CPU (A64): Add FP16/FP32 fast paths (F16C Intrinsics) for Fcvt_S, Fcvtl_V & Fcvtn_V Instructions. Switch to .NET 5.0.

Nits.

Tests performed successfully in both debug and release mode (for all instructions involved).
This commit is contained in:
LDj3SNuD 2020-10-30 05:23:25 +01:00
parent c248bf9fb4
commit 961b49e923
6 changed files with 80 additions and 27 deletions

View file

@ -270,6 +270,8 @@ namespace ARMeilleure.CodeGen.X86
Add(X86Instruction.Unpcklps, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000f14, InstructionFlags.Vex)); Add(X86Instruction.Unpcklps, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000f14, InstructionFlags.Vex));
Add(X86Instruction.Vblendvpd, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f3a4b, InstructionFlags.Vex | InstructionFlags.Prefix66)); Add(X86Instruction.Vblendvpd, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f3a4b, InstructionFlags.Vex | InstructionFlags.Prefix66));
Add(X86Instruction.Vblendvps, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f3a4a, InstructionFlags.Vex | InstructionFlags.Prefix66)); Add(X86Instruction.Vblendvps, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f3a4a, InstructionFlags.Vex | InstructionFlags.Prefix66));
Add(X86Instruction.Vcvtph2ps, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f3813, InstructionFlags.Vex | InstructionFlags.Prefix66));
Add(X86Instruction.Vcvtps2ph, new InstructionInfo(0x000f3a1d, BadOp, BadOp, BadOp, BadOp, InstructionFlags.Vex | InstructionFlags.Prefix66));
Add(X86Instruction.Vpblendvb, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f3a4c, InstructionFlags.Vex | InstructionFlags.Prefix66)); Add(X86Instruction.Vpblendvb, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f3a4c, InstructionFlags.Vex | InstructionFlags.Prefix66));
Add(X86Instruction.Xor, new InstructionInfo(0x00000031, 0x06000083, 0x06000081, BadOp, 0x00000033, InstructionFlags.None)); Add(X86Instruction.Xor, new InstructionInfo(0x00000031, 0x06000083, 0x06000081, BadOp, 0x00000033, InstructionFlags.None));
Add(X86Instruction.Xorpd, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000f57, InstructionFlags.Vex | InstructionFlags.Prefix66)); Add(X86Instruction.Xorpd, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000f57, InstructionFlags.Vex | InstructionFlags.Prefix66));

View file

@ -162,6 +162,8 @@ namespace ARMeilleure.CodeGen.X86
Add(Intrinsic.X86Unpckhps, new IntrinsicInfo(X86Instruction.Unpckhps, IntrinsicType.Binary)); Add(Intrinsic.X86Unpckhps, new IntrinsicInfo(X86Instruction.Unpckhps, IntrinsicType.Binary));
Add(Intrinsic.X86Unpcklpd, new IntrinsicInfo(X86Instruction.Unpcklpd, IntrinsicType.Binary)); Add(Intrinsic.X86Unpcklpd, new IntrinsicInfo(X86Instruction.Unpcklpd, IntrinsicType.Binary));
Add(Intrinsic.X86Unpcklps, new IntrinsicInfo(X86Instruction.Unpcklps, IntrinsicType.Binary)); Add(Intrinsic.X86Unpcklps, new IntrinsicInfo(X86Instruction.Unpcklps, IntrinsicType.Binary));
Add(Intrinsic.X86Vcvtph2ps, new IntrinsicInfo(X86Instruction.Vcvtph2ps, IntrinsicType.Unary));
Add(Intrinsic.X86Vcvtps2ph, new IntrinsicInfo(X86Instruction.Vcvtps2ph, IntrinsicType.BinaryImm));
Add(Intrinsic.X86Xorpd, new IntrinsicInfo(X86Instruction.Xorpd, IntrinsicType.Binary)); Add(Intrinsic.X86Xorpd, new IntrinsicInfo(X86Instruction.Xorpd, IntrinsicType.Binary));
Add(Intrinsic.X86Xorps, new IntrinsicInfo(X86Instruction.Xorps, IntrinsicType.Binary)); Add(Intrinsic.X86Xorps, new IntrinsicInfo(X86Instruction.Xorps, IntrinsicType.Binary));
} }

View file

@ -199,6 +199,8 @@ namespace ARMeilleure.CodeGen.X86
Unpcklps, Unpcklps,
Vblendvpd, Vblendvpd,
Vblendvps, Vblendvps,
Vcvtph2ps,
Vcvtps2ph,
Vpblendvb, Vpblendvb,
Xor, Xor,
Xorpd, Xorpd,

View file

@ -60,21 +60,48 @@ namespace ARMeilleure.Instructions
} }
else if (op.Size == 0 && op.Opc == 3) // Single -> Half. else if (op.Size == 0 && op.Opc == 3) // Single -> Half.
{ {
Operand ne = context.VectorExtract(OperandType.FP32, GetVec(op.Rn), 0); if (Optimizations.UseF16c)
{
Debug.Assert(!Optimizations.ForceLegacySse);
Operand res = context.Call(typeof(SoftFloat32_16).GetMethod(nameof(SoftFloat32_16.FPConvert)), ne); Operand n = GetVec(op.Rn);
res = context.ZeroExtend16(OperandType.I64, res); Operand res = context.AddIntrinsic(Intrinsic.X86Vcvtps2ph, n, Const(X86GetRoundControl(FPRoundingMode.ToNearest)));
res = context.AddIntrinsic(Intrinsic.X86Pslldq, res, Const(14)); // VectorZeroUpper112()
res = context.AddIntrinsic(Intrinsic.X86Psrldq, res, Const(14));
context.Copy(GetVec(op.Rd), EmitVectorInsert(context, context.VectorZero(), res, 0, 1)); context.Copy(GetVec(op.Rd), res);
}
else
{
Operand ne = context.VectorExtract(OperandType.FP32, GetVec(op.Rn), 0);
Operand res = context.Call(typeof(SoftFloat32_16).GetMethod(nameof(SoftFloat32_16.FPConvert)), ne);
res = context.ZeroExtend16(OperandType.I64, res);
context.Copy(GetVec(op.Rd), EmitVectorInsert(context, context.VectorZero(), res, 0, 1));
}
} }
else if (op.Size == 3 && op.Opc == 0) // Half -> Single. else if (op.Size == 3 && op.Opc == 0) // Half -> Single.
{ {
Operand ne = EmitVectorExtractZx(context, op.Rn, 0, 1); if (Optimizations.UseF16c)
{
Debug.Assert(!Optimizations.ForceLegacySse);
Operand res = context.Call(typeof(SoftFloat16_32).GetMethod(nameof(SoftFloat16_32.FPConvert)), ne); Operand res = context.AddIntrinsic(Intrinsic.X86Vcvtph2ps, GetVec(op.Rn));
res = context.VectorZeroUpper96(res);
context.Copy(GetVec(op.Rd), context.VectorInsert(context.VectorZero(), res, 0)); context.Copy(GetVec(op.Rd), res);
}
else
{
Operand ne = EmitVectorExtractZx(context, op.Rn, 0, 1);
Operand res = context.Call(typeof(SoftFloat16_32).GetMethod(nameof(SoftFloat16_32.FPConvert)), ne);
context.Copy(GetVec(op.Rd), context.VectorInsert(context.VectorZero(), res, 0));
}
} }
else if (op.Size == 1 && op.Opc == 3) // Double -> Half. else if (op.Size == 1 && op.Opc == 3) // Double -> Half.
{ {
@ -129,18 +156,20 @@ namespace ARMeilleure.Instructions
if (Optimizations.UseSse2 && sizeF == 1) if (Optimizations.UseSse2 && sizeF == 1)
{ {
Operand n = GetVec(op.Rn); Operand n = GetVec(op.Rn);
Operand res;
if (op.RegisterSize == RegisterSize.Simd128) Operand res = op.RegisterSize == RegisterSize.Simd128 ? context.AddIntrinsic(Intrinsic.X86Movhlps, n, n) : n;
{ res = context.AddIntrinsic(Intrinsic.X86Cvtps2pd, res);
res = context.AddIntrinsic(Intrinsic.X86Movhlps, n, n);
}
else
{
res = n;
}
res = context.AddIntrinsic(Intrinsic.X86Cvtps2pd, res); context.Copy(GetVec(op.Rd), res);
}
else if (Optimizations.UseF16c && sizeF == 0)
{
Debug.Assert(!Optimizations.ForceLegacySse);
Operand n = GetVec(op.Rn);
Operand res = op.RegisterSize == RegisterSize.Simd128 ? context.AddIntrinsic(Intrinsic.X86Movhlps, n, n) : n;
res = context.AddIntrinsic(Intrinsic.X86Vcvtph2ps, res);
context.Copy(GetVec(op.Rd), res); context.Copy(GetVec(op.Rd), res);
} }
@ -210,17 +239,30 @@ namespace ARMeilleure.Instructions
{ {
Operand d = GetVec(op.Rd); Operand d = GetVec(op.Rd);
Operand res = context.VectorZeroUpper64(d); Intrinsic movInst = op.RegisterSize == RegisterSize.Simd128 ? Intrinsic.X86Movlhps : Intrinsic.X86Movhlps;
Operand nInt = context.AddIntrinsic(Intrinsic.X86Cvtpd2ps, GetVec(op.Rn)); Operand nInt = context.AddIntrinsic(Intrinsic.X86Cvtpd2ps, GetVec(op.Rn));
nInt = context.AddIntrinsic(Intrinsic.X86Movlhps, nInt, nInt);
nInt = context.AddIntrinsic(Intrinsic.X86Movlhps, nInt, nInt); Operand res = context.VectorZeroUpper64(d);
res = context.AddIntrinsic(movInst, res, nInt);
Intrinsic movInst = op.RegisterSize == RegisterSize.Simd128 context.Copy(d, res);
? Intrinsic.X86Movlhps }
: Intrinsic.X86Movhlps; else if (Optimizations.UseF16c && sizeF == 0)
{
Debug.Assert(!Optimizations.ForceLegacySse);
res = context.AddIntrinsic(movInst, res, nInt); Operand d = GetVec(op.Rd);
Operand n = GetVec(op.Rn);
Intrinsic movInst = op.RegisterSize == RegisterSize.Simd128 ? Intrinsic.X86Movlhps : Intrinsic.X86Movhlps;
Operand nInt = context.AddIntrinsic(Intrinsic.X86Vcvtps2ph, n, Const(X86GetRoundControl(FPRoundingMode.ToNearest)));
nInt = context.AddIntrinsic(Intrinsic.X86Movlhps, nInt, nInt);
Operand res = context.VectorZeroUpper64(d);
res = context.AddIntrinsic(movInst, res, nInt);
context.Copy(d, res); context.Copy(d, res);
} }

View file

@ -151,6 +151,8 @@ namespace ARMeilleure.IntermediateRepresentation
X86Unpckhps, X86Unpckhps,
X86Unpcklpd, X86Unpcklpd,
X86Unpcklps, X86Unpcklps,
X86Vcvtph2ps,
X86Vcvtps2ph,
X86Xorpd, X86Xorpd,
X86Xorps X86Xorps
} }

View file

@ -1973,15 +1973,18 @@ namespace Ryujinx.Tests.Cpu
CompareAgainstUnicorn(); CompareAgainstUnicorn();
} }
[Test, Pairwise] [Explicit] [Test, Pairwise] [Explicit] // Unicorn seems to default all rounding modes to RMode.Rn.
public void F_Cvt_S_SH([ValueSource("_F_Cvt_S_SH_")] uint opcodes, public void F_Cvt_S_SH([ValueSource("_F_Cvt_S_SH_")] uint opcodes,
[ValueSource("_1S_F_")] ulong a) [ValueSource("_1S_F_")] ulong a,
[Values(RMode.Rn)] RMode rMode)
{ {
ulong z = TestContext.CurrentContext.Random.NextULong(); ulong z = TestContext.CurrentContext.Random.NextULong();
V128 v0 = MakeVectorE0E1(z, z); V128 v0 = MakeVectorE0E1(z, z);
V128 v1 = MakeVectorE0(a); V128 v1 = MakeVectorE0(a);
SingleOpcode(opcodes, v0: v0, v1: v1); int fpcr = (int)rMode << (int)Fpcr.RMode;
SingleOpcode(opcodes, v0: v0, v1: v1, fpcr: fpcr);
CompareAgainstUnicorn(); CompareAgainstUnicorn();
} }
@ -2134,7 +2137,7 @@ namespace Ryujinx.Tests.Cpu
CompareAgainstUnicorn(fpsrMask: Fpsr.Ioc | Fpsr.Ofc | Fpsr.Ufc | Fpsr.Ixc | Fpsr.Idc); CompareAgainstUnicorn(fpsrMask: Fpsr.Ioc | Fpsr.Ofc | Fpsr.Ufc | Fpsr.Ixc | Fpsr.Idc);
} }
[Test, Pairwise] [Explicit] // Unicorn seems to default all rounding modes to RMode.Rn. [Test, Pairwise] [Explicit]
public void F_Cvtn_V_2D2S_2D4S([ValueSource("_F_Cvtn_V_2D2S_2D4S_")] uint opcodes, public void F_Cvtn_V_2D2S_2D4S([ValueSource("_F_Cvtn_V_2D2S_2D4S_")] uint opcodes,
[Values(0u)] uint rd, [Values(0u)] uint rd,
[Values(1u, 0u)] uint rn, [Values(1u, 0u)] uint rn,