CPU (A64): Add FP16/FP32 fast paths (F16C Intrinsics) for Fcvt_S, Fcvtl_V & Fcvtn_V Instructions. Switch to .NET 5.0.

Nits. Tests performed successfully in both debug and release mode (for all instructions involved).
2020-10-30 05:23:25 +01:00 · 2020-10-30 05:23:25 +01:00 · 961b49e923
commit 961b49e923
parent c248bf9fb4
6 changed files with 80 additions and 27 deletions
--- a/ARMeilleure/CodeGen/X86/Assembler.cs
+++ b/ARMeilleure/CodeGen/X86/Assembler.cs
@ -270,6 +270,8 @@ namespace ARMeilleure.CodeGen.X86
            Add(X86Instruction.Unpcklps,   new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x00000f14, InstructionFlags.Vex));
            Add(X86Instruction.Vblendvpd,  new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x000f3a4b, InstructionFlags.Vex | InstructionFlags.Prefix66));
            Add(X86Instruction.Vblendvps,  new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x000f3a4a, InstructionFlags.Vex | InstructionFlags.Prefix66));
+            Add(X86Instruction.Vcvtph2ps,  new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x000f3813, InstructionFlags.Vex | InstructionFlags.Prefix66));
+            Add(X86Instruction.Vcvtps2ph,  new InstructionInfo(0x000f3a1d, BadOp,      BadOp,      BadOp,      BadOp,      InstructionFlags.Vex | InstructionFlags.Prefix66));
            Add(X86Instruction.Vpblendvb,  new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x000f3a4c, InstructionFlags.Vex | InstructionFlags.Prefix66));
            Add(X86Instruction.Xor,        new InstructionInfo(0x00000031, 0x06000083, 0x06000081, BadOp,      0x00000033, InstructionFlags.None));
            Add(X86Instruction.Xorpd,      new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x00000f57, InstructionFlags.Vex | InstructionFlags.Prefix66));
--- a/ARMeilleure/CodeGen/X86/IntrinsicTable.cs
+++ b/ARMeilleure/CodeGen/X86/IntrinsicTable.cs
@ -162,6 +162,8 @@ namespace ARMeilleure.CodeGen.X86
            Add(Intrinsic.X86Unpckhps,   new IntrinsicInfo(X86Instruction.Unpckhps,   IntrinsicType.Binary));
            Add(Intrinsic.X86Unpcklpd,   new IntrinsicInfo(X86Instruction.Unpcklpd,   IntrinsicType.Binary));
            Add(Intrinsic.X86Unpcklps,   new IntrinsicInfo(X86Instruction.Unpcklps,   IntrinsicType.Binary));
+            Add(Intrinsic.X86Vcvtph2ps,  new IntrinsicInfo(X86Instruction.Vcvtph2ps,  IntrinsicType.Unary));
+            Add(Intrinsic.X86Vcvtps2ph,  new IntrinsicInfo(X86Instruction.Vcvtps2ph,  IntrinsicType.BinaryImm));
            Add(Intrinsic.X86Xorpd,      new IntrinsicInfo(X86Instruction.Xorpd,      IntrinsicType.Binary));
            Add(Intrinsic.X86Xorps,      new IntrinsicInfo(X86Instruction.Xorps,      IntrinsicType.Binary));
        }
--- a/ARMeilleure/CodeGen/X86/X86Instruction.cs
+++ b/ARMeilleure/CodeGen/X86/X86Instruction.cs
@ -199,6 +199,8 @@ namespace ARMeilleure.CodeGen.X86
        Unpcklps,
        Vblendvpd,
        Vblendvps,
+        Vcvtph2ps,
+        Vcvtps2ph,
        Vpblendvb,
        Xor,
        Xorpd,
--- a/ARMeilleure/Instructions/InstEmitSimdCvt.cs
+++ b/ARMeilleure/Instructions/InstEmitSimdCvt.cs
@ -60,21 +60,48 @@ namespace ARMeilleure.Instructions
            }
            else if (op.Size == 0 && op.Opc == 3) // Single -> Half.
            {
-                Operand ne = context.VectorExtract(OperandType.FP32, GetVec(op.Rn), 0);
+                if (Optimizations.UseF16c)
+                {
+                    Debug.Assert(!Optimizations.ForceLegacySse);

-                Operand res = context.Call(typeof(SoftFloat32_16).GetMethod(nameof(SoftFloat32_16.FPConvert)), ne);
+                    Operand n = GetVec(op.Rn);

-                res = context.ZeroExtend16(OperandType.I64, res);
+                    Operand res = context.AddIntrinsic(Intrinsic.X86Vcvtps2ph, n, Const(X86GetRoundControl(FPRoundingMode.ToNearest)));
+                            res = context.AddIntrinsic(Intrinsic.X86Pslldq, res, Const(14)); // VectorZeroUpper112()
+                            res = context.AddIntrinsic(Intrinsic.X86Psrldq, res, Const(14));

-                context.Copy(GetVec(op.Rd), EmitVectorInsert(context, context.VectorZero(), res, 0, 1));
+                    context.Copy(GetVec(op.Rd), res);
+                }
+                else
+                {
+                    Operand ne = context.VectorExtract(OperandType.FP32, GetVec(op.Rn), 0);
+
+                    Operand res = context.Call(typeof(SoftFloat32_16).GetMethod(nameof(SoftFloat32_16.FPConvert)), ne);
+
+                    res = context.ZeroExtend16(OperandType.I64, res);
+
+                    context.Copy(GetVec(op.Rd), EmitVectorInsert(context, context.VectorZero(), res, 0, 1));
+                }
            }
            else if (op.Size == 3 && op.Opc == 0) // Half -> Single.
            {
-                Operand ne = EmitVectorExtractZx(context, op.Rn, 0, 1);
+                if (Optimizations.UseF16c)
+                {
+                    Debug.Assert(!Optimizations.ForceLegacySse);

-                Operand res = context.Call(typeof(SoftFloat16_32).GetMethod(nameof(SoftFloat16_32.FPConvert)), ne);
+                    Operand res = context.AddIntrinsic(Intrinsic.X86Vcvtph2ps, GetVec(op.Rn));
+                            res = context.VectorZeroUpper96(res);

-                context.Copy(GetVec(op.Rd), context.VectorInsert(context.VectorZero(), res, 0));
+                    context.Copy(GetVec(op.Rd), res);
+                }
+                else
+                {
+                    Operand ne = EmitVectorExtractZx(context, op.Rn, 0, 1);
+
+                    Operand res = context.Call(typeof(SoftFloat16_32).GetMethod(nameof(SoftFloat16_32.FPConvert)), ne);
+
+                    context.Copy(GetVec(op.Rd), context.VectorInsert(context.VectorZero(), res, 0));
+                }
            }
            else if (op.Size == 1 && op.Opc == 3) // Double -> Half.
            {
@ -129,18 +156,20 @@ namespace ARMeilleure.Instructions
            if (Optimizations.UseSse2 && sizeF == 1)
            {
                Operand n = GetVec(op.Rn);
-                Operand res;

-                if (op.RegisterSize == RegisterSize.Simd128)
-                {
-                    res = context.AddIntrinsic(Intrinsic.X86Movhlps, n, n);
-                }
-                else
-                {
-                    res = n;
-                }
+                Operand res = op.RegisterSize == RegisterSize.Simd128 ? context.AddIntrinsic(Intrinsic.X86Movhlps, n, n) : n;
+                        res = context.AddIntrinsic(Intrinsic.X86Cvtps2pd, res);

-                res = context.AddIntrinsic(Intrinsic.X86Cvtps2pd, res);
+                context.Copy(GetVec(op.Rd), res);
+            }
+            else if (Optimizations.UseF16c && sizeF == 0)
+            {
+                Debug.Assert(!Optimizations.ForceLegacySse);
+
+                Operand n = GetVec(op.Rn);
+
+                Operand res = op.RegisterSize == RegisterSize.Simd128 ? context.AddIntrinsic(Intrinsic.X86Movhlps, n, n) : n;
+                        res = context.AddIntrinsic(Intrinsic.X86Vcvtph2ps, res);

                context.Copy(GetVec(op.Rd), res);
            }
@ -210,17 +239,30 @@ namespace ARMeilleure.Instructions
            {
                Operand d = GetVec(op.Rd);

-                Operand res = context.VectorZeroUpper64(d);
+                Intrinsic movInst = op.RegisterSize == RegisterSize.Simd128 ? Intrinsic.X86Movlhps : Intrinsic.X86Movhlps;

                Operand nInt = context.AddIntrinsic(Intrinsic.X86Cvtpd2ps, GetVec(op.Rn));
+                        nInt = context.AddIntrinsic(Intrinsic.X86Movlhps, nInt, nInt);

-                nInt = context.AddIntrinsic(Intrinsic.X86Movlhps, nInt, nInt);
+                Operand res = context.VectorZeroUpper64(d);
+                        res = context.AddIntrinsic(movInst, res, nInt);

-                Intrinsic movInst = op.RegisterSize == RegisterSize.Simd128
-                    ? Intrinsic.X86Movlhps
-                    : Intrinsic.X86Movhlps;
+                context.Copy(d, res);
+            }
+            else if (Optimizations.UseF16c && sizeF == 0)
+            {
+                Debug.Assert(!Optimizations.ForceLegacySse);

-                res = context.AddIntrinsic(movInst, res, nInt);
+                Operand d = GetVec(op.Rd);
+                Operand n = GetVec(op.Rn);
+
+                Intrinsic movInst = op.RegisterSize == RegisterSize.Simd128 ? Intrinsic.X86Movlhps : Intrinsic.X86Movhlps;
+
+                Operand nInt = context.AddIntrinsic(Intrinsic.X86Vcvtps2ph, n, Const(X86GetRoundControl(FPRoundingMode.ToNearest)));
+                        nInt = context.AddIntrinsic(Intrinsic.X86Movlhps, nInt, nInt);
+
+                Operand res = context.VectorZeroUpper64(d);
+                        res = context.AddIntrinsic(movInst, res, nInt);

                context.Copy(d, res);
            }
--- a/ARMeilleure/IntermediateRepresentation/Intrinsic.cs
+++ b/ARMeilleure/IntermediateRepresentation/Intrinsic.cs
@ -151,6 +151,8 @@ namespace ARMeilleure.IntermediateRepresentation
        X86Unpckhps,
        X86Unpcklpd,
        X86Unpcklps,
+        X86Vcvtph2ps,
+        X86Vcvtps2ph,
        X86Xorpd,
        X86Xorps
    }
--- a/Ryujinx.Tests/Cpu/CpuTestSimd.cs
+++ b/Ryujinx.Tests/Cpu/CpuTestSimd.cs
@ -1973,15 +1973,18 @@ namespace Ryujinx.Tests.Cpu
            CompareAgainstUnicorn();
        }

-        [Test, Pairwise] [Explicit]
+        [Test, Pairwise] [Explicit] // Unicorn seems to default all rounding modes to RMode.Rn.
        public void F_Cvt_S_SH([ValueSource("_F_Cvt_S_SH_")] uint opcodes,
-                               [ValueSource("_1S_F_")] ulong a)
+                               [ValueSource("_1S_F_")] ulong a,
+                               [Values(RMode.Rn)] RMode rMode)
        {
            ulong z = TestContext.CurrentContext.Random.NextULong();
            V128 v0 = MakeVectorE0E1(z, z);
            V128 v1 = MakeVectorE0(a);

-            SingleOpcode(opcodes, v0: v0, v1: v1);
+            int fpcr = (int)rMode << (int)Fpcr.RMode;
+
+            SingleOpcode(opcodes, v0: v0, v1: v1, fpcr: fpcr);

            CompareAgainstUnicorn();
        }
@ -2134,7 +2137,7 @@ namespace Ryujinx.Tests.Cpu
            CompareAgainstUnicorn(fpsrMask: Fpsr.Ioc | Fpsr.Ofc | Fpsr.Ufc | Fpsr.Ixc | Fpsr.Idc);
        }

-        [Test, Pairwise] [Explicit] // Unicorn seems to default all rounding modes to RMode.Rn.
+        [Test, Pairwise] [Explicit]
        public void F_Cvtn_V_2D2S_2D4S([ValueSource("_F_Cvtn_V_2D2S_2D4S_")] uint opcodes,
                                       [Values(0u)]     uint rd,
                                       [Values(1u, 0u)] uint rn,