Jit64: Emulate FMA accurately when determinism is enabled

When determinism is enabled, we either want all CPUs to use FMA or we want no CPUs to use FMA. Until now, Jit64 has been been doing the latter. However, this is inaccurate behavior, all CPUs since Haswell support FMA, and getting JitArm64 to match the exact inaccurate rounding used by Jit64 would be a bit annoying. This commit switches us over to using FMA on all CPUs when determinism is enabled, with older CPUs calling the std::fma function.
2024-09-20 11:21:43 +02:00 · 2021-05-23 23:00:57 +02:00 · 2021-05-23 23:00:57 +02:00 · 2c38d6419e
commit 2c38d6419e
parent 9bc5bd83a9
2 changed files with 186 additions and 108 deletions
--- a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
@ -3,6 +3,8 @@
 // Refer to the license.txt file included.

 #include <algorithm>
+#include <cmath>
+#include <limits>
 #include <vector>

 #include "Common/Assert.h"
@ -239,138 +241,213 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
  JITDISABLE(bJITFloatingPointOff);
  FALLBACK_IF(inst.Rc);

+  // While we don't know if any games are actually affected (replays seem to work with all the usual
+  // suspects for desyncing), netplay and other applications need absolute perfect determinism, so
+  // be extra careful and use software FMA on CPUs that don't have hardware FMA.
+  const bool software_fma = !cpu_info.bFMA && Core::WantsDeterminism();
+
  int a = inst.FA;
  int b = inst.FB;
  int c = inst.FC;
  int d = inst.FD;
  bool single = inst.OPCD == 4 || inst.OPCD == 59;
  bool round_input = single && !js.op->fprIsSingle[c];
-  bool packed = inst.OPCD == 4 || (!cpu_info.bAtom && single && js.op->fprIsDuplicated[a] &&
-                                   js.op->fprIsDuplicated[b] && js.op->fprIsDuplicated[c]);
+  bool packed =
+      inst.OPCD == 4 || (!cpu_info.bAtom && !software_fma && single && js.op->fprIsDuplicated[a] &&
+                         js.op->fprIsDuplicated[b] && js.op->fprIsDuplicated[c]);

-  // While we don't know if any games are actually affected (replays seem to work with all the usual
-  // suspects for desyncing), netplay and other applications need absolute perfect determinism, so
-  // be extra careful and don't use FMA, even if in theory it might be okay.
-  // Note that FMA isn't necessarily less correct (it may actually be closer to correct) compared
-  // to what the Gekko does here; in deterministic mode, the important thing is multiple Dolphin
-  // instances on different computers giving identical results.
-  const bool use_fma = cpu_info.bFMA && !Core::WantsDeterminism();
-
-  // For use_fma == true:
-  //   Statistics suggests b is a lot less likely to be unbound in practice, so
-  //   if we have to pick one of a or b to bind, let's make it b.
-  RCOpArg Ra = fpr.Use(a, RCMode::Read);
-  RCOpArg Rb = use_fma ? fpr.Bind(b, RCMode::Read) : fpr.Use(b, RCMode::Read);
-  RCOpArg Rc = fpr.Use(c, RCMode::Read);
-  RCX64Reg Rd = fpr.Bind(d, single ? RCMode::Write : RCMode::ReadWrite);
-  RegCache::Realize(Ra, Rb, Rc, Rd);
-
-  switch (inst.SUBOP5)
+  RCOpArg Ra;
+  RCOpArg Rb;
+  RCOpArg Rc;
+  RCX64Reg Rd;
+  RCX64Reg scratch_guard;
+  if (software_fma)
  {
-  case 14:
-    MOVDDUP(XMM1, Rc);
-    if (round_input)
-      Force25BitPrecision(XMM1, R(XMM1), XMM0);
-    break;
-  case 15:
-    avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, XMM1, Rc, Rc, 3);
-    if (round_input)
-      Force25BitPrecision(XMM1, R(XMM1), XMM0);
-    break;
-  default:
-    bool special = inst.SUBOP5 == 30 && (!cpu_info.bFMA || Core::WantsDeterminism());
-    X64Reg tmp1 = special ? XMM0 : XMM1;
-    X64Reg tmp2 = special ? XMM1 : XMM0;
-    if (single && round_input)
-      Force25BitPrecision(tmp1, Rc, tmp2);
-    else
-      MOVAPD(tmp1, Rc);
-    break;
-  }
-
-  if (use_fma)
-  {
-    switch (inst.SUBOP5)
-    {
-    case 28:  // msub
-      if (packed)
-        VFMSUB132PD(XMM1, Rb.GetSimpleReg(), Ra);
-      else
-        VFMSUB132SD(XMM1, Rb.GetSimpleReg(), Ra);
-      break;
-    case 14:  // madds0
-    case 15:  // madds1
-    case 29:  // madd
-      if (packed)
-        VFMADD132PD(XMM1, Rb.GetSimpleReg(), Ra);
-      else
-        VFMADD132SD(XMM1, Rb.GetSimpleReg(), Ra);
-      break;
-    // PowerPC and x86 define NMADD/NMSUB differently
-    // x86: D = -A*C (+/-) B
-    // PPC: D = -(A*C (+/-) B)
-    // so we have to swap them; the ADD/SUB here isn't a typo.
-    case 30:  // nmsub
-      if (packed)
-        VFNMADD132PD(XMM1, Rb.GetSimpleReg(), Ra);
-      else
-        VFNMADD132SD(XMM1, Rb.GetSimpleReg(), Ra);
-      break;
-    case 31:  // nmadd
-      if (packed)
-        VFNMSUB132PD(XMM1, Rb.GetSimpleReg(), Ra);
-      else
-        VFNMSUB132SD(XMM1, Rb.GetSimpleReg(), Ra);
-      break;
-    }
-  }
-  else if (inst.SUBOP5 == 30)  // nmsub
-  {
-    // We implement nmsub a little differently ((b - a*c) instead of -(a*c - b)), so handle it
-    // separately.
-    MOVAPD(XMM1, Rb);
-    if (packed)
-    {
-      MULPD(XMM0, Ra);
-      SUBPD(XMM1, R(XMM0));
-    }
-    else
-    {
-      MULSD(XMM0, Ra);
-      SUBSD(XMM1, R(XMM0));
-    }
+    scratch_guard = fpr.Scratch(XMM2);
+    Ra = packed ? fpr.Bind(a, RCMode::Read) : fpr.Use(a, RCMode::Read);
+    Rb = packed ? fpr.Bind(b, RCMode::Read) : fpr.Use(b, RCMode::Read);
+    Rc = packed ? fpr.Bind(c, RCMode::Read) : fpr.Use(c, RCMode::Read);
+    Rd = fpr.Bind(d, single ? RCMode::Write : RCMode::ReadWrite);
+    RegCache::Realize(Ra, Rb, Rc, Rd, scratch_guard);
  }
  else
  {
+    // For cpu_info.bFMA == true:
+    //   Statistics suggests b is a lot less likely to be unbound in practice, so
+    //   if we have to pick one of a or b to bind, let's make it b.
+    Ra = fpr.Use(a, RCMode::Read);
+    Rb = cpu_info.bFMA ? fpr.Bind(b, RCMode::Read) : fpr.Use(b, RCMode::Read);
+    Rc = fpr.Use(c, RCMode::Read);
+    Rd = fpr.Bind(d, single ? RCMode::Write : RCMode::ReadWrite);
+    RegCache::Realize(Ra, Rb, Rc, Rd);
+  }
+
+  X64Reg result_reg = XMM0;
+  if (software_fma)
+  {
+    for (size_t i = (packed ? 1 : 0); i != std::numeric_limits<size_t>::max(); --i)
+    {
+      if ((i == 0 || inst.SUBOP5 == 14) && inst.SUBOP5 != 15)  // (i == 0 || madds0) && !madds1
+      {
+        if (round_input)
+          Force25BitPrecision(XMM1, Rc, XMM2);
+        else
+          MOVSD(XMM1, Rc);
+      }
+      else
+      {
+        MOVHLPS(XMM1, Rc.GetSimpleReg());
+        if (round_input)
+          Force25BitPrecision(XMM1, R(XMM1), XMM2);
+      }
+
+      // Write the result from the previous loop iteration into Rd so we don't lose it.
+      // It's important that this is done after reading Rc above, in case we have madds1 and c == d.
+      if (packed && i == 0)
+        MOVLHPS(Rd, XMM0);
+
+      if (i == 0)
+      {
+        MOVSD(XMM0, Ra);
+        MOVSD(XMM2, Rb);
+      }
+      else
+      {
+        MOVHLPS(XMM0, Ra.GetSimpleReg());
+        MOVHLPS(XMM2, Rb.GetSimpleReg());
+      }
+
+      if (inst.SUBOP5 == 28 || inst.SUBOP5 == 30)  // nsub, nmsub
+        XORPS(XMM2, MConst(psSignBits));
+
+      BitSet32 registers_in_use = CallerSavedRegistersInUse();
+      ABI_PushRegistersAndAdjustStack(registers_in_use, 0);
+      ABI_CallFunction(static_cast<double (*)(double, double, double)>(&std::fma));
+      ABI_PopRegistersAndAdjustStack(registers_in_use, 0);
+    }
+
    if (packed)
    {
-      MULPD(XMM1, Ra);
-      if (inst.SUBOP5 == 28)  // msub
-        SUBPD(XMM1, Rb);
-      else  //(n)madd(s[01])
-        ADDPD(XMM1, Rb);
+      MOVSD(Rd, XMM0);
+      result_reg = Rd;
+    }
+
+    if (inst.SUBOP5 == 30 || inst.SUBOP5 == 31)  // nmsub, nmadd
+      XORPD(result_reg, MConst(packed ? psSignBits2 : psSignBits));
+  }
+  else
+  {
+    switch (inst.SUBOP5)
+    {
+    case 14:  // madds0
+      MOVDDUP(XMM0, Rc);
+      if (round_input)
+        Force25BitPrecision(XMM0, R(XMM0), XMM1);
+      break;
+    case 15:  // madds1
+      avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, XMM0, Rc, Rc, 3);
+      if (round_input)
+        Force25BitPrecision(XMM0, R(XMM0), XMM1);
+      break;
+    default:
+      if (single && round_input)
+        Force25BitPrecision(XMM0, Rc, XMM1);
+      else
+        MOVAPD(XMM0, Rc);
+      break;
+    }
+
+    if (cpu_info.bFMA)
+    {
+      switch (inst.SUBOP5)
+      {
+      case 28:  // msub
+        if (packed)
+          VFMSUB132PD(XMM0, Rb.GetSimpleReg(), Ra);
+        else
+          VFMSUB132SD(XMM0, Rb.GetSimpleReg(), Ra);
+        break;
+      case 14:  // madds0
+      case 15:  // madds1
+      case 29:  // madd
+        if (packed)
+          VFMADD132PD(XMM0, Rb.GetSimpleReg(), Ra);
+        else
+          VFMADD132SD(XMM0, Rb.GetSimpleReg(), Ra);
+        break;
+      // PowerPC and x86 define NMADD/NMSUB differently
+      // x86: D = -A*C (+/-) B
+      // PPC: D = -(A*C (+/-) B)
+      // so we have to swap them; the ADD/SUB here isn't a typo.
+      case 30:  // nmsub
+        if (packed)
+          VFNMADD132PD(XMM0, Rb.GetSimpleReg(), Ra);
+        else
+          VFNMADD132SD(XMM0, Rb.GetSimpleReg(), Ra);
+        break;
+      case 31:  // nmadd
+        if (packed)
+          VFNMSUB132PD(XMM0, Rb.GetSimpleReg(), Ra);
+        else
+          VFNMSUB132SD(XMM0, Rb.GetSimpleReg(), Ra);
+        break;
+      }
    }
    else
    {
-      MULSD(XMM1, Ra);
-      if (inst.SUBOP5 == 28)
-        SUBSD(XMM1, Rb);
+      // No hardware support for FMA, and determinism is not enabled. In this case we inaccurately
+      // do the multiplication and addition/subtraction in two separate operations for performance.
+
+      if (inst.SUBOP5 == 30)  // nmsub
+      {
+        // We implement nmsub a little differently ((b - a*c) instead of -(a*c - b)),
+        // so handle it separately.
+        MOVAPD(XMM1, Rb);
+        if (packed)
+        {
+          MULPD(XMM0, Ra);
+          SUBPD(XMM1, R(XMM0));
+        }
+        else
+        {
+          MULSD(XMM0, Ra);
+          SUBSD(XMM1, R(XMM0));
+        }
+        result_reg = XMM1;
+      }
      else
-        ADDSD(XMM1, Rb);
+      {
+        if (packed)
+        {
+          MULPD(XMM0, Ra);
+          if (inst.SUBOP5 == 28)  // msub
+            SUBPD(XMM0, Rb);
+          else  //(n)madd(s[01])
+            ADDPD(XMM0, Rb);
+        }
+        else
+        {
+          MULSD(XMM0, Ra);
+          if (inst.SUBOP5 == 28)
+            SUBSD(XMM0, Rb);
+          else
+            ADDSD(XMM0, Rb);
+        }
+        if (inst.SUBOP5 == 31)  // nmadd
+          XORPD(XMM0, MConst(packed ? psSignBits2 : psSignBits));
+      }
    }
-    if (inst.SUBOP5 == 31)  // nmadd
-      XORPD(XMM1, MConst(packed ? psSignBits2 : psSignBits));
  }

  if (single)
  {
-    HandleNaNs(inst, Rd, XMM1);
-    ForceSinglePrecision(Rd, Rd, packed, true);
+    HandleNaNs(inst, result_reg, result_reg, result_reg == XMM1 ? XMM0 : XMM1);
+    ForceSinglePrecision(Rd, R(result_reg), packed, true);
  }
  else
  {
-    HandleNaNs(inst, XMM1, XMM1);
-    MOVSD(Rd, R(XMM1));
+    HandleNaNs(inst, result_reg, result_reg, XMM1);
+    MOVSD(Rd, R(result_reg));
  }
  SetFPRFIfNeeded(Rd);
 }
--- a/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.cpp
+++ b/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.cpp
@ -828,7 +828,8 @@ void EmuCodeBlock::avx_op(void (XEmitter::*avxOp)(X64Reg, X64Reg, const OpArg&,
    else
    {
      (this->*sseOp)(XMM0, arg2, imm);
-      MOVAPD(regOp, R(XMM0));
+      if (regOp != XMM0)
+        MOVAPD(regOp, R(XMM0));
    }
  }
  else