diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
index fa5d9d6d72..ead87a64df 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
@@ -3,6 +3,8 @@
 // Refer to the license.txt file included.
 
 #include <algorithm>
+#include <cmath>
+#include <limits>
 #include <vector>
 
 #include "Common/Assert.h"
@@ -239,138 +241,213 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
   JITDISABLE(bJITFloatingPointOff);
   FALLBACK_IF(inst.Rc);
 
+  // While we don't know if any games are actually affected (replays seem to work with all the usual
+  // suspects for desyncing), netplay and other applications need absolute perfect determinism, so
+  // be extra careful and use software FMA on CPUs that don't have hardware FMA.
+  const bool software_fma = !cpu_info.bFMA && Core::WantsDeterminism();
+
   int a = inst.FA;
   int b = inst.FB;
   int c = inst.FC;
   int d = inst.FD;
   bool single = inst.OPCD == 4 || inst.OPCD == 59;
   bool round_input = single && !js.op->fprIsSingle[c];
-  bool packed = inst.OPCD == 4 || (!cpu_info.bAtom && single && js.op->fprIsDuplicated[a] &&
-                                   js.op->fprIsDuplicated[b] && js.op->fprIsDuplicated[c]);
+  bool packed =
+      inst.OPCD == 4 || (!cpu_info.bAtom && !software_fma && single && js.op->fprIsDuplicated[a] &&
+                         js.op->fprIsDuplicated[b] && js.op->fprIsDuplicated[c]);
 
-  // While we don't know if any games are actually affected (replays seem to work with all the usual
-  // suspects for desyncing), netplay and other applications need absolute perfect determinism, so
-  // be extra careful and don't use FMA, even if in theory it might be okay.
-  // Note that FMA isn't necessarily less correct (it may actually be closer to correct) compared
-  // to what the Gekko does here; in deterministic mode, the important thing is multiple Dolphin
-  // instances on different computers giving identical results.
-  const bool use_fma = cpu_info.bFMA && !Core::WantsDeterminism();
-
-  // For use_fma == true:
-  //   Statistics suggests b is a lot less likely to be unbound in practice, so
-  //   if we have to pick one of a or b to bind, let's make it b.
-  RCOpArg Ra = fpr.Use(a, RCMode::Read);
-  RCOpArg Rb = use_fma ? fpr.Bind(b, RCMode::Read) : fpr.Use(b, RCMode::Read);
-  RCOpArg Rc = fpr.Use(c, RCMode::Read);
-  RCX64Reg Rd = fpr.Bind(d, single ? RCMode::Write : RCMode::ReadWrite);
-  RegCache::Realize(Ra, Rb, Rc, Rd);
-
-  switch (inst.SUBOP5)
+  RCOpArg Ra;
+  RCOpArg Rb;
+  RCOpArg Rc;
+  RCX64Reg Rd;
+  RCX64Reg scratch_guard;
+  if (software_fma)
   {
-  case 14:
-    MOVDDUP(XMM1, Rc);
-    if (round_input)
-      Force25BitPrecision(XMM1, R(XMM1), XMM0);
-    break;
-  case 15:
-    avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, XMM1, Rc, Rc, 3);
-    if (round_input)
-      Force25BitPrecision(XMM1, R(XMM1), XMM0);
-    break;
-  default:
-    bool special = inst.SUBOP5 == 30 && (!cpu_info.bFMA || Core::WantsDeterminism());
-    X64Reg tmp1 = special ? XMM0 : XMM1;
-    X64Reg tmp2 = special ? XMM1 : XMM0;
-    if (single && round_input)
-      Force25BitPrecision(tmp1, Rc, tmp2);
-    else
-      MOVAPD(tmp1, Rc);
-    break;
-  }
-
-  if (use_fma)
-  {
-    switch (inst.SUBOP5)
-    {
-    case 28:  // msub
-      if (packed)
-        VFMSUB132PD(XMM1, Rb.GetSimpleReg(), Ra);
-      else
-        VFMSUB132SD(XMM1, Rb.GetSimpleReg(), Ra);
-      break;
-    case 14:  // madds0
-    case 15:  // madds1
-    case 29:  // madd
-      if (packed)
-        VFMADD132PD(XMM1, Rb.GetSimpleReg(), Ra);
-      else
-        VFMADD132SD(XMM1, Rb.GetSimpleReg(), Ra);
-      break;
-    // PowerPC and x86 define NMADD/NMSUB differently
-    // x86: D = -A*C (+/-) B
-    // PPC: D = -(A*C (+/-) B)
-    // so we have to swap them; the ADD/SUB here isn't a typo.
-    case 30:  // nmsub
-      if (packed)
-        VFNMADD132PD(XMM1, Rb.GetSimpleReg(), Ra);
-      else
-        VFNMADD132SD(XMM1, Rb.GetSimpleReg(), Ra);
-      break;
-    case 31:  // nmadd
-      if (packed)
-        VFNMSUB132PD(XMM1, Rb.GetSimpleReg(), Ra);
-      else
-        VFNMSUB132SD(XMM1, Rb.GetSimpleReg(), Ra);
-      break;
-    }
-  }
-  else if (inst.SUBOP5 == 30)  // nmsub
-  {
-    // We implement nmsub a little differently ((b - a*c) instead of -(a*c - b)), so handle it
-    // separately.
-    MOVAPD(XMM1, Rb);
-    if (packed)
-    {
-      MULPD(XMM0, Ra);
-      SUBPD(XMM1, R(XMM0));
-    }
-    else
-    {
-      MULSD(XMM0, Ra);
-      SUBSD(XMM1, R(XMM0));
-    }
+    scratch_guard = fpr.Scratch(XMM2);
+    Ra = packed ? fpr.Bind(a, RCMode::Read) : fpr.Use(a, RCMode::Read);
+    Rb = packed ? fpr.Bind(b, RCMode::Read) : fpr.Use(b, RCMode::Read);
+    Rc = packed ? fpr.Bind(c, RCMode::Read) : fpr.Use(c, RCMode::Read);
+    Rd = fpr.Bind(d, single ? RCMode::Write : RCMode::ReadWrite);
+    RegCache::Realize(Ra, Rb, Rc, Rd, scratch_guard);
   }
   else
   {
+    // For cpu_info.bFMA == true:
+    //   Statistics suggests b is a lot less likely to be unbound in practice, so
+    //   if we have to pick one of a or b to bind, let's make it b.
+    Ra = fpr.Use(a, RCMode::Read);
+    Rb = cpu_info.bFMA ? fpr.Bind(b, RCMode::Read) : fpr.Use(b, RCMode::Read);
+    Rc = fpr.Use(c, RCMode::Read);
+    Rd = fpr.Bind(d, single ? RCMode::Write : RCMode::ReadWrite);
+    RegCache::Realize(Ra, Rb, Rc, Rd);
+  }
+
+  X64Reg result_reg = XMM0;
+  if (software_fma)
+  {
+    for (size_t i = (packed ? 1 : 0); i != std::numeric_limits<size_t>::max(); --i)
+    {
+      if ((i == 0 || inst.SUBOP5 == 14) && inst.SUBOP5 != 15)  // (i == 0 || madds0) && !madds1
+      {
+        if (round_input)
+          Force25BitPrecision(XMM1, Rc, XMM2);
+        else
+          MOVSD(XMM1, Rc);
+      }
+      else
+      {
+        MOVHLPS(XMM1, Rc.GetSimpleReg());
+        if (round_input)
+          Force25BitPrecision(XMM1, R(XMM1), XMM2);
+      }
+
+      // Write the result from the previous loop iteration into Rd so we don't lose it.
+      // It's important that this is done after reading Rc above, in case we have madds1 and c == d.
+      if (packed && i == 0)
+        MOVLHPS(Rd, XMM0);
+
+      if (i == 0)
+      {
+        MOVSD(XMM0, Ra);
+        MOVSD(XMM2, Rb);
+      }
+      else
+      {
+        MOVHLPS(XMM0, Ra.GetSimpleReg());
+        MOVHLPS(XMM2, Rb.GetSimpleReg());
+      }
+
+      if (inst.SUBOP5 == 28 || inst.SUBOP5 == 30)  // nsub, nmsub
+        XORPS(XMM2, MConst(psSignBits));
+
+      BitSet32 registers_in_use = CallerSavedRegistersInUse();
+      ABI_PushRegistersAndAdjustStack(registers_in_use, 0);
+      ABI_CallFunction(static_cast<double (*)(double, double, double)>(&std::fma));
+      ABI_PopRegistersAndAdjustStack(registers_in_use, 0);
+    }
+
     if (packed)
     {
-      MULPD(XMM1, Ra);
-      if (inst.SUBOP5 == 28)  // msub
-        SUBPD(XMM1, Rb);
-      else  //(n)madd(s[01])
-        ADDPD(XMM1, Rb);
+      MOVSD(Rd, XMM0);
+      result_reg = Rd;
+    }
+
+    if (inst.SUBOP5 == 30 || inst.SUBOP5 == 31)  // nmsub, nmadd
+      XORPD(result_reg, MConst(packed ? psSignBits2 : psSignBits));
+  }
+  else
+  {
+    switch (inst.SUBOP5)
+    {
+    case 14:  // madds0
+      MOVDDUP(XMM0, Rc);
+      if (round_input)
+        Force25BitPrecision(XMM0, R(XMM0), XMM1);
+      break;
+    case 15:  // madds1
+      avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, XMM0, Rc, Rc, 3);
+      if (round_input)
+        Force25BitPrecision(XMM0, R(XMM0), XMM1);
+      break;
+    default:
+      if (single && round_input)
+        Force25BitPrecision(XMM0, Rc, XMM1);
+      else
+        MOVAPD(XMM0, Rc);
+      break;
+    }
+
+    if (cpu_info.bFMA)
+    {
+      switch (inst.SUBOP5)
+      {
+      case 28:  // msub
+        if (packed)
+          VFMSUB132PD(XMM0, Rb.GetSimpleReg(), Ra);
+        else
+          VFMSUB132SD(XMM0, Rb.GetSimpleReg(), Ra);
+        break;
+      case 14:  // madds0
+      case 15:  // madds1
+      case 29:  // madd
+        if (packed)
+          VFMADD132PD(XMM0, Rb.GetSimpleReg(), Ra);
+        else
+          VFMADD132SD(XMM0, Rb.GetSimpleReg(), Ra);
+        break;
+      // PowerPC and x86 define NMADD/NMSUB differently
+      // x86: D = -A*C (+/-) B
+      // PPC: D = -(A*C (+/-) B)
+      // so we have to swap them; the ADD/SUB here isn't a typo.
+      case 30:  // nmsub
+        if (packed)
+          VFNMADD132PD(XMM0, Rb.GetSimpleReg(), Ra);
+        else
+          VFNMADD132SD(XMM0, Rb.GetSimpleReg(), Ra);
+        break;
+      case 31:  // nmadd
+        if (packed)
+          VFNMSUB132PD(XMM0, Rb.GetSimpleReg(), Ra);
+        else
+          VFNMSUB132SD(XMM0, Rb.GetSimpleReg(), Ra);
+        break;
+      }
     }
     else
     {
-      MULSD(XMM1, Ra);
-      if (inst.SUBOP5 == 28)
-        SUBSD(XMM1, Rb);
+      // No hardware support for FMA, and determinism is not enabled. In this case we inaccurately
+      // do the multiplication and addition/subtraction in two separate operations for performance.
+
+      if (inst.SUBOP5 == 30)  // nmsub
+      {
+        // We implement nmsub a little differently ((b - a*c) instead of -(a*c - b)),
+        // so handle it separately.
+        MOVAPD(XMM1, Rb);
+        if (packed)
+        {
+          MULPD(XMM0, Ra);
+          SUBPD(XMM1, R(XMM0));
+        }
+        else
+        {
+          MULSD(XMM0, Ra);
+          SUBSD(XMM1, R(XMM0));
+        }
+        result_reg = XMM1;
+      }
       else
-        ADDSD(XMM1, Rb);
+      {
+        if (packed)
+        {
+          MULPD(XMM0, Ra);
+          if (inst.SUBOP5 == 28)  // msub
+            SUBPD(XMM0, Rb);
+          else  //(n)madd(s[01])
+            ADDPD(XMM0, Rb);
+        }
+        else
+        {
+          MULSD(XMM0, Ra);
+          if (inst.SUBOP5 == 28)
+            SUBSD(XMM0, Rb);
+          else
+            ADDSD(XMM0, Rb);
+        }
+        if (inst.SUBOP5 == 31)  // nmadd
+          XORPD(XMM0, MConst(packed ? psSignBits2 : psSignBits));
+      }
     }
-    if (inst.SUBOP5 == 31)  // nmadd
-      XORPD(XMM1, MConst(packed ? psSignBits2 : psSignBits));
   }
 
   if (single)
   {
-    HandleNaNs(inst, Rd, XMM1);
-    ForceSinglePrecision(Rd, Rd, packed, true);
+    HandleNaNs(inst, result_reg, result_reg, result_reg == XMM1 ? XMM0 : XMM1);
+    ForceSinglePrecision(Rd, R(result_reg), packed, true);
   }
   else
   {
-    HandleNaNs(inst, XMM1, XMM1);
-    MOVSD(Rd, R(XMM1));
+    HandleNaNs(inst, result_reg, result_reg, XMM1);
+    MOVSD(Rd, R(result_reg));
   }
   SetFPRFIfNeeded(Rd);
 }
diff --git a/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.cpp b/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.cpp
index 1de9547b89..dbd2cd3497 100644
--- a/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.cpp
+++ b/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.cpp
@@ -828,7 +828,8 @@ void EmuCodeBlock::avx_op(void (XEmitter::*avxOp)(X64Reg, X64Reg, const OpArg&,
     else
     {
       (this->*sseOp)(XMM0, arg2, imm);
-      MOVAPD(regOp, R(XMM0));
+      if (regOp != XMM0)
+        MOVAPD(regOp, R(XMM0));
     }
   }
   else