diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp index fa5d9d6d72..ead87a64df 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp @@ -3,6 +3,8 @@ // Refer to the license.txt file included. #include +#include +#include #include #include "Common/Assert.h" @@ -239,138 +241,213 @@ void Jit64::fmaddXX(UGeckoInstruction inst) JITDISABLE(bJITFloatingPointOff); FALLBACK_IF(inst.Rc); + // While we don't know if any games are actually affected (replays seem to work with all the usual + // suspects for desyncing), netplay and other applications need absolute perfect determinism, so + // be extra careful and use software FMA on CPUs that don't have hardware FMA. + const bool software_fma = !cpu_info.bFMA && Core::WantsDeterminism(); + int a = inst.FA; int b = inst.FB; int c = inst.FC; int d = inst.FD; bool single = inst.OPCD == 4 || inst.OPCD == 59; bool round_input = single && !js.op->fprIsSingle[c]; - bool packed = inst.OPCD == 4 || (!cpu_info.bAtom && single && js.op->fprIsDuplicated[a] && - js.op->fprIsDuplicated[b] && js.op->fprIsDuplicated[c]); + bool packed = + inst.OPCD == 4 || (!cpu_info.bAtom && !software_fma && single && js.op->fprIsDuplicated[a] && + js.op->fprIsDuplicated[b] && js.op->fprIsDuplicated[c]); - // While we don't know if any games are actually affected (replays seem to work with all the usual - // suspects for desyncing), netplay and other applications need absolute perfect determinism, so - // be extra careful and don't use FMA, even if in theory it might be okay. - // Note that FMA isn't necessarily less correct (it may actually be closer to correct) compared - // to what the Gekko does here; in deterministic mode, the important thing is multiple Dolphin - // instances on different computers giving identical results. - const bool use_fma = cpu_info.bFMA && !Core::WantsDeterminism(); - - // For use_fma == true: - // Statistics suggests b is a lot less likely to be unbound in practice, so - // if we have to pick one of a or b to bind, let's make it b. - RCOpArg Ra = fpr.Use(a, RCMode::Read); - RCOpArg Rb = use_fma ? fpr.Bind(b, RCMode::Read) : fpr.Use(b, RCMode::Read); - RCOpArg Rc = fpr.Use(c, RCMode::Read); - RCX64Reg Rd = fpr.Bind(d, single ? RCMode::Write : RCMode::ReadWrite); - RegCache::Realize(Ra, Rb, Rc, Rd); - - switch (inst.SUBOP5) + RCOpArg Ra; + RCOpArg Rb; + RCOpArg Rc; + RCX64Reg Rd; + RCX64Reg scratch_guard; + if (software_fma) { - case 14: - MOVDDUP(XMM1, Rc); - if (round_input) - Force25BitPrecision(XMM1, R(XMM1), XMM0); - break; - case 15: - avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, XMM1, Rc, Rc, 3); - if (round_input) - Force25BitPrecision(XMM1, R(XMM1), XMM0); - break; - default: - bool special = inst.SUBOP5 == 30 && (!cpu_info.bFMA || Core::WantsDeterminism()); - X64Reg tmp1 = special ? XMM0 : XMM1; - X64Reg tmp2 = special ? XMM1 : XMM0; - if (single && round_input) - Force25BitPrecision(tmp1, Rc, tmp2); - else - MOVAPD(tmp1, Rc); - break; - } - - if (use_fma) - { - switch (inst.SUBOP5) - { - case 28: // msub - if (packed) - VFMSUB132PD(XMM1, Rb.GetSimpleReg(), Ra); - else - VFMSUB132SD(XMM1, Rb.GetSimpleReg(), Ra); - break; - case 14: // madds0 - case 15: // madds1 - case 29: // madd - if (packed) - VFMADD132PD(XMM1, Rb.GetSimpleReg(), Ra); - else - VFMADD132SD(XMM1, Rb.GetSimpleReg(), Ra); - break; - // PowerPC and x86 define NMADD/NMSUB differently - // x86: D = -A*C (+/-) B - // PPC: D = -(A*C (+/-) B) - // so we have to swap them; the ADD/SUB here isn't a typo. - case 30: // nmsub - if (packed) - VFNMADD132PD(XMM1, Rb.GetSimpleReg(), Ra); - else - VFNMADD132SD(XMM1, Rb.GetSimpleReg(), Ra); - break; - case 31: // nmadd - if (packed) - VFNMSUB132PD(XMM1, Rb.GetSimpleReg(), Ra); - else - VFNMSUB132SD(XMM1, Rb.GetSimpleReg(), Ra); - break; - } - } - else if (inst.SUBOP5 == 30) // nmsub - { - // We implement nmsub a little differently ((b - a*c) instead of -(a*c - b)), so handle it - // separately. - MOVAPD(XMM1, Rb); - if (packed) - { - MULPD(XMM0, Ra); - SUBPD(XMM1, R(XMM0)); - } - else - { - MULSD(XMM0, Ra); - SUBSD(XMM1, R(XMM0)); - } + scratch_guard = fpr.Scratch(XMM2); + Ra = packed ? fpr.Bind(a, RCMode::Read) : fpr.Use(a, RCMode::Read); + Rb = packed ? fpr.Bind(b, RCMode::Read) : fpr.Use(b, RCMode::Read); + Rc = packed ? fpr.Bind(c, RCMode::Read) : fpr.Use(c, RCMode::Read); + Rd = fpr.Bind(d, single ? RCMode::Write : RCMode::ReadWrite); + RegCache::Realize(Ra, Rb, Rc, Rd, scratch_guard); } else { + // For cpu_info.bFMA == true: + // Statistics suggests b is a lot less likely to be unbound in practice, so + // if we have to pick one of a or b to bind, let's make it b. + Ra = fpr.Use(a, RCMode::Read); + Rb = cpu_info.bFMA ? fpr.Bind(b, RCMode::Read) : fpr.Use(b, RCMode::Read); + Rc = fpr.Use(c, RCMode::Read); + Rd = fpr.Bind(d, single ? RCMode::Write : RCMode::ReadWrite); + RegCache::Realize(Ra, Rb, Rc, Rd); + } + + X64Reg result_reg = XMM0; + if (software_fma) + { + for (size_t i = (packed ? 1 : 0); i != std::numeric_limits::max(); --i) + { + if ((i == 0 || inst.SUBOP5 == 14) && inst.SUBOP5 != 15) // (i == 0 || madds0) && !madds1 + { + if (round_input) + Force25BitPrecision(XMM1, Rc, XMM2); + else + MOVSD(XMM1, Rc); + } + else + { + MOVHLPS(XMM1, Rc.GetSimpleReg()); + if (round_input) + Force25BitPrecision(XMM1, R(XMM1), XMM2); + } + + // Write the result from the previous loop iteration into Rd so we don't lose it. + // It's important that this is done after reading Rc above, in case we have madds1 and c == d. + if (packed && i == 0) + MOVLHPS(Rd, XMM0); + + if (i == 0) + { + MOVSD(XMM0, Ra); + MOVSD(XMM2, Rb); + } + else + { + MOVHLPS(XMM0, Ra.GetSimpleReg()); + MOVHLPS(XMM2, Rb.GetSimpleReg()); + } + + if (inst.SUBOP5 == 28 || inst.SUBOP5 == 30) // nsub, nmsub + XORPS(XMM2, MConst(psSignBits)); + + BitSet32 registers_in_use = CallerSavedRegistersInUse(); + ABI_PushRegistersAndAdjustStack(registers_in_use, 0); + ABI_CallFunction(static_cast(&std::fma)); + ABI_PopRegistersAndAdjustStack(registers_in_use, 0); + } + if (packed) { - MULPD(XMM1, Ra); - if (inst.SUBOP5 == 28) // msub - SUBPD(XMM1, Rb); - else //(n)madd(s[01]) - ADDPD(XMM1, Rb); + MOVSD(Rd, XMM0); + result_reg = Rd; + } + + if (inst.SUBOP5 == 30 || inst.SUBOP5 == 31) // nmsub, nmadd + XORPD(result_reg, MConst(packed ? psSignBits2 : psSignBits)); + } + else + { + switch (inst.SUBOP5) + { + case 14: // madds0 + MOVDDUP(XMM0, Rc); + if (round_input) + Force25BitPrecision(XMM0, R(XMM0), XMM1); + break; + case 15: // madds1 + avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, XMM0, Rc, Rc, 3); + if (round_input) + Force25BitPrecision(XMM0, R(XMM0), XMM1); + break; + default: + if (single && round_input) + Force25BitPrecision(XMM0, Rc, XMM1); + else + MOVAPD(XMM0, Rc); + break; + } + + if (cpu_info.bFMA) + { + switch (inst.SUBOP5) + { + case 28: // msub + if (packed) + VFMSUB132PD(XMM0, Rb.GetSimpleReg(), Ra); + else + VFMSUB132SD(XMM0, Rb.GetSimpleReg(), Ra); + break; + case 14: // madds0 + case 15: // madds1 + case 29: // madd + if (packed) + VFMADD132PD(XMM0, Rb.GetSimpleReg(), Ra); + else + VFMADD132SD(XMM0, Rb.GetSimpleReg(), Ra); + break; + // PowerPC and x86 define NMADD/NMSUB differently + // x86: D = -A*C (+/-) B + // PPC: D = -(A*C (+/-) B) + // so we have to swap them; the ADD/SUB here isn't a typo. + case 30: // nmsub + if (packed) + VFNMADD132PD(XMM0, Rb.GetSimpleReg(), Ra); + else + VFNMADD132SD(XMM0, Rb.GetSimpleReg(), Ra); + break; + case 31: // nmadd + if (packed) + VFNMSUB132PD(XMM0, Rb.GetSimpleReg(), Ra); + else + VFNMSUB132SD(XMM0, Rb.GetSimpleReg(), Ra); + break; + } } else { - MULSD(XMM1, Ra); - if (inst.SUBOP5 == 28) - SUBSD(XMM1, Rb); + // No hardware support for FMA, and determinism is not enabled. In this case we inaccurately + // do the multiplication and addition/subtraction in two separate operations for performance. + + if (inst.SUBOP5 == 30) // nmsub + { + // We implement nmsub a little differently ((b - a*c) instead of -(a*c - b)), + // so handle it separately. + MOVAPD(XMM1, Rb); + if (packed) + { + MULPD(XMM0, Ra); + SUBPD(XMM1, R(XMM0)); + } + else + { + MULSD(XMM0, Ra); + SUBSD(XMM1, R(XMM0)); + } + result_reg = XMM1; + } else - ADDSD(XMM1, Rb); + { + if (packed) + { + MULPD(XMM0, Ra); + if (inst.SUBOP5 == 28) // msub + SUBPD(XMM0, Rb); + else //(n)madd(s[01]) + ADDPD(XMM0, Rb); + } + else + { + MULSD(XMM0, Ra); + if (inst.SUBOP5 == 28) + SUBSD(XMM0, Rb); + else + ADDSD(XMM0, Rb); + } + if (inst.SUBOP5 == 31) // nmadd + XORPD(XMM0, MConst(packed ? psSignBits2 : psSignBits)); + } } - if (inst.SUBOP5 == 31) // nmadd - XORPD(XMM1, MConst(packed ? psSignBits2 : psSignBits)); } if (single) { - HandleNaNs(inst, Rd, XMM1); - ForceSinglePrecision(Rd, Rd, packed, true); + HandleNaNs(inst, result_reg, result_reg, result_reg == XMM1 ? XMM0 : XMM1); + ForceSinglePrecision(Rd, R(result_reg), packed, true); } else { - HandleNaNs(inst, XMM1, XMM1); - MOVSD(Rd, R(XMM1)); + HandleNaNs(inst, result_reg, result_reg, XMM1); + MOVSD(Rd, R(result_reg)); } SetFPRFIfNeeded(Rd); } diff --git a/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.cpp b/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.cpp index 1de9547b89..dbd2cd3497 100644 --- a/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.cpp +++ b/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.cpp @@ -828,7 +828,8 @@ void EmuCodeBlock::avx_op(void (XEmitter::*avxOp)(X64Reg, X64Reg, const OpArg&, else { (this->*sseOp)(XMM0, arg2, imm); - MOVAPD(regOp, R(XMM0)); + if (regOp != XMM0) + MOVAPD(regOp, R(XMM0)); } } else