Jit64: Emulate FMA accurately when determinism is enabled

When determinism is enabled, we either want all CPUs to use FMA or
we want no CPUs to use FMA. Until now, Jit64 has been been doing
the latter. However, this is inaccurate behavior, all CPUs since
Haswell support FMA, and getting JitArm64 to match the exact
inaccurate rounding used by Jit64 would be a bit annoying. This
commit switches us over to using FMA on all CPUs when determinism
is enabled, with older CPUs calling the std::fma function.
This commit is contained in:
JosJuice 2021-05-23 23:00:57 +02:00
parent 9bc5bd83a9
commit 2c38d6419e
2 changed files with 186 additions and 108 deletions

View file

@ -3,6 +3,8 @@
// Refer to the license.txt file included.
#include <algorithm>
#include <cmath>
#include <limits>
#include <vector>
#include "Common/Assert.h"
@ -239,138 +241,213 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
JITDISABLE(bJITFloatingPointOff);
FALLBACK_IF(inst.Rc);
// While we don't know if any games are actually affected (replays seem to work with all the usual
// suspects for desyncing), netplay and other applications need absolute perfect determinism, so
// be extra careful and use software FMA on CPUs that don't have hardware FMA.
const bool software_fma = !cpu_info.bFMA && Core::WantsDeterminism();
int a = inst.FA;
int b = inst.FB;
int c = inst.FC;
int d = inst.FD;
bool single = inst.OPCD == 4 || inst.OPCD == 59;
bool round_input = single && !js.op->fprIsSingle[c];
bool packed = inst.OPCD == 4 || (!cpu_info.bAtom && single && js.op->fprIsDuplicated[a] &&
js.op->fprIsDuplicated[b] && js.op->fprIsDuplicated[c]);
bool packed =
inst.OPCD == 4 || (!cpu_info.bAtom && !software_fma && single && js.op->fprIsDuplicated[a] &&
js.op->fprIsDuplicated[b] && js.op->fprIsDuplicated[c]);
// While we don't know if any games are actually affected (replays seem to work with all the usual
// suspects for desyncing), netplay and other applications need absolute perfect determinism, so
// be extra careful and don't use FMA, even if in theory it might be okay.
// Note that FMA isn't necessarily less correct (it may actually be closer to correct) compared
// to what the Gekko does here; in deterministic mode, the important thing is multiple Dolphin
// instances on different computers giving identical results.
const bool use_fma = cpu_info.bFMA && !Core::WantsDeterminism();
// For use_fma == true:
// Statistics suggests b is a lot less likely to be unbound in practice, so
// if we have to pick one of a or b to bind, let's make it b.
RCOpArg Ra = fpr.Use(a, RCMode::Read);
RCOpArg Rb = use_fma ? fpr.Bind(b, RCMode::Read) : fpr.Use(b, RCMode::Read);
RCOpArg Rc = fpr.Use(c, RCMode::Read);
RCX64Reg Rd = fpr.Bind(d, single ? RCMode::Write : RCMode::ReadWrite);
RegCache::Realize(Ra, Rb, Rc, Rd);
switch (inst.SUBOP5)
RCOpArg Ra;
RCOpArg Rb;
RCOpArg Rc;
RCX64Reg Rd;
RCX64Reg scratch_guard;
if (software_fma)
{
case 14:
MOVDDUP(XMM1, Rc);
if (round_input)
Force25BitPrecision(XMM1, R(XMM1), XMM0);
break;
case 15:
avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, XMM1, Rc, Rc, 3);
if (round_input)
Force25BitPrecision(XMM1, R(XMM1), XMM0);
break;
default:
bool special = inst.SUBOP5 == 30 && (!cpu_info.bFMA || Core::WantsDeterminism());
X64Reg tmp1 = special ? XMM0 : XMM1;
X64Reg tmp2 = special ? XMM1 : XMM0;
if (single && round_input)
Force25BitPrecision(tmp1, Rc, tmp2);
else
MOVAPD(tmp1, Rc);
break;
}
if (use_fma)
{
switch (inst.SUBOP5)
{
case 28: // msub
if (packed)
VFMSUB132PD(XMM1, Rb.GetSimpleReg(), Ra);
else
VFMSUB132SD(XMM1, Rb.GetSimpleReg(), Ra);
break;
case 14: // madds0
case 15: // madds1
case 29: // madd
if (packed)
VFMADD132PD(XMM1, Rb.GetSimpleReg(), Ra);
else
VFMADD132SD(XMM1, Rb.GetSimpleReg(), Ra);
break;
// PowerPC and x86 define NMADD/NMSUB differently
// x86: D = -A*C (+/-) B
// PPC: D = -(A*C (+/-) B)
// so we have to swap them; the ADD/SUB here isn't a typo.
case 30: // nmsub
if (packed)
VFNMADD132PD(XMM1, Rb.GetSimpleReg(), Ra);
else
VFNMADD132SD(XMM1, Rb.GetSimpleReg(), Ra);
break;
case 31: // nmadd
if (packed)
VFNMSUB132PD(XMM1, Rb.GetSimpleReg(), Ra);
else
VFNMSUB132SD(XMM1, Rb.GetSimpleReg(), Ra);
break;
}
}
else if (inst.SUBOP5 == 30) // nmsub
{
// We implement nmsub a little differently ((b - a*c) instead of -(a*c - b)), so handle it
// separately.
MOVAPD(XMM1, Rb);
if (packed)
{
MULPD(XMM0, Ra);
SUBPD(XMM1, R(XMM0));
}
else
{
MULSD(XMM0, Ra);
SUBSD(XMM1, R(XMM0));
}
scratch_guard = fpr.Scratch(XMM2);
Ra = packed ? fpr.Bind(a, RCMode::Read) : fpr.Use(a, RCMode::Read);
Rb = packed ? fpr.Bind(b, RCMode::Read) : fpr.Use(b, RCMode::Read);
Rc = packed ? fpr.Bind(c, RCMode::Read) : fpr.Use(c, RCMode::Read);
Rd = fpr.Bind(d, single ? RCMode::Write : RCMode::ReadWrite);
RegCache::Realize(Ra, Rb, Rc, Rd, scratch_guard);
}
else
{
// For cpu_info.bFMA == true:
// Statistics suggests b is a lot less likely to be unbound in practice, so
// if we have to pick one of a or b to bind, let's make it b.
Ra = fpr.Use(a, RCMode::Read);
Rb = cpu_info.bFMA ? fpr.Bind(b, RCMode::Read) : fpr.Use(b, RCMode::Read);
Rc = fpr.Use(c, RCMode::Read);
Rd = fpr.Bind(d, single ? RCMode::Write : RCMode::ReadWrite);
RegCache::Realize(Ra, Rb, Rc, Rd);
}
X64Reg result_reg = XMM0;
if (software_fma)
{
for (size_t i = (packed ? 1 : 0); i != std::numeric_limits<size_t>::max(); --i)
{
if ((i == 0 || inst.SUBOP5 == 14) && inst.SUBOP5 != 15) // (i == 0 || madds0) && !madds1
{
if (round_input)
Force25BitPrecision(XMM1, Rc, XMM2);
else
MOVSD(XMM1, Rc);
}
else
{
MOVHLPS(XMM1, Rc.GetSimpleReg());
if (round_input)
Force25BitPrecision(XMM1, R(XMM1), XMM2);
}
// Write the result from the previous loop iteration into Rd so we don't lose it.
// It's important that this is done after reading Rc above, in case we have madds1 and c == d.
if (packed && i == 0)
MOVLHPS(Rd, XMM0);
if (i == 0)
{
MOVSD(XMM0, Ra);
MOVSD(XMM2, Rb);
}
else
{
MOVHLPS(XMM0, Ra.GetSimpleReg());
MOVHLPS(XMM2, Rb.GetSimpleReg());
}
if (inst.SUBOP5 == 28 || inst.SUBOP5 == 30) // nsub, nmsub
XORPS(XMM2, MConst(psSignBits));
BitSet32 registers_in_use = CallerSavedRegistersInUse();
ABI_PushRegistersAndAdjustStack(registers_in_use, 0);
ABI_CallFunction(static_cast<double (*)(double, double, double)>(&std::fma));
ABI_PopRegistersAndAdjustStack(registers_in_use, 0);
}
if (packed)
{
MULPD(XMM1, Ra);
if (inst.SUBOP5 == 28) // msub
SUBPD(XMM1, Rb);
else //(n)madd(s[01])
ADDPD(XMM1, Rb);
MOVSD(Rd, XMM0);
result_reg = Rd;
}
if (inst.SUBOP5 == 30 || inst.SUBOP5 == 31) // nmsub, nmadd
XORPD(result_reg, MConst(packed ? psSignBits2 : psSignBits));
}
else
{
switch (inst.SUBOP5)
{
case 14: // madds0
MOVDDUP(XMM0, Rc);
if (round_input)
Force25BitPrecision(XMM0, R(XMM0), XMM1);
break;
case 15: // madds1
avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, XMM0, Rc, Rc, 3);
if (round_input)
Force25BitPrecision(XMM0, R(XMM0), XMM1);
break;
default:
if (single && round_input)
Force25BitPrecision(XMM0, Rc, XMM1);
else
MOVAPD(XMM0, Rc);
break;
}
if (cpu_info.bFMA)
{
switch (inst.SUBOP5)
{
case 28: // msub
if (packed)
VFMSUB132PD(XMM0, Rb.GetSimpleReg(), Ra);
else
VFMSUB132SD(XMM0, Rb.GetSimpleReg(), Ra);
break;
case 14: // madds0
case 15: // madds1
case 29: // madd
if (packed)
VFMADD132PD(XMM0, Rb.GetSimpleReg(), Ra);
else
VFMADD132SD(XMM0, Rb.GetSimpleReg(), Ra);
break;
// PowerPC and x86 define NMADD/NMSUB differently
// x86: D = -A*C (+/-) B
// PPC: D = -(A*C (+/-) B)
// so we have to swap them; the ADD/SUB here isn't a typo.
case 30: // nmsub
if (packed)
VFNMADD132PD(XMM0, Rb.GetSimpleReg(), Ra);
else
VFNMADD132SD(XMM0, Rb.GetSimpleReg(), Ra);
break;
case 31: // nmadd
if (packed)
VFNMSUB132PD(XMM0, Rb.GetSimpleReg(), Ra);
else
VFNMSUB132SD(XMM0, Rb.GetSimpleReg(), Ra);
break;
}
}
else
{
MULSD(XMM1, Ra);
if (inst.SUBOP5 == 28)
SUBSD(XMM1, Rb);
// No hardware support for FMA, and determinism is not enabled. In this case we inaccurately
// do the multiplication and addition/subtraction in two separate operations for performance.
if (inst.SUBOP5 == 30) // nmsub
{
// We implement nmsub a little differently ((b - a*c) instead of -(a*c - b)),
// so handle it separately.
MOVAPD(XMM1, Rb);
if (packed)
{
MULPD(XMM0, Ra);
SUBPD(XMM1, R(XMM0));
}
else
{
MULSD(XMM0, Ra);
SUBSD(XMM1, R(XMM0));
}
result_reg = XMM1;
}
else
ADDSD(XMM1, Rb);
{
if (packed)
{
MULPD(XMM0, Ra);
if (inst.SUBOP5 == 28) // msub
SUBPD(XMM0, Rb);
else //(n)madd(s[01])
ADDPD(XMM0, Rb);
}
else
{
MULSD(XMM0, Ra);
if (inst.SUBOP5 == 28)
SUBSD(XMM0, Rb);
else
ADDSD(XMM0, Rb);
}
if (inst.SUBOP5 == 31) // nmadd
XORPD(XMM0, MConst(packed ? psSignBits2 : psSignBits));
}
}
if (inst.SUBOP5 == 31) // nmadd
XORPD(XMM1, MConst(packed ? psSignBits2 : psSignBits));
}
if (single)
{
HandleNaNs(inst, Rd, XMM1);
ForceSinglePrecision(Rd, Rd, packed, true);
HandleNaNs(inst, result_reg, result_reg, result_reg == XMM1 ? XMM0 : XMM1);
ForceSinglePrecision(Rd, R(result_reg), packed, true);
}
else
{
HandleNaNs(inst, XMM1, XMM1);
MOVSD(Rd, R(XMM1));
HandleNaNs(inst, result_reg, result_reg, XMM1);
MOVSD(Rd, R(result_reg));
}
SetFPRFIfNeeded(Rd);
}

View file

@ -828,7 +828,8 @@ void EmuCodeBlock::avx_op(void (XEmitter::*avxOp)(X64Reg, X64Reg, const OpArg&,
else
{
(this->*sseOp)(XMM0, arg2, imm);
MOVAPD(regOp, R(XMM0));
if (regOp != XMM0)
MOVAPD(regOp, R(XMM0));
}
}
else