Jit64: optionally accurate NaNs

When AccurateNaNs is enabled, NaNs are handled accurately by checking for NaN results and choosing the correct input NaN or replacing x86's generated -QNaN with +QNaN.
2024-09-21 11:51:48 +02:00 · 2015-06-07 14:38:09 +02:00 · 2015-06-07 14:38:09 +02:00 · aec38466d9
commit aec38466d9
parent 881f6db2ab
7 changed files with 183 additions and 42 deletions
--- a/Source/Core/Core/BootManager.cpp
+++ b/Source/Core/Core/BootManager.cpp
@ -47,7 +47,7 @@ namespace BootManager
 // Apply fire liberally
 struct ConfigCache
 {
-	bool valid, bCPUThread, bSkipIdle, bSyncGPUOnSkipIdleHack, bFPRF, bMMU, bDCBZOFF, m_EnableJIT, bDSPThread,
+	bool valid, bCPUThread, bSkipIdle, bSyncGPUOnSkipIdleHack, bFPRF, bAccurateNaNs, bMMU, bDCBZOFF, m_EnableJIT, bDSPThread,
 	     bSyncGPU, bFastDiscSpeed, bDSPHLE, bHLE_BS2, bProgressive;
 	int iCPUCore, Volume;
 	int iWiimoteSource[MAX_BBMOTES];
@ -106,6 +106,7 @@ bool BootCore(const std::string& _rFilename)
 		config_cache.bSyncGPUOnSkipIdleHack = StartUp.bSyncGPUOnSkipIdleHack;
 		config_cache.iCPUCore = StartUp.iCPUCore;
 		config_cache.bFPRF = StartUp.bFPRF;
+		config_cache.bAccurateNaNs = StartUp.bAccurateNaNs;
 		config_cache.bMMU = StartUp.bMMU;
 		config_cache.bDCBZOFF = StartUp.bDCBZOFF;
 		config_cache.bSyncGPU = StartUp.bSyncGPU;
@ -146,6 +147,7 @@ bool BootCore(const std::string& _rFilename)
 		core_section->Get("SkipIdle",         &StartUp.bSkipIdle, StartUp.bSkipIdle);
 		core_section->Get("SyncOnSkipIdle",   &StartUp.bSyncGPUOnSkipIdleHack, StartUp.bSyncGPUOnSkipIdleHack);
 		core_section->Get("FPRF",             &StartUp.bFPRF, StartUp.bFPRF);
+		core_section->Get("AccurateNaNs",     &StartUp.bAccurateNaNs, StartUp.bAccurateNaNs);
 		core_section->Get("MMU",              &StartUp.bMMU, StartUp.bMMU);
 		core_section->Get("DCBZ",             &StartUp.bDCBZOFF, StartUp.bDCBZOFF);
 		core_section->Get("SyncGPU",          &StartUp.bSyncGPU, StartUp.bSyncGPU);
@ -273,6 +275,7 @@ void Stop()
 		StartUp.bSyncGPUOnSkipIdleHack = config_cache.bSyncGPUOnSkipIdleHack;
 		StartUp.iCPUCore = config_cache.iCPUCore;
 		StartUp.bFPRF = config_cache.bFPRF;
+		StartUp.bAccurateNaNs = config_cache.bAccurateNaNs;
 		StartUp.bMMU = config_cache.bMMU;
 		StartUp.bDCBZOFF = config_cache.bDCBZOFF;
 		StartUp.bSyncGPU = config_cache.bSyncGPU;
--- a/Source/Core/Core/CoreParameter.cpp
+++ b/Source/Core/Core/CoreParameter.cpp
@ -33,7 +33,7 @@ SCoreStartupParameter::SCoreStartupParameter()
  bJITPairedOff(false), bJITSystemRegistersOff(false),
  bJITBranchOff(false),
  bJITILTimeProfiling(false), bJITILOutputIR(false),
-  bFPRF(false),
+  bFPRF(false), bAccurateNaNs(false),
  bCPUThread(true), bDSPThread(false), bDSPHLE(true),
  bSkipIdle(true), bSyncGPUOnSkipIdleHack(true), bNTSC(false), bForceNTSCJ(false),
  bHLE_BS2(true), bEnableCheats(false),
@ -78,6 +78,7 @@ void SCoreStartupParameter::LoadDefaults()
 	bDSPHLE = true;
 	bFastmem = true;
 	bFPRF = false;
+	bAccurateNaNs = false;
 	bMMU = false;
 	bDCBZOFF = false;
 	iBBDumpPort = -1;
--- a/Source/Core/Core/CoreParameter.h
+++ b/Source/Core/Core/CoreParameter.h
@ -163,6 +163,7 @@ struct SCoreStartupParameter

 	bool bFastmem;
 	bool bFPRF;
+	bool bAccurateNaNs;

 	bool bCPUThread;
 	bool bDSPThread;
--- a/Source/Core/Core/PowerPC/Jit64/Jit.h
+++ b/Source/Core/Core/PowerPC/Jit64/Jit.h
@ -135,13 +135,18 @@ public:
 	Gen::FixupBranch JumpIfCRFieldBit(int field, int bit, bool jump_if_set = true);
 	void SetFPRFIfNeeded(Gen::X64Reg xmm);

+	void HandleNaNs(UGeckoInstruction inst, Gen::X64Reg xmm_out, Gen::X64Reg xmm_in);
+
 	void MultiplyImmediate(u32 imm, int a, int d, bool overflow);

 	typedef u32 (*Operation)(u32 a, u32 b);
-	void regimmop(int d, int a, bool binary, u32 value, Operation doop, void (Gen::XEmitter::*op)(int, const Gen::OpArg&, const Gen::OpArg&),
-		          bool Rc = false, bool carry = false);
-	void fp_tri_op(int d, int a, int b, bool reversible, bool single, void (Gen::XEmitter::*avxOp)(Gen::X64Reg, Gen::X64Reg, const Gen::OpArg&),
-	               void (Gen::XEmitter::*sseOp)(Gen::X64Reg, const Gen::OpArg&), bool packed = false, bool roundRHS = false);
+	void regimmop(int d, int a, bool binary, u32 value, Operation doop,
+	              void (Gen::XEmitter::*op)(int, const Gen::OpArg&, const Gen::OpArg&),
+	              bool Rc = false, bool carry = false);
+	Gen::X64Reg fp_tri_op(int d, int a, int b, bool reversible, bool single,
+	                      void (Gen::XEmitter::*avxOp)(Gen::X64Reg, Gen::X64Reg, const Gen::OpArg&),
+	                      void (Gen::XEmitter::*sseOp)(Gen::X64Reg, const Gen::OpArg&),
+	                      bool packed, bool preserve_inputs, bool roundRHS = false);
 	void FloatCompare(UGeckoInstruction inst, bool upper = false);

 	// OPCODES
--- a/Source/Core/Core/PowerPC/Jit64/JitRegCache.h
+++ b/Source/Core/Core/PowerPC/Jit64/JitRegCache.h
@ -138,6 +138,20 @@ public:
 		LockX(args...);
 	}

+	template<typename T>
+	void UnlockX(T x)
+	{
+		if (!xregs[x].locked)
+			PanicAlert("RegCache: x %i already unlocked!", x);
+		xregs[x].locked = false;
+	}
+	template<typename T, typename... Args>
+	void UnlockX(T first, Args... args)
+	{
+		UnlockX(first);
+		UnlockX(args...);
+	}
+
 	void UnlockAll();
 	void UnlockAllX();

--- a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
@ -10,38 +10,37 @@

 using namespace Gen;

-static const u64 GC_ALIGNED16(psSignBits[2]) = {0x8000000000000000ULL, 0x0000000000000000ULL};
-static const u64 GC_ALIGNED16(psSignBits2[2]) = {0x8000000000000000ULL, 0x8000000000000000ULL};
-static const u64 GC_ALIGNED16(psAbsMask[2])  = {0x7FFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL};
-static const u64 GC_ALIGNED16(psAbsMask2[2]) = {0x7FFFFFFFFFFFFFFFULL, 0x7FFFFFFFFFFFFFFFULL};
+static const u64 GC_ALIGNED16(psSignBits[2])      = {0x8000000000000000ULL, 0x0000000000000000ULL};
+static const u64 GC_ALIGNED16(psSignBits2[2])     = {0x8000000000000000ULL, 0x8000000000000000ULL};
+static const u64 GC_ALIGNED16(psAbsMask[2])       = {0x7FFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL};
+static const u64 GC_ALIGNED16(psAbsMask2[2])      = {0x7FFFFFFFFFFFFFFFULL, 0x7FFFFFFFFFFFFFFFULL};
+static const u64 GC_ALIGNED16(psGeneratedQNaN[2]) = {0x7FF8000000000000ULL, 0x7FF8000000000000ULL};
 static const double GC_ALIGNED16(half_qnan_and_s32_max[2]) = {0x7FFFFFFF, -0x80000};

-void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool single, void (XEmitter::*avxOp)(X64Reg, X64Reg, const OpArg&),
-                      void (XEmitter::*sseOp)(X64Reg, const OpArg&), bool packed, bool roundRHS)
+X64Reg Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool single, void (XEmitter::*avxOp)(X64Reg, X64Reg, const OpArg&),
+                        void (XEmitter::*sseOp)(X64Reg, const OpArg&), bool packed, bool preserve_inputs, bool roundRHS)
 {
 	fpr.Lock(d, a, b);
 	fpr.BindToRegister(d, d == a || d == b || !single);
+	X64Reg dest = preserve_inputs ? XMM1 : fpr.RX(d);
 	if (roundRHS)
 	{
-		if (d == a)
+		if (d == a && !preserve_inputs)
 		{
 			Force25BitPrecision(XMM0, fpr.R(b), XMM1);
 			(this->*sseOp)(fpr.RX(d), R(XMM0));
 		}
 		else
 		{
-			Force25BitPrecision(fpr.RX(d), fpr.R(b), XMM0);
-			(this->*sseOp)(fpr.RX(d), fpr.R(a));
+			Force25BitPrecision(dest, fpr.R(b), XMM0);
+			(this->*sseOp)(dest, fpr.R(a));
 		}
 	}
 	else
 	{
-		avx_op(avxOp, sseOp, fpr.RX(d), fpr.R(a), fpr.R(b), packed, reversible);
+		avx_op(avxOp, sseOp, dest, fpr.R(a), fpr.R(b), packed, reversible);
 	}
-	if (single)
-		ForceSinglePrecision(fpr.RX(d), fpr.R(d), packed, true);
-	SetFPRFIfNeeded(fpr.RX(d));
-	fpr.UnlockAll();
+	return dest;
 }

 // We can avoid calculating FPRF if it's not needed; every float operation resets it, so
@ -56,6 +55,112 @@ void Jit64::SetFPRFIfNeeded(X64Reg xmm)
 		SetFPRF(xmm);
 }

+void Jit64::HandleNaNs(UGeckoInstruction inst, X64Reg xmm_out, X64Reg xmm)
+{
+	//                      | PowerPC  | x86
+	// ---------------------+----------+---------
+	// input NaN precedence | 1*3 + 2  | 1*2 + 3
+	// generated QNaN       | positive | negative
+	//
+	// Dragon Ball: Revenge of King Piccolo requires generated NaNs
+	// to be positive, so we'll have to handle them manually.
+
+	if (!SConfig::GetInstance().m_LocalCoreStartupParameter.bAccurateNaNs)
+	{
+		if (xmm_out != xmm)
+			MOVAPD(xmm_out, R(xmm));
+		return;
+	}
+
+	_assert_(xmm != XMM0);
+
+	std::vector<u32> inputs;
+	u32 a = inst.FA, b = inst.FB, c = inst.FC;
+	for (u32 i : {a, b, c})
+	{
+		if (!js.op->fregsIn[i])
+			continue;
+		if (std::find(inputs.begin(), inputs.end(), i) == inputs.end())
+			inputs.push_back(i);
+	}
+	if (inst.OPCD != 4)
+	{
+		// not paired-single
+		UCOMISD(xmm, R(xmm));
+		FixupBranch handle_nan = J_CC(CC_P, true);
+		SwitchToFarCode();
+			SetJumpTarget(handle_nan);
+			std::vector<FixupBranch> fixups;
+			for (u32 x : inputs)
+			{
+				MOVDDUP(xmm, fpr.R(x));
+				UCOMISD(xmm, R(xmm));
+				fixups.push_back(J_CC(CC_P));
+			}
+			MOVDDUP(xmm, M(psGeneratedQNaN));
+			for (FixupBranch fixup : fixups)
+				SetJumpTarget(fixup);
+			FixupBranch done = J(true);
+		SwitchToNearCode();
+		SetJumpTarget(done);
+	}
+	else
+	{
+		// paired-single
+		std::reverse(inputs.begin(), inputs.end());
+		if (cpu_info.bSSE4_1)
+		{
+			avx_op(&XEmitter::VCMPPD, &XEmitter::CMPPD, XMM0, R(xmm), R(xmm), CMP_UNORD);
+			PTEST(XMM0, R(XMM0));
+			FixupBranch handle_nan = J_CC(CC_NZ, true);
+			SwitchToFarCode();
+				SetJumpTarget(handle_nan);
+				BLENDVPD(xmm, M(psGeneratedQNaN));
+				for (u32 x : inputs)
+				{
+					avx_op(&XEmitter::VCMPPD, &XEmitter::CMPPD, XMM0, fpr.R(x), fpr.R(x), CMP_UNORD);
+					BLENDVPD(xmm, fpr.R(x));
+				}
+				FixupBranch done = J(true);
+			SwitchToNearCode();
+			SetJumpTarget(done);
+		}
+		else
+		{
+			// SSE2 fallback
+			X64Reg tmp = fpr.GetFreeXReg();
+			fpr.FlushLockX(tmp);
+			MOVAPD(XMM0, R(xmm));
+			CMPPD(XMM0, R(XMM0), CMP_UNORD);
+			MOVMSKPD(RSCRATCH, R(XMM0));
+			TEST(32, R(RSCRATCH), R(RSCRATCH));
+			FixupBranch handle_nan = J_CC(CC_NZ, true);
+			SwitchToFarCode();
+				SetJumpTarget(handle_nan);
+				MOVAPD(tmp, R(XMM0));
+				PANDN(XMM0, R(xmm));
+				PAND(tmp, M(psGeneratedQNaN));
+				POR(tmp, R(XMM0));
+				MOVAPD(xmm, R(tmp));
+				for (u32 x : inputs)
+				{
+					MOVAPD(XMM0, fpr.R(x));
+					CMPPD(XMM0, R(XMM0), CMP_ORD);
+					MOVAPD(tmp, R(XMM0));
+					PANDN(XMM0, fpr.R(x));
+					PAND(xmm, R(tmp));
+					POR(xmm, R(XMM0));
+				}
+				FixupBranch done = J(true);
+			SwitchToNearCode();
+			SetJumpTarget(done);
+			fpr.UnlockX(tmp);
+		}
+	}
+	if (xmm_out != xmm)
+		MOVAPD(xmm_out, R(xmm));
+}
+
 void Jit64::fp_arith(UGeckoInstruction inst)
 {
 	INSTRUCTION_START
@ -80,20 +185,27 @@ void Jit64::fp_arith(UGeckoInstruction inst)
 		packed = false;

 	bool round_input = single && !jit->js.op->fprIsSingle[inst.FC];
+	bool preserve_inputs = SConfig::GetInstance().m_LocalCoreStartupParameter.bAccurateNaNs;

+	X64Reg dest = INVALID_REG;
 	switch (inst.SUBOP5)
 	{
-	case 18: fp_tri_op(d, a, b, false, single, packed ? &XEmitter::VDIVPD : &XEmitter::VDIVSD,
-	                   packed ? &XEmitter::DIVPD : &XEmitter::DIVSD, packed); break;
-	case 20: fp_tri_op(d, a, b, false, single, packed ? &XEmitter::VSUBPD : &XEmitter::VSUBSD,
-	                   packed ? &XEmitter::SUBPD : &XEmitter::SUBSD, packed); break;
-	case 21: fp_tri_op(d, a, b, true, single, packed ? &XEmitter::VADDPD : &XEmitter::VADDSD,
-	                   packed ? &XEmitter::ADDPD : &XEmitter::ADDSD, packed); break;
-	case 25: fp_tri_op(d, a, c, true, single, packed ? &XEmitter::VMULPD : &XEmitter::VMULSD,
-	                   packed ? &XEmitter::MULPD : &XEmitter::MULSD, packed, round_input); break;
+	case 18: dest = fp_tri_op(d, a, b, false, single, packed ? &XEmitter::VDIVPD : &XEmitter::VDIVSD,
+	                          packed ? &XEmitter::DIVPD : &XEmitter::DIVSD, packed, preserve_inputs); break;
+	case 20: dest = fp_tri_op(d, a, b, false, single, packed ? &XEmitter::VSUBPD : &XEmitter::VSUBSD,
+	                          packed ? &XEmitter::SUBPD : &XEmitter::SUBSD, packed, preserve_inputs); break;
+	case 21: dest = fp_tri_op(d, a, b, true, single, packed ? &XEmitter::VADDPD : &XEmitter::VADDSD,
+	                          packed ? &XEmitter::ADDPD : &XEmitter::ADDSD, packed, preserve_inputs); break;
+	case 25: dest = fp_tri_op(d, a, c, true, single, packed ? &XEmitter::VMULPD : &XEmitter::VMULSD,
+	                          packed ? &XEmitter::MULPD : &XEmitter::MULSD, packed, preserve_inputs, round_input); break;
 	default:
 		_assert_msg_(DYNA_REC, 0, "fp_arith WTF!!!");
 	}
+	HandleNaNs(inst, fpr.RX(d), dest);
+	if (single)
+		ForceSinglePrecision(fpr.RX(d), fpr.R(d), packed, true);
+	SetFPRFIfNeeded(fpr.RX(d));
+	fpr.UnlockAll();
 }

 void Jit64::fmaddXX(UGeckoInstruction inst)
@ -220,13 +332,17 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
 		if (inst.SUBOP5 == 31) //nmadd
 			PXOR(XMM1, M(packed ? psSignBits2 : psSignBits));
 	}
-
 	fpr.BindToRegister(d, !single);
-
 	if (single)
-		ForceSinglePrecision(fpr.RX(d), R(XMM1), packed, true);
+	{
+		HandleNaNs(inst, fpr.RX(d), XMM1);
+		ForceSinglePrecision(fpr.RX(d), fpr.R(d), packed, true);
+	}
 	else
+	{
+		HandleNaNs(inst, XMM1, XMM1);
 		MOVSD(fpr.RX(d), R(XMM1));
+	}
 	SetFPRFIfNeeded(fpr.RX(d));
 	fpr.UnlockAll();
 }
@ -379,7 +495,6 @@ void Jit64::FloatCompare(UGeckoInstruction inst, bool upper)
 	}
 	else
 	{
-		// Are we masking sNaN invalid floating point exceptions? If not this could crash if we don't handle the exception?
 		UCOMISD(fpr.RX(b), fpr.R(a));
 	}

--- a/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp
@ -38,7 +38,7 @@ void Jit64::ps_sum(UGeckoInstruction inst)
 	fpr.Lock(a, b, c, d);
 	OpArg op_a = fpr.R(a);
 	fpr.BindToRegister(d, d == b || d == c);
-	X64Reg tmp = XMM0;
+	X64Reg tmp = XMM1;
 	MOVDDUP(tmp, op_a);   // {a.ps0, a.ps0}
 	ADDPD(tmp, fpr.R(b)); // {a.ps0 + b.ps0, a.ps0 + b.ps1}
 	switch (inst.SUBOP5)
@ -55,9 +55,9 @@ void Jit64::ps_sum(UGeckoInstruction inst)
 			}
 			else
 			{
-				MOVAPD(XMM1, fpr.R(c));
-				SHUFPD(XMM1, R(tmp), 2);
-				tmp = XMM1;
+				MOVAPD(XMM0, fpr.R(c));
+				SHUFPD(XMM0, R(tmp), 2);
+				tmp = XMM0;
 			}
 		}
 		else
@ -68,7 +68,8 @@ void Jit64::ps_sum(UGeckoInstruction inst)
 	default:
 		PanicAlert("ps_sum WTF!!!");
 	}
-	ForceSinglePrecision(fpr.RX(d), R(tmp));
+	HandleNaNs(inst, fpr.RX(d), tmp);
+	ForceSinglePrecision(fpr.RX(d), fpr.R(d));
 	SetFPRFIfNeeded(fpr.RX(d));
 	fpr.UnlockAll();
 }
@ -88,19 +89,20 @@ void Jit64::ps_muls(UGeckoInstruction inst)
 	switch (inst.SUBOP5)
 	{
 	case 12: // ps_muls0
-		MOVDDUP(XMM0, fpr.R(c));
+		MOVDDUP(XMM1, fpr.R(c));
 		break;
 	case 13: // ps_muls1
-		avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, XMM0, fpr.R(c), fpr.R(c), 3);
+		avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, XMM1, fpr.R(c), fpr.R(c), 3);
 		break;
 	default:
 		PanicAlert("ps_muls WTF!!!");
 	}
 	if (round_input)
-		Force25BitPrecision(XMM0, R(XMM0), XMM1);
-	MULPD(XMM0, fpr.R(a));
+		Force25BitPrecision(XMM1, R(XMM1), XMM0);
+	MULPD(XMM1, fpr.R(a));
 	fpr.BindToRegister(d, false);
-	ForceSinglePrecision(fpr.RX(d), R(XMM0));
+	HandleNaNs(inst, fpr.RX(d), XMM1);
+	ForceSinglePrecision(fpr.RX(d), fpr.R(d));
 	SetFPRFIfNeeded(fpr.RX(d));
 	fpr.UnlockAll();
 }