Rationalize temporary register usage.

Rather than using a variety of registers including RSI, ABI_PARAM1 (either RCX or RDI), RCX, and RDX, the rule is: - RDI and RSI are never used. This allows them to be allocated on Unix, bringing parity with Windows. - RDX is a permanent temporary register along with RAX (and is thus not FlushLocked). It's used frequently enough that allocating it would probably be a bad idea, as it would constantly get flushed. - RCX is allocatable, but is flushed in two situations: - Non-immediate shifts (rlwnm), because x86 requires RCX to be used. - Paired single loads and stores, because they require three temporary registers: the helper functions take two integer arguments, and another register is used as an index to get the function address. These should be relatively rare. While we're at it, in stores, use the registers directly where possible rather than always using temporaries (by making SafeWriteRegToReg clobber less). The address doesn't need to be clobbered in the usual case, and on CPUs with MOVBE, neither does the value. Oh, and get rid of a useless MEMCHECK. This commit does not actually add new registers to the allocation order; it is intended to test for any performance or correctness issues separately.
2024-09-21 20:01:40 +02:00 · 2014-09-02 18:54:46 -04:00 · 2014-09-02 18:54:46 -04:00 · 8dea26762d
commit 8dea26762d
parent 67cdb6e07a
13 changed files with 179 additions and 172 deletions
--- a/Source/Core/Core/PowerPC/Jit64/Jit.h
+++ b/Source/Core/Core/PowerPC/Jit64/Jit.h
@ -107,10 +107,9 @@ public:
 	void GenerateRC();
 	void ComputeRC(const Gen::OpArg & arg);

-	// Reads a given bit of a given CR register part. Clobbers ABI_PARAM1,
-	// don't forget to xlock it before.
+	// Reads a given bit of a given CR register part.
 	void GetCRFieldBit(int field, int bit, Gen::X64Reg out, bool negate = false);
-	// Clobbers ABI_PARAM1, xlock it before.
+	// Clobbers RDX.
 	void SetCRFieldBit(int field, int bit, Gen::X64Reg in);

 	// Generates a branch that will check if a given bit of a CR register part
--- a/Source/Core/Core/PowerPC/Jit64/JitAsm.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/JitAsm.cpp
@ -9,13 +9,12 @@

 using namespace Gen;

-//GLOBAL STATIC ALLOCATIONS x86
-//EAX - ubiquitous scratch register - EVERYBODY scratches this
-
-//GLOBAL STATIC ALLOCATIONS x64
-//EAX - ubiquitous scratch register - EVERYBODY scratches this
-//RBX - Base pointer of memory
-//R15 - Pointer to array of block pointers
+// GLOBAL STATIC ALLOCATIONS x64
+// RAX - ubiquitous scratch register - EVERYBODY scratches this
+// RDX - second scratch register
+// RBX - Base pointer of memory
+// R15 - Pointer to array of block pointers
+// RBP - Pointer to ppcState+0x80

 // PLAN: no more block numbers - crazy opcodes just contain offset within
 // dynarec buffer
@ -73,8 +72,8 @@ void Jit64AsmRoutineManager::Generate()
 				no_mem = J_CC(CC_NZ);
 			}
 			AND(32, R(EAX), Imm32(JIT_ICACHE_MASK));
-			MOV(64, R(RSI), Imm64((u64)jit->GetBlockCache()->iCache));
-			MOV(32, R(EAX), MComplex(RSI, EAX, SCALE_1, 0));
+			MOV(64, R(RDX), Imm64((u64)jit->GetBlockCache()->iCache));
+			MOV(32, R(EAX), MComplex(RDX, EAX, SCALE_1, 0));

 			if (Core::g_CoreStartupParameter.bWii || Core::g_CoreStartupParameter.bMMU || Core::g_CoreStartupParameter.bTLBHack)
 			{
@ -86,8 +85,8 @@ void Jit64AsmRoutineManager::Generate()
 				TEST(32, R(EAX), Imm32(JIT_ICACHE_VMEM_BIT));
 				FixupBranch no_vmem = J_CC(CC_Z);
 				AND(32, R(EAX), Imm32(JIT_ICACHE_MASK));
-				MOV(64, R(RSI), Imm64((u64)jit->GetBlockCache()->iCacheVMEM));
-				MOV(32, R(EAX), MComplex(RSI, EAX, SCALE_1, 0));
+				MOV(64, R(RDX), Imm64((u64)jit->GetBlockCache()->iCacheVMEM));
+				MOV(32, R(EAX), MComplex(RDX, EAX, SCALE_1, 0));

 				if (Core::g_CoreStartupParameter.bWii) exit_vmem = J();
 				SetJumpTarget(no_vmem);
@ -97,8 +96,8 @@ void Jit64AsmRoutineManager::Generate()
 				TEST(32, R(EAX), Imm32(JIT_ICACHE_EXRAM_BIT));
 				FixupBranch no_exram = J_CC(CC_Z);
 				AND(32, R(EAX), Imm32(JIT_ICACHEEX_MASK));
-				MOV(64, R(RSI), Imm64((u64)jit->GetBlockCache()->iCacheEx));
-				MOV(32, R(EAX), MComplex(RSI, EAX, SCALE_1, 0));
+				MOV(64, R(RDX), Imm64((u64)jit->GetBlockCache()->iCacheEx));
+				MOV(32, R(EAX), MComplex(RDX, EAX, SCALE_1, 0));

 				SetJumpTarget(no_exram);
 			}
--- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
@ -442,8 +442,8 @@ void Jit64::cmpXX(UGeckoInstruction inst)

 			if (!comparand.IsImm())
 			{
-				MOVSX(64, 32, ABI_PARAM1, comparand);
-				comparand = R(ABI_PARAM1);
+				MOVSX(64, 32, RDX, comparand);
+				comparand = R(RDX);
 			}
 		}
 		else
@ -454,11 +454,11 @@ void Jit64::cmpXX(UGeckoInstruction inst)
 				MOVZX(64, 32, RAX, gpr.R(a));

 			if (comparand.IsImm())
-				MOV(32, R(ABI_PARAM1), comparand);
+				MOV(32, R(RDX), comparand);
 			else
-				MOVZX(64, 32, ABI_PARAM1, comparand);
+				MOVZX(64, 32, RDX, comparand);

-			comparand = R(ABI_PARAM1);
+			comparand = R(RDX);
 		}
 		SUB(64, R(RAX), comparand);
 		MOV(64, PPCSTATE(cr_val[crf]), R(RAX));
@ -1170,7 +1170,6 @@ void Jit64::mulhwXx(UGeckoInstruction inst)
 	}
 	else
 	{
-		gpr.FlushLockX(EDX);
 		gpr.Lock(a, b, d);
 		gpr.BindToRegister(d, (d == a || d == b), true);
 		if (gpr.RX(d) == EDX)
@ -1288,7 +1287,6 @@ void Jit64::divwux(UGeckoInstruction inst)
 	}
 	else
 	{
-		gpr.FlushLockX(EDX);
 		gpr.Lock(a, b, d);
 		gpr.BindToRegister(d, (d == a || d == b), true);
 		MOV(32, R(EAX), gpr.R(a));
@ -1349,7 +1347,6 @@ void Jit64::divwx(UGeckoInstruction inst)
 	}
 	else
 	{
-		gpr.FlushLockX(EDX);
 		gpr.Lock(a, b, d);
 		gpr.BindToRegister(d, (d == a || d == b), true);
 		MOV(32, R(EAX), gpr.R(a));
@ -1881,8 +1878,8 @@ void Jit64::srawx(UGeckoInstruction inst)
 	int a = inst.RA;
 	int b = inst.RB;
 	int s = inst.RS;
-	gpr.Lock(a, s, b);
 	gpr.FlushLockX(ECX);
+	gpr.Lock(a, s, b);
 	gpr.BindToRegister(a, (a == s || a == b), true);
 	JitClearCA();
 	MOV(32, R(ECX), gpr.R(b));
--- a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp
@ -197,14 +197,13 @@ void Jit64::lXXx(UGeckoInstruction inst)
 			else
 			{
 				// In this case we need an extra temporary register.
-				gpr.FlushLockX(ABI_PARAM1);
-				opAddress = R(ABI_PARAM1);
+				opAddress = R(RDX);
 				storeAddress = true;
 				if (use_constant_offset)
 				{
 					if (gpr.R(a).IsSimpleReg() && offset != 0)
 					{
-						LEA(32, ABI_PARAM1, MDisp(gpr.RX(a), offset));
+						LEA(32, RDX, MDisp(gpr.RX(a), offset));
 					}
 					else
 					{
@ -215,7 +214,7 @@ void Jit64::lXXx(UGeckoInstruction inst)
 				}
 				else if (gpr.R(a).IsSimpleReg() && gpr.R(b).IsSimpleReg())
 				{
-					LEA(32, ABI_PARAM1, MComplex(gpr.RX(a), gpr.RX(b), SCALE_1, 0));
+					LEA(32, RDX, MComplex(gpr.RX(a), gpr.RX(b), SCALE_1, 0));
 				}
 				else
 				{
@ -232,7 +231,7 @@ void Jit64::lXXx(UGeckoInstruction inst)
 	if (update && storeAddress)
 	{
 		// We need to save the (usually scratch) address register for the update.
-		registersInUse |= (1 << ABI_PARAM1);
+		registersInUse |= (1 << RDX);
 	}
 	SafeLoadToReg(gpr.RX(d), opAddress, accessSize, loadOffset, registersInUse, signExtend);

@ -339,8 +338,7 @@ void Jit64::stX(UGeckoInstruction inst)
 				// Helps external systems know which instruction triggered the write
 				MOV(32, PPCSTATE(pc), Imm32(jit->js.compilerPC));

-				gpr.FlushLockX(ABI_PARAM1);
-				MOV(32, R(ABI_PARAM1), gpr.R(s));
+				MOV(32, R(EDX), gpr.R(s));
 				if (update)
 					gpr.SetImmediate32(a, addr);

@ -396,24 +394,31 @@ void Jit64::stX(UGeckoInstruction inst)
 			}
 		}

-		gpr.FlushLockX(ECX, EDX);
-		gpr.Lock(s, a);
-		MOV(32, R(EDX), gpr.R(a));
-		MOV(32, R(ECX), gpr.R(s));
-		SafeWriteRegToReg(ECX, EDX, accessSize, offset, CallerSavedRegistersInUse());
+		gpr.Lock(a, s);
+		gpr.BindToRegister(a, true, false);
+		X64Reg reg_value;
+		if (WriteClobbersRegValue(accessSize, /* swap */ true))
+		{
+			MOV(32, R(EDX), gpr.R(s));
+			reg_value = EDX;
+		}
+		else
+		{
+			gpr.BindToRegister(s, true, false);
+			reg_value = gpr.RX(s);
+		}
+		SafeWriteRegToReg(reg_value, gpr.RX(a), accessSize, offset, CallerSavedRegistersInUse(), SAFE_LOADSTORE_CLOBBER_EAX_INSTEAD_OF_ADDR);

 		if (update && offset)
 		{
-			gpr.KillImmediate(a, true, true);
 			MEMCHECK_START
+			gpr.KillImmediate(a, true, true);

 			ADD(32, gpr.R(a), Imm32((u32)offset));

 			MEMCHECK_END
 		}
-
 		gpr.UnlockAll();
-		gpr.UnlockAllX();
 	}
 	else
 	{
@ -430,15 +435,12 @@ void Jit64::stXx(UGeckoInstruction inst)
 	FALLBACK_IF(!a || a == s || a == b);

 	gpr.Lock(a, b, s);
-	gpr.FlushLockX(ECX, EDX);

 	if (inst.SUBOP10 & 32)
 	{
-		MEMCHECK_START
 		gpr.BindToRegister(a, true, true);
 		ADD(32, gpr.R(a), gpr.R(b));
 		MOV(32, R(EDX), gpr.R(a));
-		MEMCHECK_END
 	}
 	else if (gpr.R(a).IsSimpleReg() && gpr.R(b).IsSimpleReg())
 	{
@ -468,8 +470,18 @@ void Jit64::stXx(UGeckoInstruction inst)
 			break;
 	}

-	MOV(32, R(ECX), gpr.R(s));
-	SafeWriteRegToReg(ECX, EDX, accessSize, 0, CallerSavedRegistersInUse());
+	X64Reg reg_value;
+	if (WriteClobbersRegValue(accessSize, /* swap */ true))
+	{
+		MOV(32, R(EAX), gpr.R(s));
+		reg_value = EAX;
+	}
+	else
+	{
+		gpr.BindToRegister(s, true, false);
+		reg_value = gpr.RX(s);
+	}
+	SafeWriteRegToReg(reg_value, EDX, accessSize, 0, CallerSavedRegistersInUse());

 	gpr.UnlockAll();
 	gpr.UnlockAllX();
@ -482,13 +494,12 @@ void Jit64::lmw(UGeckoInstruction inst)
 	JITDISABLE(bJITLoadStoreOff);

 	// TODO: This doesn't handle rollback on DSI correctly
-	gpr.FlushLockX(ECX);
-	MOV(32, R(ECX), Imm32((u32)(s32)inst.SIMM_16));
+	MOV(32, R(EDX), Imm32((u32)(s32)inst.SIMM_16));
 	if (inst.RA)
-		ADD(32, R(ECX), gpr.R(inst.RA));
+		ADD(32, R(EDX), gpr.R(inst.RA));
 	for (int i = inst.RD; i < 32; i++)
 	{
-		SafeLoadToReg(EAX, R(ECX), 32, (i - inst.RD) * 4, CallerSavedRegistersInUse() | (1 << ECX), false);
+		SafeLoadToReg(EAX, R(EDX), 32, (i - inst.RD) * 4, CallerSavedRegistersInUse() | (1 << ECX), false);
 		gpr.BindToRegister(i, false, true);
 		MOV(32, gpr.R(i), R(EAX));
 	}
@ -501,15 +512,14 @@ void Jit64::stmw(UGeckoInstruction inst)
 	JITDISABLE(bJITLoadStoreOff);

 	// TODO: This doesn't handle rollback on DSI correctly
-	gpr.FlushLockX(ECX);
 	for (int i = inst.RD; i < 32; i++)
 	{
 		if (inst.RA)
 			MOV(32, R(EAX), gpr.R(inst.RA));
 		else
 			XOR(32, R(EAX), R(EAX));
-		MOV(32, R(ECX), gpr.R(i));
-		SafeWriteRegToReg(ECX, EAX, 32, (i - inst.RD) * 4 + (u32)(s32)inst.SIMM_16, CallerSavedRegistersInUse());
+		MOV(32, R(EDX), gpr.R(i));
+		SafeWriteRegToReg(EDX, EAX, 32, (i - inst.RD) * 4 + (u32)(s32)inst.SIMM_16, CallerSavedRegistersInUse());
 	}
 	gpr.UnlockAllX();
 }
--- a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp
@ -96,24 +96,23 @@ void Jit64::stfXXX(UGeckoInstruction inst)
 	FALLBACK_IF(!indexed && !a);

 	s32 offset = 0;
-	gpr.FlushLockX(ABI_PARAM1);
 	if (indexed)
 	{
 		if (update)
 		{
 			gpr.BindToRegister(a, true, true);
 			ADD(32, gpr.R(a), gpr.R(b));
-			MOV(32, R(ABI_PARAM1), gpr.R(a));
+			MOV(32, R(RDX), gpr.R(a));
 		}
 		else
 		{
 			if (a && gpr.R(a).IsSimpleReg() && gpr.R(b).IsSimpleReg())
-				LEA(32, ABI_PARAM1, MComplex(gpr.RX(a), gpr.RX(b), SCALE_1, 0));
+				LEA(32, RDX, MComplex(gpr.RX(a), gpr.RX(b), SCALE_1, 0));
 			else
 			{
-				MOV(32, R(ABI_PARAM1), gpr.R(b));
+				MOV(32, R(RDX), gpr.R(b));
 				if (a)
-					ADD(32, R(ABI_PARAM1), gpr.R(a));
+					ADD(32, R(RDX), gpr.R(a));
 			}
 		}
 	}
@ -128,14 +127,14 @@ void Jit64::stfXXX(UGeckoInstruction inst)
 		{
 			offset = (s32)(s16)inst.SIMM_16;
 		}
-		MOV(32, R(ABI_PARAM1), gpr.R(a));
+		MOV(32, R(RDX), gpr.R(a));
 	}

 	if (single)
 	{
 		fpr.BindToRegister(s, true, false);
 		ConvertDoubleToSingle(XMM0, fpr.RX(s));
-		SafeWriteF32ToReg(XMM0, ABI_PARAM1, offset, CallerSavedRegistersInUse());
+		SafeWriteF32ToReg(XMM0, RDX, offset, CallerSavedRegistersInUse());
 		fpr.UnlockAll();
 	}
 	else
@ -144,7 +143,7 @@ void Jit64::stfXXX(UGeckoInstruction inst)
 			MOVQ_xmm(R(RAX), fpr.RX(s));
 		else
 			MOV(64, R(RAX), fpr.R(s));
-		SafeWriteRegToReg(RAX, ABI_PARAM1, 64, offset, CallerSavedRegistersInUse());
+		SafeWriteRegToReg(RAX, RDX, 64, offset, CallerSavedRegistersInUse());
 	}
 	gpr.UnlockAll();
 	gpr.UnlockAllX();
@ -160,15 +159,14 @@ void Jit64::stfiwx(UGeckoInstruction inst)
 	int a = inst.RA;
 	int b = inst.RB;

-	gpr.FlushLockX(ABI_PARAM1);
-	MOV(32, R(ABI_PARAM1), gpr.R(b));
+	MOV(32, R(RDX), gpr.R(b));
 	if (a)
-		ADD(32, R(ABI_PARAM1), gpr.R(a));
+		ADD(32, R(RDX), gpr.R(a));

 	if (fpr.R(s).IsSimpleReg())
 		MOVD_xmm(R(EAX), fpr.RX(s));
 	else
 		MOV(32, R(EAX), fpr.R(s));
-	SafeWriteRegToReg(EAX, ABI_PARAM1, 32, 0, CallerSavedRegistersInUse());
+	SafeWriteRegToReg(EAX, RDX, 32, 0, CallerSavedRegistersInUse());
 	gpr.UnlockAllX();
 }
--- a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStorePaired.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStorePaired.cpp
@ -28,8 +28,7 @@ void Jit64::psq_st(UGeckoInstruction inst)
 	int a = inst.RA;
 	int s = inst.RS; // Fp numbers

-	gpr.FlushLockX(EAX, EDX);
-	gpr.FlushLockX(ECX);
+	gpr.FlushLockX(EAX, ECX);
 	if (update)
 		gpr.BindToRegister(inst.RA, true, true);
 	fpr.BindToRegister(inst.RS, true, false);
@ -73,8 +72,7 @@ void Jit64::psq_l(UGeckoInstruction inst)
 	bool update = inst.OPCD == 57;
 	int offset = inst.SIMM_12;

-	gpr.FlushLockX(EAX, EDX);
-	gpr.FlushLockX(ECX);
+	gpr.FlushLockX(EAX, ECX);
 	gpr.BindToRegister(inst.RA, true, update && offset);
 	fpr.BindToRegister(inst.RS, false, true);
 	if (offset)
--- a/Source/Core/Core/PowerPC/Jit64/Jit_SystemRegisters.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_SystemRegisters.cpp
@ -42,40 +42,40 @@ void Jit64::GetCRFieldBit(int field, int bit, Gen::X64Reg out, bool negate)

 void Jit64::SetCRFieldBit(int field, int bit, Gen::X64Reg in)
 {
-	MOV(64, R(ABI_PARAM1), PPCSTATE(cr_val[field]));
+	MOV(64, R(RDX), PPCSTATE(cr_val[field]));
 	MOVZX(32, 8, in, R(in));

 	switch (bit)
 	{
 	case CR_SO_BIT:  // set bit 61 to input
-		BTR(64, R(ABI_PARAM1), Imm8(61));
+		BTR(64, R(RDX), Imm8(61));
 		SHL(64, R(in), Imm8(61));
-		OR(64, R(ABI_PARAM1), R(in));
+		OR(64, R(RDX), R(in));
 		break;

 	case CR_EQ_BIT:  // clear low 32 bits, set bit 0 to !input
-		SHR(64, R(ABI_PARAM1), Imm8(32));
-		SHL(64, R(ABI_PARAM1), Imm8(32));
+		SHR(64, R(RDX), Imm8(32));
+		SHL(64, R(RDX), Imm8(32));
 		XOR(32, R(in), Imm8(1));
-		OR(64, R(ABI_PARAM1), R(in));
+		OR(64, R(RDX), R(in));
 		break;

 	case CR_GT_BIT:  // set bit 63 to !input
-		BTR(64, R(ABI_PARAM1), Imm8(63));
+		BTR(64, R(RDX), Imm8(63));
 		NOT(32, R(in));
 		SHL(64, R(in), Imm8(63));
-		OR(64, R(ABI_PARAM1), R(in));
+		OR(64, R(RDX), R(in));
 		break;

 	case CR_LT_BIT:  // set bit 62 to input
-		BTR(64, R(ABI_PARAM1), Imm8(62));
+		BTR(64, R(RDX), Imm8(62));
 		SHL(64, R(in), Imm8(62));
-		OR(64, R(ABI_PARAM1), R(in));
+		OR(64, R(RDX), R(in));
 		break;
 	}

-	BTS(64, R(ABI_PARAM1), Imm8(32));
-	MOV(64, PPCSTATE(cr_val[field]), R(ABI_PARAM1));
+	BTS(64, R(RDX), Imm8(32));
+	MOV(64, PPCSTATE(cr_val[field]), R(RDX));
 }

 FixupBranch Jit64::JumpIfCRFieldBit(int field, int bit, bool jump_if_set)
@ -308,8 +308,7 @@ void Jit64::mfcr(UGeckoInstruction inst)
 	gpr.BindToRegister(d, false, true);
 	XOR(32, gpr.R(d), gpr.R(d));

-	gpr.FlushLockX(ABI_PARAM1);
-	X64Reg cr_val = ABI_PARAM1;
+	X64Reg cr_val = RDX;
 	// we only need to zero the high bits of EAX once
 	XOR(32, R(EAX), R(EAX));
 	for (int i = 0; i < 8; i++)
@ -439,9 +438,8 @@ void Jit64::crXXX(UGeckoInstruction inst)
 	// crnand or crnor
 	bool negateB = inst.SUBOP10 == 225 || inst.SUBOP10 == 33;

-	gpr.FlushLockX(ABI_PARAM1);
-	GetCRFieldBit(inst.CRBA >> 2, 3 - (inst.CRBA & 3), ABI_PARAM1, negateA);
-	GetCRFieldBit(inst.CRBB >> 2, 3 - (inst.CRBB & 3), EAX, negateB);
+	GetCRFieldBit(inst.CRBA >> 2, 3 - (inst.CRBA & 3), DL, negateA);
+	GetCRFieldBit(inst.CRBB >> 2, 3 - (inst.CRBB & 3), AL, negateB);

 	// Compute combined bit
 	switch (inst.SUBOP10)
@ -449,23 +447,23 @@ void Jit64::crXXX(UGeckoInstruction inst)
 	case 33:  // crnor: ~(A || B) == (~A && ~B)
 	case 129: // crandc
 	case 257: // crand
-		AND(8, R(EAX), R(ABI_PARAM1));
+		AND(8, R(AL), R(DL));
 		break;

 	case 193: // crxor
 	case 289: // creqv
-		XOR(8, R(EAX), R(ABI_PARAM1));
+		XOR(8, R(AL), R(DL));
 		break;

 	case 225: // crnand: ~(A && B) == (~A || ~B)
 	case 417: // crorc
 	case 449: // cror
-		OR(8, R(EAX), R(ABI_PARAM1));
+		OR(8, R(AL), R(DL));
 		break;
 	}

 	// Store result bit in CRBD
-	SetCRFieldBit(inst.CRBD >> 2, 3 - (inst.CRBD & 3), EAX);
+	SetCRFieldBit(inst.CRBD >> 2, 3 - (inst.CRBD & 3), AL);

 	gpr.UnlockAllX();
 }
--- a/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp
+++ b/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp
@ -157,7 +157,9 @@ static void fregSpill(RegInfo& RI, X64Reg reg)
 	RI.fregs[reg] = nullptr;
 }

-// ECX is scratch, so we don't allocate it
+// RAX and RDX are scratch, so we don't allocate them
+// (TODO: if we could lock RCX here too then we could allocate it - needed for
+// shifts)

 // 64-bit - calling conventions differ between linux & windows, so...
 #ifdef _WIN32
@ -602,9 +604,9 @@ static void regEmitMemStore(RegInfo& RI, InstLoc I, unsigned Size)
 {
 	auto info = regBuildMemAddress(RI, I, getOp2(I), 2, Size, nullptr);
 	if (info.first.IsImm())
-		RI.Jit->MOV(32, R(ECX), info.first);
+		RI.Jit->MOV(32, R(EDX), info.first);
 	else
-		RI.Jit->LEA(32, ECX, MDisp(info.first.GetSimpleReg(), info.second));
+		RI.Jit->LEA(32, EDX, MDisp(info.first.GetSimpleReg(), info.second));

 	regSpill(RI, EAX);

@ -617,7 +619,7 @@ static void regEmitMemStore(RegInfo& RI, InstLoc I, unsigned Size)
 		RI.Jit->MOV(32, R(EAX), regLocForInst(RI, getOp1(I)));
 	}

-	RI.Jit->SafeWriteRegToReg(EAX, ECX, Size, 0, regsInUse(RI), EmuCodeBlock::SAFE_LOADSTORE_NO_FASTMEM);
+	RI.Jit->SafeWriteRegToReg(EAX, EDX, Size, 0, regsInUse(RI), EmuCodeBlock::SAFE_LOADSTORE_NO_FASTMEM);
 	if (RI.IInfo[I - RI.FirstI] & 4)
 		regClearInst(RI, getOp1(I));
 }
@ -675,9 +677,9 @@ static void regEmitCmp(RegInfo& RI, InstLoc I)
 static void regEmitICmpInst(RegInfo& RI, InstLoc I, CCFlags flag)
 {
 	regEmitCmp(RI, I);
-	RI.Jit->SETcc(flag, R(ECX)); // Caution: SETCC uses 8-bit regs!
+	RI.Jit->SETcc(flag, R(EDX)); // Caution: SETCC uses 8-bit regs!
 	X64Reg reg = regBinReg(RI, I);
-	RI.Jit->MOVZX(32, 8, reg, R(ECX));
+	RI.Jit->MOVZX(32, 8, reg, R(EDX));
 	RI.regs[reg] = I;
 	regNormalRegClear(RI, I);
 }
@ -1111,11 +1113,11 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, u32 exitAddress)
 		}
 		case StoreFPRF:
 		{
-			Jit->MOV(32, R(ECX), regLocForInst(RI, getOp1(I)));
-			Jit->AND(32, R(ECX), Imm8(0x1F));
-			Jit->SHL(32, R(ECX), Imm8(12));
+			Jit->MOV(32, R(EDX), regLocForInst(RI, getOp1(I)));
+			Jit->AND(32, R(EDX), Imm8(0x1F));
+			Jit->SHL(32, R(EDX), Imm8(12));
 			Jit->AND(32, PPCSTATE(fpscr), Imm32(~(0x1F << 12)));
-			Jit->OR(32, PPCSTATE(fpscr), R(ECX));
+			Jit->OR(32, PPCSTATE(fpscr), R(EDX));
 			regNormalRegClear(RI, I);
 			break;
 		}
@ -1155,8 +1157,8 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, u32 exitAddress)
 				break;

 			X64Reg reg = regUReg(RI, I);
-			Jit->MOV(32, R(ECX), regLocForInst(RI, getOp1(I)));
-			Jit->MOVSX(32, 8, reg, R(ECX));
+			Jit->MOV(32, R(EDX), regLocForInst(RI, getOp1(I)));
+			Jit->MOVSX(32, 8, reg, R(EDX));
 			RI.regs[reg] = I;
 			regNormalRegClear(RI, I);
 			break;
@ -1178,9 +1180,9 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, u32 exitAddress)
 				break;

 			X64Reg reg = regUReg(RI, I);
-			Jit->MOV(32, R(ECX), Imm32(63));
+			Jit->MOV(32, R(EDX), Imm32(63));
 			Jit->BSR(32, reg, regLocForInst(RI, getOp1(I)));
-			Jit->CMOVcc(32, reg, R(ECX), CC_Z);
+			Jit->CMOVcc(32, reg, R(EDX), CC_Z);
 			Jit->XOR(32, R(reg), Imm8(31));
 			RI.regs[reg] = I;
 			regNormalRegClear(RI, I);
@ -1422,30 +1424,30 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, u32 exitAddress)
 			Jit->XOR(32, R(EAX), R(EAX));

 			// SO: Bit 61 set.
-			Jit->MOV(64, R(RCX), R(cr_val));
-			Jit->SHR(64, R(RCX), Imm8(61));
-			Jit->AND(32, R(ECX), Imm8(1));
-			Jit->OR(32, R(EAX), R(ECX));
+			Jit->MOV(64, R(RDX), R(cr_val));
+			Jit->SHR(64, R(RDX), Imm8(61));
+			Jit->AND(32, R(EDX), Imm8(1));
+			Jit->OR(32, R(EAX), R(EDX));

 			// EQ: Bits 31-0 == 0.
-			Jit->XOR(32, R(ECX), R(ECX));
+			Jit->XOR(32, R(EDX), R(EDX));
 			Jit->TEST(32, R(cr_val), R(cr_val));
-			Jit->SETcc(CC_Z, R(ECX));
-			Jit->SHL(32, R(ECX), Imm8(1));
-			Jit->OR(32, R(EAX), R(ECX));
+			Jit->SETcc(CC_Z, R(EDX));
+			Jit->SHL(32, R(EDX), Imm8(1));
+			Jit->OR(32, R(EAX), R(EDX));

 			// GT: Value > 0.
-			Jit->XOR(32, R(ECX), R(ECX));
+			Jit->XOR(32, R(EDX), R(EDX));
 			Jit->TEST(64, R(cr_val), R(cr_val));
-			Jit->SETcc(CC_G, R(ECX));
-			Jit->SHL(32, R(ECX), Imm8(2));
-			Jit->OR(32, R(EAX), R(ECX));
+			Jit->SETcc(CC_G, R(EDX));
+			Jit->SHL(32, R(EDX), Imm8(2));
+			Jit->OR(32, R(EAX), R(EDX));

 			// LT: Bit 62 set.
-			Jit->MOV(64, R(ECX), R(cr_val));
-			Jit->SHR(64, R(ECX), Imm8(62 - 3));
-			Jit->AND(32, R(ECX), Imm8(0x8));
-			Jit->OR(32, R(EAX), R(ECX));
+			Jit->MOV(64, R(EDX), R(cr_val));
+			Jit->SHR(64, R(EDX), Imm8(62 - 3));
+			Jit->AND(32, R(EDX), Imm8(0x8));
+			Jit->OR(32, R(EAX), R(EDX));

 			Jit->MOV(32, R(cr_val), R(EAX));
 			RI.regs[cr_val] = I;
@ -1460,34 +1462,34 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, u32 exitAddress)
 			X64Reg cr_val = regUReg(RI, I);
 			Jit->MOV(64, R(cr_val), regLocForInst(RI, getOp1(I)));

-			Jit->MOV(64, R(RCX), Imm64(1ull << 32));
+			Jit->MOV(64, R(RDX), Imm64(1ull << 32));

 			// SO
 			Jit->MOV(64, R(RAX), R(cr_val));
 			Jit->SHL(64, R(RAX), Imm8(63));
 			Jit->SHR(64, R(RAX), Imm8(63 - 61));
-			Jit->OR(64, R(RCX), R(RAX));
+			Jit->OR(64, R(RDX), R(RAX));

 			// EQ
 			Jit->MOV(64, R(RAX), R(cr_val));
 			Jit->NOT(64, R(RAX));
 			Jit->AND(64, R(RAX), Imm8(CR_EQ));
-			Jit->OR(64, R(RCX), R(RAX));
+			Jit->OR(64, R(RDX), R(RAX));

 			// GT
 			Jit->MOV(64, R(RAX), R(cr_val));
 			Jit->NOT(64, R(RAX));
 			Jit->AND(64, R(RAX), Imm8(CR_GT));
 			Jit->SHL(64, R(RAX), Imm8(63 - 2));
-			Jit->OR(64, R(RCX), R(RAX));
+			Jit->OR(64, R(RDX), R(RAX));

 			// LT
 			Jit->MOV(64, R(RAX), R(cr_val));
 			Jit->AND(64, R(RAX), Imm8(CR_LT));
 			Jit->SHL(64, R(RAX), Imm8(62 - 3));
-			Jit->OR(64, R(RCX), R(RAX));
+			Jit->OR(64, R(RDX), R(RAX));

-			Jit->MOV(64, R(cr_val), R(RCX));
+			Jit->MOV(64, R(cr_val), R(RDX));

 			RI.regs[cr_val] = I;
 			regNormalRegClear(RI, I);
@ -1553,9 +1555,9 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, u32 exitAddress)
 				break;

 			X64Reg reg = fregFindFreeReg(RI);
-			Jit->MOV(32, R(ECX), regLocForInst(RI, getOp1(I)));
-			RI.Jit->SafeLoadToReg(ECX, R(ECX), 32, 0, regsInUse(RI), false, EmuCodeBlock::SAFE_LOADSTORE_NO_FASTMEM);
-			Jit->MOVD_xmm(reg, R(ECX));
+			Jit->MOV(32, R(EDX), regLocForInst(RI, getOp1(I)));
+			RI.Jit->SafeLoadToReg(EDX, R(EDX), 32, 0, regsInUse(RI), false, EmuCodeBlock::SAFE_LOADSTORE_NO_FASTMEM);
+			Jit->MOVD_xmm(reg, R(EDX));
 			RI.fregs[reg] = I;
 			regNormalRegClear(RI, I);
 			break;
@ -1567,9 +1569,9 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, u32 exitAddress)

 			X64Reg reg = fregFindFreeReg(RI);
 			const OpArg loc = regLocForInst(RI, getOp1(I));
-			Jit->MOV(32, R(ECX), loc);
-			RI.Jit->SafeLoadToReg(RCX, R(ECX), 64, 0, regsInUse(RI), false, EmuCodeBlock::SAFE_LOADSTORE_NO_FASTMEM);
-			Jit->MOVQ_xmm(reg, R(RCX));
+			Jit->MOV(32, R(EDX), loc);
+			RI.Jit->SafeLoadToReg(RDX, R(EDX), 64, 0, regsInUse(RI), false, EmuCodeBlock::SAFE_LOADSTORE_NO_FASTMEM);
+			Jit->MOVQ_xmm(reg, R(RDX));
 			RI.fregs[reg] = I;
 			regNormalRegClear(RI, I);
 			break;
@ -1591,11 +1593,10 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, u32 exitAddress)
 			// 0b0011111100000111, or 0x3F07.
 			Jit->MOV(32, R(EAX), Imm32(0x3F07));
 			Jit->AND(32, R(EAX), M(((char *)&GQR(quantreg)) + 2));
-			Jit->MOVZX(32, 8, EDX, R(AL));
-			Jit->OR(32, R(EDX), Imm8(w << 3));
+			Jit->OR(32, R(EAX), Imm8(w << 3));

-			Jit->MOV(32, R(ECX), regLocForInst(RI, getOp1(I)));
-			Jit->CALLptr(MScaled(EDX, SCALE_8, (u32)(u64)(((JitIL *)jit)->asm_routines.pairedLoadQuantized)));
+			Jit->MOV(32, R(EDX), regLocForInst(RI, getOp1(I)));
+			Jit->CALLptr(MScaled(EAX, SCALE_8, (u32)(u64)(((JitIL *)jit)->asm_routines.pairedLoadQuantized)));
 			Jit->MOVAPD(reg, R(XMM0));
 			RI.fregs[reg] = I;
 			regNormalRegClear(RI, I);
@ -1610,8 +1611,8 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, u32 exitAddress)
 			else
 				Jit->MOV(32, R(EAX), loc1);

-			Jit->MOV(32, R(ECX), regLocForInst(RI, getOp2(I)));
-			RI.Jit->SafeWriteRegToReg(EAX, ECX, 32, 0, regsInUse(RI), EmuCodeBlock::SAFE_LOADSTORE_NO_FASTMEM);
+			Jit->MOV(32, R(EDX), regLocForInst(RI, getOp2(I)));
+			RI.Jit->SafeWriteRegToReg(EAX, EDX, 32, 0, regsInUse(RI), EmuCodeBlock::SAFE_LOADSTORE_NO_FASTMEM);
 			if (RI.IInfo[I - RI.FirstI] & 4)
 				fregClearInst(RI, getOp1(I));
 			if (RI.IInfo[I - RI.FirstI] & 8)
@ -1626,8 +1627,8 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, u32 exitAddress)
 			OpArg address = regLocForInst(RI, getOp2(I));
 			Jit->MOVAPD(XMM0, value);
 			Jit->MOVQ_xmm(R(RAX), XMM0);
-			Jit->MOV(32, R(ECX), address);
-			RI.Jit->SafeWriteRegToReg(RAX, ECX, 64, 0, regsInUse(RI), EmuCodeBlock::SAFE_LOADSTORE_NO_FASTMEM);
+			Jit->MOV(32, R(EDX), address);
+			RI.Jit->SafeWriteRegToReg(RAX, EDX, 64, 0, regsInUse(RI), EmuCodeBlock::SAFE_LOADSTORE_NO_FASTMEM);

 			if (RI.IInfo[I - RI.FirstI] & 4)
 				fregClearInst(RI, getOp1(I));
@ -1644,7 +1645,7 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, u32 exitAddress)
 			Jit->AND(32, R(EAX), PPCSTATE(spr[SPR_GQR0 + quantreg]));
 			Jit->MOVZX(32, 8, EDX, R(AL));

-			Jit->MOV(32, R(ECX), regLocForInst(RI, getOp2(I)));
+			Jit->MOV(32, R(EDX), regLocForInst(RI, getOp2(I)));
 			Jit->MOVAPD(XMM0, fregLocForInst(RI, getOp1(I)));
 			Jit->CALLptr(MScaled(EDX, SCALE_8, (u32)(u64)(((JitIL *)jit)->asm_routines.pairedStoreQuantized)));
 			if (RI.IInfo[I - RI.FirstI] & 4)
@ -1790,9 +1791,9 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, u32 exitAddress)
 			X64Reg reg = fregFindFreeReg(RI);
 			unsigned ppcreg = *I >> 8;
 			char *p = (char*)&(PowerPC::ppcState.ps[ppcreg][0]);
-			Jit->MOV(32, R(ECX), M(p+4));
-			Jit->AND(32, R(ECX), Imm32(0x7ff00000));
-			Jit->CMP(32, R(ECX), Imm32(0x38000000));
+			Jit->MOV(32, R(EDX), M(p+4));
+			Jit->AND(32, R(EDX), Imm32(0x7ff00000));
+			Jit->CMP(32, R(EDX), Imm32(0x38000000));
 			FixupBranch ok = Jit->J_CC(CC_AE);
 			Jit->AND(32, M(p+4), Imm32(0x80000000));
 			Jit->MOV(32, M(p), Imm32(0));
@ -2204,10 +2205,10 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, u32 exitAddress)
 			const u32 mask = 0x87C0FFFF;
 			// MSR = (MSR & ~mask) | (SRR1 & mask);
 			Jit->MOV(32, R(EAX), PPCSTATE(msr));
-			Jit->MOV(32, R(ECX), PPCSTATE_SRR1);
+			Jit->MOV(32, R(EDX), PPCSTATE_SRR1);
 			Jit->AND(32, R(EAX), Imm32(~mask));
-			Jit->AND(32, R(ECX), Imm32(mask));
-			Jit->OR(32, R(EAX), R(ECX));
+			Jit->AND(32, R(EDX), Imm32(mask));
+			Jit->OR(32, R(EAX), R(EDX));
 			// MSR &= 0xFFFBFFFF; // Mask used to clear the bit MSR[13]
 			Jit->AND(32, R(EAX), Imm32(0xFFFBFFFF));
 			Jit->MOV(32, PPCSTATE(msr), R(EAX));
--- a/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.cpp
+++ b/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.cpp
@ -9,7 +9,7 @@
 #include "Core/PowerPC/JitCommon/JitAsmCommon.h"
 #include "Core/PowerPC/JitCommon/JitBase.h"

-#define QUANTIZED_REGS_TO_SAVE (ABI_ALL_CALLER_SAVED & ~((1 << RAX) | (1 << RCX) | (1 << RDX) | \
+#define QUANTIZED_REGS_TO_SAVE (ABI_ALL_CALLER_SAVED & ~((1 << RAX) | (1 << RCX) | \
                                                         (1 << (XMM0+16)) | (1 << (XMM1+16))))

 using namespace Gen;
@ -18,19 +18,15 @@ static int temp32;

 void CommonAsmRoutines::GenFifoWrite(int size)
 {
-	// Assume value in ABI_PARAM1
+	// Assume value in EDX
 	PUSH(ESI);
-	if (size != 32)
-		PUSH(EDX);
 	MOV(32, R(EAX), Imm32((u32)(u64)GPFifo::m_gatherPipe));
 	MOV(32, R(ESI), M(&GPFifo::m_gatherPipeCount));

-	SwapAndStore(size, MComplex(RAX, RSI, 1, 0), ABI_PARAM1);
+	SwapAndStore(size, MComplex(RAX, RSI, 1, 0), EDX);

 	ADD(32, R(ESI), Imm8(size >> 3));
 	MOV(32, M(&GPFifo::m_gatherPipeCount), R(ESI));
-	if (size != 32)
-		POP(EDX);
 	POP(ESI);
 	RET();
 }
@ -39,7 +35,6 @@ void CommonAsmRoutines::GenFifoFloatWrite()
 {
 	// Assume value in XMM0
 	PUSH(ESI);
-	PUSH(EDX);
 	MOVSS(M(&temp32), XMM0);
 	MOV(32, R(EDX), M(&temp32));
 	MOV(32, R(EAX), Imm32((u32)(u64)GPFifo::m_gatherPipe));
@ -47,7 +42,6 @@ void CommonAsmRoutines::GenFifoFloatWrite()
 	SwapAndStore(32, MComplex(RAX, RSI, 1, 0), EDX);
 	ADD(32, R(ESI), Imm8(4));
 	MOV(32, M(&GPFifo::m_gatherPipeCount), R(ESI));
-	POP(EDX);
 	POP(ESI);
 	RET();
 }
--- a/Source/Core/Core/PowerPC/JitCommon/JitBackpatch.cpp
+++ b/Source/Core/Core/PowerPC/JitCommon/JitBackpatch.cpp
@ -59,6 +59,7 @@ const u8 *TrampolineCache::GetReadTrampoline(const InstructionInfo &info, u32 re
 	// It ought to be necessary to align the stack here.  Since it seems to not
 	// affect anybody, I'm not going to add it just to be completely safe about
 	// performance.
+	ABI_PushRegistersAndAdjustStack(registersInUse, true);

 	if (addrReg != ABI_PARAM1)
 		MOV(32, R(ABI_PARAM1), R((X64Reg)addrReg));
@ -66,7 +67,6 @@ const u8 *TrampolineCache::GetReadTrampoline(const InstructionInfo &info, u32 re
 	if (info.displacement)
 		ADD(32, R(ABI_PARAM1), Imm32(info.displacement));

-	ABI_PushRegistersAndAdjustStack(registersInUse, true);
 	switch (info.operandSize)
 	{
 	case 4:
@ -115,6 +115,8 @@ const u8 *TrampolineCache::GetWriteTrampoline(const InstructionInfo &info, u32 r
 	// PC is used by memory watchpoints (if enabled) or to print accurate PC locations in debug logs
 	MOV(32, PPCSTATE(pc), Imm32(pc));

+	ABI_PushRegistersAndAdjustStack(registersInUse, true);
+
 	MOVTwo(64, ABI_PARAM1, dataReg, ABI_PARAM2, addrReg, ABI_PARAM3);

 	if (info.displacement)
@ -122,7 +124,6 @@ const u8 *TrampolineCache::GetWriteTrampoline(const InstructionInfo &info, u32 r
 		ADD(32, R(ABI_PARAM2), Imm32(info.displacement));
 	}

-	ABI_PushRegistersAndAdjustStack(registersInUse, true);
 	switch (info.operandSize)
 	{
 	case 8:
--- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp
+++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp
@ -5,7 +5,6 @@
 #include <emmintrin.h>

 #include "Common/Common.h"
-#include "Common/CPUDetect.h"
 #include "Common/MathUtil.h"

 #include "Core/HW/MMIO.h"
@ -248,13 +247,11 @@ void EmuCodeBlock::MMIOLoadToReg(MMIO::Mapping* mmio, Gen::X64Reg reg_value,
 	}
 }

-// Always clobbers EAX.  Preserves the address.
-// Preserves the value if the load fails and js.memcheck is enabled.
 void EmuCodeBlock::SafeLoadToReg(X64Reg reg_value, const Gen::OpArg & opAddress, int accessSize, s32 offset, u32 registersInUse, bool signExtend, int flags)
 {
 	if (!jit->js.memcheck)
 	{
-		registersInUse &= ~(1 << RAX | 1 << reg_value);
+		registersInUse &= ~(1 << reg_value);
 	}
 	if (!Core::g_CoreStartupParameter.bMMU &&
 	    Core::g_CoreStartupParameter.bFastmem &&
@ -395,11 +392,6 @@ void EmuCodeBlock::SafeLoadToReg(X64Reg reg_value, const Gen::OpArg & opAddress,

 u8 *EmuCodeBlock::UnsafeWriteRegToReg(X64Reg reg_value, X64Reg reg_addr, int accessSize, s32 offset, bool swap)
 {
-	if (accessSize == 8 && reg_value >= 4)
-	{
-		PanicAlert("WARNING: likely incorrect use of UnsafeWriteRegToReg!");
-	}
-
 	u8* result = GetWritableCodePtr();
 	OpArg dest = MComplex(RBX, reg_addr, SCALE_1, offset);
 	if (swap)
@ -410,7 +402,8 @@ u8 *EmuCodeBlock::UnsafeWriteRegToReg(X64Reg reg_value, X64Reg reg_addr, int acc
 		}
 		else
 		{
-			BSWAP(accessSize, reg_value);
+			if (accessSize > 8)
+				BSWAP(accessSize, reg_value);
 			result = GetWritableCodePtr();
 			MOV(accessSize, dest, R(reg_value));
 		}
@ -423,10 +416,8 @@ u8 *EmuCodeBlock::UnsafeWriteRegToReg(X64Reg reg_value, X64Reg reg_addr, int acc
 	return result;
 }

-// Destroys both arg registers
 void EmuCodeBlock::SafeWriteRegToReg(X64Reg reg_value, X64Reg reg_addr, int accessSize, s32 offset, u32 registersInUse, int flags)
 {
-	registersInUse &= ~(1 << RAX);
 	if (!Core::g_CoreStartupParameter.bMMU &&
 	    Core::g_CoreStartupParameter.bFastmem &&
 	    !(flags & (SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_FASTMEM))
@ -449,7 +440,17 @@ void EmuCodeBlock::SafeWriteRegToReg(X64Reg reg_value, X64Reg reg_addr, int acce
 	}

 	if (offset)
-		ADD(32, R(reg_addr), Imm32((u32)offset));
+	{
+		if (flags & SAFE_LOADSTORE_CLOBBER_EAX_INSTEAD_OF_ADDR)
+		{
+			LEA(32, EAX, MDisp(reg_addr, (u32)offset));
+			reg_addr = EAX;
+		}
+		else
+		{
+			ADD(32, R(reg_addr), Imm32((u32)offset));
+		}
+	}

 	u32 mem_mask = Memory::ADDR_MASK_HW_ACCESS;

--- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h
+++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h
@ -6,6 +6,7 @@

 #include <unordered_map>

+#include "Common/CPUDetect.h"
 #include "Common/x64Emitter.h"

 namespace MMIO { class Mapping; }
@ -52,11 +53,21 @@ public:
 	{
 		SAFE_LOADSTORE_NO_SWAP = 1,
 		SAFE_LOADSTORE_NO_PROLOG = 2,
-		SAFE_LOADSTORE_NO_FASTMEM = 4
+		SAFE_LOADSTORE_NO_FASTMEM = 4,
+		SAFE_LOADSTORE_CLOBBER_EAX_INSTEAD_OF_ADDR = 8
 	};
+
 	void SafeLoadToReg(Gen::X64Reg reg_value, const Gen::OpArg & opAddress, int accessSize, s32 offset, u32 registersInUse, bool signExtend, int flags = 0);
+	// Clobbers EAX or reg_addr depending on the relevant flag.  Preserves
+	// reg_value if the load fails and js.memcheck is enabled.
 	void SafeWriteRegToReg(Gen::X64Reg reg_value, Gen::X64Reg reg_addr, int accessSize, s32 offset, u32 registersInUse, int flags = 0);

+	// applies to safe and unsafe WriteRegToReg
+	bool WriteClobbersRegValue(int accessSize, bool swap)
+	{
+		return swap && !cpu_info.bMOVBE && accessSize > 8;
+	}
+
 	void SafeWriteF32ToReg(Gen::X64Reg xmm_value, Gen::X64Reg reg_addr, s32 offset, u32 registersInUse, int flags = 0);

 	void WriteToConstRamAddress(int accessSize, Gen::X64Reg arg, u32 address, bool swap = false);
--- a/Source/Core/Core/PowerPC/JitILCommon/IR.cpp
+++ b/Source/Core/Core/PowerPC/JitILCommon/IR.cpp
@ -40,7 +40,7 @@ instruction and generates code.  Dead code elimination works in this step,
 by simply skipping unused instructions.  The register allocator is a dumb,
 greedy allocator: at the moment, it's really a bit too dumb, but it's
 actually not as bad as it looks: unless a block is relatively long, spills
-are rarely needed.  ECX is used as a scratch register: requiring a scratch
+are rarely needed.  EDX is used as a scratch register: requiring a scratch
 register isn't ideal, but the register allocator is too dumb to handle
 instructions that need a specific register at the moment.