Somewhat faster CR flag storage. Doesn't really make that much of a difference - but opens a possibility to merge cmp instructions with their following conditional branches in an efficient way.

git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@1549 8ced0084-cf51-0410-be5f-012b33b47a6e
2024-09-21 11:51:48 +02:00 · 2008-12-15 20:41:59 +00:00 · 2008-12-15 20:41:59 +00:00 · a44c421d01
commit a44c421d01
parent 5c831a934b
7 changed files with 68 additions and 38 deletions
--- a/Source/Core/Core/Src/PowerPC/Jit64/JitAsm.cpp
+++ b/Source/Core/Core/Src/PowerPC/Jit64/JitAsm.cpp
@ -326,19 +326,18 @@ void GenerateCommon()
 {
 	// USES_CR
 	computeRc = AlignCode16();
-	AND(32, M(&PowerPC::ppcState.cr), Imm32(0x0FFFFFFF));
 	CMP(32, R(EAX), Imm8(0));
 	FixupBranch pLesser  = J_CC(CC_L);
 	FixupBranch pGreater = J_CC(CC_G);
-	OR(32, M(&PowerPC::ppcState.cr), Imm32(0x20000000)); // _x86Reg == 0
+	MOV(8, M(&PowerPC::ppcState.cr_fast[0]), Imm8(0x2)); // _x86Reg == 0
 	RET();
 	SetJumpTarget(pGreater);
-	OR(32, M(&PowerPC::ppcState.cr), Imm32(0x40000000)); // _x86Reg > 0
+	MOV(8, M(&PowerPC::ppcState.cr_fast[0]), Imm8(0x4)); // _x86Reg > 0
 	RET();
 	SetJumpTarget(pLesser);
-	OR(32, M(&PowerPC::ppcState.cr), Imm32(0x80000000));	// _x86Reg < 0
+	MOV(8, M(&PowerPC::ppcState.cr_fast[0]), Imm8(0x8)); // _x86Reg < 0
 	RET();
-
+	
 	fifoDirectWrite8 = AlignCode4();
 	GenFifoWrite(8);
 	fifoDirectWrite16 = AlignCode4();
--- a/Source/Core/Core/Src/PowerPC/Jit64/Jit_Branch.cpp
+++ b/Source/Core/Core/Src/PowerPC/Jit64/Jit_Branch.cpp
@ -125,7 +125,7 @@ namespace Jit64

 		if ((inst.BO & 16) == 0)  // Test a CR bit
 		{
-			TEST(32, M(&PowerPC::ppcState.cr), Imm32(0x80000000 >> inst.BI));
+			TEST(8, M(&PowerPC::ppcState.cr_fast[inst.BI >> 2]), Imm8(8 >> (inst.BI & 3)));
 			if (inst.BO & 8)  // Conditional branch 
 				branch = CC_NZ;
 			else
--- a/Source/Core/Core/Src/PowerPC/Jit64/Jit_FloatingPoint.cpp
+++ b/Source/Core/Core/Src/PowerPC/Jit64/Jit_FloatingPoint.cpp
@ -202,11 +202,9 @@ namespace Jit64

 		fpr.Lock(a,b);
 		if (a != b)
-		{
 			fpr.LoadToX64(a, true);
-		}
+
 		// USES_CR
-		AND(32, M(&PowerPC::ppcState.cr), Imm32(~(0xF0000000 >> shift)));
 		if (ordered)
 			COMISD(fpr.R(a).GetSimpleReg(), fpr.R(b));
 		else
@ -214,19 +212,17 @@ namespace Jit64
 		FixupBranch pLesser  = J_CC(CC_B);
 		FixupBranch pGreater = J_CC(CC_A);
 		// _x86Reg == 0
-		MOV(32, R(EAX), Imm32(0x20000000));
+		MOV(8, M(&PowerPC::ppcState.cr_fast[crf]), Imm8(0x2));
 		FixupBranch continue1 = J();
 		// _x86Reg > 0
 		SetJumpTarget(pGreater);
-		MOV(32, R(EAX), Imm32(0x40000000));
+		MOV(8, M(&PowerPC::ppcState.cr_fast[crf]), Imm8(0x4));
 		FixupBranch continue2 = J();
 		// _x86Reg < 0
 		SetJumpTarget(pLesser);
-		MOV(32, R(EAX), Imm32(0x80000000));
+		MOV(8, M(&PowerPC::ppcState.cr_fast[crf]), Imm8(0x8));
 		SetJumpTarget(continue1);
 		SetJumpTarget(continue2);
-		SHR(32, R(EAX), Imm8(shift));
-		OR(32, M(&PowerPC::ppcState.cr), R(EAX));	
 		fpr.UnlockAll();
 	}
 	
--- a/Source/Core/Core/Src/PowerPC/Jit64/Jit_Integer.cpp
+++ b/Source/Core/Core/Src/PowerPC/Jit64/Jit_Integer.cpp
@ -174,23 +174,21 @@ namespace Jit64
 		}

 		gpr.KillImmediate(a); // todo, optimize instead, but unlikely to make a difference
-		AND(32, M(&PowerPC::ppcState.cr), Imm32(~(0xF0000000 >> (crf*4))));
 		CMP(32, gpr.R(a), comparand);
 		FixupBranch pLesser  = J_CC(less_than);
 		FixupBranch pGreater = J_CC(greater_than);
 		
-		MOV(32, R(EAX), Imm32(0x20000000 >> shift)); // _x86Reg == 0
+		MOV(8, M(&PowerPC::ppcState.cr_fast[crf]), Imm8(0x2)); // _x86Reg == 0
 		FixupBranch continue1 = J();
 		
 		SetJumpTarget(pGreater);
-		MOV(32, R(EAX), Imm32(0x40000000 >> shift)); // _x86Reg > 0
+		MOV(8, M(&PowerPC::ppcState.cr_fast[crf]), Imm8(0x4)); // _x86Reg > 0
 		FixupBranch continue2 = J();
 		
 		SetJumpTarget(pLesser);
-		MOV(32, R(EAX), Imm32(0x80000000 >> shift));// _x86Reg < 0
+		MOV(8, M(&PowerPC::ppcState.cr_fast[crf]), Imm8(0x8)); // _x86Reg < 0
 		SetJumpTarget(continue1);
 		SetJumpTarget(continue2);
-		OR(32, M(&PowerPC::ppcState.cr), R(EAX));

 		// TODO: Add extra code at the end for the "taken" case. Jump to it from the matching branches.
 		// Since it's the last block, some liberties can be taken.
@ -221,23 +219,21 @@ namespace Jit64
 		}
 		gpr.Lock(a, b);
 		gpr.LoadToX64(a, true, false);
-		AND(32, M(&PowerPC::ppcState.cr), Imm32(~(0xF0000000 >> (crf*4))));
 		CMP(32, gpr.R(a), comparand);
 		FixupBranch pLesser  = J_CC(less_than);
 		FixupBranch pGreater = J_CC(greater_than);
 		// _x86Reg == 0
-		MOV(32, R(EAX), Imm32(0x20000000 >> shift));
+		MOV(8, M(&PowerPC::ppcState.cr_fast[crf]), Imm8(0x2)); // _x86Reg == 0
 		FixupBranch continue1 = J();
-		// _x86Reg > 0
+		
 		SetJumpTarget(pGreater);
-		MOV(32, R(EAX), Imm32(0x40000000 >> shift));
+		MOV(8, M(&PowerPC::ppcState.cr_fast[crf]), Imm8(0x4)); // _x86Reg > 0
 		FixupBranch continue2 = J();
-		// _x86Reg < 0
+		
 		SetJumpTarget(pLesser);
-		MOV(32, R(EAX), Imm32(0x80000000 >> shift));
+		MOV(8, M(&PowerPC::ppcState.cr_fast[crf]), Imm8(0x8)); // _x86Reg < 0
 		SetJumpTarget(continue1);
 		SetJumpTarget(continue2);
-		OR(32, M(&PowerPC::ppcState.cr), R(EAX));	
 		gpr.UnlockAll();
 	}
 	
--- a/Source/Core/Core/Src/PowerPC/Jit64/Jit_SystemRegisters.cpp
+++ b/Source/Core/Core/Src/PowerPC/Jit64/Jit_SystemRegisters.cpp
@ -163,20 +163,39 @@ namespace Jit64
 		// USES_CR
 		int d = inst.RD;
 		gpr.LoadToX64(d, false, true);
-		MOV(32, gpr.R(d), M(&PowerPC::ppcState.cr));
+		MOV(8, R(EAX), M(&PowerPC::ppcState.cr_fast[0]));
+		SHL(32, R(EAX), Imm8(4));
+		for (int i = 1; i < 7; i++) {
+			OR(8, R(EAX), M(&PowerPC::ppcState.cr_fast[i]));
+			SHL(32, R(EAX), Imm8(4));
+		}
+		OR(8, R(EAX), M(&PowerPC::ppcState.cr_fast[7]));
+		MOV(32, gpr.R(d), R(EAX));
 	}

 	void mtcrf(UGeckoInstruction inst)
 	{
+		//Default(inst);
+		//return;
+
 		// USES_CR
 		u32 mask = 0;
 		u32 crm = inst.CRM;
-		gpr.FlushLockX(ECX);
 		if (crm == 0xFF) {
+			gpr.FlushLockX(ECX);			
 			MOV(32, R(EAX), gpr.R(inst.RS));
-			MOV(32, M(&PowerPC::ppcState.cr), R(EAX));
+			for (int i = 0; i < 8; i++) {
+				MOV(32, R(ECX), R(EAX));
+				SHR(32, R(ECX), Imm8(28 - (i * 4)));
+				AND(32, R(ECX), Imm32(0xF));
+				MOV(8, M(&PowerPC::ppcState.cr_fast[i]), R(ECX));
+			}
+			gpr.UnlockAllX();
 		} else {
-			//TODO: use lookup table? probably not worth it
+			Default(inst);
+			return;
+
+			// TODO: translate this to work in new CR model.
 			for (int i = 0; i < 8; i++) {
 				if (crm & (1 << i))
 					mask |= 0xF << (i*4);
@ -188,9 +207,6 @@ namespace Jit64
 			OR(32, R(EAX), R(ECX));
 			MOV(32, M(&PowerPC::ppcState.cr), R(EAX));
 		}
-		gpr.UnlockAllX();
 	}

-
-}
-
+}  // namespace
--- a/Source/Core/Core/Src/PowerPC/PowerPC.cpp
+++ b/Source/Core/Core/Src/PowerPC/PowerPC.cpp
@ -42,6 +42,22 @@ volatile CPUState state = CPU_STEPPING;

 static CoreMode mode;

+void CompactCR()
+{
+	ppcState.cr = 0;
+	for (int i = 0; i < 8; i++) {
+		ppcState.cr |= ppcState.cr_fast[i] << (28 - i * 4);
+	}
+}
+
+void ExpandCR()
+{
+	for (int i = 0; i < 8; i++) {
+		ppcState.cr_fast[i] = (ppcState.cr >> (28 - i * 4)) & 0xF;
+	}
+}
+
+
 void DoState(PointerWrap &p)
 {
 	p.Do(ppcState);
--- a/Source/Core/Core/Src/PowerPC/PowerPC.h
+++ b/Source/Core/Core/Src/PowerPC/PowerPC.h
@ -46,7 +46,9 @@ struct GC_ALIGNED64(PowerPCState)
 	u32 pc;     // program counter
 	u32 npc;

-	u32 cr;     // flags
+	u32 cr;            // flags
+	u8 cr_fast[8];     // Possibly reorder to 0, 2, 4, 8, 1, 3, 5, 7 so that we can make Compact and Expand super fast?
+
 	u32 msr;    // machine specific register
 	u32 fpscr;  // floating point flags/status bits

@ -86,6 +88,9 @@ void Start();
 void Pause();
 void Stop();

+void CompactCR();
+void ExpandCR();
+
 void OnIdle(u32 _uThreadAddr);

 	// Easy register access macros.
@ -127,23 +132,25 @@ void OnIdle(u32 _uThreadAddr);

 // These are intended to stay fast, probably become faster, and are not likely to slow down much if at all.
 inline void SetCRField(int cr_field, int value) {
-	PowerPC::ppcState.cr = (PowerPC::ppcState.cr & (~(0xF0000000 >> (cr_field * 4)))) | (value << ((7 - cr_field) * 4));
+	PowerPC::ppcState.cr_fast[cr_field] = value;
 }

 inline u32 GetCRField(int cr_field) {
-	return (PowerPC::ppcState.cr >> (4 * cr_field)) & 0xF;
+	return PowerPC::ppcState.cr_fast[cr_field];
 }

 inline u32 GetCRBit(int bit) {
-	return (PowerPC::ppcState.cr >> (31 - bit)) & 1;
+	return (PowerPC::ppcState.cr_fast[bit >> 2] >> (3 - (bit & 3))) & 1;
 }

 // SetCR and GetCR may become fairly slow soon. Should be avoided if possible.
 inline void SetCR(u32 new_cr) {
 	PowerPC::ppcState.cr = new_cr;
+	PowerPC::ExpandCR();
 }

 inline u32 GetCR() {
+	PowerPC::CompactCR();
 	return PowerPC::ppcState.cr;
 }