Somewhat faster CR flag storage. Doesn't really make that much of a difference - but opens a possibility to merge cmp instructions with their following conditional branches in an efficient way.

git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@1549 8ced0084-cf51-0410-be5f-012b33b47a6e
This commit is contained in:
hrydgard 2008-12-15 20:41:59 +00:00
parent 5c831a934b
commit a44c421d01
7 changed files with 68 additions and 38 deletions

View file

@ -326,17 +326,16 @@ void GenerateCommon()
{ {
// USES_CR // USES_CR
computeRc = AlignCode16(); computeRc = AlignCode16();
AND(32, M(&PowerPC::ppcState.cr), Imm32(0x0FFFFFFF));
CMP(32, R(EAX), Imm8(0)); CMP(32, R(EAX), Imm8(0));
FixupBranch pLesser = J_CC(CC_L); FixupBranch pLesser = J_CC(CC_L);
FixupBranch pGreater = J_CC(CC_G); FixupBranch pGreater = J_CC(CC_G);
OR(32, M(&PowerPC::ppcState.cr), Imm32(0x20000000)); // _x86Reg == 0 MOV(8, M(&PowerPC::ppcState.cr_fast[0]), Imm8(0x2)); // _x86Reg == 0
RET(); RET();
SetJumpTarget(pGreater); SetJumpTarget(pGreater);
OR(32, M(&PowerPC::ppcState.cr), Imm32(0x40000000)); // _x86Reg > 0 MOV(8, M(&PowerPC::ppcState.cr_fast[0]), Imm8(0x4)); // _x86Reg > 0
RET(); RET();
SetJumpTarget(pLesser); SetJumpTarget(pLesser);
OR(32, M(&PowerPC::ppcState.cr), Imm32(0x80000000)); // _x86Reg < 0 MOV(8, M(&PowerPC::ppcState.cr_fast[0]), Imm8(0x8)); // _x86Reg < 0
RET(); RET();
fifoDirectWrite8 = AlignCode4(); fifoDirectWrite8 = AlignCode4();

View file

@ -125,7 +125,7 @@ namespace Jit64
if ((inst.BO & 16) == 0) // Test a CR bit if ((inst.BO & 16) == 0) // Test a CR bit
{ {
TEST(32, M(&PowerPC::ppcState.cr), Imm32(0x80000000 >> inst.BI)); TEST(8, M(&PowerPC::ppcState.cr_fast[inst.BI >> 2]), Imm8(8 >> (inst.BI & 3)));
if (inst.BO & 8) // Conditional branch if (inst.BO & 8) // Conditional branch
branch = CC_NZ; branch = CC_NZ;
else else

View file

@ -202,11 +202,9 @@ namespace Jit64
fpr.Lock(a,b); fpr.Lock(a,b);
if (a != b) if (a != b)
{
fpr.LoadToX64(a, true); fpr.LoadToX64(a, true);
}
// USES_CR // USES_CR
AND(32, M(&PowerPC::ppcState.cr), Imm32(~(0xF0000000 >> shift)));
if (ordered) if (ordered)
COMISD(fpr.R(a).GetSimpleReg(), fpr.R(b)); COMISD(fpr.R(a).GetSimpleReg(), fpr.R(b));
else else
@ -214,19 +212,17 @@ namespace Jit64
FixupBranch pLesser = J_CC(CC_B); FixupBranch pLesser = J_CC(CC_B);
FixupBranch pGreater = J_CC(CC_A); FixupBranch pGreater = J_CC(CC_A);
// _x86Reg == 0 // _x86Reg == 0
MOV(32, R(EAX), Imm32(0x20000000)); MOV(8, M(&PowerPC::ppcState.cr_fast[crf]), Imm8(0x2));
FixupBranch continue1 = J(); FixupBranch continue1 = J();
// _x86Reg > 0 // _x86Reg > 0
SetJumpTarget(pGreater); SetJumpTarget(pGreater);
MOV(32, R(EAX), Imm32(0x40000000)); MOV(8, M(&PowerPC::ppcState.cr_fast[crf]), Imm8(0x4));
FixupBranch continue2 = J(); FixupBranch continue2 = J();
// _x86Reg < 0 // _x86Reg < 0
SetJumpTarget(pLesser); SetJumpTarget(pLesser);
MOV(32, R(EAX), Imm32(0x80000000)); MOV(8, M(&PowerPC::ppcState.cr_fast[crf]), Imm8(0x8));
SetJumpTarget(continue1); SetJumpTarget(continue1);
SetJumpTarget(continue2); SetJumpTarget(continue2);
SHR(32, R(EAX), Imm8(shift));
OR(32, M(&PowerPC::ppcState.cr), R(EAX));
fpr.UnlockAll(); fpr.UnlockAll();
} }

View file

@ -174,23 +174,21 @@ namespace Jit64
} }
gpr.KillImmediate(a); // todo, optimize instead, but unlikely to make a difference gpr.KillImmediate(a); // todo, optimize instead, but unlikely to make a difference
AND(32, M(&PowerPC::ppcState.cr), Imm32(~(0xF0000000 >> (crf*4))));
CMP(32, gpr.R(a), comparand); CMP(32, gpr.R(a), comparand);
FixupBranch pLesser = J_CC(less_than); FixupBranch pLesser = J_CC(less_than);
FixupBranch pGreater = J_CC(greater_than); FixupBranch pGreater = J_CC(greater_than);
MOV(32, R(EAX), Imm32(0x20000000 >> shift)); // _x86Reg == 0 MOV(8, M(&PowerPC::ppcState.cr_fast[crf]), Imm8(0x2)); // _x86Reg == 0
FixupBranch continue1 = J(); FixupBranch continue1 = J();
SetJumpTarget(pGreater); SetJumpTarget(pGreater);
MOV(32, R(EAX), Imm32(0x40000000 >> shift)); // _x86Reg > 0 MOV(8, M(&PowerPC::ppcState.cr_fast[crf]), Imm8(0x4)); // _x86Reg > 0
FixupBranch continue2 = J(); FixupBranch continue2 = J();
SetJumpTarget(pLesser); SetJumpTarget(pLesser);
MOV(32, R(EAX), Imm32(0x80000000 >> shift));// _x86Reg < 0 MOV(8, M(&PowerPC::ppcState.cr_fast[crf]), Imm8(0x8)); // _x86Reg < 0
SetJumpTarget(continue1); SetJumpTarget(continue1);
SetJumpTarget(continue2); SetJumpTarget(continue2);
OR(32, M(&PowerPC::ppcState.cr), R(EAX));
// TODO: Add extra code at the end for the "taken" case. Jump to it from the matching branches. // TODO: Add extra code at the end for the "taken" case. Jump to it from the matching branches.
// Since it's the last block, some liberties can be taken. // Since it's the last block, some liberties can be taken.
@ -221,23 +219,21 @@ namespace Jit64
} }
gpr.Lock(a, b); gpr.Lock(a, b);
gpr.LoadToX64(a, true, false); gpr.LoadToX64(a, true, false);
AND(32, M(&PowerPC::ppcState.cr), Imm32(~(0xF0000000 >> (crf*4))));
CMP(32, gpr.R(a), comparand); CMP(32, gpr.R(a), comparand);
FixupBranch pLesser = J_CC(less_than); FixupBranch pLesser = J_CC(less_than);
FixupBranch pGreater = J_CC(greater_than); FixupBranch pGreater = J_CC(greater_than);
// _x86Reg == 0 // _x86Reg == 0
MOV(32, R(EAX), Imm32(0x20000000 >> shift)); MOV(8, M(&PowerPC::ppcState.cr_fast[crf]), Imm8(0x2)); // _x86Reg == 0
FixupBranch continue1 = J(); FixupBranch continue1 = J();
// _x86Reg > 0
SetJumpTarget(pGreater); SetJumpTarget(pGreater);
MOV(32, R(EAX), Imm32(0x40000000 >> shift)); MOV(8, M(&PowerPC::ppcState.cr_fast[crf]), Imm8(0x4)); // _x86Reg > 0
FixupBranch continue2 = J(); FixupBranch continue2 = J();
// _x86Reg < 0
SetJumpTarget(pLesser); SetJumpTarget(pLesser);
MOV(32, R(EAX), Imm32(0x80000000 >> shift)); MOV(8, M(&PowerPC::ppcState.cr_fast[crf]), Imm8(0x8)); // _x86Reg < 0
SetJumpTarget(continue1); SetJumpTarget(continue1);
SetJumpTarget(continue2); SetJumpTarget(continue2);
OR(32, M(&PowerPC::ppcState.cr), R(EAX));
gpr.UnlockAll(); gpr.UnlockAll();
} }

View file

@ -163,20 +163,39 @@ namespace Jit64
// USES_CR // USES_CR
int d = inst.RD; int d = inst.RD;
gpr.LoadToX64(d, false, true); gpr.LoadToX64(d, false, true);
MOV(32, gpr.R(d), M(&PowerPC::ppcState.cr)); MOV(8, R(EAX), M(&PowerPC::ppcState.cr_fast[0]));
SHL(32, R(EAX), Imm8(4));
for (int i = 1; i < 7; i++) {
OR(8, R(EAX), M(&PowerPC::ppcState.cr_fast[i]));
SHL(32, R(EAX), Imm8(4));
}
OR(8, R(EAX), M(&PowerPC::ppcState.cr_fast[7]));
MOV(32, gpr.R(d), R(EAX));
} }
void mtcrf(UGeckoInstruction inst) void mtcrf(UGeckoInstruction inst)
{ {
//Default(inst);
//return;
// USES_CR // USES_CR
u32 mask = 0; u32 mask = 0;
u32 crm = inst.CRM; u32 crm = inst.CRM;
gpr.FlushLockX(ECX);
if (crm == 0xFF) { if (crm == 0xFF) {
gpr.FlushLockX(ECX);
MOV(32, R(EAX), gpr.R(inst.RS)); MOV(32, R(EAX), gpr.R(inst.RS));
MOV(32, M(&PowerPC::ppcState.cr), R(EAX)); for (int i = 0; i < 8; i++) {
MOV(32, R(ECX), R(EAX));
SHR(32, R(ECX), Imm8(28 - (i * 4)));
AND(32, R(ECX), Imm32(0xF));
MOV(8, M(&PowerPC::ppcState.cr_fast[i]), R(ECX));
}
gpr.UnlockAllX();
} else { } else {
//TODO: use lookup table? probably not worth it Default(inst);
return;
// TODO: translate this to work in new CR model.
for (int i = 0; i < 8; i++) { for (int i = 0; i < 8; i++) {
if (crm & (1 << i)) if (crm & (1 << i))
mask |= 0xF << (i*4); mask |= 0xF << (i*4);
@ -188,9 +207,6 @@ namespace Jit64
OR(32, R(EAX), R(ECX)); OR(32, R(EAX), R(ECX));
MOV(32, M(&PowerPC::ppcState.cr), R(EAX)); MOV(32, M(&PowerPC::ppcState.cr), R(EAX));
} }
gpr.UnlockAllX();
} }
} // namespace
}

View file

@ -42,6 +42,22 @@ volatile CPUState state = CPU_STEPPING;
static CoreMode mode; static CoreMode mode;
void CompactCR()
{
ppcState.cr = 0;
for (int i = 0; i < 8; i++) {
ppcState.cr |= ppcState.cr_fast[i] << (28 - i * 4);
}
}
void ExpandCR()
{
for (int i = 0; i < 8; i++) {
ppcState.cr_fast[i] = (ppcState.cr >> (28 - i * 4)) & 0xF;
}
}
void DoState(PointerWrap &p) void DoState(PointerWrap &p)
{ {
p.Do(ppcState); p.Do(ppcState);

View file

@ -47,6 +47,8 @@ struct GC_ALIGNED64(PowerPCState)
u32 npc; u32 npc;
u32 cr; // flags u32 cr; // flags
u8 cr_fast[8]; // Possibly reorder to 0, 2, 4, 8, 1, 3, 5, 7 so that we can make Compact and Expand super fast?
u32 msr; // machine specific register u32 msr; // machine specific register
u32 fpscr; // floating point flags/status bits u32 fpscr; // floating point flags/status bits
@ -86,6 +88,9 @@ void Start();
void Pause(); void Pause();
void Stop(); void Stop();
void CompactCR();
void ExpandCR();
void OnIdle(u32 _uThreadAddr); void OnIdle(u32 _uThreadAddr);
// Easy register access macros. // Easy register access macros.
@ -127,23 +132,25 @@ void OnIdle(u32 _uThreadAddr);
// These are intended to stay fast, probably become faster, and are not likely to slow down much if at all. // These are intended to stay fast, probably become faster, and are not likely to slow down much if at all.
inline void SetCRField(int cr_field, int value) { inline void SetCRField(int cr_field, int value) {
PowerPC::ppcState.cr = (PowerPC::ppcState.cr & (~(0xF0000000 >> (cr_field * 4)))) | (value << ((7 - cr_field) * 4)); PowerPC::ppcState.cr_fast[cr_field] = value;
} }
inline u32 GetCRField(int cr_field) { inline u32 GetCRField(int cr_field) {
return (PowerPC::ppcState.cr >> (4 * cr_field)) & 0xF; return PowerPC::ppcState.cr_fast[cr_field];
} }
inline u32 GetCRBit(int bit) { inline u32 GetCRBit(int bit) {
return (PowerPC::ppcState.cr >> (31 - bit)) & 1; return (PowerPC::ppcState.cr_fast[bit >> 2] >> (3 - (bit & 3))) & 1;
} }
// SetCR and GetCR may become fairly slow soon. Should be avoided if possible. // SetCR and GetCR may become fairly slow soon. Should be avoided if possible.
inline void SetCR(u32 new_cr) { inline void SetCR(u32 new_cr) {
PowerPC::ppcState.cr = new_cr; PowerPC::ppcState.cr = new_cr;
PowerPC::ExpandCR();
} }
inline u32 GetCR() { inline u32 GetCR() {
PowerPC::CompactCR();
return PowerPC::ppcState.cr; return PowerPC::ppcState.cr;
} }