Somewhat faster CR flag storage. Doesn't really make that much of a difference - but opens a possibility to merge cmp instructions with their following conditional branches in an efficient way.

git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@1549 8ced0084-cf51-0410-be5f-012b33b47a6e
This commit is contained in:
hrydgard 2008-12-15 20:41:59 +00:00
parent 5c831a934b
commit a44c421d01
7 changed files with 68 additions and 38 deletions

View file

@ -326,19 +326,18 @@ void GenerateCommon()
{
// USES_CR
computeRc = AlignCode16();
AND(32, M(&PowerPC::ppcState.cr), Imm32(0x0FFFFFFF));
CMP(32, R(EAX), Imm8(0));
FixupBranch pLesser = J_CC(CC_L);
FixupBranch pGreater = J_CC(CC_G);
OR(32, M(&PowerPC::ppcState.cr), Imm32(0x20000000)); // _x86Reg == 0
MOV(8, M(&PowerPC::ppcState.cr_fast[0]), Imm8(0x2)); // _x86Reg == 0
RET();
SetJumpTarget(pGreater);
OR(32, M(&PowerPC::ppcState.cr), Imm32(0x40000000)); // _x86Reg > 0
MOV(8, M(&PowerPC::ppcState.cr_fast[0]), Imm8(0x4)); // _x86Reg > 0
RET();
SetJumpTarget(pLesser);
OR(32, M(&PowerPC::ppcState.cr), Imm32(0x80000000)); // _x86Reg < 0
MOV(8, M(&PowerPC::ppcState.cr_fast[0]), Imm8(0x8)); // _x86Reg < 0
RET();
fifoDirectWrite8 = AlignCode4();
GenFifoWrite(8);
fifoDirectWrite16 = AlignCode4();

View file

@ -125,7 +125,7 @@ namespace Jit64
if ((inst.BO & 16) == 0) // Test a CR bit
{
TEST(32, M(&PowerPC::ppcState.cr), Imm32(0x80000000 >> inst.BI));
TEST(8, M(&PowerPC::ppcState.cr_fast[inst.BI >> 2]), Imm8(8 >> (inst.BI & 3)));
if (inst.BO & 8) // Conditional branch
branch = CC_NZ;
else

View file

@ -202,11 +202,9 @@ namespace Jit64
fpr.Lock(a,b);
if (a != b)
{
fpr.LoadToX64(a, true);
}
// USES_CR
AND(32, M(&PowerPC::ppcState.cr), Imm32(~(0xF0000000 >> shift)));
if (ordered)
COMISD(fpr.R(a).GetSimpleReg(), fpr.R(b));
else
@ -214,19 +212,17 @@ namespace Jit64
FixupBranch pLesser = J_CC(CC_B);
FixupBranch pGreater = J_CC(CC_A);
// _x86Reg == 0
MOV(32, R(EAX), Imm32(0x20000000));
MOV(8, M(&PowerPC::ppcState.cr_fast[crf]), Imm8(0x2));
FixupBranch continue1 = J();
// _x86Reg > 0
SetJumpTarget(pGreater);
MOV(32, R(EAX), Imm32(0x40000000));
MOV(8, M(&PowerPC::ppcState.cr_fast[crf]), Imm8(0x4));
FixupBranch continue2 = J();
// _x86Reg < 0
SetJumpTarget(pLesser);
MOV(32, R(EAX), Imm32(0x80000000));
MOV(8, M(&PowerPC::ppcState.cr_fast[crf]), Imm8(0x8));
SetJumpTarget(continue1);
SetJumpTarget(continue2);
SHR(32, R(EAX), Imm8(shift));
OR(32, M(&PowerPC::ppcState.cr), R(EAX));
fpr.UnlockAll();
}

View file

@ -174,23 +174,21 @@ namespace Jit64
}
gpr.KillImmediate(a); // todo, optimize instead, but unlikely to make a difference
AND(32, M(&PowerPC::ppcState.cr), Imm32(~(0xF0000000 >> (crf*4))));
CMP(32, gpr.R(a), comparand);
FixupBranch pLesser = J_CC(less_than);
FixupBranch pGreater = J_CC(greater_than);
MOV(32, R(EAX), Imm32(0x20000000 >> shift)); // _x86Reg == 0
MOV(8, M(&PowerPC::ppcState.cr_fast[crf]), Imm8(0x2)); // _x86Reg == 0
FixupBranch continue1 = J();
SetJumpTarget(pGreater);
MOV(32, R(EAX), Imm32(0x40000000 >> shift)); // _x86Reg > 0
MOV(8, M(&PowerPC::ppcState.cr_fast[crf]), Imm8(0x4)); // _x86Reg > 0
FixupBranch continue2 = J();
SetJumpTarget(pLesser);
MOV(32, R(EAX), Imm32(0x80000000 >> shift));// _x86Reg < 0
MOV(8, M(&PowerPC::ppcState.cr_fast[crf]), Imm8(0x8)); // _x86Reg < 0
SetJumpTarget(continue1);
SetJumpTarget(continue2);
OR(32, M(&PowerPC::ppcState.cr), R(EAX));
// TODO: Add extra code at the end for the "taken" case. Jump to it from the matching branches.
// Since it's the last block, some liberties can be taken.
@ -221,23 +219,21 @@ namespace Jit64
}
gpr.Lock(a, b);
gpr.LoadToX64(a, true, false);
AND(32, M(&PowerPC::ppcState.cr), Imm32(~(0xF0000000 >> (crf*4))));
CMP(32, gpr.R(a), comparand);
FixupBranch pLesser = J_CC(less_than);
FixupBranch pGreater = J_CC(greater_than);
// _x86Reg == 0
MOV(32, R(EAX), Imm32(0x20000000 >> shift));
MOV(8, M(&PowerPC::ppcState.cr_fast[crf]), Imm8(0x2)); // _x86Reg == 0
FixupBranch continue1 = J();
// _x86Reg > 0
SetJumpTarget(pGreater);
MOV(32, R(EAX), Imm32(0x40000000 >> shift));
MOV(8, M(&PowerPC::ppcState.cr_fast[crf]), Imm8(0x4)); // _x86Reg > 0
FixupBranch continue2 = J();
// _x86Reg < 0
SetJumpTarget(pLesser);
MOV(32, R(EAX), Imm32(0x80000000 >> shift));
MOV(8, M(&PowerPC::ppcState.cr_fast[crf]), Imm8(0x8)); // _x86Reg < 0
SetJumpTarget(continue1);
SetJumpTarget(continue2);
OR(32, M(&PowerPC::ppcState.cr), R(EAX));
gpr.UnlockAll();
}

View file

@ -163,20 +163,39 @@ namespace Jit64
// USES_CR
int d = inst.RD;
gpr.LoadToX64(d, false, true);
MOV(32, gpr.R(d), M(&PowerPC::ppcState.cr));
MOV(8, R(EAX), M(&PowerPC::ppcState.cr_fast[0]));
SHL(32, R(EAX), Imm8(4));
for (int i = 1; i < 7; i++) {
OR(8, R(EAX), M(&PowerPC::ppcState.cr_fast[i]));
SHL(32, R(EAX), Imm8(4));
}
OR(8, R(EAX), M(&PowerPC::ppcState.cr_fast[7]));
MOV(32, gpr.R(d), R(EAX));
}
void mtcrf(UGeckoInstruction inst)
{
//Default(inst);
//return;
// USES_CR
u32 mask = 0;
u32 crm = inst.CRM;
gpr.FlushLockX(ECX);
if (crm == 0xFF) {
gpr.FlushLockX(ECX);
MOV(32, R(EAX), gpr.R(inst.RS));
MOV(32, M(&PowerPC::ppcState.cr), R(EAX));
for (int i = 0; i < 8; i++) {
MOV(32, R(ECX), R(EAX));
SHR(32, R(ECX), Imm8(28 - (i * 4)));
AND(32, R(ECX), Imm32(0xF));
MOV(8, M(&PowerPC::ppcState.cr_fast[i]), R(ECX));
}
gpr.UnlockAllX();
} else {
//TODO: use lookup table? probably not worth it
Default(inst);
return;
// TODO: translate this to work in new CR model.
for (int i = 0; i < 8; i++) {
if (crm & (1 << i))
mask |= 0xF << (i*4);
@ -188,9 +207,6 @@ namespace Jit64
OR(32, R(EAX), R(ECX));
MOV(32, M(&PowerPC::ppcState.cr), R(EAX));
}
gpr.UnlockAllX();
}
}
} // namespace

View file

@ -42,6 +42,22 @@ volatile CPUState state = CPU_STEPPING;
static CoreMode mode;
void CompactCR()
{
ppcState.cr = 0;
for (int i = 0; i < 8; i++) {
ppcState.cr |= ppcState.cr_fast[i] << (28 - i * 4);
}
}
void ExpandCR()
{
for (int i = 0; i < 8; i++) {
ppcState.cr_fast[i] = (ppcState.cr >> (28 - i * 4)) & 0xF;
}
}
void DoState(PointerWrap &p)
{
p.Do(ppcState);

View file

@ -46,7 +46,9 @@ struct GC_ALIGNED64(PowerPCState)
u32 pc; // program counter
u32 npc;
u32 cr; // flags
u32 cr; // flags
u8 cr_fast[8]; // Possibly reorder to 0, 2, 4, 8, 1, 3, 5, 7 so that we can make Compact and Expand super fast?
u32 msr; // machine specific register
u32 fpscr; // floating point flags/status bits
@ -86,6 +88,9 @@ void Start();
void Pause();
void Stop();
void CompactCR();
void ExpandCR();
void OnIdle(u32 _uThreadAddr);
// Easy register access macros.
@ -127,23 +132,25 @@ void OnIdle(u32 _uThreadAddr);
// These are intended to stay fast, probably become faster, and are not likely to slow down much if at all.
inline void SetCRField(int cr_field, int value) {
PowerPC::ppcState.cr = (PowerPC::ppcState.cr & (~(0xF0000000 >> (cr_field * 4)))) | (value << ((7 - cr_field) * 4));
PowerPC::ppcState.cr_fast[cr_field] = value;
}
inline u32 GetCRField(int cr_field) {
return (PowerPC::ppcState.cr >> (4 * cr_field)) & 0xF;
return PowerPC::ppcState.cr_fast[cr_field];
}
inline u32 GetCRBit(int bit) {
return (PowerPC::ppcState.cr >> (31 - bit)) & 1;
return (PowerPC::ppcState.cr_fast[bit >> 2] >> (3 - (bit & 3))) & 1;
}
// SetCR and GetCR may become fairly slow soon. Should be avoided if possible.
inline void SetCR(u32 new_cr) {
PowerPC::ppcState.cr = new_cr;
PowerPC::ExpandCR();
}
inline u32 GetCR() {
PowerPC::CompactCR();
return PowerPC::ppcState.cr;
}