Rationalize temporary register usage.

Rather than using a variety of registers including RSI, ABI_PARAM1
(either RCX or RDI), RCX, and RDX, the rule is:

- RDI and RSI are never used.  This allows them to be allocated on Unix,
bringing parity with Windows.

- RDX is a permanent temporary register along with RAX (and is thus not
FlushLocked).  It's used frequently enough that allocating it would
probably be a bad idea, as it would constantly get flushed.

- RCX is allocatable, but is flushed in two situations:
    - Non-immediate shifts (rlwnm), because x86 requires RCX to be used.
    - Paired single loads and stores, because they require three
    temporary registers: the helper functions take two integer
    arguments, and another register is used as an index to get the
    function address.
These should be relatively rare.

While we're at it, in stores, use the registers directly where possible
rather than always using temporaries (by making SafeWriteRegToReg
clobber less).  The address doesn't need to be clobbered in the usual
case, and on CPUs with MOVBE, neither does the value.

Oh, and get rid of a useless MEMCHECK.

This commit does not actually add new registers to the allocation order;
it is intended to test for any performance or correctness issues
separately.
This commit is contained in:
comex 2014-09-02 18:54:46 -04:00
parent 67cdb6e07a
commit 8dea26762d
13 changed files with 179 additions and 172 deletions

View file

@ -107,10 +107,9 @@ public:
void GenerateRC();
void ComputeRC(const Gen::OpArg & arg);
// Reads a given bit of a given CR register part. Clobbers ABI_PARAM1,
// don't forget to xlock it before.
// Reads a given bit of a given CR register part.
void GetCRFieldBit(int field, int bit, Gen::X64Reg out, bool negate = false);
// Clobbers ABI_PARAM1, xlock it before.
// Clobbers RDX.
void SetCRFieldBit(int field, int bit, Gen::X64Reg in);
// Generates a branch that will check if a given bit of a CR register part

View file

@ -9,13 +9,12 @@
using namespace Gen;
//GLOBAL STATIC ALLOCATIONS x86
//EAX - ubiquitous scratch register - EVERYBODY scratches this
//GLOBAL STATIC ALLOCATIONS x64
//EAX - ubiquitous scratch register - EVERYBODY scratches this
//RBX - Base pointer of memory
//R15 - Pointer to array of block pointers
// GLOBAL STATIC ALLOCATIONS x64
// RAX - ubiquitous scratch register - EVERYBODY scratches this
// RDX - second scratch register
// RBX - Base pointer of memory
// R15 - Pointer to array of block pointers
// RBP - Pointer to ppcState+0x80
// PLAN: no more block numbers - crazy opcodes just contain offset within
// dynarec buffer
@ -73,8 +72,8 @@ void Jit64AsmRoutineManager::Generate()
no_mem = J_CC(CC_NZ);
}
AND(32, R(EAX), Imm32(JIT_ICACHE_MASK));
MOV(64, R(RSI), Imm64((u64)jit->GetBlockCache()->iCache));
MOV(32, R(EAX), MComplex(RSI, EAX, SCALE_1, 0));
MOV(64, R(RDX), Imm64((u64)jit->GetBlockCache()->iCache));
MOV(32, R(EAX), MComplex(RDX, EAX, SCALE_1, 0));
if (Core::g_CoreStartupParameter.bWii || Core::g_CoreStartupParameter.bMMU || Core::g_CoreStartupParameter.bTLBHack)
{
@ -86,8 +85,8 @@ void Jit64AsmRoutineManager::Generate()
TEST(32, R(EAX), Imm32(JIT_ICACHE_VMEM_BIT));
FixupBranch no_vmem = J_CC(CC_Z);
AND(32, R(EAX), Imm32(JIT_ICACHE_MASK));
MOV(64, R(RSI), Imm64((u64)jit->GetBlockCache()->iCacheVMEM));
MOV(32, R(EAX), MComplex(RSI, EAX, SCALE_1, 0));
MOV(64, R(RDX), Imm64((u64)jit->GetBlockCache()->iCacheVMEM));
MOV(32, R(EAX), MComplex(RDX, EAX, SCALE_1, 0));
if (Core::g_CoreStartupParameter.bWii) exit_vmem = J();
SetJumpTarget(no_vmem);
@ -97,8 +96,8 @@ void Jit64AsmRoutineManager::Generate()
TEST(32, R(EAX), Imm32(JIT_ICACHE_EXRAM_BIT));
FixupBranch no_exram = J_CC(CC_Z);
AND(32, R(EAX), Imm32(JIT_ICACHEEX_MASK));
MOV(64, R(RSI), Imm64((u64)jit->GetBlockCache()->iCacheEx));
MOV(32, R(EAX), MComplex(RSI, EAX, SCALE_1, 0));
MOV(64, R(RDX), Imm64((u64)jit->GetBlockCache()->iCacheEx));
MOV(32, R(EAX), MComplex(RDX, EAX, SCALE_1, 0));
SetJumpTarget(no_exram);
}

View file

@ -442,8 +442,8 @@ void Jit64::cmpXX(UGeckoInstruction inst)
if (!comparand.IsImm())
{
MOVSX(64, 32, ABI_PARAM1, comparand);
comparand = R(ABI_PARAM1);
MOVSX(64, 32, RDX, comparand);
comparand = R(RDX);
}
}
else
@ -454,11 +454,11 @@ void Jit64::cmpXX(UGeckoInstruction inst)
MOVZX(64, 32, RAX, gpr.R(a));
if (comparand.IsImm())
MOV(32, R(ABI_PARAM1), comparand);
MOV(32, R(RDX), comparand);
else
MOVZX(64, 32, ABI_PARAM1, comparand);
MOVZX(64, 32, RDX, comparand);
comparand = R(ABI_PARAM1);
comparand = R(RDX);
}
SUB(64, R(RAX), comparand);
MOV(64, PPCSTATE(cr_val[crf]), R(RAX));
@ -1170,7 +1170,6 @@ void Jit64::mulhwXx(UGeckoInstruction inst)
}
else
{
gpr.FlushLockX(EDX);
gpr.Lock(a, b, d);
gpr.BindToRegister(d, (d == a || d == b), true);
if (gpr.RX(d) == EDX)
@ -1288,7 +1287,6 @@ void Jit64::divwux(UGeckoInstruction inst)
}
else
{
gpr.FlushLockX(EDX);
gpr.Lock(a, b, d);
gpr.BindToRegister(d, (d == a || d == b), true);
MOV(32, R(EAX), gpr.R(a));
@ -1349,7 +1347,6 @@ void Jit64::divwx(UGeckoInstruction inst)
}
else
{
gpr.FlushLockX(EDX);
gpr.Lock(a, b, d);
gpr.BindToRegister(d, (d == a || d == b), true);
MOV(32, R(EAX), gpr.R(a));
@ -1881,8 +1878,8 @@ void Jit64::srawx(UGeckoInstruction inst)
int a = inst.RA;
int b = inst.RB;
int s = inst.RS;
gpr.Lock(a, s, b);
gpr.FlushLockX(ECX);
gpr.Lock(a, s, b);
gpr.BindToRegister(a, (a == s || a == b), true);
JitClearCA();
MOV(32, R(ECX), gpr.R(b));

View file

@ -197,14 +197,13 @@ void Jit64::lXXx(UGeckoInstruction inst)
else
{
// In this case we need an extra temporary register.
gpr.FlushLockX(ABI_PARAM1);
opAddress = R(ABI_PARAM1);
opAddress = R(RDX);
storeAddress = true;
if (use_constant_offset)
{
if (gpr.R(a).IsSimpleReg() && offset != 0)
{
LEA(32, ABI_PARAM1, MDisp(gpr.RX(a), offset));
LEA(32, RDX, MDisp(gpr.RX(a), offset));
}
else
{
@ -215,7 +214,7 @@ void Jit64::lXXx(UGeckoInstruction inst)
}
else if (gpr.R(a).IsSimpleReg() && gpr.R(b).IsSimpleReg())
{
LEA(32, ABI_PARAM1, MComplex(gpr.RX(a), gpr.RX(b), SCALE_1, 0));
LEA(32, RDX, MComplex(gpr.RX(a), gpr.RX(b), SCALE_1, 0));
}
else
{
@ -232,7 +231,7 @@ void Jit64::lXXx(UGeckoInstruction inst)
if (update && storeAddress)
{
// We need to save the (usually scratch) address register for the update.
registersInUse |= (1 << ABI_PARAM1);
registersInUse |= (1 << RDX);
}
SafeLoadToReg(gpr.RX(d), opAddress, accessSize, loadOffset, registersInUse, signExtend);
@ -339,8 +338,7 @@ void Jit64::stX(UGeckoInstruction inst)
// Helps external systems know which instruction triggered the write
MOV(32, PPCSTATE(pc), Imm32(jit->js.compilerPC));
gpr.FlushLockX(ABI_PARAM1);
MOV(32, R(ABI_PARAM1), gpr.R(s));
MOV(32, R(EDX), gpr.R(s));
if (update)
gpr.SetImmediate32(a, addr);
@ -396,24 +394,31 @@ void Jit64::stX(UGeckoInstruction inst)
}
}
gpr.FlushLockX(ECX, EDX);
gpr.Lock(s, a);
MOV(32, R(EDX), gpr.R(a));
MOV(32, R(ECX), gpr.R(s));
SafeWriteRegToReg(ECX, EDX, accessSize, offset, CallerSavedRegistersInUse());
gpr.Lock(a, s);
gpr.BindToRegister(a, true, false);
X64Reg reg_value;
if (WriteClobbersRegValue(accessSize, /* swap */ true))
{
MOV(32, R(EDX), gpr.R(s));
reg_value = EDX;
}
else
{
gpr.BindToRegister(s, true, false);
reg_value = gpr.RX(s);
}
SafeWriteRegToReg(reg_value, gpr.RX(a), accessSize, offset, CallerSavedRegistersInUse(), SAFE_LOADSTORE_CLOBBER_EAX_INSTEAD_OF_ADDR);
if (update && offset)
{
gpr.KillImmediate(a, true, true);
MEMCHECK_START
gpr.KillImmediate(a, true, true);
ADD(32, gpr.R(a), Imm32((u32)offset));
MEMCHECK_END
}
gpr.UnlockAll();
gpr.UnlockAllX();
}
else
{
@ -430,15 +435,12 @@ void Jit64::stXx(UGeckoInstruction inst)
FALLBACK_IF(!a || a == s || a == b);
gpr.Lock(a, b, s);
gpr.FlushLockX(ECX, EDX);
if (inst.SUBOP10 & 32)
{
MEMCHECK_START
gpr.BindToRegister(a, true, true);
ADD(32, gpr.R(a), gpr.R(b));
MOV(32, R(EDX), gpr.R(a));
MEMCHECK_END
}
else if (gpr.R(a).IsSimpleReg() && gpr.R(b).IsSimpleReg())
{
@ -468,8 +470,18 @@ void Jit64::stXx(UGeckoInstruction inst)
break;
}
MOV(32, R(ECX), gpr.R(s));
SafeWriteRegToReg(ECX, EDX, accessSize, 0, CallerSavedRegistersInUse());
X64Reg reg_value;
if (WriteClobbersRegValue(accessSize, /* swap */ true))
{
MOV(32, R(EAX), gpr.R(s));
reg_value = EAX;
}
else
{
gpr.BindToRegister(s, true, false);
reg_value = gpr.RX(s);
}
SafeWriteRegToReg(reg_value, EDX, accessSize, 0, CallerSavedRegistersInUse());
gpr.UnlockAll();
gpr.UnlockAllX();
@ -482,13 +494,12 @@ void Jit64::lmw(UGeckoInstruction inst)
JITDISABLE(bJITLoadStoreOff);
// TODO: This doesn't handle rollback on DSI correctly
gpr.FlushLockX(ECX);
MOV(32, R(ECX), Imm32((u32)(s32)inst.SIMM_16));
MOV(32, R(EDX), Imm32((u32)(s32)inst.SIMM_16));
if (inst.RA)
ADD(32, R(ECX), gpr.R(inst.RA));
ADD(32, R(EDX), gpr.R(inst.RA));
for (int i = inst.RD; i < 32; i++)
{
SafeLoadToReg(EAX, R(ECX), 32, (i - inst.RD) * 4, CallerSavedRegistersInUse() | (1 << ECX), false);
SafeLoadToReg(EAX, R(EDX), 32, (i - inst.RD) * 4, CallerSavedRegistersInUse() | (1 << ECX), false);
gpr.BindToRegister(i, false, true);
MOV(32, gpr.R(i), R(EAX));
}
@ -501,15 +512,14 @@ void Jit64::stmw(UGeckoInstruction inst)
JITDISABLE(bJITLoadStoreOff);
// TODO: This doesn't handle rollback on DSI correctly
gpr.FlushLockX(ECX);
for (int i = inst.RD; i < 32; i++)
{
if (inst.RA)
MOV(32, R(EAX), gpr.R(inst.RA));
else
XOR(32, R(EAX), R(EAX));
MOV(32, R(ECX), gpr.R(i));
SafeWriteRegToReg(ECX, EAX, 32, (i - inst.RD) * 4 + (u32)(s32)inst.SIMM_16, CallerSavedRegistersInUse());
MOV(32, R(EDX), gpr.R(i));
SafeWriteRegToReg(EDX, EAX, 32, (i - inst.RD) * 4 + (u32)(s32)inst.SIMM_16, CallerSavedRegistersInUse());
}
gpr.UnlockAllX();
}

View file

@ -96,24 +96,23 @@ void Jit64::stfXXX(UGeckoInstruction inst)
FALLBACK_IF(!indexed && !a);
s32 offset = 0;
gpr.FlushLockX(ABI_PARAM1);
if (indexed)
{
if (update)
{
gpr.BindToRegister(a, true, true);
ADD(32, gpr.R(a), gpr.R(b));
MOV(32, R(ABI_PARAM1), gpr.R(a));
MOV(32, R(RDX), gpr.R(a));
}
else
{
if (a && gpr.R(a).IsSimpleReg() && gpr.R(b).IsSimpleReg())
LEA(32, ABI_PARAM1, MComplex(gpr.RX(a), gpr.RX(b), SCALE_1, 0));
LEA(32, RDX, MComplex(gpr.RX(a), gpr.RX(b), SCALE_1, 0));
else
{
MOV(32, R(ABI_PARAM1), gpr.R(b));
MOV(32, R(RDX), gpr.R(b));
if (a)
ADD(32, R(ABI_PARAM1), gpr.R(a));
ADD(32, R(RDX), gpr.R(a));
}
}
}
@ -128,14 +127,14 @@ void Jit64::stfXXX(UGeckoInstruction inst)
{
offset = (s32)(s16)inst.SIMM_16;
}
MOV(32, R(ABI_PARAM1), gpr.R(a));
MOV(32, R(RDX), gpr.R(a));
}
if (single)
{
fpr.BindToRegister(s, true, false);
ConvertDoubleToSingle(XMM0, fpr.RX(s));
SafeWriteF32ToReg(XMM0, ABI_PARAM1, offset, CallerSavedRegistersInUse());
SafeWriteF32ToReg(XMM0, RDX, offset, CallerSavedRegistersInUse());
fpr.UnlockAll();
}
else
@ -144,7 +143,7 @@ void Jit64::stfXXX(UGeckoInstruction inst)
MOVQ_xmm(R(RAX), fpr.RX(s));
else
MOV(64, R(RAX), fpr.R(s));
SafeWriteRegToReg(RAX, ABI_PARAM1, 64, offset, CallerSavedRegistersInUse());
SafeWriteRegToReg(RAX, RDX, 64, offset, CallerSavedRegistersInUse());
}
gpr.UnlockAll();
gpr.UnlockAllX();
@ -160,15 +159,14 @@ void Jit64::stfiwx(UGeckoInstruction inst)
int a = inst.RA;
int b = inst.RB;
gpr.FlushLockX(ABI_PARAM1);
MOV(32, R(ABI_PARAM1), gpr.R(b));
MOV(32, R(RDX), gpr.R(b));
if (a)
ADD(32, R(ABI_PARAM1), gpr.R(a));
ADD(32, R(RDX), gpr.R(a));
if (fpr.R(s).IsSimpleReg())
MOVD_xmm(R(EAX), fpr.RX(s));
else
MOV(32, R(EAX), fpr.R(s));
SafeWriteRegToReg(EAX, ABI_PARAM1, 32, 0, CallerSavedRegistersInUse());
SafeWriteRegToReg(EAX, RDX, 32, 0, CallerSavedRegistersInUse());
gpr.UnlockAllX();
}

View file

@ -28,8 +28,7 @@ void Jit64::psq_st(UGeckoInstruction inst)
int a = inst.RA;
int s = inst.RS; // Fp numbers
gpr.FlushLockX(EAX, EDX);
gpr.FlushLockX(ECX);
gpr.FlushLockX(EAX, ECX);
if (update)
gpr.BindToRegister(inst.RA, true, true);
fpr.BindToRegister(inst.RS, true, false);
@ -73,8 +72,7 @@ void Jit64::psq_l(UGeckoInstruction inst)
bool update = inst.OPCD == 57;
int offset = inst.SIMM_12;
gpr.FlushLockX(EAX, EDX);
gpr.FlushLockX(ECX);
gpr.FlushLockX(EAX, ECX);
gpr.BindToRegister(inst.RA, true, update && offset);
fpr.BindToRegister(inst.RS, false, true);
if (offset)

View file

@ -42,40 +42,40 @@ void Jit64::GetCRFieldBit(int field, int bit, Gen::X64Reg out, bool negate)
void Jit64::SetCRFieldBit(int field, int bit, Gen::X64Reg in)
{
MOV(64, R(ABI_PARAM1), PPCSTATE(cr_val[field]));
MOV(64, R(RDX), PPCSTATE(cr_val[field]));
MOVZX(32, 8, in, R(in));
switch (bit)
{
case CR_SO_BIT: // set bit 61 to input
BTR(64, R(ABI_PARAM1), Imm8(61));
BTR(64, R(RDX), Imm8(61));
SHL(64, R(in), Imm8(61));
OR(64, R(ABI_PARAM1), R(in));
OR(64, R(RDX), R(in));
break;
case CR_EQ_BIT: // clear low 32 bits, set bit 0 to !input
SHR(64, R(ABI_PARAM1), Imm8(32));
SHL(64, R(ABI_PARAM1), Imm8(32));
SHR(64, R(RDX), Imm8(32));
SHL(64, R(RDX), Imm8(32));
XOR(32, R(in), Imm8(1));
OR(64, R(ABI_PARAM1), R(in));
OR(64, R(RDX), R(in));
break;
case CR_GT_BIT: // set bit 63 to !input
BTR(64, R(ABI_PARAM1), Imm8(63));
BTR(64, R(RDX), Imm8(63));
NOT(32, R(in));
SHL(64, R(in), Imm8(63));
OR(64, R(ABI_PARAM1), R(in));
OR(64, R(RDX), R(in));
break;
case CR_LT_BIT: // set bit 62 to input
BTR(64, R(ABI_PARAM1), Imm8(62));
BTR(64, R(RDX), Imm8(62));
SHL(64, R(in), Imm8(62));
OR(64, R(ABI_PARAM1), R(in));
OR(64, R(RDX), R(in));
break;
}
BTS(64, R(ABI_PARAM1), Imm8(32));
MOV(64, PPCSTATE(cr_val[field]), R(ABI_PARAM1));
BTS(64, R(RDX), Imm8(32));
MOV(64, PPCSTATE(cr_val[field]), R(RDX));
}
FixupBranch Jit64::JumpIfCRFieldBit(int field, int bit, bool jump_if_set)
@ -308,8 +308,7 @@ void Jit64::mfcr(UGeckoInstruction inst)
gpr.BindToRegister(d, false, true);
XOR(32, gpr.R(d), gpr.R(d));
gpr.FlushLockX(ABI_PARAM1);
X64Reg cr_val = ABI_PARAM1;
X64Reg cr_val = RDX;
// we only need to zero the high bits of EAX once
XOR(32, R(EAX), R(EAX));
for (int i = 0; i < 8; i++)
@ -439,9 +438,8 @@ void Jit64::crXXX(UGeckoInstruction inst)
// crnand or crnor
bool negateB = inst.SUBOP10 == 225 || inst.SUBOP10 == 33;
gpr.FlushLockX(ABI_PARAM1);
GetCRFieldBit(inst.CRBA >> 2, 3 - (inst.CRBA & 3), ABI_PARAM1, negateA);
GetCRFieldBit(inst.CRBB >> 2, 3 - (inst.CRBB & 3), EAX, negateB);
GetCRFieldBit(inst.CRBA >> 2, 3 - (inst.CRBA & 3), DL, negateA);
GetCRFieldBit(inst.CRBB >> 2, 3 - (inst.CRBB & 3), AL, negateB);
// Compute combined bit
switch (inst.SUBOP10)
@ -449,23 +447,23 @@ void Jit64::crXXX(UGeckoInstruction inst)
case 33: // crnor: ~(A || B) == (~A && ~B)
case 129: // crandc
case 257: // crand
AND(8, R(EAX), R(ABI_PARAM1));
AND(8, R(AL), R(DL));
break;
case 193: // crxor
case 289: // creqv
XOR(8, R(EAX), R(ABI_PARAM1));
XOR(8, R(AL), R(DL));
break;
case 225: // crnand: ~(A && B) == (~A || ~B)
case 417: // crorc
case 449: // cror
OR(8, R(EAX), R(ABI_PARAM1));
OR(8, R(AL), R(DL));
break;
}
// Store result bit in CRBD
SetCRFieldBit(inst.CRBD >> 2, 3 - (inst.CRBD & 3), EAX);
SetCRFieldBit(inst.CRBD >> 2, 3 - (inst.CRBD & 3), AL);
gpr.UnlockAllX();
}

View file

@ -157,7 +157,9 @@ static void fregSpill(RegInfo& RI, X64Reg reg)
RI.fregs[reg] = nullptr;
}
// ECX is scratch, so we don't allocate it
// RAX and RDX are scratch, so we don't allocate them
// (TODO: if we could lock RCX here too then we could allocate it - needed for
// shifts)
// 64-bit - calling conventions differ between linux & windows, so...
#ifdef _WIN32
@ -602,9 +604,9 @@ static void regEmitMemStore(RegInfo& RI, InstLoc I, unsigned Size)
{
auto info = regBuildMemAddress(RI, I, getOp2(I), 2, Size, nullptr);
if (info.first.IsImm())
RI.Jit->MOV(32, R(ECX), info.first);
RI.Jit->MOV(32, R(EDX), info.first);
else
RI.Jit->LEA(32, ECX, MDisp(info.first.GetSimpleReg(), info.second));
RI.Jit->LEA(32, EDX, MDisp(info.first.GetSimpleReg(), info.second));
regSpill(RI, EAX);
@ -617,7 +619,7 @@ static void regEmitMemStore(RegInfo& RI, InstLoc I, unsigned Size)
RI.Jit->MOV(32, R(EAX), regLocForInst(RI, getOp1(I)));
}
RI.Jit->SafeWriteRegToReg(EAX, ECX, Size, 0, regsInUse(RI), EmuCodeBlock::SAFE_LOADSTORE_NO_FASTMEM);
RI.Jit->SafeWriteRegToReg(EAX, EDX, Size, 0, regsInUse(RI), EmuCodeBlock::SAFE_LOADSTORE_NO_FASTMEM);
if (RI.IInfo[I - RI.FirstI] & 4)
regClearInst(RI, getOp1(I));
}
@ -675,9 +677,9 @@ static void regEmitCmp(RegInfo& RI, InstLoc I)
static void regEmitICmpInst(RegInfo& RI, InstLoc I, CCFlags flag)
{
regEmitCmp(RI, I);
RI.Jit->SETcc(flag, R(ECX)); // Caution: SETCC uses 8-bit regs!
RI.Jit->SETcc(flag, R(EDX)); // Caution: SETCC uses 8-bit regs!
X64Reg reg = regBinReg(RI, I);
RI.Jit->MOVZX(32, 8, reg, R(ECX));
RI.Jit->MOVZX(32, 8, reg, R(EDX));
RI.regs[reg] = I;
regNormalRegClear(RI, I);
}
@ -1111,11 +1113,11 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, u32 exitAddress)
}
case StoreFPRF:
{
Jit->MOV(32, R(ECX), regLocForInst(RI, getOp1(I)));
Jit->AND(32, R(ECX), Imm8(0x1F));
Jit->SHL(32, R(ECX), Imm8(12));
Jit->MOV(32, R(EDX), regLocForInst(RI, getOp1(I)));
Jit->AND(32, R(EDX), Imm8(0x1F));
Jit->SHL(32, R(EDX), Imm8(12));
Jit->AND(32, PPCSTATE(fpscr), Imm32(~(0x1F << 12)));
Jit->OR(32, PPCSTATE(fpscr), R(ECX));
Jit->OR(32, PPCSTATE(fpscr), R(EDX));
regNormalRegClear(RI, I);
break;
}
@ -1155,8 +1157,8 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, u32 exitAddress)
break;
X64Reg reg = regUReg(RI, I);
Jit->MOV(32, R(ECX), regLocForInst(RI, getOp1(I)));
Jit->MOVSX(32, 8, reg, R(ECX));
Jit->MOV(32, R(EDX), regLocForInst(RI, getOp1(I)));
Jit->MOVSX(32, 8, reg, R(EDX));
RI.regs[reg] = I;
regNormalRegClear(RI, I);
break;
@ -1178,9 +1180,9 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, u32 exitAddress)
break;
X64Reg reg = regUReg(RI, I);
Jit->MOV(32, R(ECX), Imm32(63));
Jit->MOV(32, R(EDX), Imm32(63));
Jit->BSR(32, reg, regLocForInst(RI, getOp1(I)));
Jit->CMOVcc(32, reg, R(ECX), CC_Z);
Jit->CMOVcc(32, reg, R(EDX), CC_Z);
Jit->XOR(32, R(reg), Imm8(31));
RI.regs[reg] = I;
regNormalRegClear(RI, I);
@ -1422,30 +1424,30 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, u32 exitAddress)
Jit->XOR(32, R(EAX), R(EAX));
// SO: Bit 61 set.
Jit->MOV(64, R(RCX), R(cr_val));
Jit->SHR(64, R(RCX), Imm8(61));
Jit->AND(32, R(ECX), Imm8(1));
Jit->OR(32, R(EAX), R(ECX));
Jit->MOV(64, R(RDX), R(cr_val));
Jit->SHR(64, R(RDX), Imm8(61));
Jit->AND(32, R(EDX), Imm8(1));
Jit->OR(32, R(EAX), R(EDX));
// EQ: Bits 31-0 == 0.
Jit->XOR(32, R(ECX), R(ECX));
Jit->XOR(32, R(EDX), R(EDX));
Jit->TEST(32, R(cr_val), R(cr_val));
Jit->SETcc(CC_Z, R(ECX));
Jit->SHL(32, R(ECX), Imm8(1));
Jit->OR(32, R(EAX), R(ECX));
Jit->SETcc(CC_Z, R(EDX));
Jit->SHL(32, R(EDX), Imm8(1));
Jit->OR(32, R(EAX), R(EDX));
// GT: Value > 0.
Jit->XOR(32, R(ECX), R(ECX));
Jit->XOR(32, R(EDX), R(EDX));
Jit->TEST(64, R(cr_val), R(cr_val));
Jit->SETcc(CC_G, R(ECX));
Jit->SHL(32, R(ECX), Imm8(2));
Jit->OR(32, R(EAX), R(ECX));
Jit->SETcc(CC_G, R(EDX));
Jit->SHL(32, R(EDX), Imm8(2));
Jit->OR(32, R(EAX), R(EDX));
// LT: Bit 62 set.
Jit->MOV(64, R(ECX), R(cr_val));
Jit->SHR(64, R(ECX), Imm8(62 - 3));
Jit->AND(32, R(ECX), Imm8(0x8));
Jit->OR(32, R(EAX), R(ECX));
Jit->MOV(64, R(EDX), R(cr_val));
Jit->SHR(64, R(EDX), Imm8(62 - 3));
Jit->AND(32, R(EDX), Imm8(0x8));
Jit->OR(32, R(EAX), R(EDX));
Jit->MOV(32, R(cr_val), R(EAX));
RI.regs[cr_val] = I;
@ -1460,34 +1462,34 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, u32 exitAddress)
X64Reg cr_val = regUReg(RI, I);
Jit->MOV(64, R(cr_val), regLocForInst(RI, getOp1(I)));
Jit->MOV(64, R(RCX), Imm64(1ull << 32));
Jit->MOV(64, R(RDX), Imm64(1ull << 32));
// SO
Jit->MOV(64, R(RAX), R(cr_val));
Jit->SHL(64, R(RAX), Imm8(63));
Jit->SHR(64, R(RAX), Imm8(63 - 61));
Jit->OR(64, R(RCX), R(RAX));
Jit->OR(64, R(RDX), R(RAX));
// EQ
Jit->MOV(64, R(RAX), R(cr_val));
Jit->NOT(64, R(RAX));
Jit->AND(64, R(RAX), Imm8(CR_EQ));
Jit->OR(64, R(RCX), R(RAX));
Jit->OR(64, R(RDX), R(RAX));
// GT
Jit->MOV(64, R(RAX), R(cr_val));
Jit->NOT(64, R(RAX));
Jit->AND(64, R(RAX), Imm8(CR_GT));
Jit->SHL(64, R(RAX), Imm8(63 - 2));
Jit->OR(64, R(RCX), R(RAX));
Jit->OR(64, R(RDX), R(RAX));
// LT
Jit->MOV(64, R(RAX), R(cr_val));
Jit->AND(64, R(RAX), Imm8(CR_LT));
Jit->SHL(64, R(RAX), Imm8(62 - 3));
Jit->OR(64, R(RCX), R(RAX));
Jit->OR(64, R(RDX), R(RAX));
Jit->MOV(64, R(cr_val), R(RCX));
Jit->MOV(64, R(cr_val), R(RDX));
RI.regs[cr_val] = I;
regNormalRegClear(RI, I);
@ -1553,9 +1555,9 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, u32 exitAddress)
break;
X64Reg reg = fregFindFreeReg(RI);
Jit->MOV(32, R(ECX), regLocForInst(RI, getOp1(I)));
RI.Jit->SafeLoadToReg(ECX, R(ECX), 32, 0, regsInUse(RI), false, EmuCodeBlock::SAFE_LOADSTORE_NO_FASTMEM);
Jit->MOVD_xmm(reg, R(ECX));
Jit->MOV(32, R(EDX), regLocForInst(RI, getOp1(I)));
RI.Jit->SafeLoadToReg(EDX, R(EDX), 32, 0, regsInUse(RI), false, EmuCodeBlock::SAFE_LOADSTORE_NO_FASTMEM);
Jit->MOVD_xmm(reg, R(EDX));
RI.fregs[reg] = I;
regNormalRegClear(RI, I);
break;
@ -1567,9 +1569,9 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, u32 exitAddress)
X64Reg reg = fregFindFreeReg(RI);
const OpArg loc = regLocForInst(RI, getOp1(I));
Jit->MOV(32, R(ECX), loc);
RI.Jit->SafeLoadToReg(RCX, R(ECX), 64, 0, regsInUse(RI), false, EmuCodeBlock::SAFE_LOADSTORE_NO_FASTMEM);
Jit->MOVQ_xmm(reg, R(RCX));
Jit->MOV(32, R(EDX), loc);
RI.Jit->SafeLoadToReg(RDX, R(EDX), 64, 0, regsInUse(RI), false, EmuCodeBlock::SAFE_LOADSTORE_NO_FASTMEM);
Jit->MOVQ_xmm(reg, R(RDX));
RI.fregs[reg] = I;
regNormalRegClear(RI, I);
break;
@ -1591,11 +1593,10 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, u32 exitAddress)
// 0b0011111100000111, or 0x3F07.
Jit->MOV(32, R(EAX), Imm32(0x3F07));
Jit->AND(32, R(EAX), M(((char *)&GQR(quantreg)) + 2));
Jit->MOVZX(32, 8, EDX, R(AL));
Jit->OR(32, R(EDX), Imm8(w << 3));
Jit->OR(32, R(EAX), Imm8(w << 3));
Jit->MOV(32, R(ECX), regLocForInst(RI, getOp1(I)));
Jit->CALLptr(MScaled(EDX, SCALE_8, (u32)(u64)(((JitIL *)jit)->asm_routines.pairedLoadQuantized)));
Jit->MOV(32, R(EDX), regLocForInst(RI, getOp1(I)));
Jit->CALLptr(MScaled(EAX, SCALE_8, (u32)(u64)(((JitIL *)jit)->asm_routines.pairedLoadQuantized)));
Jit->MOVAPD(reg, R(XMM0));
RI.fregs[reg] = I;
regNormalRegClear(RI, I);
@ -1610,8 +1611,8 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, u32 exitAddress)
else
Jit->MOV(32, R(EAX), loc1);
Jit->MOV(32, R(ECX), regLocForInst(RI, getOp2(I)));
RI.Jit->SafeWriteRegToReg(EAX, ECX, 32, 0, regsInUse(RI), EmuCodeBlock::SAFE_LOADSTORE_NO_FASTMEM);
Jit->MOV(32, R(EDX), regLocForInst(RI, getOp2(I)));
RI.Jit->SafeWriteRegToReg(EAX, EDX, 32, 0, regsInUse(RI), EmuCodeBlock::SAFE_LOADSTORE_NO_FASTMEM);
if (RI.IInfo[I - RI.FirstI] & 4)
fregClearInst(RI, getOp1(I));
if (RI.IInfo[I - RI.FirstI] & 8)
@ -1626,8 +1627,8 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, u32 exitAddress)
OpArg address = regLocForInst(RI, getOp2(I));
Jit->MOVAPD(XMM0, value);
Jit->MOVQ_xmm(R(RAX), XMM0);
Jit->MOV(32, R(ECX), address);
RI.Jit->SafeWriteRegToReg(RAX, ECX, 64, 0, regsInUse(RI), EmuCodeBlock::SAFE_LOADSTORE_NO_FASTMEM);
Jit->MOV(32, R(EDX), address);
RI.Jit->SafeWriteRegToReg(RAX, EDX, 64, 0, regsInUse(RI), EmuCodeBlock::SAFE_LOADSTORE_NO_FASTMEM);
if (RI.IInfo[I - RI.FirstI] & 4)
fregClearInst(RI, getOp1(I));
@ -1644,7 +1645,7 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, u32 exitAddress)
Jit->AND(32, R(EAX), PPCSTATE(spr[SPR_GQR0 + quantreg]));
Jit->MOVZX(32, 8, EDX, R(AL));
Jit->MOV(32, R(ECX), regLocForInst(RI, getOp2(I)));
Jit->MOV(32, R(EDX), regLocForInst(RI, getOp2(I)));
Jit->MOVAPD(XMM0, fregLocForInst(RI, getOp1(I)));
Jit->CALLptr(MScaled(EDX, SCALE_8, (u32)(u64)(((JitIL *)jit)->asm_routines.pairedStoreQuantized)));
if (RI.IInfo[I - RI.FirstI] & 4)
@ -1790,9 +1791,9 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, u32 exitAddress)
X64Reg reg = fregFindFreeReg(RI);
unsigned ppcreg = *I >> 8;
char *p = (char*)&(PowerPC::ppcState.ps[ppcreg][0]);
Jit->MOV(32, R(ECX), M(p+4));
Jit->AND(32, R(ECX), Imm32(0x7ff00000));
Jit->CMP(32, R(ECX), Imm32(0x38000000));
Jit->MOV(32, R(EDX), M(p+4));
Jit->AND(32, R(EDX), Imm32(0x7ff00000));
Jit->CMP(32, R(EDX), Imm32(0x38000000));
FixupBranch ok = Jit->J_CC(CC_AE);
Jit->AND(32, M(p+4), Imm32(0x80000000));
Jit->MOV(32, M(p), Imm32(0));
@ -2204,10 +2205,10 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, u32 exitAddress)
const u32 mask = 0x87C0FFFF;
// MSR = (MSR & ~mask) | (SRR1 & mask);
Jit->MOV(32, R(EAX), PPCSTATE(msr));
Jit->MOV(32, R(ECX), PPCSTATE_SRR1);
Jit->MOV(32, R(EDX), PPCSTATE_SRR1);
Jit->AND(32, R(EAX), Imm32(~mask));
Jit->AND(32, R(ECX), Imm32(mask));
Jit->OR(32, R(EAX), R(ECX));
Jit->AND(32, R(EDX), Imm32(mask));
Jit->OR(32, R(EAX), R(EDX));
// MSR &= 0xFFFBFFFF; // Mask used to clear the bit MSR[13]
Jit->AND(32, R(EAX), Imm32(0xFFFBFFFF));
Jit->MOV(32, PPCSTATE(msr), R(EAX));

View file

@ -9,7 +9,7 @@
#include "Core/PowerPC/JitCommon/JitAsmCommon.h"
#include "Core/PowerPC/JitCommon/JitBase.h"
#define QUANTIZED_REGS_TO_SAVE (ABI_ALL_CALLER_SAVED & ~((1 << RAX) | (1 << RCX) | (1 << RDX) | \
#define QUANTIZED_REGS_TO_SAVE (ABI_ALL_CALLER_SAVED & ~((1 << RAX) | (1 << RCX) | \
(1 << (XMM0+16)) | (1 << (XMM1+16))))
using namespace Gen;
@ -18,19 +18,15 @@ static int temp32;
void CommonAsmRoutines::GenFifoWrite(int size)
{
// Assume value in ABI_PARAM1
// Assume value in EDX
PUSH(ESI);
if (size != 32)
PUSH(EDX);
MOV(32, R(EAX), Imm32((u32)(u64)GPFifo::m_gatherPipe));
MOV(32, R(ESI), M(&GPFifo::m_gatherPipeCount));
SwapAndStore(size, MComplex(RAX, RSI, 1, 0), ABI_PARAM1);
SwapAndStore(size, MComplex(RAX, RSI, 1, 0), EDX);
ADD(32, R(ESI), Imm8(size >> 3));
MOV(32, M(&GPFifo::m_gatherPipeCount), R(ESI));
if (size != 32)
POP(EDX);
POP(ESI);
RET();
}
@ -39,7 +35,6 @@ void CommonAsmRoutines::GenFifoFloatWrite()
{
// Assume value in XMM0
PUSH(ESI);
PUSH(EDX);
MOVSS(M(&temp32), XMM0);
MOV(32, R(EDX), M(&temp32));
MOV(32, R(EAX), Imm32((u32)(u64)GPFifo::m_gatherPipe));
@ -47,7 +42,6 @@ void CommonAsmRoutines::GenFifoFloatWrite()
SwapAndStore(32, MComplex(RAX, RSI, 1, 0), EDX);
ADD(32, R(ESI), Imm8(4));
MOV(32, M(&GPFifo::m_gatherPipeCount), R(ESI));
POP(EDX);
POP(ESI);
RET();
}

View file

@ -59,6 +59,7 @@ const u8 *TrampolineCache::GetReadTrampoline(const InstructionInfo &info, u32 re
// It ought to be necessary to align the stack here. Since it seems to not
// affect anybody, I'm not going to add it just to be completely safe about
// performance.
ABI_PushRegistersAndAdjustStack(registersInUse, true);
if (addrReg != ABI_PARAM1)
MOV(32, R(ABI_PARAM1), R((X64Reg)addrReg));
@ -66,7 +67,6 @@ const u8 *TrampolineCache::GetReadTrampoline(const InstructionInfo &info, u32 re
if (info.displacement)
ADD(32, R(ABI_PARAM1), Imm32(info.displacement));
ABI_PushRegistersAndAdjustStack(registersInUse, true);
switch (info.operandSize)
{
case 4:
@ -115,6 +115,8 @@ const u8 *TrampolineCache::GetWriteTrampoline(const InstructionInfo &info, u32 r
// PC is used by memory watchpoints (if enabled) or to print accurate PC locations in debug logs
MOV(32, PPCSTATE(pc), Imm32(pc));
ABI_PushRegistersAndAdjustStack(registersInUse, true);
MOVTwo(64, ABI_PARAM1, dataReg, ABI_PARAM2, addrReg, ABI_PARAM3);
if (info.displacement)
@ -122,7 +124,6 @@ const u8 *TrampolineCache::GetWriteTrampoline(const InstructionInfo &info, u32 r
ADD(32, R(ABI_PARAM2), Imm32(info.displacement));
}
ABI_PushRegistersAndAdjustStack(registersInUse, true);
switch (info.operandSize)
{
case 8:

View file

@ -5,7 +5,6 @@
#include <emmintrin.h>
#include "Common/Common.h"
#include "Common/CPUDetect.h"
#include "Common/MathUtil.h"
#include "Core/HW/MMIO.h"
@ -248,13 +247,11 @@ void EmuCodeBlock::MMIOLoadToReg(MMIO::Mapping* mmio, Gen::X64Reg reg_value,
}
}
// Always clobbers EAX. Preserves the address.
// Preserves the value if the load fails and js.memcheck is enabled.
void EmuCodeBlock::SafeLoadToReg(X64Reg reg_value, const Gen::OpArg & opAddress, int accessSize, s32 offset, u32 registersInUse, bool signExtend, int flags)
{
if (!jit->js.memcheck)
{
registersInUse &= ~(1 << RAX | 1 << reg_value);
registersInUse &= ~(1 << reg_value);
}
if (!Core::g_CoreStartupParameter.bMMU &&
Core::g_CoreStartupParameter.bFastmem &&
@ -395,11 +392,6 @@ void EmuCodeBlock::SafeLoadToReg(X64Reg reg_value, const Gen::OpArg & opAddress,
u8 *EmuCodeBlock::UnsafeWriteRegToReg(X64Reg reg_value, X64Reg reg_addr, int accessSize, s32 offset, bool swap)
{
if (accessSize == 8 && reg_value >= 4)
{
PanicAlert("WARNING: likely incorrect use of UnsafeWriteRegToReg!");
}
u8* result = GetWritableCodePtr();
OpArg dest = MComplex(RBX, reg_addr, SCALE_1, offset);
if (swap)
@ -410,7 +402,8 @@ u8 *EmuCodeBlock::UnsafeWriteRegToReg(X64Reg reg_value, X64Reg reg_addr, int acc
}
else
{
BSWAP(accessSize, reg_value);
if (accessSize > 8)
BSWAP(accessSize, reg_value);
result = GetWritableCodePtr();
MOV(accessSize, dest, R(reg_value));
}
@ -423,10 +416,8 @@ u8 *EmuCodeBlock::UnsafeWriteRegToReg(X64Reg reg_value, X64Reg reg_addr, int acc
return result;
}
// Destroys both arg registers
void EmuCodeBlock::SafeWriteRegToReg(X64Reg reg_value, X64Reg reg_addr, int accessSize, s32 offset, u32 registersInUse, int flags)
{
registersInUse &= ~(1 << RAX);
if (!Core::g_CoreStartupParameter.bMMU &&
Core::g_CoreStartupParameter.bFastmem &&
!(flags & (SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_FASTMEM))
@ -449,7 +440,17 @@ void EmuCodeBlock::SafeWriteRegToReg(X64Reg reg_value, X64Reg reg_addr, int acce
}
if (offset)
ADD(32, R(reg_addr), Imm32((u32)offset));
{
if (flags & SAFE_LOADSTORE_CLOBBER_EAX_INSTEAD_OF_ADDR)
{
LEA(32, EAX, MDisp(reg_addr, (u32)offset));
reg_addr = EAX;
}
else
{
ADD(32, R(reg_addr), Imm32((u32)offset));
}
}
u32 mem_mask = Memory::ADDR_MASK_HW_ACCESS;

View file

@ -6,6 +6,7 @@
#include <unordered_map>
#include "Common/CPUDetect.h"
#include "Common/x64Emitter.h"
namespace MMIO { class Mapping; }
@ -52,11 +53,21 @@ public:
{
SAFE_LOADSTORE_NO_SWAP = 1,
SAFE_LOADSTORE_NO_PROLOG = 2,
SAFE_LOADSTORE_NO_FASTMEM = 4
SAFE_LOADSTORE_NO_FASTMEM = 4,
SAFE_LOADSTORE_CLOBBER_EAX_INSTEAD_OF_ADDR = 8
};
void SafeLoadToReg(Gen::X64Reg reg_value, const Gen::OpArg & opAddress, int accessSize, s32 offset, u32 registersInUse, bool signExtend, int flags = 0);
// Clobbers EAX or reg_addr depending on the relevant flag. Preserves
// reg_value if the load fails and js.memcheck is enabled.
void SafeWriteRegToReg(Gen::X64Reg reg_value, Gen::X64Reg reg_addr, int accessSize, s32 offset, u32 registersInUse, int flags = 0);
// applies to safe and unsafe WriteRegToReg
bool WriteClobbersRegValue(int accessSize, bool swap)
{
return swap && !cpu_info.bMOVBE && accessSize > 8;
}
void SafeWriteF32ToReg(Gen::X64Reg xmm_value, Gen::X64Reg reg_addr, s32 offset, u32 registersInUse, int flags = 0);
void WriteToConstRamAddress(int accessSize, Gen::X64Reg arg, u32 address, bool swap = false);

View file

@ -40,7 +40,7 @@ instruction and generates code. Dead code elimination works in this step,
by simply skipping unused instructions. The register allocator is a dumb,
greedy allocator: at the moment, it's really a bit too dumb, but it's
actually not as bad as it looks: unless a block is relatively long, spills
are rarely needed. ECX is used as a scratch register: requiring a scratch
are rarely needed. EDX is used as a scratch register: requiring a scratch
register isn't ideal, but the register allocator is too dumb to handle
instructions that need a specific register at the moment.