JitArm64: Initial implementation of the BLR optimization.

This commit is contained in:
degasus 2017-02-01 00:10:32 +01:00
parent f20113fce2
commit 384efb0cb2
8 changed files with 209 additions and 28 deletions

View file

@ -46,9 +46,7 @@ void JitArm64::Init()
UpdateMemoryOptions();
gpr.Init(this);
fpr.Init(this);
blocks.Init();
GenerateAsm();
code_block.m_stats = &js.st;
code_block.m_gpa = &js.gpa;
@ -56,6 +54,9 @@ void JitArm64::Init()
analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_CONDITIONAL_CONTINUE);
analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_CARRY_MERGE);
analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_BRANCH_FOLLOW);
m_enable_blr_optimization = true;
GenerateAsm();
m_supports_cycle_counter = HasCycleCounters();
}
@ -192,8 +193,16 @@ void JitArm64::DoDownCount()
gpr.Unlock(WA, WB);
}
// Exits
void JitArm64::WriteExit(u32 destination)
void JitArm64::ResetStack()
{
if (!m_enable_blr_optimization)
return;
LDR(INDEX_UNSIGNED, X0, PPC_REG, PPCSTATE_OFF(stored_stack_pointer));
SUB(SP, X0, 16);
}
void JitArm64::WriteExit(u32 destination, bool LK, u32 exit_address_after_return)
{
Cleanup();
DoDownCount();
@ -201,31 +210,159 @@ void JitArm64::WriteExit(u32 destination)
if (Profiler::g_ProfileBlocks)
EndTimeProfile(js.curBlock);
// If nobody has taken care of this yet (this can be removed when all branches are done)
LK &= m_enable_blr_optimization;
if (LK)
{
// Push {ARM_PC+20; PPC_PC} on the stack
MOVI2R(X1, exit_address_after_return);
ADR(X0, 20);
STP(INDEX_PRE, X0, X1, SP, -16);
}
JitBlock* b = js.curBlock;
JitBlock::LinkData linkData;
linkData.exitAddress = destination;
linkData.exitPtrs = GetWritableCodePtr();
linkData.linkStatus = false;
linkData.call = LK;
b->linkData.push_back(linkData);
MOVI2R(DISPATCHER_PC, destination);
B(dispatcher);
if (!LK)
{
B(dispatcher);
}
else
{
BL(dispatcher);
// MOVI2R might only require one instruction. So the const offset of 20 bytes
// might be wrong. Be sure and just add a NOP here.
HINT(HINT_NOP);
// Write the regular exit node after the return.
linkData.exitAddress = exit_address_after_return;
linkData.exitPtrs = GetWritableCodePtr();
linkData.linkStatus = false;
linkData.call = false;
b->linkData.push_back(linkData);
MOVI2R(DISPATCHER_PC, exit_address_after_return);
B(dispatcher);
}
}
void JitArm64::WriteExit(ARM64Reg Reg)
void JitArm64::WriteExit(Arm64Gen::ARM64Reg dest, bool LK, u32 exit_address_after_return)
{
Cleanup();
DoDownCount();
if (Reg != DISPATCHER_PC)
MOV(DISPATCHER_PC, Reg);
gpr.Unlock(Reg);
LK &= m_enable_blr_optimization;
if (dest != DISPATCHER_PC)
MOV(DISPATCHER_PC, dest);
gpr.Unlock(dest);
if (Profiler::g_ProfileBlocks)
EndTimeProfile(js.curBlock);
if (!LK)
{
B(dispatcher);
}
else
{
// Push {ARM_PC, PPC_PC} on the stack
MOVI2R(X1, exit_address_after_return);
ADR(X0, 12);
STP(INDEX_PRE, X0, X1, SP, -16);
BL(dispatcher);
// Write the regular exit node after the return.
JitBlock* b = js.curBlock;
JitBlock::LinkData linkData;
linkData.exitAddress = exit_address_after_return;
linkData.exitPtrs = GetWritableCodePtr();
linkData.linkStatus = false;
linkData.call = false;
b->linkData.push_back(linkData);
MOVI2R(DISPATCHER_PC, exit_address_after_return);
B(dispatcher);
}
}
void JitArm64::FakeLKExit(u32 exit_address_after_return)
{
if (!m_enable_blr_optimization)
return;
// We may need to fake the BLR stack on inlined CALL instructions.
// Else we can't return to this location any more.
ARM64Reg after_reg = gpr.GetReg();
ARM64Reg code_reg = gpr.GetReg();
MOVI2R(after_reg, exit_address_after_return);
ADR(EncodeRegTo64(code_reg), 12);
STP(INDEX_PRE, EncodeRegTo64(code_reg), EncodeRegTo64(after_reg), SP, -16);
gpr.Unlock(after_reg, code_reg);
FixupBranch skip_exit = BL();
// Write the regular exit node after the return.
JitBlock* b = js.curBlock;
JitBlock::LinkData linkData;
linkData.exitAddress = exit_address_after_return;
linkData.exitPtrs = GetWritableCodePtr();
linkData.linkStatus = false;
linkData.call = false;
b->linkData.push_back(linkData);
MOVI2R(DISPATCHER_PC, exit_address_after_return);
B(dispatcher);
SetJumpTarget(skip_exit);
}
void JitArm64::WriteBLRExit(Arm64Gen::ARM64Reg dest)
{
if (!m_enable_blr_optimization)
{
WriteExit(dest);
return;
}
Cleanup();
if (Profiler::g_ProfileBlocks)
EndTimeProfile(js.curBlock);
ARM64Reg code = gpr.GetReg();
ARM64Reg pc = gpr.GetReg();
// Check if {ARM_PC, PPC_PC} matches the current state.
LDP(INDEX_POST, EncodeRegTo64(code), EncodeRegTo64(pc), SP, 16);
CMP(pc, dest);
FixupBranch no_match = B(CC_NEQ);
DoDownCount();
RET(EncodeRegTo64(code));
SetJumpTarget(no_match);
DoDownCount();
if (dest != DISPATCHER_PC)
MOV(DISPATCHER_PC, dest);
ResetStack();
B(dispatcher);
gpr.Unlock(dest, pc, code);
}
void JitArm64::WriteExceptionExit(u32 destination, bool only_external)
@ -399,11 +536,11 @@ void JitArm64::Jit(u32)
}
JitBlock* b = blocks.AllocateBlock(em_address);
const u8* BlockPtr = DoJit(em_address, &code_buffer, b, nextPC);
DoJit(em_address, &code_buffer, b, nextPC);
blocks.FinalizeBlock(*b, jo.enableBlocklink, code_block.m_physical_addresses);
}
const u8* JitArm64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBlock* b, u32 nextPC)
void JitArm64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBlock* b, u32 nextPC)
{
if (em_address == 0)
{
@ -629,5 +766,4 @@ const u8* JitArm64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitB
FlushIcache();
farcode.FlushIcache();
return start;
}

View file

@ -190,6 +190,8 @@ private:
// Do we support cycle counter profiling?
bool m_supports_cycle_counter;
bool m_enable_blr_optimization;
void EmitResetCycleCounters();
void EmitGetCycles(Arm64Gen::ARM64Reg reg);
@ -219,10 +221,11 @@ private:
void SafeLoadToReg(u32 dest, s32 addr, s32 offsetReg, u32 flags, s32 offset, bool update);
void SafeStoreFromReg(s32 dest, u32 value, s32 regOffset, u32 flags, s32 offset);
const u8* DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBlock* b, u32 nextPC);
void DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBlock* b, u32 nextPC);
void DoDownCount();
void Cleanup();
void ResetStack();
// AsmRoutines
void GenerateAsm();
@ -234,10 +237,12 @@ private:
void EndTimeProfile(JitBlock* b);
// Exits
void WriteExit(u32 destination);
void WriteExit(Arm64Gen::ARM64Reg dest);
void WriteExit(u32 destination, bool LK = false, u32 exit_address_after_return = 0);
void WriteExit(Arm64Gen::ARM64Reg dest, bool LK = false, u32 exit_address_after_return = 0);
void WriteExceptionExit(u32 destination, bool only_external = false);
void WriteExceptionExit(Arm64Gen::ARM64Reg dest, bool only_external = false);
void FakeLKExit(u32 exit_address_after_return);
void WriteBLRExit(Arm64Gen::ARM64Reg dest);
FixupBranch JumpIfCRFieldBit(int field, int bit, bool jump_if_set);

View file

@ -19,21 +19,31 @@ void JitArm64BlockCache::WriteLinkBlock(const JitBlock::LinkData& source, const
if (dest)
{
// Are we able to jump directly to the normal entry?
s64 distance = ((s64)dest->normalEntry - (s64)location) >> 2;
if (distance >= -0x40000 && distance <= 0x3FFFF)
if (source.call)
{
emit.B(CC_PL, dest->normalEntry);
emit.BL(dest->checkedEntry);
}
else
{
// Are we able to jump directly to the normal entry?
s64 distance = ((s64)dest->normalEntry - (s64)location) >> 2;
if (distance >= -0x40000 && distance <= 0x3FFFF)
{
emit.B(CC_PL, dest->normalEntry);
}
// Use the checked entry if either downcount is smaller zero,
// or if we're not able to inline the downcount check here.
emit.B(dest->checkedEntry);
// Use the checked entry if either downcount is smaller zero,
// or if we're not able to inline the downcount check here.
emit.B(dest->checkedEntry);
}
}
else
{
emit.MOVI2R(DISPATCHER_PC, source.exitAddress);
emit.B(m_jit.GetAsmRoutines()->dispatcher);
if (source.call)
emit.BL(m_jit.GetAsmRoutines()->dispatcher);
else
emit.B(m_jit.GetAsmRoutines()->dispatcher);
}
emit.FlushIcache();
}

View file

@ -92,6 +92,13 @@ void JitArm64::bx(UGeckoInstruction inst)
if (!js.isLastInstruction)
{
if (inst.LK && !js.op->skipLRStack)
{
// We have to fake the stack as the RET instruction was not
// found in the same block. This is a big overhead, but still
// better than calling the dispatcher.
FakeLKExit(js.compilerPC + 4);
}
return;
}
@ -112,7 +119,7 @@ void JitArm64::bx(UGeckoInstruction inst)
return;
}
WriteExit(destination);
WriteExit(destination, inst.LK, js.compilerPC + 4);
}
void JitArm64::bcx(UGeckoInstruction inst)
@ -162,7 +169,7 @@ void JitArm64::bcx(UGeckoInstruction inst)
gpr.Flush(FlushMode::FLUSH_MAINTAIN_STATE);
fpr.Flush(FlushMode::FLUSH_MAINTAIN_STATE);
WriteExit(destination);
WriteExit(destination, inst.LK, js.compilerPC + 4);
SwitchToNearCode();
@ -211,7 +218,8 @@ void JitArm64::bcctrx(UGeckoInstruction inst)
LDR(INDEX_UNSIGNED, WA, PPC_REG, PPCSTATE_OFF(spr[SPR_CTR]));
AND(WA, WA, 30, 29); // Wipe the bottom 2 bits.
WriteExit(WA);
WriteExit(WA, inst.LK_3, js.compilerPC + 4);
}
void JitArm64::bclrx(UGeckoInstruction inst)
@ -264,7 +272,7 @@ void JitArm64::bclrx(UGeckoInstruction inst)
gpr.Flush(conditional ? FlushMode::FLUSH_MAINTAIN_STATE : FlushMode::FLUSH_ALL);
fpr.Flush(conditional ? FlushMode::FLUSH_MAINTAIN_STATE : FlushMode::FLUSH_ALL);
WriteExit(WA);
WriteBLRExit(WA);
if (conditional)
SwitchToNearCode();

View file

@ -56,6 +56,10 @@ void JitArm64::mtmsr(UGeckoInstruction inst)
gpr.Flush(FlushMode::FLUSH_ALL);
fpr.Flush(FlushMode::FLUSH_ALL);
// Our jit cache also stores some MSR bits, as they have changed, we either
// have to validate them in the BLR/RET check, or just flush the stack here.
ResetStack();
WriteExceptionExit(js.compilerPC + 4, true);
}

View file

@ -28,6 +28,14 @@ void JitArm64::GenerateAsm()
MOVP2R(PPC_REG, &PowerPC::ppcState);
// Store the stack pointer, so we can reset it if the BLR optimization fails.
ADD(X0, SP, 0);
STR(INDEX_UNSIGNED, X0, PPC_REG, PPCSTATE_OFF(stored_stack_pointer));
// Push {nullptr; -1} as invalid destination on the stack.
MOVI2R(X0, 0xFFFFFFFF);
STP(INDEX_PRE, ZR, X0, SP, -16);
// The PC will be loaded into DISPATCHER_PC after the call to CoreTiming::Advance().
// Advance() does an exception check so we don't know what PC to use until afterwards.
FixupBranch to_start_of_timing_slice = B();
@ -119,6 +127,7 @@ void JitArm64::GenerateAsm()
// Call JIT
SetJumpTarget(no_block_available);
ResetStack();
MOV(W0, DISPATCHER_PC);
MOVP2R(X30, reinterpret_cast<void*>(&JitTrampoline));
BLR(X30);
@ -150,6 +159,11 @@ void JitArm64::GenerateAsm()
B(dispatcherNoCheck);
SetJumpTarget(Exit);
// Reset the stack pointer, as the BLR optimization have touched it.
LDR(INDEX_UNSIGNED, X0, PPC_REG, PPCSTATE_OFF(stored_stack_pointer));
ADD(SP, X0, 0);
ABI_PopRegisters(regs_to_save);
RET(X30);

View file

@ -58,6 +58,7 @@ struct JitBlock
u8* exitPtrs; // to be able to rewrite the exit jump
u32 exitAddress;
bool linkStatus; // is it already linked?
bool call;
};
std::vector<LinkData> linkData;

View file

@ -116,6 +116,9 @@ struct PowerPCState
// also for power management, but we don't care about that.
u32 spr[1024];
// Storage for the stack pointer of the BLR optimization.
u8* stored_stack_pointer;
std::array<std::array<tlb_entry, TLB_SIZE / TLB_WAYS>, NUM_TLBS> tlb;
u32 pagetable_base;