Merge pull request #4735 from degasus/jitcache

Jit64: Enable branch following.
This commit is contained in:
Scott Mansell 2017-01-28 15:48:01 +13:00 committed by GitHub
commit 5da565a1a1
9 changed files with 125 additions and 48 deletions

View file

@ -443,6 +443,16 @@ void XEmitter::CALL(const void* fnptr)
Write32(u32(distance));
}
FixupBranch XEmitter::CALL()
{
FixupBranch branch;
branch.type = 1;
branch.ptr = code + 5;
Write8(0xE8);
Write32(0);
return branch;
}
FixupBranch XEmitter::J(bool force5bytes)
{
FixupBranch branch;

View file

@ -467,6 +467,7 @@ public:
#undef CALL
#endif
void CALL(const void* fnptr);
FixupBranch CALL();
void CALLptr(OpArg arg);
FixupBranch J_CC(CCFlags conditionCode, bool force5bytes = false);

View file

@ -372,6 +372,21 @@ bool Jit64::Cleanup()
return did_something;
}
void Jit64::FakeBLCall(u32 after)
{
if (!m_enable_blr_optimization)
return;
// We may need to fake the BLR stack on inlined CALL instructions.
// Else we can't return to this location any more.
MOV(32, R(RSCRATCH2), Imm32(after));
PUSH(RSCRATCH2);
FixupBranch skip_exit = CALL();
POP(RSCRATCH2);
JustWriteExit(after, false, 0);
SetJumpTarget(skip_exit);
}
void Jit64::WriteExit(u32 destination, bool bl, u32 after)
{
if (!m_enable_blr_optimization)
@ -569,6 +584,7 @@ void Jit64::Jit(u32 em_address)
analyzer.ClearOption(PPCAnalyst::PPCAnalyzer::OPTION_BRANCH_MERGE);
analyzer.ClearOption(PPCAnalyst::PPCAnalyzer::OPTION_CROR_MERGE);
analyzer.ClearOption(PPCAnalyst::PPCAnalyzer::OPTION_CARRY_MERGE);
analyzer.ClearOption(PPCAnalyst::PPCAnalyzer::OPTION_BRANCH_FOLLOW);
}
Trace();
}
@ -973,6 +989,7 @@ void Jit64::EnableOptimization()
analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_BRANCH_MERGE);
analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_CROR_MERGE);
analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_CARRY_MERGE);
analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_BRANCH_FOLLOW);
}
void Jit64::IntializeSpeculativeConstants()

View file

@ -85,6 +85,7 @@ public:
// Utilities for use by opcodes
void FakeBLCall(u32 after);
void WriteExit(u32 destination, bool bl = false, u32 after = 0);
void JustWriteExit(u32 destination, bool bl, u32 after);
void WriteExitDestInRSCRATCH(bool bl = false, u32 after = 0);

View file

@ -74,6 +74,13 @@ void Jit64::bx(UGeckoInstruction inst)
// Because PPCAnalyst::Flatten() merged the blocks.
if (!js.isLastInstruction)
{
if (inst.LK && !js.op->skipLRStack)
{
// We have to fake the stack as the RET instruction was not
// found in the same block. This is a big overhead, but still
// better than calling the dispatcher.
FakeBLCall(js.compilerPC + 4);
}
return;
}
@ -131,6 +138,22 @@ void Jit64::bcx(UGeckoInstruction inst)
if (inst.LK)
MOV(32, PPCSTATE_LR, Imm32(js.compilerPC + 4));
// If this is not the last instruction of a block
// and an unconditional branch, we will skip the rest process.
// Because PPCAnalyst::Flatten() merged the blocks.
if (!js.isLastInstruction && (inst.BO & BO_DONT_DECREMENT_FLAG) &&
(inst.BO & BO_DONT_CHECK_CONDITION))
{
if (inst.LK && !js.op->skipLRStack)
{
// We have to fake the stack as the RET instruction was not
// found in the same block. This is a big overhead, but still
// better than calling the dispatcher.
FakeBLCall(js.compilerPC + 4);
}
return;
}
u32 destination;
if (inst.AA)
destination = SignExt16(inst.BD << 2);

View file

@ -55,6 +55,7 @@ void JitArm64::Init()
code_block.m_fpa = &js.fpa;
analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_CONDITIONAL_CONTINUE);
analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_CARRY_MERGE);
analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_BRANCH_FOLLOW);
m_supports_cycle_counter = HasCycleCounters();
}

View file

@ -76,9 +76,6 @@ void JitArm64::bx(UGeckoInstruction inst)
INSTRUCTION_START
JITDISABLE(bJITBranchOff);
gpr.Flush(FlushMode::FLUSH_ALL);
fpr.Flush(FlushMode::FLUSH_ALL);
u32 destination;
if (inst.AA)
destination = SignExt26(inst.LI << 2);
@ -93,6 +90,14 @@ void JitArm64::bx(UGeckoInstruction inst)
gpr.Unlock(WA);
}
if (!js.isLastInstruction)
{
return;
}
gpr.Flush(FlushMode::FLUSH_ALL);
fpr.Flush(FlushMode::FLUSH_ALL);
if (destination == js.compilerPC)
{
// make idle loops go faster

View file

@ -32,8 +32,9 @@
namespace PPCAnalyst
{
constexpr int CODEBUFFER_SIZE = 32000;
// 0 does not perform block merging
constexpr u32 FUNCTION_FOLLOWING_THRESHOLD = 16;
constexpr u32 BRANCH_FOLLOWING_THRESHOLD = 2;
constexpr u32 INVALID_BRANCH_TARGET = 0xFFFFFFFF;
@ -651,7 +652,8 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, u32
CodeOp* code = buffer->codebuffer;
bool found_exit = false;
u32 return_address = 0;
bool found_call = false;
size_t caller = 0;
u32 numFollows = 0;
u32 num_inst = 0;
@ -686,50 +688,65 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, u32
bool conditional_continue = false;
// Do we inline leaf functions?
if (HasOption(OPTION_LEAF_INLINE))
// TODO: Find the optimal value for BRANCH_FOLLOWING_THRESHOLD.
// If it is small, the performance will be down.
// If it is big, the size of generated code will be big and
// cache clearning will happen many times.
if (HasOption(OPTION_BRANCH_FOLLOW) && numFollows < BRANCH_FOLLOWING_THRESHOLD)
{
if (inst.OPCD == 18 && blockSize > 1)
{
// Is bx - should we inline? yes!
if (inst.AA)
destination = SignExt26(inst.LI << 2);
else
destination = address + SignExt26(inst.LI << 2);
if (destination != block->m_address)
follow = true;
// Always follow BX instructions.
// TODO: Loop unrolling might bloat the code size too much.
// Enable it carefully.
follow = destination != block->m_address;
destination = SignExt26(inst.LI << 2) + (inst.AA ? 0 : address);
if (inst.LK)
{
found_call = true;
caller = i;
}
}
else if (inst.OPCD == 19 && inst.SUBOP10 == 16 && (inst.BO & (1 << 4)) &&
(inst.BO & (1 << 2)) && return_address != 0)
else if (inst.OPCD == 16 && (inst.BO & BO_DONT_DECREMENT_FLAG) &&
(inst.BO & BO_DONT_CHECK_CONDITION) && blockSize > 1)
{
// Always follow unconditional BCX instructions, but they are very rare.
follow = true;
destination = SignExt16(inst.BD << 2) + (inst.AA ? 0 : address);
if (inst.LK)
{
found_call = true;
caller = i;
}
}
else if (inst.OPCD == 19 && inst.SUBOP10 == 16 && !inst.LK && found_call &&
(inst.BO & BO_DONT_DECREMENT_FLAG) && (inst.BO & BO_DONT_CHECK_CONDITION))
{
// bclrx with unconditional branch = return
// Follow it if we can propagate the LR value of the last CALL instruction.
// Through it would be easy to track the upper level of call/return,
// we can't guarantee the LR value. The PPC ABI forces all functions to push
// the LR value on the stack as there are no spare registers. So we'd need
// to check all store instruction to not alias with the stack.
follow = true;
destination = return_address;
return_address = 0;
destination = code[caller].address + 4;
found_call = false;
code[i].skip = true;
if (inst.LK)
return_address = address + 4;
// Skip the RET, so also don't generate the stack entry for the BLR optimization.
code[caller].skipLRStack = true;
}
else if (inst.OPCD == 31 && inst.SUBOP10 == 467)
{
// mtspr
// mtspr, skip CALL/RET merging as LR is overwritten.
const u32 index = (inst.SPRU << 5) | (inst.SPRL & 0x1F);
if (index == SPR_LR)
{
// We give up to follow the return address
// because we have to check the register usage.
return_address = 0;
found_call = false;
}
}
// TODO: Find the optimal value for FUNCTION_FOLLOWING_THRESHOLD.
// If it is small, the performance will be down.
// If it is big, the size of generated code will be big and
// cache clearning will happen many times.
// TODO: Investivate the reason why
// "0" is fastest in some games, MP2 for example.
if (numFollows > FUNCTION_FOLLOWING_THRESHOLD)
follow = false;
}
if (HasOption(OPTION_CONDITIONAL_CONTINUE))
@ -759,27 +776,28 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, u32
}
}
if (!follow)
if (follow)
{
// Follow the unconditional branch.
numFollows++;
address = destination;
}
else
{
// Just pick the next instruction
address += 4;
if (!conditional_continue && opinfo->flags & FL_ENDBLOCK) // right now we stop early
{
found_exit = true;
break;
}
if (conditional_continue)
{
// If we skip any conditional branch, we can't garantee to get the matching CALL/RET pair.
// So we stop inling the RET here and let the BLR optitmization handle this case.
found_call = false;
}
}
// XXX: We don't support inlining yet.
#if 0
else
{
numFollows++;
// We don't "code[i].skip = true" here
// because bx may store a certain value to the link register.
// Instead, we skip a part of bx in Jit**::bx().
address = destination;
merged_addresses[size_of_merged_addresses++] = address;
}
#endif
}
block->m_num_instructions = num_inst;

View file

@ -42,6 +42,7 @@ struct CodeOp // 16B
bool outputFPRF;
bool outputCA;
bool canEndBlock;
bool skipLRStack;
bool skip; // followed BL-s for example
// which registers are still needed after this instruction in this block
BitSet32 fprInUse;
@ -189,11 +190,11 @@ public:
// Requires JIT support to be enabled.
OPTION_CONDITIONAL_CONTINUE = (1 << 0),
// If there is a unconditional branch that jumps to a leaf function then inline it.
// Try to inline unconditional branches/calls/returns.
// Also track the LR value to follow unconditional return instructions.
// Might require JIT intervention to support it correctly.
// Requires JITBLock support for inlined code
// XXX: NOT COMPLETE
OPTION_LEAF_INLINE = (1 << 1),
// Especially if the BLR optimization is used.
OPTION_BRANCH_FOLLOW = (1 << 1),
// Complex blocks support jumping backwards on to themselves.
// Happens commonly in loops, pretty complex to support.