JIT: make instruction merging generic

Now it should be easier to merge more than 2-instruction-long sequences.
Also correct some minor inconsistencies in behavior between instruction
merging cases.
This commit is contained in:
Fiora 2015-01-03 22:59:28 -08:00
parent 074f246c69
commit e8cfcd3aeb
12 changed files with 112 additions and 115 deletions

View file

@ -522,6 +522,7 @@ void Jit64::Jit(u32 em_address)
jo.enableBlocklink = false;
analyzer.ClearOption(PPCAnalyst::PPCAnalyzer::OPTION_CONDITIONAL_CONTINUE);
analyzer.ClearOption(PPCAnalyst::PPCAnalyzer::OPTION_BRANCH_MERGE);
analyzer.ClearOption(PPCAnalyst::PPCAnalyzer::OPTION_CROR_MERGE);
analyzer.ClearOption(PPCAnalyst::PPCAnalyzer::OPTION_CARRY_MERGE);
}
Trace();
@ -603,7 +604,7 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc
if (!SConfig::GetInstance().m_LocalCoreStartupParameter.bEnableDebugging)
js.downcountAmount += PatchEngine::GetSpeedhackCycles(code_block.m_address);
js.skipnext = false;
js.skipInstructions = 0;
js.carryFlagSet = false;
js.carryFlagInverted = false;
js.assumeNoPairedQuantize = false;
@ -651,12 +652,9 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc
if (i == (code_block.m_num_instructions - 1))
{
// WARNING - cmp->branch merging will screw this up.
js.isLastInstruction = true;
js.next_inst = 0;
js.next_inst_bp = false;
if (Profiler::g_ProfileBlocks)
{
// WARNING - cmp->branch merging will screw this up.
PROFILER_VPUSH;
// get end tic
PROFILER_QUERY_PERFORMANCE_COUNTER(&b->ticStop);
@ -664,14 +662,7 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc
PROFILER_UPDATE_TIME(b);
PROFILER_VPOP;
}
}
else
{
// help peephole optimizations
js.next_inst = ops[i + 1].inst;
js.next_compilerPC = ops[i + 1].address;
js.next_op = &ops[i + 1];
js.next_inst_bp = SConfig::GetInstance().m_LocalCoreStartupParameter.bEnableDebugging && breakpoints.IsAddressBreakPoint(ops[i + 1].address);
js.isLastInstruction = true;
}
if (jo.optimizeGatherPipe && js.fifoBytesThisBlock >= 32)
@ -856,11 +847,8 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc
//NOTICE_LOG(DYNA_REC, "Unflushed register: %s", ppc_inst.c_str());
}
#endif
if (js.skipnext)
{
js.skipnext = false;
i++; // Skip next instruction
}
i += js.skipInstructions;
js.skipInstructions = 0;
}
u32 function = HLE::GetFunctionIndex(js.blockStart);
@ -919,5 +907,6 @@ void Jit64::EnableOptimization()
{
analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_CONDITIONAL_CONTINUE);
analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_BRANCH_MERGE);
analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_CROR_MERGE);
analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_CARRY_MERGE);
}

View file

@ -115,6 +115,7 @@ public:
void GenerateConstantOverflow(bool overflow);
void GenerateConstantOverflow(s64 val);
void GenerateOverflow();
bool MergeAllowedNextInstructions(int count);
void FinalizeCarryOverflow(bool oe, bool inv = false);
void FinalizeCarry(Gen::CCFlags cond);
void FinalizeCarry(bool ca);

View file

@ -346,10 +346,12 @@ void Jit64::FloatCompare(UGeckoInstruction inst, bool upper)
int output[4] = { CR_SO, CR_EQ, CR_GT, CR_LT };
// Merge neighboring fcmp and cror (the primary use of cror).
UGeckoInstruction next = js.next_inst;
if (next.OPCD == 19 && next.SUBOP10 == 449 && (next.CRBA >> 2) == crf && (next.CRBB >> 2) == crf && (next.CRBD >> 2) == crf)
UGeckoInstruction next = js.op[1].inst;
if (analyzer.HasOption(PPCAnalyst::PPCAnalyzer::OPTION_CROR_MERGE) &&
MergeAllowedNextInstructions(1) && next.OPCD == 19 && next.SUBOP10 == 449 &&
(next.CRBA >> 2) == crf && (next.CRBB >> 2) == crf && (next.CRBD >> 2) == crf)
{
js.skipnext = true;
js.skipInstructions = 1;
js.downcountAmount++;
int dst = 3 - (next.CRBD & 3);
output[3 - (next.CRBD & 3)] &= ~(1 << dst);

View file

@ -50,14 +50,30 @@ void Jit64::GenerateOverflow()
SetJumpTarget(exit);
}
bool Jit64::MergeAllowedNextInstructions(int count)
{
if (PowerPC::GetState() == PowerPC::CPU_STEPPING || js.instructionsLeft < count)
return false;
// Be careful: a breakpoint kills flags in between instructions
for (int i = 1; i <= count; i++)
{
if (SConfig::GetInstance().m_LocalCoreStartupParameter.bEnableDebugging &&
PowerPC::breakpoints.IsAddressBreakPoint(js.op[i].address))
return false;
if (js.op[i].isBranchTarget)
return false;
}
return true;
}
void Jit64::FinalizeCarry(CCFlags cond)
{
js.carryFlagSet = false;
js.carryFlagInverted = false;
if (js.op->wantsCA)
{
// Be careful: a breakpoint kills flags in between instructions
if (!js.isLastInstruction && js.next_op->wantsCAInFlags && !js.next_inst_bp)
// Not actually merging instructions, but the effect is equivalent (we can't have breakpoints/etc in between).
if (MergeAllowedNextInstructions(1) && js.op[1].wantsCAInFlags)
{
if (cond == CC_C || cond == CC_NC)
{
@ -86,7 +102,7 @@ void Jit64::FinalizeCarry(bool ca)
js.carryFlagInverted = false;
if (js.op->wantsCA)
{
if (!js.isLastInstruction && js.next_op->wantsCAInFlags && !js.next_inst_bp)
if (MergeAllowedNextInstructions(1) && js.op[1].wantsCAInFlags)
{
if (ca)
STC();
@ -331,7 +347,10 @@ bool Jit64::CheckMergedBranch(int crf)
if (!analyzer.HasOption(PPCAnalyst::PPCAnalyzer::OPTION_BRANCH_MERGE))
return false;
const UGeckoInstruction& next = js.next_inst;
if (!MergeAllowedNextInstructions(1))
return false;
const UGeckoInstruction& next = js.op[1].inst;
return (((next.OPCD == 16 /* bcx */) ||
((next.OPCD == 19) && (next.SUBOP10 == 528) /* bcctrx */) ||
((next.OPCD == 19) && (next.SUBOP10 == 16) /* bclrx */)) &&
@ -343,33 +362,35 @@ bool Jit64::CheckMergedBranch(int crf)
void Jit64::DoMergedBranch()
{
// Code that handles successful PPC branching.
if (js.next_inst.OPCD == 16) // bcx
const UGeckoInstruction& next = js.op[1].inst;
const u32 nextPC = js.op[1].address;
if (next.OPCD == 16) // bcx
{
if (js.next_inst.LK)
MOV(32, M(&LR), Imm32(js.next_compilerPC + 4));
if (next.LK)
MOV(32, M(&LR), Imm32(nextPC + 4));
u32 destination;
if (js.next_inst.AA)
destination = SignExt16(js.next_inst.BD << 2);
if (next.AA)
destination = SignExt16(next.BD << 2);
else
destination = js.next_compilerPC + SignExt16(js.next_inst.BD << 2);
WriteExit(destination, js.next_inst.LK, js.next_compilerPC + 4);
destination = nextPC + SignExt16(next.BD << 2);
WriteExit(destination, next.LK, nextPC + 4);
}
else if ((js.next_inst.OPCD == 19) && (js.next_inst.SUBOP10 == 528)) // bcctrx
else if ((next.OPCD == 19) && (next.SUBOP10 == 528)) // bcctrx
{
if (js.next_inst.LK)
MOV(32, M(&LR), Imm32(js.next_compilerPC + 4));
if (next.LK)
MOV(32, M(&LR), Imm32(nextPC + 4));
MOV(32, R(RSCRATCH), M(&CTR));
AND(32, R(RSCRATCH), Imm32(0xFFFFFFFC));
WriteExitDestInRSCRATCH(js.next_inst.LK, js.next_compilerPC + 4);
WriteExitDestInRSCRATCH(next.LK, nextPC + 4);
}
else if ((js.next_inst.OPCD == 19) && (js.next_inst.SUBOP10 == 16)) // bclrx
else if ((next.OPCD == 19) && (next.SUBOP10 == 16)) // bclrx
{
MOV(32, R(RSCRATCH), M(&LR));
if (!m_enable_blr_optimization)
AND(32, R(RSCRATCH), Imm32(0xFFFFFFFC));
if (js.next_inst.LK)
MOV(32, M(&LR), Imm32(js.next_compilerPC + 4));
if (next.LK)
MOV(32, M(&LR), Imm32(nextPC + 4));
WriteBLRExit();
}
else
@ -381,9 +402,11 @@ void Jit64::DoMergedBranch()
void Jit64::DoMergedBranchCondition()
{
js.downcountAmount++;
js.skipnext = true;
int test_bit = 8 >> (js.next_inst.BI & 3);
bool condition = !!(js.next_inst.BO & BO_BRANCH_IF_TRUE);
js.skipInstructions = 1;
const UGeckoInstruction& next = js.op[1].inst;
int test_bit = 8 >> (next.BI & 3);
bool condition = !!(next.BO & BO_BRANCH_IF_TRUE);
const u32 nextPC = js.op[1].address;
gpr.UnlockAll();
gpr.UnlockAllX();
@ -408,16 +431,18 @@ void Jit64::DoMergedBranchCondition()
{
gpr.Flush();
fpr.Flush();
WriteExit(js.next_compilerPC + 4);
WriteExit(nextPC + 4);
}
}
void Jit64::DoMergedBranchImmediate(s64 val)
{
js.downcountAmount++;
js.skipnext = true;
int test_bit = 8 >> (js.next_inst.BI & 3);
bool condition = !!(js.next_inst.BO & BO_BRANCH_IF_TRUE);
js.skipInstructions = 1;
const UGeckoInstruction& next = js.op[1].inst;
int test_bit = 8 >> (next.BI & 3);
bool condition = !!(next.BO & BO_BRANCH_IF_TRUE);
const u32 nextPC = js.op[1].address;
gpr.UnlockAll();
gpr.UnlockAllX();
@ -441,7 +466,7 @@ void Jit64::DoMergedBranchImmediate(s64 val)
{
gpr.Flush();
fpr.Flush();
WriteExit(js.next_compilerPC + 4);
WriteExit(nextPC + 4);
}
}

View file

@ -95,15 +95,12 @@ void Jit64::lXXx(UGeckoInstruction inst)
}
// PowerPC has no 8-bit sign extended load, but x86 does, so merge extsb with the load if we find it.
if (accessSize == 8 && js.next_inst.OPCD == 31 && js.next_inst.SUBOP10 == 954 &&
js.next_inst.RS == inst.RD && js.next_inst.RA == inst.RD && !js.next_inst.Rc)
if (MergeAllowedNextInstructions(1) && accessSize == 8 && js.op[1].inst.OPCD == 31 && js.op[1].inst.SUBOP10 == 954 &&
js.op[1].inst.RS == inst.RD && js.op[1].inst.RA == inst.RD && !js.op[1].inst.Rc)
{
if (PowerPC::GetState() != PowerPC::CPU_STEPPING)
{
js.downcountAmount++;
js.skipnext = true;
signExtend = true;
}
js.downcountAmount++;
js.skipInstructions = 1;
signExtend = true;
}
// TODO(ector): Make it dynamically enable/disable idle skipping where appropriate

View file

@ -282,38 +282,38 @@ void Jit64::mfspr(UGeckoInstruction inst)
ADD(64, R(RAX), R(RDX));
MOV(64, PPCSTATE(spr[SPR_TL]), R(RAX));
// Two calls of TU/TL next to each other are extremely common in typical usage, so merge them
// if we can.
u32 nextIndex = (js.next_inst.SPRU << 5) | (js.next_inst.SPRL & 0x1F);
// Be careful; the actual opcode is for mftb (371), not mfspr (339)
int n = js.next_inst.RD;
if (js.next_inst.OPCD == 31 && js.next_inst.SUBOP10 == 371 && (nextIndex == SPR_TU || nextIndex == SPR_TL) &&
PowerPC::GetState() != PowerPC::CPU_STEPPING && n != d)
if (MergeAllowedNextInstructions(1))
{
js.downcountAmount++;
js.skipnext = true;
gpr.Lock(d, n);
gpr.BindToRegister(d, false);
gpr.BindToRegister(n, false);
if (iIndex == SPR_TL)
MOV(32, gpr.R(d), R(RAX));
if (nextIndex == SPR_TL)
MOV(32, gpr.R(n), R(RAX));
SHR(64, R(RAX), Imm8(32));
if (iIndex == SPR_TU)
MOV(32, gpr.R(d), R(RAX));
if (nextIndex == SPR_TU)
MOV(32, gpr.R(n), R(RAX));
}
else
{
gpr.Lock(d);
gpr.BindToRegister(d, false);
if (iIndex == SPR_TU)
const UGeckoInstruction& next = js.op[1].inst;
// Two calls of TU/TL next to each other are extremely common in typical usage, so merge them
// if we can.
u32 nextIndex = (next.SPRU << 5) | (next.SPRL & 0x1F);
// Be careful; the actual opcode is for mftb (371), not mfspr (339)
int n = next.RD;
if (next.OPCD == 31 && next.SUBOP10 == 371 && (nextIndex == SPR_TU || nextIndex == SPR_TL) && n != d)
{
js.downcountAmount++;
js.skipInstructions = 1;
gpr.Lock(d, n);
gpr.BindToRegister(d, false);
gpr.BindToRegister(n, false);
if (iIndex == SPR_TL)
MOV(32, gpr.R(d), R(RAX));
if (nextIndex == SPR_TL)
MOV(32, gpr.R(n), R(RAX));
SHR(64, R(RAX), Imm8(32));
MOV(32, gpr.R(d), R(RAX));
if (iIndex == SPR_TU)
MOV(32, gpr.R(d), R(RAX));
if (nextIndex == SPR_TU)
MOV(32, gpr.R(n), R(RAX));
break;
}
}
gpr.UnlockAllX();
gpr.Lock(d);
gpr.BindToRegister(d, false);
if (iIndex == SPR_TU)
SHR(64, R(RAX), Imm8(32));
MOV(32, gpr.R(d), R(RAX));
break;
}
case SPR_XER:
@ -341,6 +341,7 @@ void Jit64::mfspr(UGeckoInstruction inst)
MOV(32, gpr.R(d), PPCSTATE(spr[iIndex]));
break;
}
gpr.UnlockAllX();
gpr.UnlockAll();
}

View file

@ -610,16 +610,7 @@ const u8* JitIL::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc
js.downcountAmount += opinfo->numCycles;
if (i == (code_block.m_num_instructions - 1))
{
js.isLastInstruction = true;
js.next_inst = 0;
}
else
{
// help peephole optimizations
js.next_inst = ops[i + 1].inst;
js.next_compilerPC = ops[i + 1].address;
}
u32 function = HLE::GetFunctionIndex(ops[i].address);
if (function != 0)

View file

@ -443,7 +443,7 @@ const u8* JitArm::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBlo
if (!SConfig::GetInstance().m_LocalCoreStartupParameter.bEnableDebugging)
js.downcountAmount += PatchEngine::GetSpeedhackCycles(em_address);
js.skipnext = false;
js.skipInstructions = 0;
js.compilerPC = nextPC;
// Translate instructions
@ -459,13 +459,6 @@ const u8* JitArm::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBlo
{
// WARNING - cmp->branch merging will screw this up.
js.isLastInstruction = true;
js.next_inst = 0;
}
else
{
// help peephole optimizations
js.next_inst = ops[i + 1].inst;
js.next_compilerPC = ops[i + 1].address;
}
if (jo.optimizeGatherPipe && js.fifoBytesThisBlock >= 32)

View file

@ -232,7 +232,7 @@ const u8* JitArm64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitB
js.blockStart = em_address;
js.fifoBytesThisBlock = 0;
js.downcountAmount = 0;
js.skipnext = false;
js.skipInstructions = 0;
js.curBlock = b;
u32 nextPC = em_address;
@ -281,13 +281,6 @@ const u8* JitArm64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitB
{
// WARNING - cmp->branch merging will screw this up.
js.isLastInstruction = true;
js.next_inst = 0;
}
else
{
// help peephole optimizations
js.next_inst = ops[i + 1].inst;
js.next_compilerPC = ops[i + 1].address;
}
if (jo.optimizeGatherPipe && js.fifoBytesThisBlock >= 32)

View file

@ -65,9 +65,7 @@ protected:
struct JitState
{
u32 compilerPC;
u32 next_compilerPC;
u32 blockStart;
UGeckoInstruction next_inst; // for easy peephole opt.
int instructionNumber;
int instructionsLeft;
int downcountAmount;
@ -88,10 +86,9 @@ protected:
bool firstFPInstructionFound;
bool isLastInstruction;
bool memcheck;
bool skipnext;
int skipInstructions;
bool carryFlagSet;
bool carryFlagInverted;
bool next_inst_bp;
int fifoBytesThisBlock;
@ -99,7 +96,6 @@ protected:
PPCAnalyst::BlockRegStats gpa;
PPCAnalyst::BlockRegStats fpa;
PPCAnalyst::CodeOp* op;
PPCAnalyst::CodeOp* next_op;
u8* rewriteStart;
JitBlock *curBlock;

View file

@ -219,6 +219,11 @@ static bool CanSwapAdjacentOps(const CodeOp &a, const CodeOp &b)
const GekkoOPInfo *b_info = b.opinfo;
int a_flags = a_info->flags;
int b_flags = b_info->flags;
// can't reorder around breakpoints
if (SConfig::GetInstance().m_LocalCoreStartupParameter.bEnableDebugging &&
(PowerPC::breakpoints.IsAddressBreakPoint(a.address) || PowerPC::breakpoints.IsAddressBreakPoint(b.address)))
return false;
if (b_flags & (FL_SET_CRx | FL_ENDBLOCK | FL_TIMER | FL_EVIL | FL_SET_OE))
return false;
if ((b_flags & (FL_RC_BIT | FL_RC_BIT_F)) && (b.inst.Rc))
@ -462,7 +467,8 @@ void PPCAnalyzer::ReorderInstructions(u32 instructions, CodeOp *code)
// Reorder cror instructions upwards (e.g. towards an fcmp). Technically we should be more
// picky about this, but cror seems to almost solely be used for this purpose in real code.
// Additionally, the other boolean ops seem to almost never be used.
ReorderInstructionsCore(instructions, code, true, REORDER_CROR);
if (HasOption(OPTION_CROR_MERGE))
ReorderInstructionsCore(instructions, code, true, REORDER_CROR);
// For carry, bubble instructions *towards* each other; one direction often isn't enough
// to get pairs like addc/adde next to each other.
if (HasOption(OPTION_CARRY_MERGE))

View file

@ -214,6 +214,9 @@ public:
// Reorder carry instructions next to their associated branches and pass
// carry flags in the x86 flags between them, instead of in XER.
OPTION_CARRY_MERGE = (1 << 5),
// Reorder cror instructions next to their associated fcmp.
OPTION_CROR_MERGE = (1 << 6),
};