From 45d84605a9a2a0115976af061d318345985ac422 Mon Sep 17 00:00:00 2001 From: Fiora Date: Sun, 7 Sep 2014 00:37:47 -0700 Subject: [PATCH] JIT64: optimize carry calculations further Keep carry flags in the x86 flags register if used in the next instruction. --- Source/Core/Core/PowerPC/Jit64/Jit.cpp | 3 + Source/Core/Core/PowerPC/Jit64/Jit.h | 4 +- .../Core/Core/PowerPC/Jit64/Jit64_Tables.cpp | 2 +- .../Core/Core/PowerPC/Jit64/Jit_Integer.cpp | 129 ++++++++++++------ Source/Core/Core/PowerPC/JitCommon/JitBase.h | 5 +- .../Core/Core/PowerPC/JitCommon/Jit_Util.cpp | 1 + Source/Core/Core/PowerPC/PPCAnalyst.cpp | 6 + Source/Core/Core/PowerPC/PPCAnalyst.h | 1 + 8 files changed, 109 insertions(+), 42 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.cpp b/Source/Core/Core/PowerPC/Jit64/Jit.cpp index 7412489948..619217e4b6 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit.cpp @@ -457,6 +457,8 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc js.downcountAmount += PatchEngine::GetSpeedhackCycles(code_block.m_address); js.skipnext = false; + js.carryFlagSet = false; + js.carryFlagInverted = false; js.compilerPC = nextPC; // Translate instructions for (u32 i = 0; i < code_block.m_num_instructions; i++) @@ -488,6 +490,7 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc // help peephole optimizations js.next_inst = ops[i + 1].inst; js.next_compilerPC = ops[i + 1].address; + js.next_op = &ops[i + 1]; } if (jo.optimizeGatherPipe && js.fifoBytesThisBlock >= 32) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h index 498d833dd7..53a846237f 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.h +++ b/Source/Core/Core/PowerPC/Jit64/Jit.h @@ -100,7 +100,9 @@ public: void GenerateConstantOverflow(bool overflow); void GenerateConstantOverflow(s64 val); void GenerateOverflow(); - void FinalizeCarryOverflow(bool ca, bool oe, bool inv = false); + void FinalizeCarryOverflow(bool oe, bool inv = false); + void FinalizeCarry(Gen::CCFlags cond); + void FinalizeCarry(bool ca); void ComputeRC(const Gen::OpArg & arg); // Use to extract bytes from a register using the regcache. offset is in bytes. diff --git a/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp b/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp index f9e0ac97d5..88f686023a 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp @@ -273,7 +273,7 @@ static GekkoOPTemplate table31[] = {339, &Jit64::mfspr}, //"mfspr", OPTYPE_SPR, FL_OUT_D}}, {467, &Jit64::mtspr}, //"mtspr", OPTYPE_SPR, 0, 2}}, {371, &Jit64::mftb}, //"mftb", OPTYPE_SYSTEM, FL_OUT_D | FL_TIMER}}, - {512, &Jit64::mcrxr}, //"mcrxr", OPTYPE_SYSTEM, 0}}, + {512, &Jit64::mcrxr}, //"mcrxr", OPTYPE_SYSTEM, FL_READ_CA | FL_SET_CA}}, {595, &Jit64::FallBackToInterpreter}, //"mfsr", OPTYPE_SYSTEM, FL_OUT_D, 2}}, {659, &Jit64::FallBackToInterpreter}, //"mfsrin", OPTYPE_SYSTEM, FL_OUT_D, 2}}, diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp index 974ae1569d..203a5fae00 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp @@ -44,30 +44,76 @@ void Jit64::GenerateOverflow() SetJumpTarget(exit); } +void Jit64::FinalizeCarry(CCFlags cond) +{ + js.carryFlagSet = false; + js.carryFlagInverted = false; + if (js.op->wantsCA) + { + if (js.next_op->wantsCAInFlags) + { + if (cond == CC_C || cond == CC_NC) + { + js.carryFlagInverted = cond == CC_NC; + } + else + { + // convert the condition to a carry flag (is there a better way?) + SETcc(cond, R(RSCRATCH)); + BT(8, R(RSCRATCH), Imm8(0)); + } + js.carryFlagSet = true; + } + else + { + JitSetCAIf(cond); + } + } +} + +// Unconditional version +void Jit64::FinalizeCarry(bool ca) +{ + js.carryFlagSet = false; + js.carryFlagInverted = false; + if (js.op->wantsCA) + { + if (js.next_op->wantsCAInFlags) + { + if (ca) + STC(); + else + CLC(); + js.carryFlagSet = true; + } + else if (ca) + { + JitSetCA(); + } + else + { + JitClearCAOV(true, false); + } + } +} + // Assumes CA,OV are clear -void Jit64::FinalizeCarryOverflow(bool ca, bool oe, bool inv) +void Jit64::FinalizeCarryOverflow(bool oe, bool inv) { // USES_XER if (oe) { - // this is slightly messy because JitSetCAIf modifies x86 flags, so we have to do it in both - // sides of the branch. + // Make sure not to lose the carry flags (not a big deal, this path is rare). + PUSHF(); + AND(32, PPCSTATE(spr[SPR_XER]), Imm32(~(XER_SO_MASK | XER_OV_MASK))); FixupBranch jno = J_CC(CC_NO); - if (ca) - JitSetCAIf(inv ? CC_NC : CC_C); //XER[OV/SO] = 1 OR(32, PPCSTATE(spr[SPR_XER]), Imm32(XER_SO_MASK | XER_OV_MASK)); - FixupBranch exit = J(); SetJumpTarget(jno); - if (ca) - JitSetCAIf(inv ? CC_NC : CC_C); - SetJumpTarget(exit); - } - else if (ca) - { - // Do carry - JitSetCAIf(inv ? CC_NC : CC_C); + POPF(); } + // Do carry + FinalizeCarry(inv ? CC_NC : CC_C); } void Jit64::ComputeRC(const Gen::OpArg & arg) @@ -135,7 +181,6 @@ void Jit64::regimmop(int d, int a, bool binary, u32 value, Operation doop, void if (a || binary || carry) { carry &= js.op->wantsCA; - JitClearCAOV(carry, false); if (gpr.R(a).IsImm() && !carry) { gpr.SetImmediate32(d, doop((u32)gpr.R(a).offset, value)); @@ -159,7 +204,7 @@ void Jit64::regimmop(int d, int a, bool binary, u32 value, Operation doop, void } } if (carry) - JitSetCAIf(CC_C); + FinalizeCarry(CC_C); if (Rc) ComputeRC(gpr.R(d)); } @@ -720,38 +765,31 @@ void Jit64::subfic(UGeckoInstruction inst) { if (imm == 0) { - JitClearCAOV(js.op->wantsCA, false); // Flags act exactly like subtracting from 0 NEG(32, gpr.R(d)); // Output carry is inverted - if (js.op->wantsCA) - JitSetCAIf(CC_NC); + FinalizeCarry(CC_NC); } else if (imm == -1) { - // CA is always set in this case - if (js.op->wantsCA) - JitSetCA(); NOT(32, gpr.R(d)); + // CA is always set in this case + FinalizeCarry(true); } else { - JitClearCAOV(js.op->wantsCA, false); NOT(32, gpr.R(d)); ADD(32, gpr.R(d), Imm32(imm+1)); // Output carry is normal - if (js.op->wantsCA) - JitSetCAIf(CC_C); + FinalizeCarry(CC_C); } } else { - JitClearCAOV(js.op->wantsCA, false); MOV(32, gpr.R(d), Imm32(imm)); SUB(32, gpr.R(d), gpr.R(a)); // Output carry is inverted - if (js.op->wantsCA) - JitSetCAIf(CC_NC); + FinalizeCarry(CC_NC); } gpr.UnlockAll(); // This instruction has no RC flag @@ -1233,29 +1271,44 @@ void Jit64::arithXex(UGeckoInstruction inst) int a = inst.RA; int b = regsource ? inst.RB : a; int d = inst.RD; + bool same_input_sub = !add && regsource && a == b; gpr.Lock(a, b, d); - gpr.BindToRegister(d, d == a || d == b); - JitGetAndClearCAOV(inst.OE); + gpr.BindToRegister(d, !same_input_sub && (d == a || d == b)); + if (!js.carryFlagSet) + JitGetAndClearCAOV(inst.OE); bool invertedCarry = false; - if (!add && regsource && d == b) + // Special case: subfe A, B, B is a common compiler idiom + if (same_input_sub) { // Convert carry to borrow - CMC(); + if (!js.carryFlagInverted) + CMC(); + SBB(32, gpr.R(d), gpr.R(d)); + invertedCarry = true; + } + else if (!add && regsource && d == b) + { + if (!js.carryFlagInverted) + CMC(); + if (d != b) + MOV(32, gpr.R(d), gpr.R(b)); SBB(32, gpr.R(d), gpr.R(a)); invertedCarry = true; } else { OpArg source = regsource ? gpr.R(d == b ? a : b) : Imm32(mex ? 0xFFFFFFFF : 0); + if (js.carryFlagInverted) + CMC(); if (d != a && d != b) MOV(32, gpr.R(d), gpr.R(a)); if (!add) NOT(32, gpr.R(d)); ADC(32, gpr.R(d), source); } - FinalizeCarryOverflow(js.op->wantsCA, inst.OE, invertedCarry); + FinalizeCarryOverflow(inst.OE, invertedCarry); if (inst.Rc) ComputeRC(gpr.R(d)); gpr.UnlockAll(); @@ -1269,7 +1322,6 @@ void Jit64::arithcx(UGeckoInstruction inst) int a = inst.RA, b = inst.RB, d = inst.RD; gpr.Lock(a, b, d); gpr.BindToRegister(d, d == a || d == b, true); - JitClearCAOV(js.op->wantsCA, inst.OE); if (d == a && d != b) { @@ -1295,7 +1347,7 @@ void Jit64::arithcx(UGeckoInstruction inst) SUB(32, gpr.R(d), gpr.R(a)); } - FinalizeCarryOverflow(js.op->wantsCA, inst.OE, !add); + FinalizeCarryOverflow(inst.OE, !add); if (inst.Rc) ComputeRC(gpr.R(d)); gpr.UnlockAll(); @@ -1688,7 +1740,6 @@ void Jit64::srawx(UGeckoInstruction inst) gpr.FlushLockX(ECX); gpr.Lock(a, s, b); gpr.BindToRegister(a, (a == s || a == b), true); - JitClearCAOV(js.op->wantsCA, false); MOV(32, R(ECX), gpr.R(b)); if (a != s) MOV(32, gpr.R(a), gpr.R(s)); @@ -1699,12 +1750,12 @@ void Jit64::srawx(UGeckoInstruction inst) MOV(32, R(RSCRATCH), gpr.R(a)); SHR(64, gpr.R(a), Imm8(32)); TEST(32, gpr.R(a), R(RSCRATCH)); - JitSetCAIf(CC_NZ); } else { SHR(64, gpr.R(a), Imm8(32)); } + FinalizeCarry(CC_NZ); gpr.UnlockAll(); gpr.UnlockAllX(); if (inst.Rc) @@ -1758,14 +1809,14 @@ void Jit64::srawix(UGeckoInstruction inst) SAR(32, gpr.R(a), Imm8(amount)); SHL(32, R(RSCRATCH), Imm8(32 - amount)); TEST(32, R(RSCRATCH), gpr.R(a)); - JitSetCAIf(CC_NZ); + FinalizeCarry(CC_NZ); } } } else { gpr.Lock(a, s); - JitClearCAOV(js.op->wantsCA, false); + FinalizeCarry(false); gpr.BindToRegister(a, a == s, true); if (a != s) diff --git a/Source/Core/Core/PowerPC/JitCommon/JitBase.h b/Source/Core/Core/PowerPC/JitCommon/JitBase.h index 816bfeae13..c6ff6e4967 100644 --- a/Source/Core/Core/PowerPC/JitCommon/JitBase.h +++ b/Source/Core/Core/PowerPC/JitCommon/JitBase.h @@ -81,13 +81,16 @@ protected: bool isLastInstruction; bool memcheck; bool skipnext; + bool carryFlagSet; + bool carryFlagInverted; int fifoBytesThisBlock; PPCAnalyst::BlockStats st; PPCAnalyst::BlockRegStats gpa; PPCAnalyst::BlockRegStats fpa; - PPCAnalyst::CodeOp *op; + PPCAnalyst::CodeOp* op; + PPCAnalyst::CodeOp* next_op; u8* rewriteStart; JitBlock *curBlock; diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp index 2b1a0ef7c1..ee7441a607 100644 --- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp +++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp @@ -822,6 +822,7 @@ void EmuCodeBlock::JitSetCAIf(CCFlags conditionCode) SETcc(conditionCode, R(RSCRATCH)); MOVZX(32, 8, RSCRATCH, R(RSCRATCH)); SHL(32, R(RSCRATCH), Imm8(XER_CA_SHIFT)); + AND(32, PPCSTATE(spr[SPR_XER]), Imm32(~XER_CA_MASK)); OR(32, PPCSTATE(spr[SPR_XER]), R(RSCRATCH)); //XER.CA = 1 } diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.cpp b/Source/Core/Core/PowerPC/PPCAnalyst.cpp index e7c06a2009..36f2ecd91d 100644 --- a/Source/Core/Core/PowerPC/PPCAnalyst.cpp +++ b/Source/Core/Core/PowerPC/PPCAnalyst.cpp @@ -460,7 +460,13 @@ void PPCAnalyzer::SetInstructionStats(CodeBlock *block, CodeOp *code, GekkoOPInf code->wantsCA = (opinfo->flags & FL_READ_CA) ? true : false; code->outputCA = (opinfo->flags & FL_SET_CA) ? true : false; + // We're going to try to avoid storing carry in XER if we can avoid it -- keep it in the x86 carry flag! + // If the instruction reads CA but doesn't write it, we still need to store CA in XER; we can't + // leave it in flags. + code->wantsCAInFlags = code->wantsCA && code->outputCA && code->inst.SUBOP10 != 512; + // mfspr/mtspr can affect/use XER, so be super careful here + // we need to note specifically that mfspr needs CA in XER, not in the x86 carry flag if (code->inst.OPCD == 31 && code->inst.SUBOP10 == 339) // mfspr code->wantsCA = ((code->inst.SPRU << 5) | (code->inst.SPRL & 0x1F)) == SPR_XER; if (code->inst.OPCD == 31 && code->inst.SUBOP10 == 467) // mtspr diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.h b/Source/Core/Core/PowerPC/PPCAnalyst.h index 774129a5d0..aa1a00abeb 100644 --- a/Source/Core/Core/PowerPC/PPCAnalyst.h +++ b/Source/Core/Core/PowerPC/PPCAnalyst.h @@ -35,6 +35,7 @@ struct CodeOp //16B bool wantsCR1; bool wantsFPRF; bool wantsCA; + bool wantsCAInFlags; bool outputCR0; bool outputCR1; bool outputFPRF;