JIT: make instruction merging generic

Now it should be easier to merge more than 2-instruction-long sequences. Also correct some minor inconsistencies in behavior between instruction merging cases.
2024-09-20 19:31:53 +02:00 · 2015-01-03 22:59:28 -08:00 · 2015-01-03 22:59:28 -08:00 · e8cfcd3aeb
commit e8cfcd3aeb
parent 074f246c69
12 changed files with 112 additions and 115 deletions
--- a/Source/Core/Core/PowerPC/Jit64/Jit.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit.cpp
@ -522,6 +522,7 @@ void Jit64::Jit(u32 em_address)
 				jo.enableBlocklink = false;
 				analyzer.ClearOption(PPCAnalyst::PPCAnalyzer::OPTION_CONDITIONAL_CONTINUE);
 				analyzer.ClearOption(PPCAnalyst::PPCAnalyzer::OPTION_BRANCH_MERGE);
+				analyzer.ClearOption(PPCAnalyst::PPCAnalyzer::OPTION_CROR_MERGE);
 				analyzer.ClearOption(PPCAnalyst::PPCAnalyzer::OPTION_CARRY_MERGE);
 			}
 			Trace();
@ -603,7 +604,7 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc
 	if (!SConfig::GetInstance().m_LocalCoreStartupParameter.bEnableDebugging)
 		js.downcountAmount += PatchEngine::GetSpeedhackCycles(code_block.m_address);

-	js.skipnext = false;
+	js.skipInstructions = 0;
 	js.carryFlagSet = false;
 	js.carryFlagInverted = false;
 	js.assumeNoPairedQuantize = false;
@ -651,12 +652,9 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc

 		if (i == (code_block.m_num_instructions - 1))
 		{
-			// WARNING - cmp->branch merging will screw this up.
-			js.isLastInstruction = true;
-			js.next_inst = 0;
-			js.next_inst_bp = false;
 			if (Profiler::g_ProfileBlocks)
 			{
+				// WARNING - cmp->branch merging will screw this up.
 				PROFILER_VPUSH;
 				// get end tic
 				PROFILER_QUERY_PERFORMANCE_COUNTER(&b->ticStop);
@ -664,14 +662,7 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc
 				PROFILER_UPDATE_TIME(b);
 				PROFILER_VPOP;
 			}
-		}
-		else
-		{
-			// help peephole optimizations
-			js.next_inst = ops[i + 1].inst;
-			js.next_compilerPC = ops[i + 1].address;
-			js.next_op = &ops[i + 1];
-			js.next_inst_bp = SConfig::GetInstance().m_LocalCoreStartupParameter.bEnableDebugging && breakpoints.IsAddressBreakPoint(ops[i + 1].address);
+			js.isLastInstruction = true;
 		}

 		if (jo.optimizeGatherPipe && js.fifoBytesThisBlock >= 32)
@ -856,11 +847,8 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc
 			//NOTICE_LOG(DYNA_REC, "Unflushed register: %s", ppc_inst.c_str());
 		}
 #endif
-		if (js.skipnext)
-		{
-			js.skipnext = false;
-			i++; // Skip next instruction
-		}
+		i += js.skipInstructions;
+		js.skipInstructions = 0;
 	}

 	u32 function = HLE::GetFunctionIndex(js.blockStart);
@ -919,5 +907,6 @@ void Jit64::EnableOptimization()
 {
 	analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_CONDITIONAL_CONTINUE);
 	analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_BRANCH_MERGE);
+	analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_CROR_MERGE);
 	analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_CARRY_MERGE);
 }
--- a/Source/Core/Core/PowerPC/Jit64/Jit.h
+++ b/Source/Core/Core/PowerPC/Jit64/Jit.h
@ -115,6 +115,7 @@ public:
 	void GenerateConstantOverflow(bool overflow);
 	void GenerateConstantOverflow(s64 val);
 	void GenerateOverflow();
+	bool MergeAllowedNextInstructions(int count);
 	void FinalizeCarryOverflow(bool oe, bool inv = false);
 	void FinalizeCarry(Gen::CCFlags cond);
 	void FinalizeCarry(bool ca);
--- a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
@ -346,10 +346,12 @@ void Jit64::FloatCompare(UGeckoInstruction inst, bool upper)
 	int output[4] = { CR_SO, CR_EQ, CR_GT, CR_LT };

 	// Merge neighboring fcmp and cror (the primary use of cror).
-	UGeckoInstruction next = js.next_inst;
-	if (next.OPCD == 19 && next.SUBOP10 == 449 && (next.CRBA >> 2) == crf && (next.CRBB >> 2) == crf && (next.CRBD >> 2) == crf)
+	UGeckoInstruction next = js.op[1].inst;
+	if (analyzer.HasOption(PPCAnalyst::PPCAnalyzer::OPTION_CROR_MERGE) &&
+	    MergeAllowedNextInstructions(1) && next.OPCD == 19 && next.SUBOP10 == 449 &&
+	    (next.CRBA >> 2) == crf && (next.CRBB >> 2) == crf && (next.CRBD >> 2) == crf)
 	{
-		js.skipnext = true;
+		js.skipInstructions = 1;
 		js.downcountAmount++;
 		int dst = 3 - (next.CRBD & 3);
 		output[3 - (next.CRBD & 3)] &= ~(1 << dst);
--- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
@ -50,14 +50,30 @@ void Jit64::GenerateOverflow()
 	SetJumpTarget(exit);
 }

+bool Jit64::MergeAllowedNextInstructions(int count)
+{
+	if (PowerPC::GetState() == PowerPC::CPU_STEPPING || js.instructionsLeft < count)
+		return false;
+	// Be careful: a breakpoint kills flags in between instructions
+	for (int i = 1; i <= count; i++)
+	{
+		if (SConfig::GetInstance().m_LocalCoreStartupParameter.bEnableDebugging &&
+			PowerPC::breakpoints.IsAddressBreakPoint(js.op[i].address))
+			return false;
+		if (js.op[i].isBranchTarget)
+			return false;
+	}
+	return true;
+}
+
 void Jit64::FinalizeCarry(CCFlags cond)
 {
 	js.carryFlagSet = false;
 	js.carryFlagInverted = false;
 	if (js.op->wantsCA)
 	{
-		// Be careful: a breakpoint kills flags in between instructions
-		if (!js.isLastInstruction && js.next_op->wantsCAInFlags && !js.next_inst_bp)
+		// Not actually merging instructions, but the effect is equivalent (we can't have breakpoints/etc in between).
+		if (MergeAllowedNextInstructions(1) && js.op[1].wantsCAInFlags)
 		{
 			if (cond == CC_C || cond == CC_NC)
 			{
@ -86,7 +102,7 @@ void Jit64::FinalizeCarry(bool ca)
 	js.carryFlagInverted = false;
 	if (js.op->wantsCA)
 	{
-		if (!js.isLastInstruction && js.next_op->wantsCAInFlags && !js.next_inst_bp)
+		if (MergeAllowedNextInstructions(1) && js.op[1].wantsCAInFlags)
 		{
 			if (ca)
 				STC();
@ -331,7 +347,10 @@ bool Jit64::CheckMergedBranch(int crf)
 	if (!analyzer.HasOption(PPCAnalyst::PPCAnalyzer::OPTION_BRANCH_MERGE))
 		return false;

-	const UGeckoInstruction& next = js.next_inst;
+	if (!MergeAllowedNextInstructions(1))
+		return false;
+
+	const UGeckoInstruction& next = js.op[1].inst;
 	return (((next.OPCD == 16 /* bcx */) ||
 	        ((next.OPCD == 19) && (next.SUBOP10 == 528) /* bcctrx */) ||
 	        ((next.OPCD == 19) && (next.SUBOP10 == 16) /* bclrx */)) &&
@ -343,33 +362,35 @@ bool Jit64::CheckMergedBranch(int crf)
 void Jit64::DoMergedBranch()
 {
 	// Code that handles successful PPC branching.
-	if (js.next_inst.OPCD == 16) // bcx
+	const UGeckoInstruction& next = js.op[1].inst;
+	const u32 nextPC = js.op[1].address;
+	if (next.OPCD == 16) // bcx
 	{
-		if (js.next_inst.LK)
-			MOV(32, M(&LR), Imm32(js.next_compilerPC + 4));
+		if (next.LK)
+			MOV(32, M(&LR), Imm32(nextPC + 4));

 		u32 destination;
-		if (js.next_inst.AA)
-			destination = SignExt16(js.next_inst.BD << 2);
+		if (next.AA)
+			destination = SignExt16(next.BD << 2);
 		else
-			destination = js.next_compilerPC + SignExt16(js.next_inst.BD << 2);
-		WriteExit(destination, js.next_inst.LK, js.next_compilerPC + 4);
+			destination = nextPC + SignExt16(next.BD << 2);
+		WriteExit(destination, next.LK, nextPC + 4);
 	}
-	else if ((js.next_inst.OPCD == 19) && (js.next_inst.SUBOP10 == 528)) // bcctrx
+	else if ((next.OPCD == 19) && (next.SUBOP10 == 528)) // bcctrx
 	{
-		if (js.next_inst.LK)
-			MOV(32, M(&LR), Imm32(js.next_compilerPC + 4));
+		if (next.LK)
+			MOV(32, M(&LR), Imm32(nextPC + 4));
 		MOV(32, R(RSCRATCH), M(&CTR));
 		AND(32, R(RSCRATCH), Imm32(0xFFFFFFFC));
-		WriteExitDestInRSCRATCH(js.next_inst.LK, js.next_compilerPC + 4);
+		WriteExitDestInRSCRATCH(next.LK, nextPC + 4);
 	}
-	else if ((js.next_inst.OPCD == 19) && (js.next_inst.SUBOP10 == 16)) // bclrx
+	else if ((next.OPCD == 19) && (next.SUBOP10 == 16)) // bclrx
 	{
 		MOV(32, R(RSCRATCH), M(&LR));
 		if (!m_enable_blr_optimization)
 			AND(32, R(RSCRATCH), Imm32(0xFFFFFFFC));
-		if (js.next_inst.LK)
-			MOV(32, M(&LR), Imm32(js.next_compilerPC + 4));
+		if (next.LK)
+			MOV(32, M(&LR), Imm32(nextPC + 4));
 		WriteBLRExit();
 	}
 	else
@ -381,9 +402,11 @@ void Jit64::DoMergedBranch()
 void Jit64::DoMergedBranchCondition()
 {
 	js.downcountAmount++;
-	js.skipnext = true;
-	int test_bit = 8 >> (js.next_inst.BI & 3);
-	bool condition = !!(js.next_inst.BO & BO_BRANCH_IF_TRUE);
+	js.skipInstructions = 1;
+	const UGeckoInstruction& next = js.op[1].inst;
+	int test_bit = 8 >> (next.BI & 3);
+	bool condition = !!(next.BO & BO_BRANCH_IF_TRUE);
+	const u32 nextPC = js.op[1].address;

 	gpr.UnlockAll();
 	gpr.UnlockAllX();
@ -408,16 +431,18 @@ void Jit64::DoMergedBranchCondition()
 	{
 		gpr.Flush();
 		fpr.Flush();
-		WriteExit(js.next_compilerPC + 4);
+		WriteExit(nextPC + 4);
 	}
 }

 void Jit64::DoMergedBranchImmediate(s64 val)
 {
 	js.downcountAmount++;
-	js.skipnext = true;
-	int test_bit = 8 >> (js.next_inst.BI & 3);
-	bool condition = !!(js.next_inst.BO & BO_BRANCH_IF_TRUE);
+	js.skipInstructions = 1;
+	const UGeckoInstruction& next = js.op[1].inst;
+	int test_bit = 8 >> (next.BI & 3);
+	bool condition = !!(next.BO & BO_BRANCH_IF_TRUE);
+	const u32 nextPC = js.op[1].address;

 	gpr.UnlockAll();
 	gpr.UnlockAllX();
@ -441,7 +466,7 @@ void Jit64::DoMergedBranchImmediate(s64 val)
 	{
 		gpr.Flush();
 		fpr.Flush();
-		WriteExit(js.next_compilerPC + 4);
+		WriteExit(nextPC + 4);
 	}
 }

--- a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp
@ -95,15 +95,12 @@ void Jit64::lXXx(UGeckoInstruction inst)
 	}

 	// PowerPC has no 8-bit sign extended load, but x86 does, so merge extsb with the load if we find it.
-	if (accessSize == 8 && js.next_inst.OPCD == 31 && js.next_inst.SUBOP10 == 954 &&
-	    js.next_inst.RS == inst.RD && js.next_inst.RA == inst.RD && !js.next_inst.Rc)
+	if (MergeAllowedNextInstructions(1) && accessSize == 8 && js.op[1].inst.OPCD == 31 && js.op[1].inst.SUBOP10 == 954 &&
+	    js.op[1].inst.RS == inst.RD && js.op[1].inst.RA == inst.RD && !js.op[1].inst.Rc)
 	{
-		if (PowerPC::GetState() != PowerPC::CPU_STEPPING)
-		{
-			js.downcountAmount++;
-			js.skipnext = true;
-			signExtend = true;
-		}
+		js.downcountAmount++;
+		js.skipInstructions = 1;
+		signExtend = true;
 	}

 	// TODO(ector): Make it dynamically enable/disable idle skipping where appropriate
--- a/Source/Core/Core/PowerPC/Jit64/Jit_SystemRegisters.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_SystemRegisters.cpp
@ -282,38 +282,38 @@ void Jit64::mfspr(UGeckoInstruction inst)
 		ADD(64, R(RAX), R(RDX));
 		MOV(64, PPCSTATE(spr[SPR_TL]), R(RAX));

-		// Two calls of TU/TL next to each other are extremely common in typical usage, so merge them
-		// if we can.
-		u32 nextIndex = (js.next_inst.SPRU << 5) | (js.next_inst.SPRL & 0x1F);
-		// Be careful; the actual opcode is for mftb (371), not mfspr (339)
-		int n = js.next_inst.RD;
-		if (js.next_inst.OPCD == 31 && js.next_inst.SUBOP10 == 371 && (nextIndex == SPR_TU || nextIndex == SPR_TL) &&
-			PowerPC::GetState() != PowerPC::CPU_STEPPING && n != d)
+		if (MergeAllowedNextInstructions(1))
 		{
-			js.downcountAmount++;
-			js.skipnext = true;
-			gpr.Lock(d, n);
-			gpr.BindToRegister(d, false);
-			gpr.BindToRegister(n, false);
-			if (iIndex == SPR_TL)
-				MOV(32, gpr.R(d), R(RAX));
-			if (nextIndex == SPR_TL)
-				MOV(32, gpr.R(n), R(RAX));
-			SHR(64, R(RAX), Imm8(32));
-			if (iIndex == SPR_TU)
-				MOV(32, gpr.R(d), R(RAX));
-			if (nextIndex == SPR_TU)
-				MOV(32, gpr.R(n), R(RAX));
-		}
-		else
-		{
-			gpr.Lock(d);
-			gpr.BindToRegister(d, false);
-			if (iIndex == SPR_TU)
+			const UGeckoInstruction& next = js.op[1].inst;
+			// Two calls of TU/TL next to each other are extremely common in typical usage, so merge them
+			// if we can.
+			u32 nextIndex = (next.SPRU << 5) | (next.SPRL & 0x1F);
+			// Be careful; the actual opcode is for mftb (371), not mfspr (339)
+			int n = next.RD;
+			if (next.OPCD == 31 && next.SUBOP10 == 371 && (nextIndex == SPR_TU || nextIndex == SPR_TL) && n != d)
+			{
+				js.downcountAmount++;
+				js.skipInstructions = 1;
+				gpr.Lock(d, n);
+				gpr.BindToRegister(d, false);
+				gpr.BindToRegister(n, false);
+				if (iIndex == SPR_TL)
+					MOV(32, gpr.R(d), R(RAX));
+				if (nextIndex == SPR_TL)
+					MOV(32, gpr.R(n), R(RAX));
 				SHR(64, R(RAX), Imm8(32));
-			MOV(32, gpr.R(d), R(RAX));
+				if (iIndex == SPR_TU)
+					MOV(32, gpr.R(d), R(RAX));
+				if (nextIndex == SPR_TU)
+					MOV(32, gpr.R(n), R(RAX));
+				break;
+			}
 		}
-		gpr.UnlockAllX();
+		gpr.Lock(d);
+		gpr.BindToRegister(d, false);
+		if (iIndex == SPR_TU)
+			SHR(64, R(RAX), Imm8(32));
+		MOV(32, gpr.R(d), R(RAX));
 		break;
 	}
 	case SPR_XER:
@ -341,6 +341,7 @@ void Jit64::mfspr(UGeckoInstruction inst)
 		MOV(32, gpr.R(d), PPCSTATE(spr[iIndex]));
 		break;
 	}
+	gpr.UnlockAllX();
 	gpr.UnlockAll();
 }

--- a/Source/Core/Core/PowerPC/Jit64IL/JitIL.cpp
+++ b/Source/Core/Core/PowerPC/Jit64IL/JitIL.cpp
@ -610,16 +610,7 @@ const u8* JitIL::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc
 		js.downcountAmount += opinfo->numCycles;

 		if (i == (code_block.m_num_instructions - 1))
-		{
 			js.isLastInstruction = true;
-			js.next_inst = 0;
-		}
-		else
-		{
-			// help peephole optimizations
-			js.next_inst = ops[i + 1].inst;
-			js.next_compilerPC = ops[i + 1].address;
-		}

 		u32 function = HLE::GetFunctionIndex(ops[i].address);
 		if (function != 0)
--- a/Source/Core/Core/PowerPC/JitArm32/Jit.cpp
+++ b/Source/Core/Core/PowerPC/JitArm32/Jit.cpp
@ -443,7 +443,7 @@ const u8* JitArm::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBlo
 	if (!SConfig::GetInstance().m_LocalCoreStartupParameter.bEnableDebugging)
 		js.downcountAmount += PatchEngine::GetSpeedhackCycles(em_address);

-	js.skipnext = false;
+	js.skipInstructions = 0;
 	js.compilerPC = nextPC;

 	// Translate instructions
@ -459,13 +459,6 @@ const u8* JitArm::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBlo
 		{
 			// WARNING - cmp->branch merging will screw this up.
 			js.isLastInstruction = true;
-			js.next_inst = 0;
-		}
-		else
-		{
-			// help peephole optimizations
-			js.next_inst = ops[i + 1].inst;
-			js.next_compilerPC = ops[i + 1].address;
 		}

 		if (jo.optimizeGatherPipe && js.fifoBytesThisBlock >= 32)
--- a/Source/Core/Core/PowerPC/JitArm64/Jit.cpp
+++ b/Source/Core/Core/PowerPC/JitArm64/Jit.cpp
@ -232,7 +232,7 @@ const u8* JitArm64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitB
 	js.blockStart = em_address;
 	js.fifoBytesThisBlock = 0;
 	js.downcountAmount = 0;
-	js.skipnext = false;
+	js.skipInstructions = 0;
 	js.curBlock = b;

 	u32 nextPC = em_address;
@ -281,13 +281,6 @@ const u8* JitArm64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitB
 		{
 			// WARNING - cmp->branch merging will screw this up.
 			js.isLastInstruction = true;
-			js.next_inst = 0;
-		}
-		else
-		{
-			// help peephole optimizations
-			js.next_inst = ops[i + 1].inst;
-			js.next_compilerPC = ops[i + 1].address;
 		}

 		if (jo.optimizeGatherPipe && js.fifoBytesThisBlock >= 32)
--- a/Source/Core/Core/PowerPC/JitCommon/JitBase.h
+++ b/Source/Core/Core/PowerPC/JitCommon/JitBase.h
@ -65,9 +65,7 @@ protected:
 	struct JitState
 	{
 		u32 compilerPC;
-		u32 next_compilerPC;
 		u32 blockStart;
-		UGeckoInstruction next_inst;  // for easy peephole opt.
 		int instructionNumber;
 		int instructionsLeft;
 		int downcountAmount;
@ -88,10 +86,9 @@ protected:
 		bool firstFPInstructionFound;
 		bool isLastInstruction;
 		bool memcheck;
-		bool skipnext;
+		int skipInstructions;
 		bool carryFlagSet;
 		bool carryFlagInverted;
-		bool next_inst_bp;

 		int fifoBytesThisBlock;

@ -99,7 +96,6 @@ protected:
 		PPCAnalyst::BlockRegStats gpa;
 		PPCAnalyst::BlockRegStats fpa;
 		PPCAnalyst::CodeOp* op;
-		PPCAnalyst::CodeOp* next_op;
 		u8* rewriteStart;

 		JitBlock *curBlock;
--- a/Source/Core/Core/PowerPC/PPCAnalyst.cpp
+++ b/Source/Core/Core/PowerPC/PPCAnalyst.cpp
@ -219,6 +219,11 @@ static bool CanSwapAdjacentOps(const CodeOp &a, const CodeOp &b)
 	const GekkoOPInfo *b_info = b.opinfo;
 	int a_flags = a_info->flags;
 	int b_flags = b_info->flags;
+
+	// can't reorder around breakpoints
+	if (SConfig::GetInstance().m_LocalCoreStartupParameter.bEnableDebugging &&
+	    (PowerPC::breakpoints.IsAddressBreakPoint(a.address) || PowerPC::breakpoints.IsAddressBreakPoint(b.address)))
+		return false;
 	if (b_flags & (FL_SET_CRx | FL_ENDBLOCK | FL_TIMER | FL_EVIL | FL_SET_OE))
 		return false;
 	if ((b_flags & (FL_RC_BIT | FL_RC_BIT_F)) && (b.inst.Rc))
@ -462,7 +467,8 @@ void PPCAnalyzer::ReorderInstructions(u32 instructions, CodeOp *code)
 	// Reorder cror instructions upwards (e.g. towards an fcmp). Technically we should be more
 	// picky about this, but cror seems to almost solely be used for this purpose in real code.
 	// Additionally, the other boolean ops seem to almost never be used.
-	ReorderInstructionsCore(instructions, code, true, REORDER_CROR);
+	if (HasOption(OPTION_CROR_MERGE))
+		ReorderInstructionsCore(instructions, code, true, REORDER_CROR);
 	// For carry, bubble instructions *towards* each other; one direction often isn't enough
 	// to get pairs like addc/adde next to each other.
 	if (HasOption(OPTION_CARRY_MERGE))
--- a/Source/Core/Core/PowerPC/PPCAnalyst.h
+++ b/Source/Core/Core/PowerPC/PPCAnalyst.h
@ -214,6 +214,9 @@ public:
 		// Reorder carry instructions next to their associated branches and pass
 		// carry flags in the x86 flags between them, instead of in XER.
 		OPTION_CARRY_MERGE = (1 << 5),
+
+		// Reorder cror instructions next to their associated fcmp.
+		OPTION_CROR_MERGE =  (1 << 6),
 	};