Merge pull request #4735 from degasus/jitcache

Jit64: Enable branch following.
2024-09-20 11:21:43 +02:00 · 2017-01-28 15:48:01 +13:00 · 2017-01-28 15:48:01 +13:00 · 5da565a1a1
commit 5da565a1a1
parent 5e9c10f65c ca10cf5afe
9 changed files with 125 additions and 48 deletions
--- a/Source/Core/Common/x64Emitter.cpp
+++ b/Source/Core/Common/x64Emitter.cpp
@ -443,6 +443,16 @@ void XEmitter::CALL(const void* fnptr)
  Write32(u32(distance));
 }

+FixupBranch XEmitter::CALL()
+{
+  FixupBranch branch;
+  branch.type = 1;
+  branch.ptr = code + 5;
+  Write8(0xE8);
+  Write32(0);
+  return branch;
+}
+
 FixupBranch XEmitter::J(bool force5bytes)
 {
  FixupBranch branch;
--- a/Source/Core/Common/x64Emitter.h
+++ b/Source/Core/Common/x64Emitter.h
@ -467,6 +467,7 @@ public:
 #undef CALL
 #endif
  void CALL(const void* fnptr);
+  FixupBranch CALL();
  void CALLptr(OpArg arg);

  FixupBranch J_CC(CCFlags conditionCode, bool force5bytes = false);
--- a/Source/Core/Core/PowerPC/Jit64/Jit.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit.cpp
@ -372,6 +372,21 @@ bool Jit64::Cleanup()
  return did_something;
 }

+void Jit64::FakeBLCall(u32 after)
+{
+  if (!m_enable_blr_optimization)
+    return;
+
+  // We may need to fake the BLR stack on inlined CALL instructions.
+  // Else we can't return to this location any more.
+  MOV(32, R(RSCRATCH2), Imm32(after));
+  PUSH(RSCRATCH2);
+  FixupBranch skip_exit = CALL();
+  POP(RSCRATCH2);
+  JustWriteExit(after, false, 0);
+  SetJumpTarget(skip_exit);
+}
+
 void Jit64::WriteExit(u32 destination, bool bl, u32 after)
 {
  if (!m_enable_blr_optimization)
@ -569,6 +584,7 @@ void Jit64::Jit(u32 em_address)
        analyzer.ClearOption(PPCAnalyst::PPCAnalyzer::OPTION_BRANCH_MERGE);
        analyzer.ClearOption(PPCAnalyst::PPCAnalyzer::OPTION_CROR_MERGE);
        analyzer.ClearOption(PPCAnalyst::PPCAnalyzer::OPTION_CARRY_MERGE);
+        analyzer.ClearOption(PPCAnalyst::PPCAnalyzer::OPTION_BRANCH_FOLLOW);
      }
      Trace();
    }
@ -973,6 +989,7 @@ void Jit64::EnableOptimization()
  analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_BRANCH_MERGE);
  analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_CROR_MERGE);
  analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_CARRY_MERGE);
+  analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_BRANCH_FOLLOW);
 }

 void Jit64::IntializeSpeculativeConstants()
--- a/Source/Core/Core/PowerPC/Jit64/Jit.h
+++ b/Source/Core/Core/PowerPC/Jit64/Jit.h
@ -85,6 +85,7 @@ public:

  // Utilities for use by opcodes

+  void FakeBLCall(u32 after);
  void WriteExit(u32 destination, bool bl = false, u32 after = 0);
  void JustWriteExit(u32 destination, bool bl, u32 after);
  void WriteExitDestInRSCRATCH(bool bl = false, u32 after = 0);
--- a/Source/Core/Core/PowerPC/Jit64/Jit_Branch.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_Branch.cpp
@ -74,6 +74,13 @@ void Jit64::bx(UGeckoInstruction inst)
  // Because PPCAnalyst::Flatten() merged the blocks.
  if (!js.isLastInstruction)
  {
+    if (inst.LK && !js.op->skipLRStack)
+    {
+      // We have to fake the stack as the RET instruction was not
+      // found in the same block. This is a big overhead, but still
+      // better than calling the dispatcher.
+      FakeBLCall(js.compilerPC + 4);
+    }
    return;
  }

@ -131,6 +138,22 @@ void Jit64::bcx(UGeckoInstruction inst)
  if (inst.LK)
    MOV(32, PPCSTATE_LR, Imm32(js.compilerPC + 4));

+  // If this is not the last instruction of a block
+  // and an unconditional branch, we will skip the rest process.
+  // Because PPCAnalyst::Flatten() merged the blocks.
+  if (!js.isLastInstruction && (inst.BO & BO_DONT_DECREMENT_FLAG) &&
+      (inst.BO & BO_DONT_CHECK_CONDITION))
+  {
+    if (inst.LK && !js.op->skipLRStack)
+    {
+      // We have to fake the stack as the RET instruction was not
+      // found in the same block. This is a big overhead, but still
+      // better than calling the dispatcher.
+      FakeBLCall(js.compilerPC + 4);
+    }
+    return;
+  }
+
  u32 destination;
  if (inst.AA)
    destination = SignExt16(inst.BD << 2);
--- a/Source/Core/Core/PowerPC/JitArm64/Jit.cpp
+++ b/Source/Core/Core/PowerPC/JitArm64/Jit.cpp
@ -55,6 +55,7 @@ void JitArm64::Init()
  code_block.m_fpa = &js.fpa;
  analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_CONDITIONAL_CONTINUE);
  analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_CARRY_MERGE);
+  analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_BRANCH_FOLLOW);

  m_supports_cycle_counter = HasCycleCounters();
 }
--- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Branch.cpp
+++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Branch.cpp
@ -76,9 +76,6 @@ void JitArm64::bx(UGeckoInstruction inst)
  INSTRUCTION_START
  JITDISABLE(bJITBranchOff);

-  gpr.Flush(FlushMode::FLUSH_ALL);
-  fpr.Flush(FlushMode::FLUSH_ALL);
-
  u32 destination;
  if (inst.AA)
    destination = SignExt26(inst.LI << 2);
@ -93,6 +90,14 @@ void JitArm64::bx(UGeckoInstruction inst)
    gpr.Unlock(WA);
  }

+  if (!js.isLastInstruction)
+  {
+    return;
+  }
+
+  gpr.Flush(FlushMode::FLUSH_ALL);
+  fpr.Flush(FlushMode::FLUSH_ALL);
+
  if (destination == js.compilerPC)
  {
    // make idle loops go faster
--- a/Source/Core/Core/PowerPC/PPCAnalyst.cpp
+++ b/Source/Core/Core/PowerPC/PPCAnalyst.cpp
@ -32,8 +32,9 @@
 namespace PPCAnalyst
 {
 constexpr int CODEBUFFER_SIZE = 32000;
+
 // 0 does not perform block merging
-constexpr u32 FUNCTION_FOLLOWING_THRESHOLD = 16;
+constexpr u32 BRANCH_FOLLOWING_THRESHOLD = 2;

 constexpr u32 INVALID_BRANCH_TARGET = 0xFFFFFFFF;

@ -651,7 +652,8 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, u32
  CodeOp* code = buffer->codebuffer;

  bool found_exit = false;
-  u32 return_address = 0;
+  bool found_call = false;
+  size_t caller = 0;
  u32 numFollows = 0;
  u32 num_inst = 0;

@ -686,50 +688,65 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, u32

    bool conditional_continue = false;

-    // Do we inline leaf functions?
-    if (HasOption(OPTION_LEAF_INLINE))
+    // TODO: Find the optimal value for BRANCH_FOLLOWING_THRESHOLD.
+    //       If it is small, the performance will be down.
+    //       If it is big, the size of generated code will be big and
+    //       cache clearning will happen many times.
+    if (HasOption(OPTION_BRANCH_FOLLOW) && numFollows < BRANCH_FOLLOWING_THRESHOLD)
    {
      if (inst.OPCD == 18 && blockSize > 1)
      {
-        // Is bx - should we inline? yes!
-        if (inst.AA)
-          destination = SignExt26(inst.LI << 2);
-        else
-          destination = address + SignExt26(inst.LI << 2);
-        if (destination != block->m_address)
-          follow = true;
+        // Always follow BX instructions.
+        // TODO: Loop unrolling might bloat the code size too much.
+        //       Enable it carefully.
+        follow = destination != block->m_address;
+        destination = SignExt26(inst.LI << 2) + (inst.AA ? 0 : address);
+        if (inst.LK)
+        {
+          found_call = true;
+          caller = i;
+        }
      }
-      else if (inst.OPCD == 19 && inst.SUBOP10 == 16 && (inst.BO & (1 << 4)) &&
-               (inst.BO & (1 << 2)) && return_address != 0)
+      else if (inst.OPCD == 16 && (inst.BO & BO_DONT_DECREMENT_FLAG) &&
+               (inst.BO & BO_DONT_CHECK_CONDITION) && blockSize > 1)
+      {
+        // Always follow unconditional BCX instructions, but they are very rare.
+        follow = true;
+        destination = SignExt16(inst.BD << 2) + (inst.AA ? 0 : address);
+        if (inst.LK)
+        {
+          found_call = true;
+          caller = i;
+        }
+      }
+      else if (inst.OPCD == 19 && inst.SUBOP10 == 16 && !inst.LK && found_call &&
+               (inst.BO & BO_DONT_DECREMENT_FLAG) && (inst.BO & BO_DONT_CHECK_CONDITION))
      {
        // bclrx with unconditional branch = return
+        // Follow it if we can propagate the LR value of the last CALL instruction.
+        // Through it would be easy to track the upper level of call/return,
+        // we can't guarantee the LR value. The PPC ABI forces all functions to push
+        // the LR value on the stack as there are no spare registers. So we'd need
+        // to check all store instruction to not alias with the stack.
        follow = true;
-        destination = return_address;
-        return_address = 0;
+        destination = code[caller].address + 4;
+        found_call = false;
+        code[i].skip = true;

-        if (inst.LK)
-          return_address = address + 4;
+        // Skip the RET, so also don't generate the stack entry for the BLR optimization.
+        code[caller].skipLRStack = true;
      }
      else if (inst.OPCD == 31 && inst.SUBOP10 == 467)
      {
-        // mtspr
+        // mtspr, skip CALL/RET merging as LR is overwritten.
        const u32 index = (inst.SPRU << 5) | (inst.SPRL & 0x1F);
        if (index == SPR_LR)
        {
          // We give up to follow the return address
          // because we have to check the register usage.
-          return_address = 0;
+          found_call = false;
        }
      }
-
-      // TODO: Find the optimal value for FUNCTION_FOLLOWING_THRESHOLD.
-      //       If it is small, the performance will be down.
-      //       If it is big, the size of generated code will be big and
-      //       cache clearning will happen many times.
-      // TODO: Investivate the reason why
-      //       "0" is fastest in some games, MP2 for example.
-      if (numFollows > FUNCTION_FOLLOWING_THRESHOLD)
-        follow = false;
    }

    if (HasOption(OPTION_CONDITIONAL_CONTINUE))
@ -759,27 +776,28 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, u32
      }
    }

-    if (!follow)
+    if (follow)
    {
+      // Follow the unconditional branch.
+      numFollows++;
+      address = destination;
+    }
+    else
+    {
+      // Just pick the next instruction
      address += 4;
      if (!conditional_continue && opinfo->flags & FL_ENDBLOCK)  // right now we stop early
      {
        found_exit = true;
        break;
      }
+      if (conditional_continue)
+      {
+        // If we skip any conditional branch, we can't garantee to get the matching CALL/RET pair.
+        // So we stop inling the RET here and let the BLR optitmization handle this case.
+        found_call = false;
+      }
    }
-// XXX: We don't support inlining yet.
-#if 0
-		else
-		{
-			numFollows++;
-			// We don't "code[i].skip = true" here
-			// because bx may store a certain value to the link register.
-			// Instead, we skip a part of bx in Jit**::bx().
-			address = destination;
-			merged_addresses[size_of_merged_addresses++] = address;
-		}
-#endif
  }

  block->m_num_instructions = num_inst;
--- a/Source/Core/Core/PowerPC/PPCAnalyst.h
+++ b/Source/Core/Core/PowerPC/PPCAnalyst.h
@ -42,6 +42,7 @@ struct CodeOp  // 16B
  bool outputFPRF;
  bool outputCA;
  bool canEndBlock;
+  bool skipLRStack;
  bool skip;  // followed BL-s for example
  // which registers are still needed after this instruction in this block
  BitSet32 fprInUse;
@ -189,11 +190,11 @@ public:
    // Requires JIT support to be enabled.
    OPTION_CONDITIONAL_CONTINUE = (1 << 0),

-    // If there is a unconditional branch that jumps to a leaf function then inline it.
+    // Try to inline unconditional branches/calls/returns.
+    // Also track the LR value to follow unconditional return instructions.
    // Might require JIT intervention to support it correctly.
-    // Requires JITBLock support for inlined code
-    // XXX: NOT COMPLETE
-    OPTION_LEAF_INLINE = (1 << 1),
+    // Especially if the BLR optimization is used.
+    OPTION_BRANCH_FOLLOW = (1 << 1),

    // Complex blocks support jumping backwards on to themselves.
    // Happens commonly in loops, pretty complex to support.