diff --git a/Source/Core/Common/MemoryUtil.cpp b/Source/Core/Common/MemoryUtil.cpp index f7e1d7d902..a741deef4f 100644 --- a/Source/Core/Common/MemoryUtil.cpp +++ b/Source/Core/Common/MemoryUtil.cpp @@ -158,6 +158,25 @@ void FreeAlignedMemory(void* ptr) } } +void ReadProtectMemory(void* ptr, size_t size) +{ + bool error_occurred = false; + +#ifdef _WIN32 + DWORD oldValue; + if (!VirtualProtect(ptr, size, PAGE_NOACCESS, &oldValue)) + error_occurred = true; +#else + int retval = mprotect(ptr, size, PROT_NONE); + + if (retval != 0) + error_occurred = true; +#endif + + if (error_occurred) + PanicAlert("ReadProtectMemory failed!\n%s", GetLastErrorMsg()); +} + void WriteProtectMemory(void* ptr, size_t size, bool allowExecute) { bool error_occurred = false; diff --git a/Source/Core/Common/MemoryUtil.h b/Source/Core/Common/MemoryUtil.h index 6f437fcda7..5f584f868d 100644 --- a/Source/Core/Common/MemoryUtil.h +++ b/Source/Core/Common/MemoryUtil.h @@ -12,8 +12,12 @@ void* AllocateMemoryPages(size_t size); void FreeMemoryPages(void* ptr, size_t size); void* AllocateAlignedMemory(size_t size,size_t alignment); void FreeAlignedMemory(void* ptr); +void ReadProtectMemory(void* ptr, size_t size); void WriteProtectMemory(void* ptr, size_t size, bool executable = false); void UnWriteProtectMemory(void* ptr, size_t size, bool allowExecute = false); std::string MemUsage(); +void GuardMemoryMake(void* ptr, size_t size); +void GuardMemoryUnmake(void* ptr, size_t size); + inline int GetPageSize() { return 4096; } diff --git a/Source/Core/Common/x64Emitter.cpp b/Source/Core/Common/x64Emitter.cpp index fa16cf2b36..75cd418379 100644 --- a/Source/Core/Common/x64Emitter.cpp +++ b/Source/Core/Common/x64Emitter.cpp @@ -1766,6 +1766,8 @@ void XEmitter::ANDN(int bits, X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteBMI void XEmitter::LOCK() { Write8(0xF0); } void XEmitter::REP() { Write8(0xF3); } void XEmitter::REPNE() { Write8(0xF2); } +void XEmitter::FSOverride() { Write8(0x64); } +void XEmitter::GSOverride() { Write8(0x65); } void XEmitter::FWAIT() { diff --git a/Source/Core/Common/x64Emitter.h b/Source/Core/Common/x64Emitter.h index 8f41065668..8b655c2c42 100644 --- a/Source/Core/Common/x64Emitter.h +++ b/Source/Core/Common/x64Emitter.h @@ -467,6 +467,8 @@ public: void LOCK(); void REP(); void REPNE(); + void FSOverride(); + void GSOverride(); // x87 enum x87StatusWordBits { diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.cpp b/Source/Core/Core/PowerPC/Jit64/Jit.cpp index d928d02927..92595f6acd 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit.cpp @@ -95,6 +95,83 @@ using namespace PowerPC; and such, but it's currently limited to integer ops only. This can definitely be made better. */ +// The BLR optimization is nice, but it means that JITted code can overflow the +// native stack by repeatedly running BL. (The chance of this happening in any +// retail game is close to 0, but correctness is correctness...) Also, the +// overflow might not happen directly in the JITted code but in a C++ function +// called from it, so we can't just adjust RSP in the case of a fault. +// Instead, we have to have extra stack space preallocated under the fault +// point which allows the code to continue, after wiping the JIT cache so we +// can reset things at a safe point. Once this condition trips, the +// optimization is permanently disabled, under the assumption this will never +// happen in practice. + +// On Unix, we just mark an appropriate region of the stack as PROT_NONE and +// handle it the same way as fastmem faults. It's safe to take a fault with a +// bad RSP, because on Linux we can use sigaltstack and on OS X we're already +// on a separate thread. + +// On Windows, the OS gets upset if RSP doesn't work, and I don't know any +// equivalent of sigaltstack. Windows supports guard pages which, when +// accessed, immediately turn into regular pages but cause a trap... but +// putting them in the path of RSP just leads to something (in the kernel?) +// thinking a regular stack extension is required. So this protection is not +// supported on Windows yet... We still use a separate stack for the sake of +// simplicity. + +enum +{ + STACK_SIZE = 2 * 1024 * 1024, + SAFE_STACK_SIZE = 512 * 1024, + GUARD_SIZE = 0x10000, // two guards - bottom (permanent) and middle (see above) + GUARD_OFFSET = STACK_SIZE - SAFE_STACK_SIZE - GUARD_SIZE, +}; + +void Jit64::AllocStack() +{ +#if defined(_WIN32) + m_stack = (u8*)AllocateMemoryPages(STACK_SIZE); + ReadProtectMemory(m_stack, GUARD_SIZE); + ReadProtectMemory(m_stack + GUARD_OFFSET, GUARD_SIZE); +#endif +} + +void Jit64::FreeStack() +{ +#if defined(_WIN32) + if (m_stack) + { + FreeMemoryPages(m_stack, STACK_SIZE); + m_stack = NULL; + } +#endif +} + +bool Jit64::HandleFault(uintptr_t access_address, SContext* ctx) +{ + uintptr_t stack = (uintptr_t)m_stack, diff = access_address - stack; + // In the trap region? + if (stack && diff >= GUARD_OFFSET && diff < GUARD_OFFSET + GUARD_SIZE) + { + WARN_LOG(POWERPC, "BLR cache disabled due to excessive BL in the emulated program."); + m_enable_blr_optimization = false; + UnWriteProtectMemory(m_stack + GUARD_OFFSET, GUARD_SIZE); + // We're going to need to clear the whole cache to get rid of the bad + // CALLs, but we can't yet. Fake the downcount so we're forced to the + // dispatcher (no block linking), and clear the cache so we're sent to + // Jit. Yeah, it's kind of gross. + GetBlockCache()->InvalidateICache(0, 0xffffffff); + CoreTiming::ForceExceptionCheck(0); + m_clear_cache_asap = true; + + return true; + } + + return Jitx86Base::HandleFault(access_address, ctx); +} + + + void Jit64::Init() { jo.optimizeStack = true; @@ -130,8 +207,18 @@ void Jit64::Init() trampolines.Init(); AllocCodeSpace(CODE_SIZE); + + // BLR optimization has the same consequences as block linking, as well as + // depending on the fault handler to be safe in the event of excessive BL. + m_enable_blr_optimization = jo.enableBlocklink && SConfig::GetInstance().m_LocalCoreStartupParameter.bFastmem; + m_clear_cache_asap = false; + + m_stack = nullptr; + if (m_enable_blr_optimization) + AllocStack(); + blocks.Init(); - asm_routines.Init(); + asm_routines.Init(m_stack ? (m_stack + STACK_SIZE) : nullptr); // important: do this *after* generating the global asm routines, because we can't use farcode in them. // it'll crash because the farcode functions get cleared on JIT clears. @@ -155,6 +242,7 @@ void Jit64::ClearCache() void Jit64::Shutdown() { + FreeStack(); FreeCodeSpace(); blocks.Shutdown(); @@ -251,11 +339,8 @@ bool Jit64::Cleanup() void Jit64::WriteExit(u32 destination, bool bl, u32 after) { - // BLR optimization has similar consequences to block linking. - if (!jo.enableBlocklink) - { + if (!m_enable_blr_optimization) bl = false; - } Cleanup(); @@ -313,17 +398,17 @@ void Jit64::JustWriteExit(u32 destination, bool bl, u32 after) void Jit64::WriteExitDestInRSCRATCH(bool bl, u32 after) { - if (!jo.enableBlocklink) - { + if (!m_enable_blr_optimization) bl = false; - } + MOV(32, PPCSTATE(pc), R(RSCRATCH)); + Cleanup(); + if (bl) { MOV(32, R(RSCRATCH2), Imm32(after)); PUSH(RSCRATCH2); } - MOV(32, PPCSTATE(pc), R(RSCRATCH)); - Cleanup(); + SUB(32, PPCSTATE(downcount), Imm32(js.downcountAmount)); if (bl) { @@ -339,7 +424,7 @@ void Jit64::WriteExitDestInRSCRATCH(bool bl, u32 after) void Jit64::WriteBLRExit() { - if (!jo.enableBlocklink) + if (!m_enable_blr_optimization) { WriteExitDestInRSCRATCH(); return; @@ -428,8 +513,11 @@ void Jit64::Trace() void STACKALIGN Jit64::Jit(u32 em_address) { - if (GetSpaceLeft() < 0x10000 || farcode.GetSpaceLeft() < 0x10000 || blocks.IsFull() || - SConfig::GetInstance().m_LocalCoreStartupParameter.bJITNoBlockCache) + if (GetSpaceLeft() < 0x10000 || + farcode.GetSpaceLeft() < 0x10000 || + blocks.IsFull() || + SConfig::GetInstance().m_LocalCoreStartupParameter.bJITNoBlockCache || + m_clear_cache_asap) { ClearCache(); } diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h index cface00cb3..0391d258cc 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.h +++ b/Source/Core/Core/PowerPC/Jit64/Jit.h @@ -18,6 +18,10 @@ // ---------- #pragma once +#ifdef _WIN32 +#include +#endif + #include "Common/x64ABI.h" #include "Common/x64Analyzer.h" #include "Common/x64Emitter.h" @@ -40,6 +44,9 @@ class Jit64 : public Jitx86Base { private: + void AllocStack(); + void FreeStack(); + GPRRegCache gpr; FPURegCache fpr; @@ -48,6 +55,10 @@ private: PPCAnalyst::CodeBuffer code_buffer; Jit64AsmRoutineManager asm_routines; + bool m_enable_blr_optimization; + bool m_clear_cache_asap; + u8* m_stack; + public: Jit64() : code_buffer(32000) {} ~Jit64() {} @@ -55,6 +66,8 @@ public: void Init() override; void Shutdown() override; + bool HandleFault(uintptr_t access_address, SContext* ctx) override; + // Jit! void Jit(u32 em_address) override; diff --git a/Source/Core/Core/PowerPC/Jit64/JitAsm.cpp b/Source/Core/Core/PowerPC/Jit64/JitAsm.cpp index dc307540f6..dcfffaa3e9 100644 --- a/Source/Core/Core/PowerPC/Jit64/JitAsm.cpp +++ b/Source/Core/Core/PowerPC/Jit64/JitAsm.cpp @@ -23,8 +23,18 @@ void Jit64AsmRoutineManager::Generate() // for the shadow region before calls in this function. This call will // waste a bit of space for a second shadow, but whatever. ABI_PushRegistersAndAdjustStack(ABI_ALL_CALLEE_SAVED, 8, /*frame*/ 16); + if (m_stack_top) + { + // Pivot the stack to our custom one. + MOV(64, R(RSCRATCH), R(RSP)); + MOV(64, R(RSP), Imm64((u64)m_stack_top - 0x20)); + MOV(64, MDisp(RSP, 0x18), R(RSCRATCH)); + } + else + { + MOV(64, M(&s_saved_rsp), R(RSP)); + } // something that can't pass the BLR test - MOV(64, M(&s_saved_rsp), R(RSP)); MOV(64, MDisp(RSP, 8), Imm32((u32)-1)); // Two statically allocated registers. @@ -46,7 +56,10 @@ void Jit64AsmRoutineManager::Generate() ABI_PopRegistersAndAdjustStack(1 << RSCRATCH, 0); #endif - MOV(64, R(RSP), M(&s_saved_rsp)); + if (m_stack_top) + MOV(64, R(RSP), Imm64((u64)m_stack_top - 0x20)); + else + MOV(64, R(RSP), M(&s_saved_rsp)); SUB(32, PPCSTATE(downcount), R(RSCRATCH)); @@ -55,6 +68,8 @@ void Jit64AsmRoutineManager::Generate() // IMPORTANT - We jump on negative, not carry!!! FixupBranch bail = J_CC(CC_BE, true); + FixupBranch dbg_exit; + if (SConfig::GetInstance().m_LocalCoreStartupParameter.bEnableDebugging) { TEST(32, M((void*)PowerPC::GetStatePtr()), Imm32(PowerPC::CPU_STEPPING)); @@ -63,11 +78,7 @@ void Jit64AsmRoutineManager::Generate() ABI_CallFunction(reinterpret_cast(&PowerPC::CheckBreakPoints)); ABI_PopRegistersAndAdjustStack(0, 0); TEST(32, M((void*)PowerPC::GetStatePtr()), Imm32(0xFFFFFFFF)); - FixupBranch noBreakpoint = J_CC(CC_Z); - MOV(64, R(RSP), M(&s_saved_rsp)); - ABI_PopRegistersAndAdjustStack(ABI_ALL_CALLEE_SAVED, 8, 16); - RET(); - SetJumpTarget(noBreakpoint); + dbg_exit = J_CC(CC_NZ); SetJumpTarget(notStepping); } @@ -155,7 +166,17 @@ void Jit64AsmRoutineManager::Generate() J_CC(CC_Z, outerLoop); //Landing pad for drec space - MOV(64, R(RSP), M(&s_saved_rsp)); + if (SConfig::GetInstance().m_LocalCoreStartupParameter.bEnableDebugging) + SetJumpTarget(dbg_exit); + if (m_stack_top) + { + MOV(64, R(RSP), Imm64((u64)m_stack_top - 0x8)); + POP(RSP); + } + else + { + MOV(64, R(RSP), M(&s_saved_rsp)); + } ABI_PopRegistersAndAdjustStack(ABI_ALL_CALLEE_SAVED, 8, 16); RET(); diff --git a/Source/Core/Core/PowerPC/Jit64/JitAsm.h b/Source/Core/Core/PowerPC/Jit64/JitAsm.h index e3cc4371f7..9272f5c8aa 100644 --- a/Source/Core/Core/PowerPC/Jit64/JitAsm.h +++ b/Source/Core/Core/PowerPC/Jit64/JitAsm.h @@ -25,10 +25,12 @@ class Jit64AsmRoutineManager : public CommonAsmRoutines private: void Generate(); void GenerateCommon(); + u8* m_stack_top; public: - void Init() + void Init(u8* stack_top) { + m_stack_top = stack_top; AllocCodeSpace(8192); Generate(); WriteProtect(); diff --git a/Source/Core/Core/PowerPC/Jit64IL/JitIL.cpp b/Source/Core/Core/PowerPC/Jit64IL/JitIL.cpp index 81260249c7..9f9f9cf98c 100644 --- a/Source/Core/Core/PowerPC/Jit64IL/JitIL.cpp +++ b/Source/Core/Core/PowerPC/Jit64IL/JitIL.cpp @@ -272,7 +272,7 @@ void JitIL::Init() trampolines.Init(); AllocCodeSpace(CODE_SIZE); blocks.Init(); - asm_routines.Init(); + asm_routines.Init(nullptr); farcode.Init(js.memcheck ? FARCODE_SIZE_MMU : FARCODE_SIZE);