From 3e5bbac5a1fad6ba8dfa97d0760d142c9011260a Mon Sep 17 00:00:00 2001 From: Wunk Date: Sun, 1 Sep 2024 03:24:13 -0700 Subject: [PATCH] shader_jit_a64: Compact host executable memory (#230) * common/aarch64: Allow generic code generator types Use the templated `BasicCodeGenerator` type rather than the specialized `CodeGenerator` type. Allows `VectorCodeGenerator` to work with these functions. * common/aarch64: Add `VectorCodeGenerator` to `CallFarFunction` `VectorCodeGenerator` will always do far-calls since we cannot resolve any absolute addresses here. * shader_jit_a64: Implement position-independent VectorCodeGenerator Generates more position-independent assembly to allow for code to be generated within a resizable vector before copying into executable memory, allowing for more compact memory allocations and usage rather than a statically defined worst-case for all-cases. `VectorCodeGenerator` will need to generate position-independent code rather than use absolute addresses. Assumes all far function calls in the case of `VectorCodeGenerator` to use absolute addresses rather than potentially use a relative `BL` branch after memory relocation. --- src/common/aarch64/oaknut_abi.h | 6 +- src/common/aarch64/oaknut_util.h | 10 +++ .../shader/shader_jit_a64_compiler.cpp | 89 ++++++++++++------- .../shader/shader_jit_a64_compiler.h | 10 +-- 4 files changed, 74 insertions(+), 41 deletions(-) diff --git a/src/common/aarch64/oaknut_abi.h b/src/common/aarch64/oaknut_abi.h index 56bb8b5f3..ec28872aa 100644 --- a/src/common/aarch64/oaknut_abi.h +++ b/src/common/aarch64/oaknut_abi.h @@ -78,7 +78,8 @@ inline ABIFrameInfo ABI_CalculateFrameSize(std::bitset<64> regs, std::size_t fra return ABIFrameInfo{static_cast(total_size), static_cast(fprs_base_subtraction)}; } -inline void ABI_PushRegisters(oaknut::CodeGenerator& code, std::bitset<64> regs, +template +inline void ABI_PushRegisters(oaknut::BasicCodeGenerator& code, std::bitset<64> regs, std::size_t frame_size = 0) { using namespace oaknut; using namespace oaknut::util; @@ -137,7 +138,8 @@ inline void ABI_PushRegisters(oaknut::CodeGenerator& code, std::bitset<64> regs, } } -inline void ABI_PopRegisters(oaknut::CodeGenerator& code, std::bitset<64> regs, +template +inline void ABI_PopRegisters(oaknut::BasicCodeGenerator& code, std::bitset<64> regs, std::size_t frame_size = 0) { using namespace oaknut; using namespace oaknut::util; diff --git a/src/common/aarch64/oaknut_util.h b/src/common/aarch64/oaknut_util.h index 16cf7dfe3..253d01134 100644 --- a/src/common/aarch64/oaknut_util.h +++ b/src/common/aarch64/oaknut_util.h @@ -38,6 +38,16 @@ inline void CallFarFunction(oaknut::CodeGenerator& code, const T f) { } } +template +inline void CallFarFunction(oaknut::VectorCodeGenerator& code, const T f) { + static_assert(std::is_pointer_v, "Argument must be a (function) pointer."); + // X16(IP0) and X17(IP1) is the standard veneer register + // LR is also available as an intermediate register + // https://developer.arm.com/documentation/102374/0101/Procedure-Call-Standard + code.MOVP2R(oaknut::util::X16, reinterpret_cast(f)); + code.BLR(oaknut::util::X16); +} + } // namespace Common::A64 #endif // CITRA_ARCH(arm64) diff --git a/src/video_core/shader/shader_jit_a64_compiler.cpp b/src/video_core/shader/shader_jit_a64_compiler.cpp index 2927a28c5..18793317e 100644 --- a/src/video_core/shader/shader_jit_a64_compiler.cpp +++ b/src/video_core/shader/shader_jit_a64_compiler.cpp @@ -942,7 +942,7 @@ void JitShader::Compile(const std::array* program_ swizzle_data = swizzle_data_; // Reset flow control state - program = xptr(); + const std::uintptr_t program_offset = offset(); program_counter = 0; loop_depth = 0; instruction_labels.fill(Label()); @@ -984,18 +984,28 @@ void JitShader::Compile(const std::array* program_ return_offsets.clear(); return_offsets.shrink_to_fit(); + // Copy to executable memory + const size_t code_size = code_vec.size() * sizeof(u32); + + code_mem = std::make_unique(code_size); + code_mem->unprotect(); + + program = reinterpret_cast(reinterpret_cast(code_mem->ptr()) + + program_offset); + + // Copy to executable memory + std::memcpy(code_mem->ptr(), code_vec.data(), code_vec.size() * sizeof(u32)); + // Memory is ready to execute - protect(); - invalidate_all(); + code_mem->protect(); + code_mem->invalidate_all(); - const std::size_t code_size = static_cast(offset()); - - ASSERT_MSG(code_size <= MAX_SHADER_SIZE, "Compiled a shader that exceeds the allocated size!"); - LOG_DEBUG(HW_GPU, "Compiled shader size={}", code_size); + // code_vec is no longer needed + code_vec.clear(); + code_vec.shrink_to_fit(); } -JitShader::JitShader() : CodeBlock(MAX_SHADER_SIZE), CodeGenerator(CodeBlock::ptr()) { - unprotect(); +JitShader::JitShader() : oaknut::VectorCodeGenerator(code_vec) { CompilePrelude(); } @@ -1013,19 +1023,22 @@ Label JitShader::CompilePrelude_Log2() { // range. Coefficients for the minimax polynomial. // f(x) computes approximately log2(x) / (x - 1). // f(x) = c4 + x * (c3 + x * (c2 + x * (c1 + x * c0)). - align(16); - const void* c0 = xptr(); + oaknut::Label c0; + // align(16); + l(c0); dw(0x3d74552f); - align(16); - const void* c14 = xptr(); + // align(16); + oaknut::Label c14; + l(c14); dw(0xbeee7397); dw(0x3fbd96dd); dw(0xc02153f6); dw(0x4038d96c); - align(16); - const void* negative_infinity_vector = xptr(); + // align(16); + oaknut::Label negative_infinity_vector; + l(negative_infinity_vector); dw(0xff800000); dw(0xff800000); dw(0xff800000); @@ -1038,19 +1051,19 @@ Label JitShader::CompilePrelude_Log2() { Label input_is_nan, input_is_zero, input_out_of_range; - align(16); + // align(16); l(input_out_of_range); B(Cond::EQ, input_is_zero); - MOVP2R(XSCRATCH0, default_qnan_vector); + ADR(XSCRATCH0, default_qnan_vector); LDR(SRC1, XSCRATCH0); RET(); l(input_is_zero); - MOVP2R(XSCRATCH0, negative_infinity_vector); + ADR(XSCRATCH0, negative_infinity_vector); LDR(SRC1, XSCRATCH0); RET(); - align(16); + // align(16); l(subroutine); // Here we handle edge cases: input in {NaN, 0, -Inf, Negative}. @@ -1078,14 +1091,14 @@ Label JitShader::CompilePrelude_Log2() { UCVTF(VSCRATCH1.toS(), VSCRATCH1.toS()); // VSCRATCH1 now contains the exponent of the input. - MOVP2R(XSCRATCH0, c0); + ADR(XSCRATCH0, c0); LDR(XSCRATCH0.toW(), XSCRATCH0); MOV(VSCRATCH0.Selem()[0], XSCRATCH0.toW()); // Complete computation of polynomial // Load C1,C2,C3,C4 into a single scratch register const QReg C14 = SRC2; - MOVP2R(XSCRATCH0, c14); + ADR(XSCRATCH0, c14); LDR(C14, XSCRATCH0); FMUL(VSCRATCH0.toS(), VSCRATCH0.toS(), SRC1.toS()); FMLA(VSCRATCH0.toS(), ONE.toS(), C14.Selem()[0]); @@ -1118,27 +1131,35 @@ Label JitShader::CompilePrelude_Exp2() { // polynomial which was fit for the function exp2(x) is then evaluated. We then restore the // result into the appropriate range. - align(16); - const void* input_max = xptr(); + // align(16); + Label input_max; + l(input_max); dw(0x43010000); - const void* input_min = xptr(); + Label input_min; + l(input_min); dw(0xc2fdffff); - const void* c0 = xptr(); + Label c0; + l(c0); dw(0x3c5dbe69); - const void* half = xptr(); + Label half; + l(half); dw(0x3f000000); - const void* c1 = xptr(); + Label c1; + l(c1); dw(0x3d5509f9); - const void* c2 = xptr(); + Label c2; + l(c2); dw(0x3e773cc5); - const void* c3 = xptr(); + Label c3; + l(c3); dw(0x3f3168b3); - const void* c4 = xptr(); + Label c4; + l(c4); dw(0x3f800016); Label ret_label; - align(16); + // align(16); l(subroutine); // Handle edge cases @@ -1149,15 +1170,15 @@ Label JitShader::CompilePrelude_Exp2() { // VSCRATCH0=2^round(input) // SRC1=input-round(input) [-0.5, 0.5) // Clamp to maximum range since we shift the value directly into the exponent. - MOVP2R(XSCRATCH0, input_max); + ADR(XSCRATCH0, input_max); LDR(VSCRATCH0.toS(), XSCRATCH0); FMIN(SRC1.toS(), SRC1.toS(), VSCRATCH0.toS()); - MOVP2R(XSCRATCH0, input_min); + ADR(XSCRATCH0, input_min); LDR(VSCRATCH0.toS(), XSCRATCH0); FMAX(SRC1.toS(), SRC1.toS(), VSCRATCH0.toS()); - MOVP2R(XSCRATCH0, half); + ADR(XSCRATCH0, half); LDR(VSCRATCH0.toS(), XSCRATCH0); FSUB(VSCRATCH0.toS(), SRC1.toS(), VSCRATCH0.toS()); diff --git a/src/video_core/shader/shader_jit_a64_compiler.h b/src/video_core/shader/shader_jit_a64_compiler.h index 9819b6b7e..7accf66ac 100644 --- a/src/video_core/shader/shader_jit_a64_compiler.h +++ b/src/video_core/shader/shader_jit_a64_compiler.h @@ -30,20 +30,17 @@ struct ShaderUnit; namespace Pica::Shader { -/// Memory allocated for each compiled shader -constexpr std::size_t MAX_SHADER_SIZE = MAX_PROGRAM_CODE_LENGTH * 256; - /** * This class implements the shader JIT compiler. It recompiles a Pica shader program into x86_64 * code that can be executed on the host machine directly. */ -class JitShader : private oaknut::CodeBlock, private oaknut::CodeGenerator { +class JitShader : public oaknut::VectorCodeGenerator { public: JitShader(); void Run(const ShaderSetup& setup, ShaderUnit& state, u32 offset) const { program(&setup.uniforms, &state, - reinterpret_cast(oaknut::CodeBlock::ptr()) + + reinterpret_cast(code_mem->ptr()) + instruction_labels[offset].offset()); } @@ -81,6 +78,9 @@ public: void Compile_SETE(Instruction instr); private: + std::vector code_vec; + std::unique_ptr code_mem; + void Compile_Block(u32 end); void Compile_NextInstr();