shader_jit_a64: Compact host executable memory (#230)

* common/aarch64: Allow generic code generator types

Use the templated `BasicCodeGenerator` type rather than the specialized
`CodeGenerator` type.
Allows `VectorCodeGenerator` to work with these functions.

* common/aarch64: Add `VectorCodeGenerator` to `CallFarFunction`

`VectorCodeGenerator` will always do far-calls since we cannot resolve any absolute addresses here.

* shader_jit_a64: Implement position-independent VectorCodeGenerator

Generates more position-independent assembly to allow for code to be
generated within a resizable vector before copying into executable
memory, allowing for more compact memory allocations and usage rather
than a statically defined worst-case for all-cases.

`VectorCodeGenerator` will need to generate position-independent code
rather than use absolute addresses. Assumes all far function calls in the
case of `VectorCodeGenerator` to use absolute addresses rather than
potentially use a relative `BL` branch after memory relocation.
This commit is contained in:
Wunk 2024-09-01 03:24:13 -07:00 committed by GitHub
parent 82faf2e557
commit 3e5bbac5a1
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 74 additions and 41 deletions

View file

@ -78,7 +78,8 @@ inline ABIFrameInfo ABI_CalculateFrameSize(std::bitset<64> regs, std::size_t fra
return ABIFrameInfo{static_cast<u32>(total_size), static_cast<u32>(fprs_base_subtraction)}; return ABIFrameInfo{static_cast<u32>(total_size), static_cast<u32>(fprs_base_subtraction)};
} }
inline void ABI_PushRegisters(oaknut::CodeGenerator& code, std::bitset<64> regs, template <typename Policy>
inline void ABI_PushRegisters(oaknut::BasicCodeGenerator<Policy>& code, std::bitset<64> regs,
std::size_t frame_size = 0) { std::size_t frame_size = 0) {
using namespace oaknut; using namespace oaknut;
using namespace oaknut::util; using namespace oaknut::util;
@ -137,7 +138,8 @@ inline void ABI_PushRegisters(oaknut::CodeGenerator& code, std::bitset<64> regs,
} }
} }
inline void ABI_PopRegisters(oaknut::CodeGenerator& code, std::bitset<64> regs, template <typename Policy>
inline void ABI_PopRegisters(oaknut::BasicCodeGenerator<Policy>& code, std::bitset<64> regs,
std::size_t frame_size = 0) { std::size_t frame_size = 0) {
using namespace oaknut; using namespace oaknut;
using namespace oaknut::util; using namespace oaknut::util;

View file

@ -38,6 +38,16 @@ inline void CallFarFunction(oaknut::CodeGenerator& code, const T f) {
} }
} }
template <typename T>
inline void CallFarFunction(oaknut::VectorCodeGenerator& code, const T f) {
static_assert(std::is_pointer_v<T>, "Argument must be a (function) pointer.");
// X16(IP0) and X17(IP1) is the standard veneer register
// LR is also available as an intermediate register
// https://developer.arm.com/documentation/102374/0101/Procedure-Call-Standard
code.MOVP2R(oaknut::util::X16, reinterpret_cast<const void*>(f));
code.BLR(oaknut::util::X16);
}
} // namespace Common::A64 } // namespace Common::A64
#endif // CITRA_ARCH(arm64) #endif // CITRA_ARCH(arm64)

View file

@ -942,7 +942,7 @@ void JitShader::Compile(const std::array<u32, MAX_PROGRAM_CODE_LENGTH>* program_
swizzle_data = swizzle_data_; swizzle_data = swizzle_data_;
// Reset flow control state // Reset flow control state
program = xptr<CompiledShader*>(); const std::uintptr_t program_offset = offset();
program_counter = 0; program_counter = 0;
loop_depth = 0; loop_depth = 0;
instruction_labels.fill(Label()); instruction_labels.fill(Label());
@ -984,18 +984,28 @@ void JitShader::Compile(const std::array<u32, MAX_PROGRAM_CODE_LENGTH>* program_
return_offsets.clear(); return_offsets.clear();
return_offsets.shrink_to_fit(); return_offsets.shrink_to_fit();
// Copy to executable memory
const size_t code_size = code_vec.size() * sizeof(u32);
code_mem = std::make_unique<oaknut::CodeBlock>(code_size);
code_mem->unprotect();
program = reinterpret_cast<CompiledShader*>(reinterpret_cast<std::byte*>(code_mem->ptr()) +
program_offset);
// Copy to executable memory
std::memcpy(code_mem->ptr(), code_vec.data(), code_vec.size() * sizeof(u32));
// Memory is ready to execute // Memory is ready to execute
protect(); code_mem->protect();
invalidate_all(); code_mem->invalidate_all();
const std::size_t code_size = static_cast<std::size_t>(offset()); // code_vec is no longer needed
code_vec.clear();
ASSERT_MSG(code_size <= MAX_SHADER_SIZE, "Compiled a shader that exceeds the allocated size!"); code_vec.shrink_to_fit();
LOG_DEBUG(HW_GPU, "Compiled shader size={}", code_size);
} }
JitShader::JitShader() : CodeBlock(MAX_SHADER_SIZE), CodeGenerator(CodeBlock::ptr()) { JitShader::JitShader() : oaknut::VectorCodeGenerator(code_vec) {
unprotect();
CompilePrelude(); CompilePrelude();
} }
@ -1013,19 +1023,22 @@ Label JitShader::CompilePrelude_Log2() {
// range. Coefficients for the minimax polynomial. // range. Coefficients for the minimax polynomial.
// f(x) computes approximately log2(x) / (x - 1). // f(x) computes approximately log2(x) / (x - 1).
// f(x) = c4 + x * (c3 + x * (c2 + x * (c1 + x * c0)). // f(x) = c4 + x * (c3 + x * (c2 + x * (c1 + x * c0)).
align(16); oaknut::Label c0;
const void* c0 = xptr<const void*>(); // align(16);
l(c0);
dw(0x3d74552f); dw(0x3d74552f);
align(16); // align(16);
const void* c14 = xptr<const void*>(); oaknut::Label c14;
l(c14);
dw(0xbeee7397); dw(0xbeee7397);
dw(0x3fbd96dd); dw(0x3fbd96dd);
dw(0xc02153f6); dw(0xc02153f6);
dw(0x4038d96c); dw(0x4038d96c);
align(16); // align(16);
const void* negative_infinity_vector = xptr<const void*>(); oaknut::Label negative_infinity_vector;
l(negative_infinity_vector);
dw(0xff800000); dw(0xff800000);
dw(0xff800000); dw(0xff800000);
dw(0xff800000); dw(0xff800000);
@ -1038,19 +1051,19 @@ Label JitShader::CompilePrelude_Log2() {
Label input_is_nan, input_is_zero, input_out_of_range; Label input_is_nan, input_is_zero, input_out_of_range;
align(16); // align(16);
l(input_out_of_range); l(input_out_of_range);
B(Cond::EQ, input_is_zero); B(Cond::EQ, input_is_zero);
MOVP2R(XSCRATCH0, default_qnan_vector); ADR(XSCRATCH0, default_qnan_vector);
LDR(SRC1, XSCRATCH0); LDR(SRC1, XSCRATCH0);
RET(); RET();
l(input_is_zero); l(input_is_zero);
MOVP2R(XSCRATCH0, negative_infinity_vector); ADR(XSCRATCH0, negative_infinity_vector);
LDR(SRC1, XSCRATCH0); LDR(SRC1, XSCRATCH0);
RET(); RET();
align(16); // align(16);
l(subroutine); l(subroutine);
// Here we handle edge cases: input in {NaN, 0, -Inf, Negative}. // Here we handle edge cases: input in {NaN, 0, -Inf, Negative}.
@ -1078,14 +1091,14 @@ Label JitShader::CompilePrelude_Log2() {
UCVTF(VSCRATCH1.toS(), VSCRATCH1.toS()); UCVTF(VSCRATCH1.toS(), VSCRATCH1.toS());
// VSCRATCH1 now contains the exponent of the input. // VSCRATCH1 now contains the exponent of the input.
MOVP2R(XSCRATCH0, c0); ADR(XSCRATCH0, c0);
LDR(XSCRATCH0.toW(), XSCRATCH0); LDR(XSCRATCH0.toW(), XSCRATCH0);
MOV(VSCRATCH0.Selem()[0], XSCRATCH0.toW()); MOV(VSCRATCH0.Selem()[0], XSCRATCH0.toW());
// Complete computation of polynomial // Complete computation of polynomial
// Load C1,C2,C3,C4 into a single scratch register // Load C1,C2,C3,C4 into a single scratch register
const QReg C14 = SRC2; const QReg C14 = SRC2;
MOVP2R(XSCRATCH0, c14); ADR(XSCRATCH0, c14);
LDR(C14, XSCRATCH0); LDR(C14, XSCRATCH0);
FMUL(VSCRATCH0.toS(), VSCRATCH0.toS(), SRC1.toS()); FMUL(VSCRATCH0.toS(), VSCRATCH0.toS(), SRC1.toS());
FMLA(VSCRATCH0.toS(), ONE.toS(), C14.Selem()[0]); FMLA(VSCRATCH0.toS(), ONE.toS(), C14.Selem()[0]);
@ -1118,27 +1131,35 @@ Label JitShader::CompilePrelude_Exp2() {
// polynomial which was fit for the function exp2(x) is then evaluated. We then restore the // polynomial which was fit for the function exp2(x) is then evaluated. We then restore the
// result into the appropriate range. // result into the appropriate range.
align(16); // align(16);
const void* input_max = xptr<const void*>(); Label input_max;
l(input_max);
dw(0x43010000); dw(0x43010000);
const void* input_min = xptr<const void*>(); Label input_min;
l(input_min);
dw(0xc2fdffff); dw(0xc2fdffff);
const void* c0 = xptr<const void*>(); Label c0;
l(c0);
dw(0x3c5dbe69); dw(0x3c5dbe69);
const void* half = xptr<const void*>(); Label half;
l(half);
dw(0x3f000000); dw(0x3f000000);
const void* c1 = xptr<const void*>(); Label c1;
l(c1);
dw(0x3d5509f9); dw(0x3d5509f9);
const void* c2 = xptr<const void*>(); Label c2;
l(c2);
dw(0x3e773cc5); dw(0x3e773cc5);
const void* c3 = xptr<const void*>(); Label c3;
l(c3);
dw(0x3f3168b3); dw(0x3f3168b3);
const void* c4 = xptr<const void*>(); Label c4;
l(c4);
dw(0x3f800016); dw(0x3f800016);
Label ret_label; Label ret_label;
align(16); // align(16);
l(subroutine); l(subroutine);
// Handle edge cases // Handle edge cases
@ -1149,15 +1170,15 @@ Label JitShader::CompilePrelude_Exp2() {
// VSCRATCH0=2^round(input) // VSCRATCH0=2^round(input)
// SRC1=input-round(input) [-0.5, 0.5) // SRC1=input-round(input) [-0.5, 0.5)
// Clamp to maximum range since we shift the value directly into the exponent. // Clamp to maximum range since we shift the value directly into the exponent.
MOVP2R(XSCRATCH0, input_max); ADR(XSCRATCH0, input_max);
LDR(VSCRATCH0.toS(), XSCRATCH0); LDR(VSCRATCH0.toS(), XSCRATCH0);
FMIN(SRC1.toS(), SRC1.toS(), VSCRATCH0.toS()); FMIN(SRC1.toS(), SRC1.toS(), VSCRATCH0.toS());
MOVP2R(XSCRATCH0, input_min); ADR(XSCRATCH0, input_min);
LDR(VSCRATCH0.toS(), XSCRATCH0); LDR(VSCRATCH0.toS(), XSCRATCH0);
FMAX(SRC1.toS(), SRC1.toS(), VSCRATCH0.toS()); FMAX(SRC1.toS(), SRC1.toS(), VSCRATCH0.toS());
MOVP2R(XSCRATCH0, half); ADR(XSCRATCH0, half);
LDR(VSCRATCH0.toS(), XSCRATCH0); LDR(VSCRATCH0.toS(), XSCRATCH0);
FSUB(VSCRATCH0.toS(), SRC1.toS(), VSCRATCH0.toS()); FSUB(VSCRATCH0.toS(), SRC1.toS(), VSCRATCH0.toS());

View file

@ -30,20 +30,17 @@ struct ShaderUnit;
namespace Pica::Shader { namespace Pica::Shader {
/// Memory allocated for each compiled shader
constexpr std::size_t MAX_SHADER_SIZE = MAX_PROGRAM_CODE_LENGTH * 256;
/** /**
* This class implements the shader JIT compiler. It recompiles a Pica shader program into x86_64 * This class implements the shader JIT compiler. It recompiles a Pica shader program into x86_64
* code that can be executed on the host machine directly. * code that can be executed on the host machine directly.
*/ */
class JitShader : private oaknut::CodeBlock, private oaknut::CodeGenerator { class JitShader : public oaknut::VectorCodeGenerator {
public: public:
JitShader(); JitShader();
void Run(const ShaderSetup& setup, ShaderUnit& state, u32 offset) const { void Run(const ShaderSetup& setup, ShaderUnit& state, u32 offset) const {
program(&setup.uniforms, &state, program(&setup.uniforms, &state,
reinterpret_cast<std::byte*>(oaknut::CodeBlock::ptr()) + reinterpret_cast<const std::byte*>(code_mem->ptr()) +
instruction_labels[offset].offset()); instruction_labels[offset].offset());
} }
@ -81,6 +78,9 @@ public:
void Compile_SETE(Instruction instr); void Compile_SETE(Instruction instr);
private: private:
std::vector<u32> code_vec;
std::unique_ptr<oaknut::CodeBlock> code_mem;
void Compile_Block(u32 end); void Compile_Block(u32 end);
void Compile_NextInstr(); void Compile_NextInstr();