mirror of
https://github.com/PabloMK7/citra
synced 2024-11-15 05:08:23 +00:00
shader_jit_a64: Compact host executable memory (#230)
* common/aarch64: Allow generic code generator types Use the templated `BasicCodeGenerator` type rather than the specialized `CodeGenerator` type. Allows `VectorCodeGenerator` to work with these functions. * common/aarch64: Add `VectorCodeGenerator` to `CallFarFunction` `VectorCodeGenerator` will always do far-calls since we cannot resolve any absolute addresses here. * shader_jit_a64: Implement position-independent VectorCodeGenerator Generates more position-independent assembly to allow for code to be generated within a resizable vector before copying into executable memory, allowing for more compact memory allocations and usage rather than a statically defined worst-case for all-cases. `VectorCodeGenerator` will need to generate position-independent code rather than use absolute addresses. Assumes all far function calls in the case of `VectorCodeGenerator` to use absolute addresses rather than potentially use a relative `BL` branch after memory relocation.
This commit is contained in:
parent
82faf2e557
commit
3e5bbac5a1
4 changed files with 74 additions and 41 deletions
|
@ -78,7 +78,8 @@ inline ABIFrameInfo ABI_CalculateFrameSize(std::bitset<64> regs, std::size_t fra
|
||||||
return ABIFrameInfo{static_cast<u32>(total_size), static_cast<u32>(fprs_base_subtraction)};
|
return ABIFrameInfo{static_cast<u32>(total_size), static_cast<u32>(fprs_base_subtraction)};
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void ABI_PushRegisters(oaknut::CodeGenerator& code, std::bitset<64> regs,
|
template <typename Policy>
|
||||||
|
inline void ABI_PushRegisters(oaknut::BasicCodeGenerator<Policy>& code, std::bitset<64> regs,
|
||||||
std::size_t frame_size = 0) {
|
std::size_t frame_size = 0) {
|
||||||
using namespace oaknut;
|
using namespace oaknut;
|
||||||
using namespace oaknut::util;
|
using namespace oaknut::util;
|
||||||
|
@ -137,7 +138,8 @@ inline void ABI_PushRegisters(oaknut::CodeGenerator& code, std::bitset<64> regs,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void ABI_PopRegisters(oaknut::CodeGenerator& code, std::bitset<64> regs,
|
template <typename Policy>
|
||||||
|
inline void ABI_PopRegisters(oaknut::BasicCodeGenerator<Policy>& code, std::bitset<64> regs,
|
||||||
std::size_t frame_size = 0) {
|
std::size_t frame_size = 0) {
|
||||||
using namespace oaknut;
|
using namespace oaknut;
|
||||||
using namespace oaknut::util;
|
using namespace oaknut::util;
|
||||||
|
|
|
@ -38,6 +38,16 @@ inline void CallFarFunction(oaknut::CodeGenerator& code, const T f) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
inline void CallFarFunction(oaknut::VectorCodeGenerator& code, const T f) {
|
||||||
|
static_assert(std::is_pointer_v<T>, "Argument must be a (function) pointer.");
|
||||||
|
// X16(IP0) and X17(IP1) is the standard veneer register
|
||||||
|
// LR is also available as an intermediate register
|
||||||
|
// https://developer.arm.com/documentation/102374/0101/Procedure-Call-Standard
|
||||||
|
code.MOVP2R(oaknut::util::X16, reinterpret_cast<const void*>(f));
|
||||||
|
code.BLR(oaknut::util::X16);
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace Common::A64
|
} // namespace Common::A64
|
||||||
|
|
||||||
#endif // CITRA_ARCH(arm64)
|
#endif // CITRA_ARCH(arm64)
|
||||||
|
|
|
@ -942,7 +942,7 @@ void JitShader::Compile(const std::array<u32, MAX_PROGRAM_CODE_LENGTH>* program_
|
||||||
swizzle_data = swizzle_data_;
|
swizzle_data = swizzle_data_;
|
||||||
|
|
||||||
// Reset flow control state
|
// Reset flow control state
|
||||||
program = xptr<CompiledShader*>();
|
const std::uintptr_t program_offset = offset();
|
||||||
program_counter = 0;
|
program_counter = 0;
|
||||||
loop_depth = 0;
|
loop_depth = 0;
|
||||||
instruction_labels.fill(Label());
|
instruction_labels.fill(Label());
|
||||||
|
@ -984,18 +984,28 @@ void JitShader::Compile(const std::array<u32, MAX_PROGRAM_CODE_LENGTH>* program_
|
||||||
return_offsets.clear();
|
return_offsets.clear();
|
||||||
return_offsets.shrink_to_fit();
|
return_offsets.shrink_to_fit();
|
||||||
|
|
||||||
|
// Copy to executable memory
|
||||||
|
const size_t code_size = code_vec.size() * sizeof(u32);
|
||||||
|
|
||||||
|
code_mem = std::make_unique<oaknut::CodeBlock>(code_size);
|
||||||
|
code_mem->unprotect();
|
||||||
|
|
||||||
|
program = reinterpret_cast<CompiledShader*>(reinterpret_cast<std::byte*>(code_mem->ptr()) +
|
||||||
|
program_offset);
|
||||||
|
|
||||||
|
// Copy to executable memory
|
||||||
|
std::memcpy(code_mem->ptr(), code_vec.data(), code_vec.size() * sizeof(u32));
|
||||||
|
|
||||||
// Memory is ready to execute
|
// Memory is ready to execute
|
||||||
protect();
|
code_mem->protect();
|
||||||
invalidate_all();
|
code_mem->invalidate_all();
|
||||||
|
|
||||||
const std::size_t code_size = static_cast<std::size_t>(offset());
|
// code_vec is no longer needed
|
||||||
|
code_vec.clear();
|
||||||
ASSERT_MSG(code_size <= MAX_SHADER_SIZE, "Compiled a shader that exceeds the allocated size!");
|
code_vec.shrink_to_fit();
|
||||||
LOG_DEBUG(HW_GPU, "Compiled shader size={}", code_size);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
JitShader::JitShader() : CodeBlock(MAX_SHADER_SIZE), CodeGenerator(CodeBlock::ptr()) {
|
JitShader::JitShader() : oaknut::VectorCodeGenerator(code_vec) {
|
||||||
unprotect();
|
|
||||||
CompilePrelude();
|
CompilePrelude();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1013,19 +1023,22 @@ Label JitShader::CompilePrelude_Log2() {
|
||||||
// range. Coefficients for the minimax polynomial.
|
// range. Coefficients for the minimax polynomial.
|
||||||
// f(x) computes approximately log2(x) / (x - 1).
|
// f(x) computes approximately log2(x) / (x - 1).
|
||||||
// f(x) = c4 + x * (c3 + x * (c2 + x * (c1 + x * c0)).
|
// f(x) = c4 + x * (c3 + x * (c2 + x * (c1 + x * c0)).
|
||||||
align(16);
|
oaknut::Label c0;
|
||||||
const void* c0 = xptr<const void*>();
|
// align(16);
|
||||||
|
l(c0);
|
||||||
dw(0x3d74552f);
|
dw(0x3d74552f);
|
||||||
|
|
||||||
align(16);
|
// align(16);
|
||||||
const void* c14 = xptr<const void*>();
|
oaknut::Label c14;
|
||||||
|
l(c14);
|
||||||
dw(0xbeee7397);
|
dw(0xbeee7397);
|
||||||
dw(0x3fbd96dd);
|
dw(0x3fbd96dd);
|
||||||
dw(0xc02153f6);
|
dw(0xc02153f6);
|
||||||
dw(0x4038d96c);
|
dw(0x4038d96c);
|
||||||
|
|
||||||
align(16);
|
// align(16);
|
||||||
const void* negative_infinity_vector = xptr<const void*>();
|
oaknut::Label negative_infinity_vector;
|
||||||
|
l(negative_infinity_vector);
|
||||||
dw(0xff800000);
|
dw(0xff800000);
|
||||||
dw(0xff800000);
|
dw(0xff800000);
|
||||||
dw(0xff800000);
|
dw(0xff800000);
|
||||||
|
@ -1038,19 +1051,19 @@ Label JitShader::CompilePrelude_Log2() {
|
||||||
|
|
||||||
Label input_is_nan, input_is_zero, input_out_of_range;
|
Label input_is_nan, input_is_zero, input_out_of_range;
|
||||||
|
|
||||||
align(16);
|
// align(16);
|
||||||
l(input_out_of_range);
|
l(input_out_of_range);
|
||||||
B(Cond::EQ, input_is_zero);
|
B(Cond::EQ, input_is_zero);
|
||||||
MOVP2R(XSCRATCH0, default_qnan_vector);
|
ADR(XSCRATCH0, default_qnan_vector);
|
||||||
LDR(SRC1, XSCRATCH0);
|
LDR(SRC1, XSCRATCH0);
|
||||||
RET();
|
RET();
|
||||||
|
|
||||||
l(input_is_zero);
|
l(input_is_zero);
|
||||||
MOVP2R(XSCRATCH0, negative_infinity_vector);
|
ADR(XSCRATCH0, negative_infinity_vector);
|
||||||
LDR(SRC1, XSCRATCH0);
|
LDR(SRC1, XSCRATCH0);
|
||||||
RET();
|
RET();
|
||||||
|
|
||||||
align(16);
|
// align(16);
|
||||||
l(subroutine);
|
l(subroutine);
|
||||||
|
|
||||||
// Here we handle edge cases: input in {NaN, 0, -Inf, Negative}.
|
// Here we handle edge cases: input in {NaN, 0, -Inf, Negative}.
|
||||||
|
@ -1078,14 +1091,14 @@ Label JitShader::CompilePrelude_Log2() {
|
||||||
UCVTF(VSCRATCH1.toS(), VSCRATCH1.toS());
|
UCVTF(VSCRATCH1.toS(), VSCRATCH1.toS());
|
||||||
// VSCRATCH1 now contains the exponent of the input.
|
// VSCRATCH1 now contains the exponent of the input.
|
||||||
|
|
||||||
MOVP2R(XSCRATCH0, c0);
|
ADR(XSCRATCH0, c0);
|
||||||
LDR(XSCRATCH0.toW(), XSCRATCH0);
|
LDR(XSCRATCH0.toW(), XSCRATCH0);
|
||||||
MOV(VSCRATCH0.Selem()[0], XSCRATCH0.toW());
|
MOV(VSCRATCH0.Selem()[0], XSCRATCH0.toW());
|
||||||
|
|
||||||
// Complete computation of polynomial
|
// Complete computation of polynomial
|
||||||
// Load C1,C2,C3,C4 into a single scratch register
|
// Load C1,C2,C3,C4 into a single scratch register
|
||||||
const QReg C14 = SRC2;
|
const QReg C14 = SRC2;
|
||||||
MOVP2R(XSCRATCH0, c14);
|
ADR(XSCRATCH0, c14);
|
||||||
LDR(C14, XSCRATCH0);
|
LDR(C14, XSCRATCH0);
|
||||||
FMUL(VSCRATCH0.toS(), VSCRATCH0.toS(), SRC1.toS());
|
FMUL(VSCRATCH0.toS(), VSCRATCH0.toS(), SRC1.toS());
|
||||||
FMLA(VSCRATCH0.toS(), ONE.toS(), C14.Selem()[0]);
|
FMLA(VSCRATCH0.toS(), ONE.toS(), C14.Selem()[0]);
|
||||||
|
@ -1118,27 +1131,35 @@ Label JitShader::CompilePrelude_Exp2() {
|
||||||
// polynomial which was fit for the function exp2(x) is then evaluated. We then restore the
|
// polynomial which was fit for the function exp2(x) is then evaluated. We then restore the
|
||||||
// result into the appropriate range.
|
// result into the appropriate range.
|
||||||
|
|
||||||
align(16);
|
// align(16);
|
||||||
const void* input_max = xptr<const void*>();
|
Label input_max;
|
||||||
|
l(input_max);
|
||||||
dw(0x43010000);
|
dw(0x43010000);
|
||||||
const void* input_min = xptr<const void*>();
|
Label input_min;
|
||||||
|
l(input_min);
|
||||||
dw(0xc2fdffff);
|
dw(0xc2fdffff);
|
||||||
const void* c0 = xptr<const void*>();
|
Label c0;
|
||||||
|
l(c0);
|
||||||
dw(0x3c5dbe69);
|
dw(0x3c5dbe69);
|
||||||
const void* half = xptr<const void*>();
|
Label half;
|
||||||
|
l(half);
|
||||||
dw(0x3f000000);
|
dw(0x3f000000);
|
||||||
const void* c1 = xptr<const void*>();
|
Label c1;
|
||||||
|
l(c1);
|
||||||
dw(0x3d5509f9);
|
dw(0x3d5509f9);
|
||||||
const void* c2 = xptr<const void*>();
|
Label c2;
|
||||||
|
l(c2);
|
||||||
dw(0x3e773cc5);
|
dw(0x3e773cc5);
|
||||||
const void* c3 = xptr<const void*>();
|
Label c3;
|
||||||
|
l(c3);
|
||||||
dw(0x3f3168b3);
|
dw(0x3f3168b3);
|
||||||
const void* c4 = xptr<const void*>();
|
Label c4;
|
||||||
|
l(c4);
|
||||||
dw(0x3f800016);
|
dw(0x3f800016);
|
||||||
|
|
||||||
Label ret_label;
|
Label ret_label;
|
||||||
|
|
||||||
align(16);
|
// align(16);
|
||||||
l(subroutine);
|
l(subroutine);
|
||||||
|
|
||||||
// Handle edge cases
|
// Handle edge cases
|
||||||
|
@ -1149,15 +1170,15 @@ Label JitShader::CompilePrelude_Exp2() {
|
||||||
// VSCRATCH0=2^round(input)
|
// VSCRATCH0=2^round(input)
|
||||||
// SRC1=input-round(input) [-0.5, 0.5)
|
// SRC1=input-round(input) [-0.5, 0.5)
|
||||||
// Clamp to maximum range since we shift the value directly into the exponent.
|
// Clamp to maximum range since we shift the value directly into the exponent.
|
||||||
MOVP2R(XSCRATCH0, input_max);
|
ADR(XSCRATCH0, input_max);
|
||||||
LDR(VSCRATCH0.toS(), XSCRATCH0);
|
LDR(VSCRATCH0.toS(), XSCRATCH0);
|
||||||
FMIN(SRC1.toS(), SRC1.toS(), VSCRATCH0.toS());
|
FMIN(SRC1.toS(), SRC1.toS(), VSCRATCH0.toS());
|
||||||
|
|
||||||
MOVP2R(XSCRATCH0, input_min);
|
ADR(XSCRATCH0, input_min);
|
||||||
LDR(VSCRATCH0.toS(), XSCRATCH0);
|
LDR(VSCRATCH0.toS(), XSCRATCH0);
|
||||||
FMAX(SRC1.toS(), SRC1.toS(), VSCRATCH0.toS());
|
FMAX(SRC1.toS(), SRC1.toS(), VSCRATCH0.toS());
|
||||||
|
|
||||||
MOVP2R(XSCRATCH0, half);
|
ADR(XSCRATCH0, half);
|
||||||
LDR(VSCRATCH0.toS(), XSCRATCH0);
|
LDR(VSCRATCH0.toS(), XSCRATCH0);
|
||||||
FSUB(VSCRATCH0.toS(), SRC1.toS(), VSCRATCH0.toS());
|
FSUB(VSCRATCH0.toS(), SRC1.toS(), VSCRATCH0.toS());
|
||||||
|
|
||||||
|
|
|
@ -30,20 +30,17 @@ struct ShaderUnit;
|
||||||
|
|
||||||
namespace Pica::Shader {
|
namespace Pica::Shader {
|
||||||
|
|
||||||
/// Memory allocated for each compiled shader
|
|
||||||
constexpr std::size_t MAX_SHADER_SIZE = MAX_PROGRAM_CODE_LENGTH * 256;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This class implements the shader JIT compiler. It recompiles a Pica shader program into x86_64
|
* This class implements the shader JIT compiler. It recompiles a Pica shader program into x86_64
|
||||||
* code that can be executed on the host machine directly.
|
* code that can be executed on the host machine directly.
|
||||||
*/
|
*/
|
||||||
class JitShader : private oaknut::CodeBlock, private oaknut::CodeGenerator {
|
class JitShader : public oaknut::VectorCodeGenerator {
|
||||||
public:
|
public:
|
||||||
JitShader();
|
JitShader();
|
||||||
|
|
||||||
void Run(const ShaderSetup& setup, ShaderUnit& state, u32 offset) const {
|
void Run(const ShaderSetup& setup, ShaderUnit& state, u32 offset) const {
|
||||||
program(&setup.uniforms, &state,
|
program(&setup.uniforms, &state,
|
||||||
reinterpret_cast<std::byte*>(oaknut::CodeBlock::ptr()) +
|
reinterpret_cast<const std::byte*>(code_mem->ptr()) +
|
||||||
instruction_labels[offset].offset());
|
instruction_labels[offset].offset());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -81,6 +78,9 @@ public:
|
||||||
void Compile_SETE(Instruction instr);
|
void Compile_SETE(Instruction instr);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
std::vector<u32> code_vec;
|
||||||
|
std::unique_ptr<oaknut::CodeBlock> code_mem;
|
||||||
|
|
||||||
void Compile_Block(u32 end);
|
void Compile_Block(u32 end);
|
||||||
void Compile_NextInstr();
|
void Compile_NextInstr();
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue