mirror of
https://github.com/PabloMK7/citra
synced 2024-11-14 20:58:23 +00:00
shader_jit_a64: Compact host executable memory (#230)
* common/aarch64: Allow generic code generator types Use the templated `BasicCodeGenerator` type rather than the specialized `CodeGenerator` type. Allows `VectorCodeGenerator` to work with these functions. * common/aarch64: Add `VectorCodeGenerator` to `CallFarFunction` `VectorCodeGenerator` will always do far-calls since we cannot resolve any absolute addresses here. * shader_jit_a64: Implement position-independent VectorCodeGenerator Generates more position-independent assembly to allow for code to be generated within a resizable vector before copying into executable memory, allowing for more compact memory allocations and usage rather than a statically defined worst-case for all-cases. `VectorCodeGenerator` will need to generate position-independent code rather than use absolute addresses. Assumes all far function calls in the case of `VectorCodeGenerator` to use absolute addresses rather than potentially use a relative `BL` branch after memory relocation.
This commit is contained in:
parent
82faf2e557
commit
3e5bbac5a1
4 changed files with 74 additions and 41 deletions
|
@ -78,7 +78,8 @@ inline ABIFrameInfo ABI_CalculateFrameSize(std::bitset<64> regs, std::size_t fra
|
|||
return ABIFrameInfo{static_cast<u32>(total_size), static_cast<u32>(fprs_base_subtraction)};
|
||||
}
|
||||
|
||||
inline void ABI_PushRegisters(oaknut::CodeGenerator& code, std::bitset<64> regs,
|
||||
template <typename Policy>
|
||||
inline void ABI_PushRegisters(oaknut::BasicCodeGenerator<Policy>& code, std::bitset<64> regs,
|
||||
std::size_t frame_size = 0) {
|
||||
using namespace oaknut;
|
||||
using namespace oaknut::util;
|
||||
|
@ -137,7 +138,8 @@ inline void ABI_PushRegisters(oaknut::CodeGenerator& code, std::bitset<64> regs,
|
|||
}
|
||||
}
|
||||
|
||||
inline void ABI_PopRegisters(oaknut::CodeGenerator& code, std::bitset<64> regs,
|
||||
template <typename Policy>
|
||||
inline void ABI_PopRegisters(oaknut::BasicCodeGenerator<Policy>& code, std::bitset<64> regs,
|
||||
std::size_t frame_size = 0) {
|
||||
using namespace oaknut;
|
||||
using namespace oaknut::util;
|
||||
|
|
|
@ -38,6 +38,16 @@ inline void CallFarFunction(oaknut::CodeGenerator& code, const T f) {
|
|||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline void CallFarFunction(oaknut::VectorCodeGenerator& code, const T f) {
|
||||
static_assert(std::is_pointer_v<T>, "Argument must be a (function) pointer.");
|
||||
// X16(IP0) and X17(IP1) is the standard veneer register
|
||||
// LR is also available as an intermediate register
|
||||
// https://developer.arm.com/documentation/102374/0101/Procedure-Call-Standard
|
||||
code.MOVP2R(oaknut::util::X16, reinterpret_cast<const void*>(f));
|
||||
code.BLR(oaknut::util::X16);
|
||||
}
|
||||
|
||||
} // namespace Common::A64
|
||||
|
||||
#endif // CITRA_ARCH(arm64)
|
||||
|
|
|
@ -942,7 +942,7 @@ void JitShader::Compile(const std::array<u32, MAX_PROGRAM_CODE_LENGTH>* program_
|
|||
swizzle_data = swizzle_data_;
|
||||
|
||||
// Reset flow control state
|
||||
program = xptr<CompiledShader*>();
|
||||
const std::uintptr_t program_offset = offset();
|
||||
program_counter = 0;
|
||||
loop_depth = 0;
|
||||
instruction_labels.fill(Label());
|
||||
|
@ -984,18 +984,28 @@ void JitShader::Compile(const std::array<u32, MAX_PROGRAM_CODE_LENGTH>* program_
|
|||
return_offsets.clear();
|
||||
return_offsets.shrink_to_fit();
|
||||
|
||||
// Copy to executable memory
|
||||
const size_t code_size = code_vec.size() * sizeof(u32);
|
||||
|
||||
code_mem = std::make_unique<oaknut::CodeBlock>(code_size);
|
||||
code_mem->unprotect();
|
||||
|
||||
program = reinterpret_cast<CompiledShader*>(reinterpret_cast<std::byte*>(code_mem->ptr()) +
|
||||
program_offset);
|
||||
|
||||
// Copy to executable memory
|
||||
std::memcpy(code_mem->ptr(), code_vec.data(), code_vec.size() * sizeof(u32));
|
||||
|
||||
// Memory is ready to execute
|
||||
protect();
|
||||
invalidate_all();
|
||||
code_mem->protect();
|
||||
code_mem->invalidate_all();
|
||||
|
||||
const std::size_t code_size = static_cast<std::size_t>(offset());
|
||||
|
||||
ASSERT_MSG(code_size <= MAX_SHADER_SIZE, "Compiled a shader that exceeds the allocated size!");
|
||||
LOG_DEBUG(HW_GPU, "Compiled shader size={}", code_size);
|
||||
// code_vec is no longer needed
|
||||
code_vec.clear();
|
||||
code_vec.shrink_to_fit();
|
||||
}
|
||||
|
||||
JitShader::JitShader() : CodeBlock(MAX_SHADER_SIZE), CodeGenerator(CodeBlock::ptr()) {
|
||||
unprotect();
|
||||
JitShader::JitShader() : oaknut::VectorCodeGenerator(code_vec) {
|
||||
CompilePrelude();
|
||||
}
|
||||
|
||||
|
@ -1013,19 +1023,22 @@ Label JitShader::CompilePrelude_Log2() {
|
|||
// range. Coefficients for the minimax polynomial.
|
||||
// f(x) computes approximately log2(x) / (x - 1).
|
||||
// f(x) = c4 + x * (c3 + x * (c2 + x * (c1 + x * c0)).
|
||||
align(16);
|
||||
const void* c0 = xptr<const void*>();
|
||||
oaknut::Label c0;
|
||||
// align(16);
|
||||
l(c0);
|
||||
dw(0x3d74552f);
|
||||
|
||||
align(16);
|
||||
const void* c14 = xptr<const void*>();
|
||||
// align(16);
|
||||
oaknut::Label c14;
|
||||
l(c14);
|
||||
dw(0xbeee7397);
|
||||
dw(0x3fbd96dd);
|
||||
dw(0xc02153f6);
|
||||
dw(0x4038d96c);
|
||||
|
||||
align(16);
|
||||
const void* negative_infinity_vector = xptr<const void*>();
|
||||
// align(16);
|
||||
oaknut::Label negative_infinity_vector;
|
||||
l(negative_infinity_vector);
|
||||
dw(0xff800000);
|
||||
dw(0xff800000);
|
||||
dw(0xff800000);
|
||||
|
@ -1038,19 +1051,19 @@ Label JitShader::CompilePrelude_Log2() {
|
|||
|
||||
Label input_is_nan, input_is_zero, input_out_of_range;
|
||||
|
||||
align(16);
|
||||
// align(16);
|
||||
l(input_out_of_range);
|
||||
B(Cond::EQ, input_is_zero);
|
||||
MOVP2R(XSCRATCH0, default_qnan_vector);
|
||||
ADR(XSCRATCH0, default_qnan_vector);
|
||||
LDR(SRC1, XSCRATCH0);
|
||||
RET();
|
||||
|
||||
l(input_is_zero);
|
||||
MOVP2R(XSCRATCH0, negative_infinity_vector);
|
||||
ADR(XSCRATCH0, negative_infinity_vector);
|
||||
LDR(SRC1, XSCRATCH0);
|
||||
RET();
|
||||
|
||||
align(16);
|
||||
// align(16);
|
||||
l(subroutine);
|
||||
|
||||
// Here we handle edge cases: input in {NaN, 0, -Inf, Negative}.
|
||||
|
@ -1078,14 +1091,14 @@ Label JitShader::CompilePrelude_Log2() {
|
|||
UCVTF(VSCRATCH1.toS(), VSCRATCH1.toS());
|
||||
// VSCRATCH1 now contains the exponent of the input.
|
||||
|
||||
MOVP2R(XSCRATCH0, c0);
|
||||
ADR(XSCRATCH0, c0);
|
||||
LDR(XSCRATCH0.toW(), XSCRATCH0);
|
||||
MOV(VSCRATCH0.Selem()[0], XSCRATCH0.toW());
|
||||
|
||||
// Complete computation of polynomial
|
||||
// Load C1,C2,C3,C4 into a single scratch register
|
||||
const QReg C14 = SRC2;
|
||||
MOVP2R(XSCRATCH0, c14);
|
||||
ADR(XSCRATCH0, c14);
|
||||
LDR(C14, XSCRATCH0);
|
||||
FMUL(VSCRATCH0.toS(), VSCRATCH0.toS(), SRC1.toS());
|
||||
FMLA(VSCRATCH0.toS(), ONE.toS(), C14.Selem()[0]);
|
||||
|
@ -1118,27 +1131,35 @@ Label JitShader::CompilePrelude_Exp2() {
|
|||
// polynomial which was fit for the function exp2(x) is then evaluated. We then restore the
|
||||
// result into the appropriate range.
|
||||
|
||||
align(16);
|
||||
const void* input_max = xptr<const void*>();
|
||||
// align(16);
|
||||
Label input_max;
|
||||
l(input_max);
|
||||
dw(0x43010000);
|
||||
const void* input_min = xptr<const void*>();
|
||||
Label input_min;
|
||||
l(input_min);
|
||||
dw(0xc2fdffff);
|
||||
const void* c0 = xptr<const void*>();
|
||||
Label c0;
|
||||
l(c0);
|
||||
dw(0x3c5dbe69);
|
||||
const void* half = xptr<const void*>();
|
||||
Label half;
|
||||
l(half);
|
||||
dw(0x3f000000);
|
||||
const void* c1 = xptr<const void*>();
|
||||
Label c1;
|
||||
l(c1);
|
||||
dw(0x3d5509f9);
|
||||
const void* c2 = xptr<const void*>();
|
||||
Label c2;
|
||||
l(c2);
|
||||
dw(0x3e773cc5);
|
||||
const void* c3 = xptr<const void*>();
|
||||
Label c3;
|
||||
l(c3);
|
||||
dw(0x3f3168b3);
|
||||
const void* c4 = xptr<const void*>();
|
||||
Label c4;
|
||||
l(c4);
|
||||
dw(0x3f800016);
|
||||
|
||||
Label ret_label;
|
||||
|
||||
align(16);
|
||||
// align(16);
|
||||
l(subroutine);
|
||||
|
||||
// Handle edge cases
|
||||
|
@ -1149,15 +1170,15 @@ Label JitShader::CompilePrelude_Exp2() {
|
|||
// VSCRATCH0=2^round(input)
|
||||
// SRC1=input-round(input) [-0.5, 0.5)
|
||||
// Clamp to maximum range since we shift the value directly into the exponent.
|
||||
MOVP2R(XSCRATCH0, input_max);
|
||||
ADR(XSCRATCH0, input_max);
|
||||
LDR(VSCRATCH0.toS(), XSCRATCH0);
|
||||
FMIN(SRC1.toS(), SRC1.toS(), VSCRATCH0.toS());
|
||||
|
||||
MOVP2R(XSCRATCH0, input_min);
|
||||
ADR(XSCRATCH0, input_min);
|
||||
LDR(VSCRATCH0.toS(), XSCRATCH0);
|
||||
FMAX(SRC1.toS(), SRC1.toS(), VSCRATCH0.toS());
|
||||
|
||||
MOVP2R(XSCRATCH0, half);
|
||||
ADR(XSCRATCH0, half);
|
||||
LDR(VSCRATCH0.toS(), XSCRATCH0);
|
||||
FSUB(VSCRATCH0.toS(), SRC1.toS(), VSCRATCH0.toS());
|
||||
|
||||
|
|
|
@ -30,20 +30,17 @@ struct ShaderUnit;
|
|||
|
||||
namespace Pica::Shader {
|
||||
|
||||
/// Memory allocated for each compiled shader
|
||||
constexpr std::size_t MAX_SHADER_SIZE = MAX_PROGRAM_CODE_LENGTH * 256;
|
||||
|
||||
/**
|
||||
* This class implements the shader JIT compiler. It recompiles a Pica shader program into x86_64
|
||||
* code that can be executed on the host machine directly.
|
||||
*/
|
||||
class JitShader : private oaknut::CodeBlock, private oaknut::CodeGenerator {
|
||||
class JitShader : public oaknut::VectorCodeGenerator {
|
||||
public:
|
||||
JitShader();
|
||||
|
||||
void Run(const ShaderSetup& setup, ShaderUnit& state, u32 offset) const {
|
||||
program(&setup.uniforms, &state,
|
||||
reinterpret_cast<std::byte*>(oaknut::CodeBlock::ptr()) +
|
||||
reinterpret_cast<const std::byte*>(code_mem->ptr()) +
|
||||
instruction_labels[offset].offset());
|
||||
}
|
||||
|
||||
|
@ -81,6 +78,9 @@ public:
|
|||
void Compile_SETE(Instruction instr);
|
||||
|
||||
private:
|
||||
std::vector<u32> code_vec;
|
||||
std::unique_ptr<oaknut::CodeBlock> code_mem;
|
||||
|
||||
void Compile_Block(u32 end);
|
||||
void Compile_NextInstr();
|
||||
|
||||
|
|
Loading…
Reference in a new issue