mirror of
https://github.com/PabloMK7/citra
synced 2024-11-14 20:58:23 +00:00
shader_jit: Fix/optimize conditional evaluation (#234)
* shader_jit: Add conditional unit-tests Tests all permutations of X, Y, AND, OR with each possible input value. * video_core: Fix shader-interpreter conditional-code initialization Rather than reserving the incoming state of the conditional codes, the shader-interpreter was setting them both to false. In pretty much all cases, the initial state of a shaderunit can be zero-initialized statically. Just running the interpreter shouldn't necessarily reset the conditional codes though. The JIT loads incoming conditional codes while the shader-interpreter resets them to false. This makes the interpreter match the behavior of the shader-jit. * shader_jit_a64: Fix/optimize conditional evaluation Fix some of the regressions introduced by the previous optimization. EOR does not support a constant of `0` in its immediate. In these cases the COND{0,1} registers can be utilized immediately. * shader_jit_x64: Fix conditional evaluation extended-bit hazard The unit test seems to have identified a bug in the x64 jit too. The x64 jit was doing 32-bit comparisons despite the condition flags being 8-bit values and is sensitive to garbage being in the upper 24 bits of the register. This is fixed by using the proper 8-bit register types rather than the 32-bit ones(`eax,`ebx` -> `al`, `bl`). * shader_jit_x64: Zero-extend conditional-code bytes `mov` was doing a partial update of bits within the register, allowing garbage to be introduced in the upper bits of the register.
This commit is contained in:
parent
f248fefe06
commit
52f06f757f
6 changed files with 144 additions and 40 deletions
|
@ -674,6 +674,94 @@ TEMPLATE_TEST_CASE("Nested Loop", "[video_core][shader]", ShaderJitTest) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
SHADER_TEST_CASE("Conditional", "[video_core][shader]") {
|
||||||
|
const auto sh_input = SourceRegister::MakeInput(0);
|
||||||
|
const auto sh_temp = SourceRegister::MakeTemporary(0);
|
||||||
|
const auto sh_output = DestRegister::MakeOutput(0);
|
||||||
|
|
||||||
|
const std::initializer_list<nihstro::InlineAsm> assembly_template = {
|
||||||
|
// IFC configured later
|
||||||
|
{OpCode::Id::NOP},
|
||||||
|
// True
|
||||||
|
{OpCode::Id::MOV, sh_output, sh_input},
|
||||||
|
{OpCode::Id::END},
|
||||||
|
// False
|
||||||
|
{OpCode::Id::MOV, sh_output, sh_temp},
|
||||||
|
{OpCode::Id::END},
|
||||||
|
};
|
||||||
|
|
||||||
|
const bool ref_x = GENERATE(0, 1);
|
||||||
|
const bool cmp_x = GENERATE(0, 1);
|
||||||
|
const bool result_x = (cmp_x == ref_x);
|
||||||
|
|
||||||
|
const bool ref_y = GENERATE(0, 1);
|
||||||
|
const bool cmp_y = GENERATE(0, 1);
|
||||||
|
const bool result_y = (cmp_y == ref_y);
|
||||||
|
|
||||||
|
nihstro::Instruction IFC = {};
|
||||||
|
IFC.opcode = nihstro::OpCode::Id::IFC;
|
||||||
|
IFC.flow_control.num_instructions = 2;
|
||||||
|
IFC.flow_control.dest_offset = 3;
|
||||||
|
IFC.flow_control.refx = ref_x;
|
||||||
|
IFC.flow_control.refy = ref_y;
|
||||||
|
|
||||||
|
Pica::ShaderUnit shader_unit;
|
||||||
|
shader_unit.conditional_code[0] = cmp_x;
|
||||||
|
shader_unit.conditional_code[1] = cmp_y;
|
||||||
|
|
||||||
|
// JustX
|
||||||
|
{
|
||||||
|
auto shader_setup = CompileShaderSetup(assembly_template);
|
||||||
|
IFC.flow_control.op = nihstro::Instruction::FlowControlType::Op::JustX;
|
||||||
|
shader_setup->program_code[0] = IFC.hex;
|
||||||
|
const float result = result_x ? 1.0f : 0.0f;
|
||||||
|
|
||||||
|
auto shader_test = TestType(std::move(shader_setup));
|
||||||
|
shader_test.Run(shader_unit, 1.0f);
|
||||||
|
|
||||||
|
REQUIRE(shader_unit.output[0].x.ToFloat32() == result);
|
||||||
|
}
|
||||||
|
|
||||||
|
// JustY
|
||||||
|
{
|
||||||
|
auto shader_setup = CompileShaderSetup(assembly_template);
|
||||||
|
IFC.flow_control.op = nihstro::Instruction::FlowControlType::Op::JustY;
|
||||||
|
shader_setup->program_code[0] = IFC.hex;
|
||||||
|
const float result = result_y ? 1.0f : 0.0f;
|
||||||
|
|
||||||
|
auto shader_test = TestType(std::move(shader_setup));
|
||||||
|
shader_test.Run(shader_unit, 1.0f);
|
||||||
|
|
||||||
|
REQUIRE(shader_unit.output[0].x.ToFloat32() == result);
|
||||||
|
}
|
||||||
|
|
||||||
|
// OR
|
||||||
|
{
|
||||||
|
auto shader_setup = CompileShaderSetup(assembly_template);
|
||||||
|
IFC.flow_control.op = nihstro::Instruction::FlowControlType::Op::Or;
|
||||||
|
shader_setup->program_code[0] = IFC.hex;
|
||||||
|
const float result = (result_x || result_y) ? 1.0f : 0.0f;
|
||||||
|
|
||||||
|
auto shader_test = TestType(std::move(shader_setup));
|
||||||
|
shader_test.Run(shader_unit, 1.0f);
|
||||||
|
|
||||||
|
REQUIRE(shader_unit.output[0].x.ToFloat32() == result);
|
||||||
|
}
|
||||||
|
|
||||||
|
// AND
|
||||||
|
{
|
||||||
|
auto shader_setup = CompileShaderSetup(assembly_template);
|
||||||
|
IFC.flow_control.op = nihstro::Instruction::FlowControlType::Op::And;
|
||||||
|
shader_setup->program_code[0] = IFC.hex;
|
||||||
|
const float result = (result_x && result_y) ? 1.0f : 0.0f;
|
||||||
|
|
||||||
|
auto shader_test = TestType(std::move(shader_setup));
|
||||||
|
shader_test.Run(shader_unit, 1.0f);
|
||||||
|
|
||||||
|
REQUIRE(shader_unit.output[0].x.ToFloat32() == result);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
SHADER_TEST_CASE("Source Swizzle", "[video_core][shader]") {
|
SHADER_TEST_CASE("Source Swizzle", "[video_core][shader]") {
|
||||||
const auto sh_input = SourceRegister::MakeInput(0);
|
const auto sh_input = SourceRegister::MakeInput(0);
|
||||||
const auto sh_output = DestRegister::MakeOutput(0);
|
const auto sh_output = DestRegister::MakeOutput(0);
|
||||||
|
|
|
@ -9,10 +9,7 @@
|
||||||
|
|
||||||
namespace Pica {
|
namespace Pica {
|
||||||
|
|
||||||
ShaderUnit::ShaderUnit(GeometryEmitter* emitter) : emitter_ptr{emitter} {
|
ShaderUnit::ShaderUnit(GeometryEmitter* emitter) : emitter_ptr{emitter} {}
|
||||||
const Common::Vec4<f24> temp_vec{f24::Zero(), f24::Zero(), f24::Zero(), f24::One()};
|
|
||||||
temporary.fill(temp_vec);
|
|
||||||
}
|
|
||||||
|
|
||||||
ShaderUnit::~ShaderUnit() = default;
|
ShaderUnit::~ShaderUnit() = default;
|
||||||
|
|
||||||
|
|
|
@ -46,11 +46,11 @@ struct ShaderUnit {
|
||||||
}
|
}
|
||||||
|
|
||||||
public:
|
public:
|
||||||
s32 address_registers[3];
|
s32 address_registers[3] = {};
|
||||||
bool conditional_code[2];
|
bool conditional_code[2] = {};
|
||||||
alignas(16) std::array<Common::Vec4<f24>, 16> input;
|
alignas(16) std::array<Common::Vec4<f24>, 16> input = {};
|
||||||
alignas(16) std::array<Common::Vec4<f24>, 16> temporary;
|
alignas(16) std::array<Common::Vec4<f24>, 16> temporary = {};
|
||||||
alignas(16) std::array<Common::Vec4<f24>, 16> output;
|
alignas(16) std::array<Common::Vec4<f24>, 16> output = {};
|
||||||
GeometryEmitter* emitter_ptr;
|
GeometryEmitter* emitter_ptr;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
|
|
@ -52,9 +52,6 @@ static void RunInterpreter(const ShaderSetup& setup, ShaderUnit& state,
|
||||||
boost::circular_buffer<LoopStackElement> loop_stack(4);
|
boost::circular_buffer<LoopStackElement> loop_stack(4);
|
||||||
u32 program_counter = entry_point;
|
u32 program_counter = entry_point;
|
||||||
|
|
||||||
state.conditional_code[0] = false;
|
|
||||||
state.conditional_code[1] = false;
|
|
||||||
|
|
||||||
const auto do_if = [&](Instruction instr, bool condition) {
|
const auto do_if = [&](Instruction instr, bool condition) {
|
||||||
if (condition) {
|
if (condition) {
|
||||||
if_stack.push_back({
|
if_stack.push_back({
|
||||||
|
|
|
@ -386,28 +386,50 @@ void JitShader::Compile_SanitizedMul(QReg src1, QReg src2, QReg scratch0) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void JitShader::Compile_EvaluateCondition(Instruction instr) {
|
void JitShader::Compile_EvaluateCondition(Instruction instr) {
|
||||||
const u8 refx = instr.flow_control.refx.Value();
|
const bool refx = instr.flow_control.refx.Value();
|
||||||
const u8 refy = instr.flow_control.refy.Value();
|
const bool refy = instr.flow_control.refy.Value();
|
||||||
|
|
||||||
switch (instr.flow_control.op) {
|
switch (instr.flow_control.op) {
|
||||||
// Note: NXOR is used below to check for equality
|
// Note: NXOR is used below to check for equality
|
||||||
case Instruction::FlowControlType::Or:
|
case Instruction::FlowControlType::Or: {
|
||||||
EOR(XSCRATCH0, COND0, refx ^ 1);
|
XReg OpX = XSCRATCH0;
|
||||||
EOR(XSCRATCH1, COND1, refy ^ 1);
|
if (!refx) {
|
||||||
ORR(XSCRATCH0, XSCRATCH0, XSCRATCH1);
|
EOR(OpX, COND0, u8(refx) ^ 1);
|
||||||
|
} else {
|
||||||
|
OpX = COND0;
|
||||||
|
}
|
||||||
|
XReg OpY = XSCRATCH1;
|
||||||
|
if (!refy) {
|
||||||
|
EOR(OpY, COND1, u8(refy) ^ 1);
|
||||||
|
} else {
|
||||||
|
OpY = COND1;
|
||||||
|
}
|
||||||
|
ORR(XSCRATCH0, OpX, OpY);
|
||||||
CMP(XSCRATCH0, 0);
|
CMP(XSCRATCH0, 0);
|
||||||
break;
|
break;
|
||||||
|
}
|
||||||
// Note: TST will AND two registers and set the EQ/NE flags on the result
|
// Note: TST will AND two registers and set the EQ/NE flags on the result
|
||||||
case Instruction::FlowControlType::And:
|
case Instruction::FlowControlType::And: {
|
||||||
EOR(XSCRATCH0, COND0, refx ^ 1);
|
XReg OpX = XSCRATCH0;
|
||||||
EOR(XSCRATCH1, COND1, refy ^ 1);
|
if (!refx) {
|
||||||
TST(XSCRATCH0, XSCRATCH1);
|
EOR(OpX, COND0, u8(refx) ^ 1);
|
||||||
|
} else {
|
||||||
|
OpX = COND0;
|
||||||
|
}
|
||||||
|
XReg OpY = XSCRATCH1;
|
||||||
|
if (!refy) {
|
||||||
|
EOR(OpY, COND1, u8(refy) ^ 1);
|
||||||
|
} else {
|
||||||
|
OpY = COND1;
|
||||||
|
}
|
||||||
|
TST(OpX, OpY);
|
||||||
break;
|
break;
|
||||||
|
}
|
||||||
case Instruction::FlowControlType::JustX:
|
case Instruction::FlowControlType::JustX:
|
||||||
CMP(COND0, refx);
|
CMP(COND0, u8(refx) ^ 1);
|
||||||
break;
|
break;
|
||||||
case Instruction::FlowControlType::JustY:
|
case Instruction::FlowControlType::JustY:
|
||||||
CMP(COND1, refy);
|
CMP(COND1, u8(refy) ^ 1);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
UNREACHABLE();
|
UNREACHABLE();
|
||||||
|
|
|
@ -401,29 +401,29 @@ void JitShader::Compile_EvaluateCondition(Instruction instr) {
|
||||||
// Note: NXOR is used below to check for equality
|
// Note: NXOR is used below to check for equality
|
||||||
switch (instr.flow_control.op) {
|
switch (instr.flow_control.op) {
|
||||||
case Instruction::FlowControlType::Or:
|
case Instruction::FlowControlType::Or:
|
||||||
mov(eax, COND0);
|
mov(al, COND0.cvt8());
|
||||||
mov(ebx, COND1);
|
mov(bl, COND1.cvt8());
|
||||||
xor_(eax, (instr.flow_control.refx.Value() ^ 1));
|
xor_(al, (instr.flow_control.refx.Value() ^ 1));
|
||||||
xor_(ebx, (instr.flow_control.refy.Value() ^ 1));
|
xor_(bl, (instr.flow_control.refy.Value() ^ 1));
|
||||||
or_(eax, ebx);
|
or_(al, bl);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case Instruction::FlowControlType::And:
|
case Instruction::FlowControlType::And:
|
||||||
mov(eax, COND0);
|
mov(al, COND0);
|
||||||
mov(ebx, COND1);
|
mov(bl, COND1);
|
||||||
xor_(eax, (instr.flow_control.refx.Value() ^ 1));
|
xor_(al, (instr.flow_control.refx.Value() ^ 1));
|
||||||
xor_(ebx, (instr.flow_control.refy.Value() ^ 1));
|
xor_(bl, (instr.flow_control.refy.Value() ^ 1));
|
||||||
and_(eax, ebx);
|
and_(al, bl);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case Instruction::FlowControlType::JustX:
|
case Instruction::FlowControlType::JustX:
|
||||||
mov(eax, COND0);
|
mov(al, COND0);
|
||||||
xor_(eax, (instr.flow_control.refx.Value() ^ 1));
|
xor_(al, (instr.flow_control.refx.Value() ^ 1));
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case Instruction::FlowControlType::JustY:
|
case Instruction::FlowControlType::JustY:
|
||||||
mov(eax, COND1);
|
mov(al, COND1);
|
||||||
xor_(eax, (instr.flow_control.refy.Value() ^ 1));
|
xor_(al, (instr.flow_control.refy.Value() ^ 1));
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1002,8 +1002,8 @@ void JitShader::Compile(const std::array<u32, MAX_PROGRAM_CODE_LENGTH>* program_
|
||||||
mov(LOOPCOUNT_REG, dword[STATE + offsetof(ShaderUnit, address_registers[2])]);
|
mov(LOOPCOUNT_REG, dword[STATE + offsetof(ShaderUnit, address_registers[2])]);
|
||||||
|
|
||||||
// Load conditional code
|
// Load conditional code
|
||||||
mov(COND0, byte[STATE + offsetof(ShaderUnit, conditional_code[0])]);
|
movzx(COND0, byte[STATE + offsetof(ShaderUnit, conditional_code[0])]);
|
||||||
mov(COND1, byte[STATE + offsetof(ShaderUnit, conditional_code[1])]);
|
movzx(COND1, byte[STATE + offsetof(ShaderUnit, conditional_code[1])]);
|
||||||
|
|
||||||
// Used to set a register to one
|
// Used to set a register to one
|
||||||
static const __m128 one = {1.f, 1.f, 1.f, 1.f};
|
static const __m128 one = {1.f, 1.f, 1.f, 1.f};
|
||||||
|
|
Loading…
Reference in a new issue