shader: Fix address register offset behavior in x64 Jit (#6942)
* shader: Fix address register offset behavior in x64 Jit * shader: Remove redundant jump * tests: Add address register tests * shader: Remove additional pre-multiplications by 16 * tests: Add catch-stringifier for vec4f * tests: Format
This commit is contained in:
parent
1caf569f16
commit
ef43776c7b
2 changed files with 94 additions and 27 deletions
|
@ -28,6 +28,15 @@ static constexpr Common::Vec4f vec4_nan = Common::Vec4f::AssignToAll(NAN);
|
||||||
static constexpr Common::Vec4f vec4_one = Common::Vec4f::AssignToAll(1.0f);
|
static constexpr Common::Vec4f vec4_one = Common::Vec4f::AssignToAll(1.0f);
|
||||||
static constexpr Common::Vec4f vec4_zero = Common::Vec4f::AssignToAll(0.0f);
|
static constexpr Common::Vec4f vec4_zero = Common::Vec4f::AssignToAll(0.0f);
|
||||||
|
|
||||||
|
namespace Catch {
|
||||||
|
template <>
|
||||||
|
struct StringMaker<Common::Vec4f> {
|
||||||
|
static std::string convert(Common::Vec4f value) {
|
||||||
|
return fmt::format("({}, {}, {}, {})", value.r(), value.g(), value.b(), value.a());
|
||||||
|
}
|
||||||
|
};
|
||||||
|
} // namespace Catch
|
||||||
|
|
||||||
static std::unique_ptr<Pica::Shader::ShaderSetup> CompileShaderSetup(
|
static std::unique_ptr<Pica::Shader::ShaderSetup> CompileShaderSetup(
|
||||||
std::initializer_list<nihstro::InlineAsm> code) {
|
std::initializer_list<nihstro::InlineAsm> code) {
|
||||||
const auto shbin = nihstro::InlineAsm::CompileToRawBinary(code);
|
const auto shbin = nihstro::InlineAsm::CompileToRawBinary(code);
|
||||||
|
@ -385,6 +394,56 @@ TEST_CASE("RSQ", "[video_core][shader][shader_jit]") {
|
||||||
REQUIRE(shader.Run({0.0625f}).x == Catch::Approx(4.0f).margin(0.004f));
|
REQUIRE(shader.Run({0.0625f}).x == Catch::Approx(4.0f).margin(0.004f));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST_CASE("Address Register Offset", "[video_core][shader][shader_jit]") {
|
||||||
|
const auto sh_input = SourceRegister::MakeInput(0);
|
||||||
|
const auto sh_c40 = SourceRegister::MakeFloat(40);
|
||||||
|
const auto sh_output = DestRegister::MakeOutput(0);
|
||||||
|
|
||||||
|
auto shader = ShaderTest({
|
||||||
|
// mova a0.x, sh_input.x
|
||||||
|
{OpCode::Id::MOVA, DestRegister{}, "x", sh_input, "x", SourceRegister{}, "",
|
||||||
|
nihstro::InlineAsm::RelativeAddress::A1},
|
||||||
|
// mov sh_output.xyzw, c40[a0.x].xyzw
|
||||||
|
{OpCode::Id::MOV, sh_output, "xyzw", sh_c40, "xyzw", SourceRegister{}, "",
|
||||||
|
nihstro::InlineAsm::RelativeAddress::A1},
|
||||||
|
{OpCode::Id::END},
|
||||||
|
});
|
||||||
|
|
||||||
|
// Prepare shader uniforms
|
||||||
|
const bool inverted = true;
|
||||||
|
std::array<Common::Vec4f, 96> f_uniforms;
|
||||||
|
for (u32 i = 0; i < 0x80; i++) {
|
||||||
|
if (i >= 0x00 && i < 0x60) {
|
||||||
|
const u32 base = inverted ? (0x60 - i) : i;
|
||||||
|
const auto color = (base * 2.f) / 255.0f;
|
||||||
|
const auto color_f24 = Pica::f24::FromFloat32(color);
|
||||||
|
shader.shader_setup->uniforms.f[i] = {color_f24, color_f24, color_f24,
|
||||||
|
Pica::f24::One()};
|
||||||
|
f_uniforms[i] = {color, color, color, 1.f};
|
||||||
|
} else if (i >= 0x60 && i < 0x70) {
|
||||||
|
const u8 color = static_cast<u8>((i - 0x60) * 0x10);
|
||||||
|
shader.shader_setup->uniforms.i[i - 0x60] = {color, color, color, 255};
|
||||||
|
} else if (i >= 0x70 && i < 0x80) {
|
||||||
|
shader.shader_setup->uniforms.b[i - 0x70] = i >= 0x78;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
REQUIRE(shader.Run(0.f) == f_uniforms[40]);
|
||||||
|
REQUIRE(shader.Run(13.f) == f_uniforms[53]);
|
||||||
|
REQUIRE(shader.Run(50.f) == f_uniforms[90]);
|
||||||
|
REQUIRE(shader.Run(60.f) == vec4_one);
|
||||||
|
REQUIRE(shader.Run(74.f) == vec4_one);
|
||||||
|
REQUIRE(shader.Run(87.f) == vec4_one);
|
||||||
|
REQUIRE(shader.Run(88.f) == f_uniforms[0]);
|
||||||
|
REQUIRE(shader.Run(128.f) == f_uniforms[40]);
|
||||||
|
REQUIRE(shader.Run(-40.f) == f_uniforms[0]);
|
||||||
|
REQUIRE(shader.Run(-42.f) == vec4_one);
|
||||||
|
REQUIRE(shader.Run(-70.f) == vec4_one);
|
||||||
|
REQUIRE(shader.Run(-73.f) == f_uniforms[95]);
|
||||||
|
REQUIRE(shader.Run(-127.f) == f_uniforms[41]);
|
||||||
|
REQUIRE(shader.Run(-129.f) == f_uniforms[40]);
|
||||||
|
}
|
||||||
|
|
||||||
// TODO: Requires fix from https://github.com/neobrain/nihstro/issues/68
|
// TODO: Requires fix from https://github.com/neobrain/nihstro/issues/68
|
||||||
// TEST_CASE("MAD", "[video_core][shader][shader_jit]") {
|
// TEST_CASE("MAD", "[video_core][shader][shader_jit]") {
|
||||||
// const auto sh_input1 = SourceRegister::MakeInput(0);
|
// const auto sh_input1 = SourceRegister::MakeInput(0);
|
||||||
|
|
|
@ -232,21 +232,45 @@ void JitShader::Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRe
|
||||||
address_register_index = instr.common.address_register_index;
|
address_register_index = instr.common.address_register_index;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (src_num == offset_src && address_register_index != 0) {
|
if (src_reg.GetRegisterType() == RegisterType::FloatUniform && src_num == offset_src &&
|
||||||
|
address_register_index != 0) {
|
||||||
|
Xbyak::Reg64 address_reg;
|
||||||
switch (address_register_index) {
|
switch (address_register_index) {
|
||||||
case 1: // address offset 1
|
case 1:
|
||||||
movaps(dest, xword[src_ptr + ADDROFFS_REG_0 + src_offset_disp]);
|
address_reg = ADDROFFS_REG_0;
|
||||||
break;
|
break;
|
||||||
case 2: // address offset 2
|
case 2:
|
||||||
movaps(dest, xword[src_ptr + ADDROFFS_REG_1 + src_offset_disp]);
|
address_reg = ADDROFFS_REG_1;
|
||||||
break;
|
break;
|
||||||
case 3: // address offset 3
|
case 3:
|
||||||
movaps(dest, xword[src_ptr + LOOPCOUNT_REG.cvt64() + src_offset_disp]);
|
address_reg = LOOPCOUNT_REG.cvt64();
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
UNREACHABLE();
|
UNREACHABLE();
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
// s32 offset = address_reg >= -128 && address_reg <= 127 ? address_reg : 0;
|
||||||
|
// u32 index = (src_reg.GetIndex() + offset) & 0x7f;
|
||||||
|
|
||||||
|
// First we add 128 to address_reg so the first comparison is turned to
|
||||||
|
// address_reg >= 0 && address_reg < 256 which can be performed with
|
||||||
|
// a single unsigned comparison (cmovb)
|
||||||
|
lea(eax, ptr[address_reg + 128]);
|
||||||
|
mov(ebx, src_reg.GetIndex());
|
||||||
|
mov(ecx, address_reg.cvt32());
|
||||||
|
add(ecx, ebx);
|
||||||
|
cmp(eax, 256);
|
||||||
|
cmovb(ebx, ecx);
|
||||||
|
and_(ebx, 0x7f);
|
||||||
|
|
||||||
|
// index > 95 ? vec4(1.0) : uniforms.f[index];
|
||||||
|
movaps(dest, ONE);
|
||||||
|
cmp(ebx, 95);
|
||||||
|
Label load_end;
|
||||||
|
jg(load_end);
|
||||||
|
shl(rbx, 4);
|
||||||
|
movaps(dest, xword[src_ptr + rbx]);
|
||||||
|
L(load_end);
|
||||||
} else {
|
} else {
|
||||||
// Load the source
|
// Load the source
|
||||||
movaps(dest, xword[src_ptr + src_offset_disp]);
|
movaps(dest, xword[src_ptr + src_offset_disp]);
|
||||||
|
@ -590,24 +614,14 @@ void JitShader::Compile_MOVA(Instruction instr) {
|
||||||
// Move and sign-extend high 32 bits
|
// Move and sign-extend high 32 bits
|
||||||
shr(rax, 32);
|
shr(rax, 32);
|
||||||
movsxd(ADDROFFS_REG_1, eax);
|
movsxd(ADDROFFS_REG_1, eax);
|
||||||
|
|
||||||
// Multiply by 16 to be used as an offset later
|
|
||||||
shl(ADDROFFS_REG_0, 4);
|
|
||||||
shl(ADDROFFS_REG_1, 4);
|
|
||||||
} else {
|
} else {
|
||||||
if (swiz.DestComponentEnabled(0)) {
|
if (swiz.DestComponentEnabled(0)) {
|
||||||
// Move and sign-extend low 32 bits
|
// Move and sign-extend low 32 bits
|
||||||
movsxd(ADDROFFS_REG_0, eax);
|
movsxd(ADDROFFS_REG_0, eax);
|
||||||
|
|
||||||
// Multiply by 16 to be used as an offset later
|
|
||||||
shl(ADDROFFS_REG_0, 4);
|
|
||||||
} else if (swiz.DestComponentEnabled(1)) {
|
} else if (swiz.DestComponentEnabled(1)) {
|
||||||
// Move and sign-extend high 32 bits
|
// Move and sign-extend high 32 bits
|
||||||
shr(rax, 32);
|
shr(rax, 32);
|
||||||
movsxd(ADDROFFS_REG_1, eax);
|
movsxd(ADDROFFS_REG_1, eax);
|
||||||
|
|
||||||
// Multiply by 16 to be used as an offset later
|
|
||||||
shl(ADDROFFS_REG_1, 4);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -659,9 +673,6 @@ void JitShader::Compile_END(Instruction instr) {
|
||||||
mov(byte[STATE + offsetof(UnitState, conditional_code[1])], COND1.cvt8());
|
mov(byte[STATE + offsetof(UnitState, conditional_code[1])], COND1.cvt8());
|
||||||
|
|
||||||
// Save address/loop registers
|
// Save address/loop registers
|
||||||
sar(ADDROFFS_REG_0, 4);
|
|
||||||
sar(ADDROFFS_REG_1, 4);
|
|
||||||
sar(LOOPCOUNT_REG, 4);
|
|
||||||
mov(dword[STATE + offsetof(UnitState, address_registers[0])], ADDROFFS_REG_0.cvt32());
|
mov(dword[STATE + offsetof(UnitState, address_registers[0])], ADDROFFS_REG_0.cvt32());
|
||||||
mov(dword[STATE + offsetof(UnitState, address_registers[1])], ADDROFFS_REG_1.cvt32());
|
mov(dword[STATE + offsetof(UnitState, address_registers[1])], ADDROFFS_REG_1.cvt32());
|
||||||
mov(dword[STATE + offsetof(UnitState, address_registers[2])], LOOPCOUNT_REG);
|
mov(dword[STATE + offsetof(UnitState, address_registers[2])], LOOPCOUNT_REG);
|
||||||
|
@ -813,11 +824,11 @@ void JitShader::Compile_LOOP(Instruction instr) {
|
||||||
std::size_t offset = Uniforms::GetIntUniformOffset(instr.flow_control.int_uniform_id);
|
std::size_t offset = Uniforms::GetIntUniformOffset(instr.flow_control.int_uniform_id);
|
||||||
mov(LOOPCOUNT, dword[UNIFORMS + offset]);
|
mov(LOOPCOUNT, dword[UNIFORMS + offset]);
|
||||||
mov(LOOPCOUNT_REG, LOOPCOUNT);
|
mov(LOOPCOUNT_REG, LOOPCOUNT);
|
||||||
shr(LOOPCOUNT_REG, 4);
|
shr(LOOPCOUNT_REG, 8);
|
||||||
and_(LOOPCOUNT_REG, 0xFF0); // Y-component is the start
|
and_(LOOPCOUNT_REG, 0xFF); // Y-component is the start
|
||||||
mov(LOOPINC, LOOPCOUNT);
|
mov(LOOPINC, LOOPCOUNT);
|
||||||
shr(LOOPINC, 12);
|
shr(LOOPINC, 16);
|
||||||
and_(LOOPINC, 0xFF0); // Z-component is the incrementer
|
and_(LOOPINC, 0xFF); // Z-component is the incrementer
|
||||||
movzx(LOOPCOUNT, LOOPCOUNT.cvt8()); // X-component is iteration count
|
movzx(LOOPCOUNT, LOOPCOUNT.cvt8()); // X-component is iteration count
|
||||||
add(LOOPCOUNT, 1); // Iteration count is X-component + 1
|
add(LOOPCOUNT, 1); // Iteration count is X-component + 1
|
||||||
|
|
||||||
|
@ -993,9 +1004,6 @@ void JitShader::Compile(const std::array<u32, MAX_PROGRAM_CODE_LENGTH>* program_
|
||||||
movsxd(ADDROFFS_REG_0, dword[STATE + offsetof(UnitState, address_registers[0])]);
|
movsxd(ADDROFFS_REG_0, dword[STATE + offsetof(UnitState, address_registers[0])]);
|
||||||
movsxd(ADDROFFS_REG_1, dword[STATE + offsetof(UnitState, address_registers[1])]);
|
movsxd(ADDROFFS_REG_1, dword[STATE + offsetof(UnitState, address_registers[1])]);
|
||||||
mov(LOOPCOUNT_REG, dword[STATE + offsetof(UnitState, address_registers[2])]);
|
mov(LOOPCOUNT_REG, dword[STATE + offsetof(UnitState, address_registers[2])]);
|
||||||
shl(ADDROFFS_REG_0, 4);
|
|
||||||
shl(ADDROFFS_REG_1, 4);
|
|
||||||
shl(LOOPCOUNT_REG, 4);
|
|
||||||
|
|
||||||
// Load conditional code
|
// Load conditional code
|
||||||
mov(COND0, byte[STATE + offsetof(UnitState, conditional_code[0])]);
|
mov(COND0, byte[STATE + offsetof(UnitState, conditional_code[0])]);
|
||||||
|
|
Loading…
Reference in a new issue