IR: Implement FPVector{To,From}Half32

Implement ASIMD VCVT (half) in terms of this instruction.
Correct handling of ASIMDStandardValue.
This commit is contained in:
MerryMage 2021-06-05 03:39:48 +01:00
parent 9a23c09c3b
commit 828959caed
7 changed files with 124 additions and 28 deletions

View file

@ -5,8 +5,11 @@
#pragma once #pragma once
#include <optional>
#include "dynarmic/common/bit_util.h" #include "dynarmic/common/bit_util.h"
#include "dynarmic/common/common_types.h" #include "dynarmic/common/common_types.h"
#include "dynarmic/common/fp/rounding_mode.h"
namespace Dynarmic::Backend::X64 { namespace Dynarmic::Backend::X64 {
@ -84,4 +87,19 @@ constexpr u32 FixupLUT(FpFixup src_qnan = FpFixup::A,
return fixup_lut; return fixup_lut;
} }
constexpr std::optional<int> ConvertRoundingModeToX64Immediate(FP::RoundingMode rounding_mode) {
switch (rounding_mode) {
case FP::RoundingMode::ToNearest_TieEven:
return 0b00;
case FP::RoundingMode::TowardsPlusInfinity:
return 0b10;
case FP::RoundingMode::TowardsMinusInfinity:
return 0b01;
case FP::RoundingMode::TowardsZero:
return 0b11;
default:
return std::nullopt;
}
}
} // namespace Dynarmic::Backend::X64 } // namespace Dynarmic::Backend::X64

View file

@ -80,21 +80,6 @@ constexpr u64 f64_max_u64_lim = 0x43f0000000000000u; // 2^64 as a double (actua
} \ } \
} }
std::optional<int> ConvertRoundingModeToX64Immediate(FP::RoundingMode rounding_mode) {
switch (rounding_mode) {
case FP::RoundingMode::ToNearest_TieEven:
return 0b00;
case FP::RoundingMode::TowardsPlusInfinity:
return 0b10;
case FP::RoundingMode::TowardsMinusInfinity:
return 0b01;
case FP::RoundingMode::TowardsZero:
return 0b11;
default:
return std::nullopt;
}
}
template<size_t fsize> template<size_t fsize>
void DenormalsAreZero(BlockOfCode& code, EmitContext& ctx, std::initializer_list<Xbyak::Xmm> to_daz) { void DenormalsAreZero(BlockOfCode& code, EmitContext& ctx, std::initializer_list<Xbyak::Xmm> to_daz) {
if (ctx.FPCR().FZ()) { if (ctx.FPCR().FZ()) {

View file

@ -642,6 +642,49 @@ void EmitX64::EmitFPVectorEqual64(EmitContext& ctx, IR::Inst* inst) {
ctx.reg_alloc.DefineValue(inst, a); ctx.reg_alloc.DefineValue(inst, a);
} }
void EmitX64::EmitFPVectorFromHalf32(EmitContext& ctx, IR::Inst* inst) {
const auto rounding_mode = static_cast<FP::RoundingMode>(inst->GetArg(1).GetU8());
const bool fpcr_controlled = inst->GetArg(2).GetU1();
if (code.HasHostFeature(HostFeature::F16C) && !ctx.FPCR().AHP() && !ctx.FPCR().FZ16()) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm value = ctx.reg_alloc.UseXmm(args[0]);
code.vcvtph2ps(result, value);
ForceToDefaultNaN<32>(code, ctx.FPCR(fpcr_controlled), result);
ctx.reg_alloc.DefineValue(inst, result);
return;
}
using rounding_list = mp::list<
mp::lift_value<FP::RoundingMode::ToNearest_TieEven>,
mp::lift_value<FP::RoundingMode::TowardsPlusInfinity>,
mp::lift_value<FP::RoundingMode::TowardsMinusInfinity>,
mp::lift_value<FP::RoundingMode::TowardsZero>,
mp::lift_value<FP::RoundingMode::ToNearest_TieAwayFromZero>>;
static const auto lut = Common::GenerateLookupTableFromList(
[](auto arg) {
return std::pair{
mp::lower_to_tuple_v<decltype(arg)>,
Common::FptrCast(
[](VectorArray<u32>& output, const VectorArray<u16>& input, FP::FPCR fpcr, FP::FPSR& fpsr) {
constexpr auto t = mp::lower_to_tuple_v<decltype(arg)>;
constexpr FP::RoundingMode rounding_mode = std::get<0>(t);
for (size_t i = 0; i < output.size(); ++i) {
output[i] = FP::FPConvert<u32, u16>(input[i], fpcr, rounding_mode, fpsr);
}
})};
},
mp::cartesian_product<rounding_list>{});
EmitTwoOpFallback<2>(code, ctx, inst, lut.at(std::make_tuple(rounding_mode)));
}
void EmitX64::EmitFPVectorFromSignedFixed32(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitFPVectorFromSignedFixed32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm xmm = ctx.reg_alloc.UseScratchXmm(args[0]); const Xbyak::Xmm xmm = ctx.reg_alloc.UseScratchXmm(args[0]);
@ -1607,6 +1650,53 @@ void EmitX64::EmitFPVectorSub64(EmitContext& ctx, IR::Inst* inst) {
EmitThreeOpVectorOperation<64, DefaultIndexer>(code, ctx, inst, &Xbyak::CodeGenerator::subpd); EmitThreeOpVectorOperation<64, DefaultIndexer>(code, ctx, inst, &Xbyak::CodeGenerator::subpd);
} }
void EmitX64::EmitFPVectorToHalf32(EmitContext& ctx, IR::Inst* inst) {
const auto rounding_mode = static_cast<FP::RoundingMode>(inst->GetArg(1).GetU8());
const bool fpcr_controlled = inst->GetArg(2).GetU1();
if (code.HasHostFeature(HostFeature::F16C) && !ctx.FPCR().AHP() && !ctx.FPCR().FZ16()) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const auto round_imm = ConvertRoundingModeToX64Immediate(rounding_mode);
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
ForceToDefaultNaN<32>(code, ctx.FPCR(fpcr_controlled), result);
code.vcvtps2ph(result, result, static_cast<u8>(*round_imm));
ctx.reg_alloc.DefineValue(inst, result);
return;
}
using rounding_list = mp::list<
mp::lift_value<FP::RoundingMode::ToNearest_TieEven>,
mp::lift_value<FP::RoundingMode::TowardsPlusInfinity>,
mp::lift_value<FP::RoundingMode::TowardsMinusInfinity>,
mp::lift_value<FP::RoundingMode::TowardsZero>,
mp::lift_value<FP::RoundingMode::ToNearest_TieAwayFromZero>>;
static const auto lut = Common::GenerateLookupTableFromList(
[](auto arg) {
return std::pair{
mp::lower_to_tuple_v<decltype(arg)>,
Common::FptrCast(
[](VectorArray<u16>& output, const VectorArray<u32>& input, FP::FPCR fpcr, FP::FPSR& fpsr) {
constexpr auto t = mp::lower_to_tuple_v<decltype(arg)>;
constexpr FP::RoundingMode rounding_mode = std::get<0>(t);
for (size_t i = 0; i < output.size(); ++i) {
if (i < input.size()) {
output[i] = FP::FPConvert<u16, u32>(input[i], fpcr, rounding_mode, fpsr);
} else {
output[i] = 0;
}
}
})};
},
mp::cartesian_product<rounding_list>{});
EmitTwoOpFallback<2>(code, ctx, inst, lut.at(std::make_tuple(rounding_mode)));
}
template<size_t fsize, bool unsigned_> template<size_t fsize, bool unsigned_>
void EmitFPVectorToFixed(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { void EmitFPVectorToFixed(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
using FPT = mp::unsigned_integer_of_size<fsize>; using FPT = mp::unsigned_integer_of_size<fsize>;

View file

@ -620,24 +620,13 @@ bool TranslatorVisitor::asimd_VCVT_half(bool D, size_t sz, size_t Vd, bool half_
} }
const size_t esize = 8U << sz; const size_t esize = 8U << sz;
const size_t num_elements = 4;
const auto rounding_mode = FP::RoundingMode::ToNearest_TieEven; // StandardFPSCRValue().RMode const auto rounding_mode = FP::RoundingMode::ToNearest_TieEven; // StandardFPSCRValue().RMode
const auto d = ToVector(half_to_single, Vd, D); const auto d = ToVector(half_to_single, Vd, D);
const auto m = ToVector(!half_to_single, Vm, M); const auto m = ToVector(!half_to_single, Vm, M);
const auto operand = ir.GetVector(m); const auto operand = ir.GetVector(m);
IR::U128 result = ir.ZeroVector(); const IR::U128 result = half_to_single ? ir.FPVectorFromHalf(esize * 2, operand, rounding_mode, false)
for (size_t i = 0; i < num_elements; i++) { : ir.FPVectorToHalf(esize * 2, operand, rounding_mode, false);
if (half_to_single) {
const IR::U16 old_element = ir.VectorGetElement(esize, operand, i);
const IR::U32 new_element = ir.FPHalfToSingle(old_element, rounding_mode);
result = ir.VectorSetElement(esize * 2, result, i, new_element);
} else {
const IR::U32 old_element = ir.VectorGetElement(esize * 2, operand, i);
const IR::U16 new_element = ir.FPSingleToHalf(old_element, rounding_mode);
result = ir.VectorSetElement(esize, result, i, new_element);
}
}
ir.SetVector(d, result); ir.SetVector(d, result);
return true; return true;
} }

View file

@ -2404,6 +2404,11 @@ U128 IREmitter::FPVectorEqual(size_t esize, const U128& a, const U128& b, bool f
UNREACHABLE(); UNREACHABLE();
} }
U128 IREmitter::FPVectorFromHalf(size_t esize, const U128& a, FP::RoundingMode rounding, bool fpcr_controlled) {
ASSERT(esize == 32);
return Inst<U128>(Opcode::FPVectorFromHalf32, a, Imm8(static_cast<u8>(rounding)), Imm1(fpcr_controlled));
}
U128 IREmitter::FPVectorFromSignedFixed(size_t esize, const U128& a, size_t fbits, FP::RoundingMode rounding, bool fpcr_controlled) { U128 IREmitter::FPVectorFromSignedFixed(size_t esize, const U128& a, size_t fbits, FP::RoundingMode rounding, bool fpcr_controlled) {
ASSERT(fbits <= esize); ASSERT(fbits <= esize);
switch (esize) { switch (esize) {
@ -2613,6 +2618,11 @@ U128 IREmitter::FPVectorSub(size_t esize, const U128& a, const U128& b, bool fpc
UNREACHABLE(); UNREACHABLE();
} }
U128 IREmitter::FPVectorToHalf(size_t esize, const U128& a, FP::RoundingMode rounding, bool fpcr_controlled) {
ASSERT(esize == 32);
return Inst<U128>(Opcode::FPVectorToHalf32, a, Imm8(static_cast<u8>(rounding)), Imm1(fpcr_controlled));
}
U128 IREmitter::FPVectorToSignedFixed(size_t esize, const U128& a, size_t fbits, FP::RoundingMode rounding, bool fpcr_controlled) { U128 IREmitter::FPVectorToSignedFixed(size_t esize, const U128& a, size_t fbits, FP::RoundingMode rounding, bool fpcr_controlled) {
ASSERT(fbits <= esize); ASSERT(fbits <= esize);

View file

@ -370,6 +370,7 @@ public:
U128 FPVectorAdd(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true); U128 FPVectorAdd(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true);
U128 FPVectorDiv(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true); U128 FPVectorDiv(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true);
U128 FPVectorEqual(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true); U128 FPVectorEqual(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true);
U128 FPVectorFromHalf(size_t esize, const U128& a, FP::RoundingMode rounding, bool fpcr_controlled = true);
U128 FPVectorFromSignedFixed(size_t esize, const U128& a, size_t fbits, FP::RoundingMode rounding, bool fpcr_controlled = true); U128 FPVectorFromSignedFixed(size_t esize, const U128& a, size_t fbits, FP::RoundingMode rounding, bool fpcr_controlled = true);
U128 FPVectorFromUnsignedFixed(size_t esize, const U128& a, size_t fbits, FP::RoundingMode rounding, bool fpcr_controlled = true); U128 FPVectorFromUnsignedFixed(size_t esize, const U128& a, size_t fbits, FP::RoundingMode rounding, bool fpcr_controlled = true);
U128 FPVectorGreater(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true); U128 FPVectorGreater(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true);
@ -389,6 +390,7 @@ public:
U128 FPVectorRSqrtStepFused(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true); U128 FPVectorRSqrtStepFused(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true);
U128 FPVectorSqrt(size_t esize, const U128& a, bool fpcr_controlled = true); U128 FPVectorSqrt(size_t esize, const U128& a, bool fpcr_controlled = true);
U128 FPVectorSub(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true); U128 FPVectorSub(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true);
U128 FPVectorToHalf(size_t esize, const U128& a, FP::RoundingMode rounding, bool fpcr_controlled = true);
U128 FPVectorToSignedFixed(size_t esize, const U128& a, size_t fbits, FP::RoundingMode rounding, bool fpcr_controlled = true); U128 FPVectorToSignedFixed(size_t esize, const U128& a, size_t fbits, FP::RoundingMode rounding, bool fpcr_controlled = true);
U128 FPVectorToUnsignedFixed(size_t esize, const U128& a, size_t fbits, FP::RoundingMode rounding, bool fpcr_controlled = true); U128 FPVectorToUnsignedFixed(size_t esize, const U128& a, size_t fbits, FP::RoundingMode rounding, bool fpcr_controlled = true);

View file

@ -613,6 +613,7 @@ OPCODE(FPVectorDiv64, U128, U128
OPCODE(FPVectorEqual16, U128, U128, U128, U1 ) OPCODE(FPVectorEqual16, U128, U128, U128, U1 )
OPCODE(FPVectorEqual32, U128, U128, U128, U1 ) OPCODE(FPVectorEqual32, U128, U128, U128, U1 )
OPCODE(FPVectorEqual64, U128, U128, U128, U1 ) OPCODE(FPVectorEqual64, U128, U128, U128, U1 )
OPCODE(FPVectorFromHalf32, U128, U128, U8, U1 )
OPCODE(FPVectorFromSignedFixed32, U128, U128, U8, U8, U1 ) OPCODE(FPVectorFromSignedFixed32, U128, U128, U8, U8, U1 )
OPCODE(FPVectorFromSignedFixed64, U128, U128, U8, U8, U1 ) OPCODE(FPVectorFromSignedFixed64, U128, U128, U8, U8, U1 )
OPCODE(FPVectorFromUnsignedFixed32, U128, U128, U8, U8, U1 ) OPCODE(FPVectorFromUnsignedFixed32, U128, U128, U8, U8, U1 )
@ -658,6 +659,7 @@ OPCODE(FPVectorSqrt32, U128, U128
OPCODE(FPVectorSqrt64, U128, U128, U1 ) OPCODE(FPVectorSqrt64, U128, U128, U1 )
OPCODE(FPVectorSub32, U128, U128, U128, U1 ) OPCODE(FPVectorSub32, U128, U128, U128, U1 )
OPCODE(FPVectorSub64, U128, U128, U128, U1 ) OPCODE(FPVectorSub64, U128, U128, U128, U1 )
OPCODE(FPVectorToHalf32, U128, U128, U8, U1 )
OPCODE(FPVectorToSignedFixed16, U128, U128, U8, U8, U1 ) OPCODE(FPVectorToSignedFixed16, U128, U128, U8, U8, U1 )
OPCODE(FPVectorToSignedFixed32, U128, U128, U8, U8, U1 ) OPCODE(FPVectorToSignedFixed32, U128, U128, U8, U8, U1 )
OPCODE(FPVectorToSignedFixed64, U128, U128, U8, U8, U1 ) OPCODE(FPVectorToSignedFixed64, U128, U128, U8, U8, U1 )