Implement UADD8
This commit is contained in:
parent
7cad6949e7
commit
1a1646d962
5 changed files with 77 additions and 10 deletions
|
@ -1242,22 +1242,51 @@ void EmitX64::EmitByteReverseDual(IR::Block&, IR::Inst* inst) {
|
||||||
code->bswap(result);
|
code->bswap(result);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void EmitPackedOperation(BlockOfCode* code, RegAlloc& reg_alloc, IR::Inst* inst, void (Xbyak::CodeGenerator::*fn)(const Xbyak::Mmx& mmx, const Xbyak::Operand&)) {
|
void EmitX64::EmitPackedAddU8(IR::Block& block, IR::Inst* inst) {
|
||||||
|
auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp);
|
||||||
|
|
||||||
IR::Value a = inst->GetArg(0);
|
IR::Value a = inst->GetArg(0);
|
||||||
IR::Value b = inst->GetArg(1);
|
IR::Value b = inst->GetArg(1);
|
||||||
|
|
||||||
Xbyak::Reg32 result = reg_alloc.UseDefGpr(a, inst).cvt32();
|
Xbyak::Reg32 reg_a = reg_alloc.UseScratchGpr(a).cvt32();
|
||||||
Xbyak::Reg32 arg = reg_alloc.UseGpr(b).cvt32();
|
Xbyak::Reg32 reg_b = reg_alloc.UseScratchGpr(b).cvt32();
|
||||||
|
Xbyak::Reg32 result = reg_alloc.DefGpr(inst).cvt32();
|
||||||
|
Xbyak::Reg32 reg_ge, tmp;
|
||||||
|
|
||||||
Xbyak::Xmm xmm_scratch_a = reg_alloc.ScratchXmm();
|
if (ge_inst) {
|
||||||
Xbyak::Xmm xmm_scratch_b = reg_alloc.ScratchXmm();
|
EraseInstruction(block, ge_inst);
|
||||||
|
inst->DecrementRemainingUses();
|
||||||
|
|
||||||
code->movd(xmm_scratch_a, result);
|
reg_ge = reg_alloc.DefGpr(ge_inst).cvt32();
|
||||||
code->movd(xmm_scratch_b, arg);
|
tmp = reg_alloc.ScratchGpr().cvt32();
|
||||||
|
|
||||||
(code->*fn)(xmm_scratch_a, xmm_scratch_b);
|
code->mov(reg_ge, reg_a);
|
||||||
|
code->and_(reg_ge, reg_b);
|
||||||
|
}
|
||||||
|
|
||||||
code->movd(result, xmm_scratch_a);
|
// SWAR Arithmetic
|
||||||
|
code->mov(result, reg_a);
|
||||||
|
code->xor_(result, reg_b);
|
||||||
|
code->and_(result, 0x80808080);
|
||||||
|
code->and_(reg_a, 0x7F7F7F7F);
|
||||||
|
code->and_(reg_b, 0x7F7F7F7F);
|
||||||
|
code->add(reg_a, reg_b);
|
||||||
|
if (ge_inst) {
|
||||||
|
code->mov(tmp, result);
|
||||||
|
code->and_(tmp, reg_a);
|
||||||
|
code->or_(reg_ge, tmp);
|
||||||
|
}
|
||||||
|
code->xor_(result, reg_a);
|
||||||
|
if (ge_inst) {
|
||||||
|
if (cpu_info.has(Xbyak::util::Cpu::tBMI2)) {
|
||||||
|
code->mov(tmp, 0x80808080);
|
||||||
|
code->pext(reg_ge, reg_ge, tmp);
|
||||||
|
} else {
|
||||||
|
code->and_(reg_ge, 0x80808080);
|
||||||
|
code->imul(reg_ge, reg_ge, 0x0204081);
|
||||||
|
code->shr(reg_ge, 28);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void EmitX64::EmitPackedHalvingAddU8(IR::Block& block, IR::Inst* inst) {
|
void EmitX64::EmitPackedHalvingAddU8(IR::Block& block, IR::Inst* inst) {
|
||||||
|
@ -1461,6 +1490,24 @@ void EmitX64::EmitPackedHalvingSubU16(IR::Block& block, IR::Inst* inst) {
|
||||||
// minuend now contains the desired result.
|
// minuend now contains the desired result.
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void EmitPackedOperation(BlockOfCode* code, RegAlloc& reg_alloc, IR::Inst* inst, void (Xbyak::CodeGenerator::*fn)(const Xbyak::Mmx& mmx, const Xbyak::Operand&)) {
|
||||||
|
IR::Value a = inst->GetArg(0);
|
||||||
|
IR::Value b = inst->GetArg(1);
|
||||||
|
|
||||||
|
Xbyak::Reg32 result = reg_alloc.UseDefGpr(a, inst).cvt32();
|
||||||
|
Xbyak::Reg32 arg = reg_alloc.UseGpr(b).cvt32();
|
||||||
|
|
||||||
|
Xbyak::Xmm xmm_scratch_a = reg_alloc.ScratchXmm();
|
||||||
|
Xbyak::Xmm xmm_scratch_b = reg_alloc.ScratchXmm();
|
||||||
|
|
||||||
|
code->movd(xmm_scratch_a, result);
|
||||||
|
code->movd(xmm_scratch_b, arg);
|
||||||
|
|
||||||
|
(code->*fn)(xmm_scratch_a, xmm_scratch_b);
|
||||||
|
|
||||||
|
code->movd(result, xmm_scratch_a);
|
||||||
|
}
|
||||||
|
|
||||||
void EmitX64::EmitPackedSaturatedAddU8(IR::Block& block, IR::Inst* inst) {
|
void EmitX64::EmitPackedSaturatedAddU8(IR::Block& block, IR::Inst* inst) {
|
||||||
EmitPackedOperation(code, reg_alloc, inst, &Xbyak::CodeGenerator::paddusb);
|
EmitPackedOperation(code, reg_alloc, inst, &Xbyak::CodeGenerator::paddusb);
|
||||||
}
|
}
|
||||||
|
|
|
@ -324,6 +324,12 @@ Value IREmitter::ByteReverseDual(const Value& a) {
|
||||||
return Inst(Opcode::ByteReverseDual, {a});
|
return Inst(Opcode::ByteReverseDual, {a});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
IREmitter::ResultAndGE IREmitter::PackedAddU8(const Value& a, const Value& b) {
|
||||||
|
auto result = Inst(Opcode::PackedAddU8, {a, b});
|
||||||
|
auto ge = Inst(Opcode::GetGEFromOp, {result});
|
||||||
|
return {result, ge};
|
||||||
|
}
|
||||||
|
|
||||||
Value IREmitter::PackedHalvingAddU8(const Value& a, const Value& b) {
|
Value IREmitter::PackedHalvingAddU8(const Value& a, const Value& b) {
|
||||||
return Inst(Opcode::PackedHalvingAddU8, {a, b});
|
return Inst(Opcode::PackedHalvingAddU8, {a, b});
|
||||||
}
|
}
|
||||||
|
|
|
@ -49,6 +49,11 @@ public:
|
||||||
Value overflow;
|
Value overflow;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct ResultAndGE {
|
||||||
|
Value result;
|
||||||
|
Value ge;
|
||||||
|
};
|
||||||
|
|
||||||
void Unimplemented();
|
void Unimplemented();
|
||||||
u32 PC();
|
u32 PC();
|
||||||
u32 AlignPC(size_t alignment);
|
u32 AlignPC(size_t alignment);
|
||||||
|
@ -122,6 +127,7 @@ public:
|
||||||
Value ByteReverseWord(const Value& a);
|
Value ByteReverseWord(const Value& a);
|
||||||
Value ByteReverseHalf(const Value& a);
|
Value ByteReverseHalf(const Value& a);
|
||||||
Value ByteReverseDual(const Value& a);
|
Value ByteReverseDual(const Value& a);
|
||||||
|
ResultAndGE PackedAddU8(const Value& a, const Value& b);
|
||||||
Value PackedHalvingAddU8(const Value& a, const Value& b);
|
Value PackedHalvingAddU8(const Value& a, const Value& b);
|
||||||
Value PackedHalvingAddS8(const Value& a, const Value& b);
|
Value PackedHalvingAddS8(const Value& a, const Value& b);
|
||||||
Value PackedHalvingSubU8(const Value& a, const Value& b);
|
Value PackedHalvingSubU8(const Value& a, const Value& b);
|
||||||
|
|
|
@ -72,6 +72,7 @@ OPCODE(ZeroExtendByteToWord, T::U32, T::U8
|
||||||
OPCODE(ByteReverseWord, T::U32, T::U32 )
|
OPCODE(ByteReverseWord, T::U32, T::U32 )
|
||||||
OPCODE(ByteReverseHalf, T::U16, T::U16 )
|
OPCODE(ByteReverseHalf, T::U16, T::U16 )
|
||||||
OPCODE(ByteReverseDual, T::U64, T::U64 )
|
OPCODE(ByteReverseDual, T::U64, T::U64 )
|
||||||
|
OPCODE(PackedAddU8, T::U32, T::U32, T::U32 )
|
||||||
OPCODE(PackedHalvingAddU8, T::U32, T::U32, T::U32 )
|
OPCODE(PackedHalvingAddU8, T::U32, T::U32, T::U32 )
|
||||||
OPCODE(PackedHalvingAddS8, T::U32, T::U32, T::U32 )
|
OPCODE(PackedHalvingAddS8, T::U32, T::U32, T::U32 )
|
||||||
OPCODE(PackedHalvingSubU8, T::U32, T::U32, T::U32 )
|
OPCODE(PackedHalvingSubU8, T::U32, T::U32, T::U32 )
|
||||||
|
|
|
@ -35,7 +35,14 @@ bool ArmTranslatorVisitor::arm_SSUB16(Cond cond, Reg n, Reg d, Reg m) {
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ArmTranslatorVisitor::arm_UADD8(Cond cond, Reg n, Reg d, Reg m) {
|
bool ArmTranslatorVisitor::arm_UADD8(Cond cond, Reg n, Reg d, Reg m) {
|
||||||
return InterpretThisInstruction();
|
if (d == Reg::PC || n == Reg::PC || m == Reg::PC)
|
||||||
|
return UnpredictableInstruction();
|
||||||
|
if (ConditionPassed(cond)) {
|
||||||
|
auto result = ir.PackedAddU8(ir.GetRegister(n), ir.GetRegister(m));
|
||||||
|
ir.SetRegister(d, result.result);
|
||||||
|
ir.SetGEFlags(result.ge);
|
||||||
|
}
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ArmTranslatorVisitor::arm_UADD16(Cond cond, Reg n, Reg d, Reg m) {
|
bool ArmTranslatorVisitor::arm_UADD16(Cond cond, Reg n, Reg d, Reg m) {
|
||||||
|
|
Loading…
Reference in a new issue