diff --git a/src/backend_x64/emit_x64.cpp b/src/backend_x64/emit_x64.cpp index 56149d43..6fb70618 100644 --- a/src/backend_x64/emit_x64.cpp +++ b/src/backend_x64/emit_x64.cpp @@ -118,85 +118,96 @@ void EmitX64::EmitBreakpoint(RegAlloc&, IR::Block&, IR::Inst*) { } void EmitX64::EmitIdentity(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { - if (!inst->GetArg(0).IsImmediate()) { - reg_alloc.RegisterAddDef(inst, inst->GetArg(0)); + auto args = reg_alloc.GetArgumentInfo(inst); + if (!args[0].IsImmediate()) { + reg_alloc.DefineValue(inst, args[0]); } } void EmitX64::EmitGetRegister(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { Arm::Reg reg = inst->GetArg(0).GetRegRef(); - Xbyak::Reg32 result = reg_alloc.DefGpr(inst).cvt32(); + + Xbyak::Reg32 result = reg_alloc.ScratchGpr().cvt32(); code->mov(result, MJitStateReg(reg)); + reg_alloc.DefineValue(inst, result); } void EmitX64::EmitGetExtendedRegister32(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { Arm::ExtReg reg = inst->GetArg(0).GetExtRegRef(); ASSERT(Arm::IsSingleExtReg(reg)); - Xbyak::Xmm result = reg_alloc.DefXmm(inst); + Xbyak::Xmm result = reg_alloc.ScratchXmm(); code->movss(result, MJitStateExtReg(reg)); + reg_alloc.DefineValue(inst, result); } void EmitX64::EmitGetExtendedRegister64(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { Arm::ExtReg reg = inst->GetArg(0).GetExtRegRef(); ASSERT(Arm::IsDoubleExtReg(reg)); - Xbyak::Xmm result = reg_alloc.DefXmm(inst); + + Xbyak::Xmm result = reg_alloc.ScratchXmm(); code->movsd(result, MJitStateExtReg(reg)); + reg_alloc.DefineValue(inst, result); } void EmitX64::EmitSetRegister(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { + auto args = reg_alloc.GetArgumentInfo(inst); Arm::Reg reg = inst->GetArg(0).GetRegRef(); - IR::Value arg = inst->GetArg(1); - if (arg.IsImmediate()) { - code->mov(MJitStateReg(reg), arg.GetU32()); + if (args[1].IsImmediate()) { + code->mov(MJitStateReg(reg), args[1].GetImmediateU32()); } else { - Xbyak::Reg32 to_store = reg_alloc.UseGpr(arg).cvt32(); + Xbyak::Reg32 to_store = reg_alloc.UseGpr(args[1]).cvt32(); code->mov(MJitStateReg(reg), to_store); } } void EmitX64::EmitSetExtendedRegister32(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { + auto args = reg_alloc.GetArgumentInfo(inst); Arm::ExtReg reg = inst->GetArg(0).GetExtRegRef(); ASSERT(Arm::IsSingleExtReg(reg)); - Xbyak::Xmm source = reg_alloc.UseXmm(inst->GetArg(1)); + Xbyak::Xmm source = reg_alloc.UseXmm(args[1]); code->movss(MJitStateExtReg(reg), source); } void EmitX64::EmitSetExtendedRegister64(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { + auto args = reg_alloc.GetArgumentInfo(inst); Arm::ExtReg reg = inst->GetArg(0).GetExtRegRef(); ASSERT(Arm::IsDoubleExtReg(reg)); - Xbyak::Xmm source = reg_alloc.UseXmm(inst->GetArg(1)); + Xbyak::Xmm source = reg_alloc.UseXmm(args[1]); code->movsd(MJitStateExtReg(reg), source); } void EmitX64::EmitGetCpsr(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { - Xbyak::Reg32 result = reg_alloc.DefGpr(inst).cvt32(); + Xbyak::Reg32 result = reg_alloc.ScratchGpr().cvt32(); code->mov(result, MJitStateCpsr()); + reg_alloc.DefineValue(inst, result); } void EmitX64::EmitSetCpsr(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { - Xbyak::Reg32 arg = reg_alloc.UseGpr(inst->GetArg(0)).cvt32(); + auto args = reg_alloc.GetArgumentInfo(inst); + Xbyak::Reg32 arg = reg_alloc.UseGpr(args[0]).cvt32(); code->mov(MJitStateCpsr(), arg); } void EmitX64::EmitGetNFlag(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { - Xbyak::Reg32 result = reg_alloc.DefGpr(inst).cvt32(); + Xbyak::Reg32 result = reg_alloc.ScratchGpr().cvt32(); code->mov(result, MJitStateCpsr()); code->shr(result, 31); + reg_alloc.DefineValue(inst, result); } void EmitX64::EmitSetNFlag(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { constexpr size_t flag_bit = 31; constexpr u32 flag_mask = 1u << flag_bit; - IR::Value arg = inst->GetArg(0); - if (arg.IsImmediate()) { - if (arg.GetU1()) { + auto args = reg_alloc.GetArgumentInfo(inst); + if (args[0].IsImmediate()) { + if (args[0].GetImmediateU1()) { code->or_(MJitStateCpsr(), flag_mask); } else { code->and_(MJitStateCpsr(), ~flag_mask); } } else { - Xbyak::Reg32 to_store = reg_alloc.UseScratchGpr(arg).cvt32(); + Xbyak::Reg32 to_store = reg_alloc.UseScratchGpr(args[0]).cvt32(); code->shl(to_store, flag_bit); code->and_(MJitStateCpsr(), ~flag_mask); @@ -205,24 +216,25 @@ void EmitX64::EmitSetNFlag(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { } void EmitX64::EmitGetZFlag(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { - Xbyak::Reg32 result = reg_alloc.DefGpr(inst).cvt32(); + Xbyak::Reg32 result = reg_alloc.ScratchGpr().cvt32(); code->mov(result, MJitStateCpsr()); code->shr(result, 30); code->and_(result, 1); + reg_alloc.DefineValue(inst, result); } void EmitX64::EmitSetZFlag(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { constexpr size_t flag_bit = 30; constexpr u32 flag_mask = 1u << flag_bit; - IR::Value arg = inst->GetArg(0); - if (arg.IsImmediate()) { - if (arg.GetU1()) { + auto args = reg_alloc.GetArgumentInfo(inst); + if (args[0].IsImmediate()) { + if (args[0].GetImmediateU1()) { code->or_(MJitStateCpsr(), flag_mask); } else { code->and_(MJitStateCpsr(), ~flag_mask); } } else { - Xbyak::Reg32 to_store = reg_alloc.UseScratchGpr(arg).cvt32(); + Xbyak::Reg32 to_store = reg_alloc.UseScratchGpr(args[0]).cvt32(); code->shl(to_store, flag_bit); code->and_(MJitStateCpsr(), ~flag_mask); @@ -231,24 +243,25 @@ void EmitX64::EmitSetZFlag(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { } void EmitX64::EmitGetCFlag(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { - Xbyak::Reg32 result = reg_alloc.DefGpr(inst).cvt32(); + Xbyak::Reg32 result = reg_alloc.ScratchGpr().cvt32(); code->mov(result, MJitStateCpsr()); code->shr(result, 29); code->and_(result, 1); + reg_alloc.DefineValue(inst, result); } void EmitX64::EmitSetCFlag(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { constexpr size_t flag_bit = 29; constexpr u32 flag_mask = 1u << flag_bit; - IR::Value arg = inst->GetArg(0); - if (arg.IsImmediate()) { - if (arg.GetU1()) { + auto args = reg_alloc.GetArgumentInfo(inst); + if (args[0].IsImmediate()) { + if (args[0].GetImmediateU1()) { code->or_(MJitStateCpsr(), flag_mask); } else { code->and_(MJitStateCpsr(), ~flag_mask); } } else { - Xbyak::Reg32 to_store = reg_alloc.UseScratchGpr(arg).cvt32(); + Xbyak::Reg32 to_store = reg_alloc.UseScratchGpr(args[0]).cvt32(); code->shl(to_store, flag_bit); code->and_(MJitStateCpsr(), ~flag_mask); @@ -257,24 +270,25 @@ void EmitX64::EmitSetCFlag(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { } void EmitX64::EmitGetVFlag(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { - Xbyak::Reg32 result = reg_alloc.DefGpr(inst).cvt32(); + Xbyak::Reg32 result = reg_alloc.ScratchGpr().cvt32(); code->mov(result, MJitStateCpsr()); code->shr(result, 28); code->and_(result, 1); + reg_alloc.DefineValue(inst, result); } void EmitX64::EmitSetVFlag(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { constexpr size_t flag_bit = 28; constexpr u32 flag_mask = 1u << flag_bit; - IR::Value arg = inst->GetArg(0); - if (arg.IsImmediate()) { - if (arg.GetU1()) { + auto args = reg_alloc.GetArgumentInfo(inst); + if (args[0].IsImmediate()) { + if (args[0].GetImmediateU1()) { code->or_(MJitStateCpsr(), flag_mask); } else { code->and_(MJitStateCpsr(), ~flag_mask); } } else { - Xbyak::Reg32 to_store = reg_alloc.UseScratchGpr(arg).cvt32(); + Xbyak::Reg32 to_store = reg_alloc.UseScratchGpr(args[0]).cvt32(); code->shl(to_store, flag_bit); code->and_(MJitStateCpsr(), ~flag_mask); @@ -285,12 +299,12 @@ void EmitX64::EmitSetVFlag(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { void EmitX64::EmitOrQFlag(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { constexpr size_t flag_bit = 27; constexpr u32 flag_mask = 1u << flag_bit; - IR::Value arg = inst->GetArg(0); - if (arg.IsImmediate()) { - if (arg.GetU1()) + auto args = reg_alloc.GetArgumentInfo(inst); + if (args[0].IsImmediate()) { + if (args[0].GetImmediateU1()) code->or_(MJitStateCpsr(), flag_mask); } else { - Xbyak::Reg32 to_store = reg_alloc.UseScratchGpr(arg).cvt32(); + Xbyak::Reg32 to_store = reg_alloc.UseScratchGpr(args[0]).cvt32(); code->shl(to_store, flag_bit); code->or_(MJitStateCpsr(), to_store); @@ -298,22 +312,23 @@ void EmitX64::EmitOrQFlag(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { } void EmitX64::EmitGetGEFlags(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { - Xbyak::Reg32 result = reg_alloc.DefGpr(inst).cvt32(); + Xbyak::Reg32 result = reg_alloc.ScratchGpr().cvt32(); code->mov(result, MJitStateCpsr()); code->shr(result, 16); code->and_(result, 0xF); + reg_alloc.DefineValue(inst, result); } void EmitX64::EmitSetGEFlags(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { constexpr size_t flag_bit = 16; constexpr u32 flag_mask = 0xFu << flag_bit; - IR::Value arg = inst->GetArg(0); - if (arg.IsImmediate()) { - u32 imm = (arg.GetU32() << flag_bit) & flag_mask; + auto args = reg_alloc.GetArgumentInfo(inst); + if (args[0].IsImmediate()) { + u32 imm = (args[0].GetImmediateU32() << flag_bit) & flag_mask; code->and_(MJitStateCpsr(), ~flag_mask); code->or_(MJitStateCpsr(), imm); } else { - Xbyak::Reg32 to_store = reg_alloc.UseScratchGpr(arg).cvt32(); + Xbyak::Reg32 to_store = reg_alloc.UseScratchGpr(args[0]).cvt32(); code->shl(to_store, flag_bit); code->and_(to_store, flag_mask); @@ -323,8 +338,10 @@ void EmitX64::EmitSetGEFlags(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { } void EmitX64::EmitBXWritePC(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { + auto args = reg_alloc.GetArgumentInfo(inst); + auto& arg = args[0]; + const u32 T_bit = 1 << 5; - auto arg = inst->GetArg(0); // Pseudocode: // if (new_pc & 1) { @@ -336,7 +353,7 @@ void EmitX64::EmitBXWritePC(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { // } if (arg.IsImmediate()) { - u32 new_pc = arg.GetU32(); + u32 new_pc = arg.GetImmediateU32(); if (Common::Bit<0>(new_pc)) { new_pc &= 0xFFFFFFFE; code->mov(MJitStateReg(Arm::Reg::PC), new_pc); @@ -368,9 +385,8 @@ void EmitX64::EmitBXWritePC(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { } void EmitX64::EmitCallSupervisor(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { - auto imm32 = inst->GetArg(0); - - reg_alloc.HostCall(nullptr, imm32); + auto args = reg_alloc.GetArgumentInfo(inst); + reg_alloc.HostCall(nullptr, args[0]); code->SwitchMxcsrOnExit(); code->CallFunction(cb.CallSVC); @@ -395,9 +411,8 @@ static void SetFpscrImpl(u32 value, JitState* jit_state) { } void EmitX64::EmitSetFpscr(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { - auto a = inst->GetArg(0); - - reg_alloc.HostCall(nullptr, a); + auto args = reg_alloc.GetArgumentInfo(inst); + reg_alloc.HostCall(nullptr, args[0]); code->mov(code->ABI_PARAM2, code->r15); code->SwitchMxcsrOnExit(); @@ -408,15 +423,16 @@ void EmitX64::EmitSetFpscr(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { void EmitX64::EmitGetFpscrNZCV(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { using namespace Xbyak::util; - Xbyak::Reg32 result = reg_alloc.DefGpr(inst).cvt32(); - + Xbyak::Reg32 result = reg_alloc.ScratchGpr().cvt32(); code->mov(result, dword[r15 + offsetof(JitState, FPSCR_nzcv)]); + reg_alloc.DefineValue(inst, result); } void EmitX64::EmitSetFpscrNZCV(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { using namespace Xbyak::util; - Xbyak::Reg32 value = reg_alloc.UseGpr(inst->GetArg(0)).cvt32(); + auto args = reg_alloc.GetArgumentInfo(inst); + Xbyak::Reg32 value = reg_alloc.UseGpr(args[0]).cvt32(); code->mov(dword[r15 + offsetof(JitState, FPSCR_nzcv)], value); } @@ -424,8 +440,9 @@ void EmitX64::EmitSetFpscrNZCV(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) void EmitX64::EmitPushRSB(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { using namespace Xbyak::util; - ASSERT(inst->GetArg(0).IsImmediate()); - u64 unique_hash_of_target = inst->GetArg(0).GetU64(); + auto args = reg_alloc.GetArgumentInfo(inst); + ASSERT(args[0].IsImmediate()); + u64 unique_hash_of_target = args[0].GetImmediateU64(); auto iter = block_descriptors.find(unique_hash_of_target); CodePtr target_code_ptr = iter != block_descriptors.end() @@ -470,121 +487,119 @@ void EmitX64::EmitGetGEFromOp(RegAlloc&, IR::Block&, IR::Inst*) { } void EmitX64::EmitPack2x32To1x64(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { - OpArg lo; - Xbyak::Reg64 result; - if (inst->GetArg(0).IsImmediate()) { - // TODO: Optimize - result = reg_alloc.UseDefGpr(inst->GetArg(0), inst); - lo = result.cvt32(); - } else { - std::tie(lo, result) = reg_alloc.UseDefOpArgGpr(inst->GetArg(0), inst); - } - lo.setBit(32); - Xbyak::Reg64 hi = reg_alloc.UseScratchGpr(inst->GetArg(1)); + auto args = reg_alloc.GetArgumentInfo(inst); + Xbyak::Reg64 lo = reg_alloc.UseScratchGpr(args[0]); + Xbyak::Reg64 hi = reg_alloc.UseScratchGpr(args[1]); code->shl(hi, 32); - code->mov(result.cvt32(), *lo); // Zero extend to 64-bits - code->or_(result, hi); + code->mov(lo.cvt32(), lo.cvt32()); // Zero extend to 64-bits + code->or_(lo, hi); + + reg_alloc.DefineValue(inst, lo); } void EmitX64::EmitLeastSignificantWord(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { - reg_alloc.RegisterAddDef(inst, inst->GetArg(0)); + auto args = reg_alloc.GetArgumentInfo(inst); + reg_alloc.DefineValue(inst, args[0]); } void EmitX64::EmitMostSignificantWord(RegAlloc& reg_alloc, IR::Block& block, IR::Inst* inst) { - auto carry_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetCarryFromOp); - Xbyak::Reg64 result = reg_alloc.UseDefGpr(inst->GetArg(0), inst); - + auto args = reg_alloc.GetArgumentInfo(inst); + Xbyak::Reg64 result = reg_alloc.UseScratchGpr(args[0]); code->shr(result, 32); + reg_alloc.DefineValue(inst, result); + auto carry_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetCarryFromOp); if (carry_inst) { EraseInstruction(block, carry_inst); - - Xbyak::Reg64 carry = reg_alloc.DefGpr(carry_inst); - + Xbyak::Reg64 carry = reg_alloc.ScratchGpr(); code->setc(carry.cvt8()); + reg_alloc.DefineValue(carry_inst, carry); } } void EmitX64::EmitLeastSignificantHalf(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { - reg_alloc.RegisterAddDef(inst, inst->GetArg(0)); + auto args = reg_alloc.GetArgumentInfo(inst); + reg_alloc.DefineValue(inst, args[0]); } void EmitX64::EmitLeastSignificantByte(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { - reg_alloc.RegisterAddDef(inst, inst->GetArg(0)); + auto args = reg_alloc.GetArgumentInfo(inst); + reg_alloc.DefineValue(inst, args[0]); } void EmitX64::EmitMostSignificantBit(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { - Xbyak::Reg32 result = reg_alloc.UseDefGpr(inst->GetArg(0), inst).cvt32(); - + auto args = reg_alloc.GetArgumentInfo(inst); + Xbyak::Reg64 result = reg_alloc.UseScratchGpr(args[0]); // TODO: Flag optimization - code->shr(result, 31); + reg_alloc.DefineValue(inst, result); } void EmitX64::EmitIsZero(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { - Xbyak::Reg32 result = reg_alloc.UseDefGpr(inst->GetArg(0), inst).cvt32(); - + auto args = reg_alloc.GetArgumentInfo(inst); + Xbyak::Reg64 result = reg_alloc.UseScratchGpr(args[0]); // TODO: Flag optimization - code->test(result, result); code->sete(result.cvt8()); code->movzx(result, result.cvt8()); + reg_alloc.DefineValue(inst, result); } void EmitX64::EmitIsZero64(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { - Xbyak::Reg64 result = reg_alloc.UseDefGpr(inst->GetArg(0), inst); - + auto args = reg_alloc.GetArgumentInfo(inst); + Xbyak::Reg64 result = reg_alloc.UseScratchGpr(args[0]); // TODO: Flag optimization - code->test(result, result); code->sete(result.cvt8()); code->movzx(result, result.cvt8()); + reg_alloc.DefineValue(inst, result); } void EmitX64::EmitLogicalShiftLeft(RegAlloc& reg_alloc, IR::Block& block, IR::Inst* inst) { auto carry_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetCarryFromOp); + auto args = reg_alloc.GetArgumentInfo(inst); + auto& operand_arg = args[0]; + auto& shift_arg = args[1]; + auto& carry_arg = args[2]; + // TODO: Consider using BMI2 instructions like SHLX when arm-in-host flags is implemented. if (!carry_inst) { - if (!inst->GetArg(2).IsImmediate()) { - inst->GetArg(2).GetInst()->DecrementRemainingUses(); - } - - auto shift_arg = inst->GetArg(1); - if (shift_arg.IsImmediate()) { - Xbyak::Reg32 result = reg_alloc.UseDefGpr(inst->GetArg(0), inst).cvt32(); - u8 shift = shift_arg.GetU8(); + Xbyak::Reg32 result = reg_alloc.UseScratchGpr(operand_arg).cvt32(); + u8 shift = shift_arg.GetImmediateU8(); if (shift <= 31) { code->shl(result, shift); } else { code->xor_(result, result); } + + reg_alloc.DefineValue(inst, result); } else { - Xbyak::Reg8 shift = reg_alloc.UseGpr(shift_arg, {HostLoc::RCX}).cvt8(); - Xbyak::Reg32 result = reg_alloc.UseDefGpr(inst->GetArg(0), inst).cvt32(); + reg_alloc.Use(shift_arg, HostLoc::RCX); + Xbyak::Reg32 result = reg_alloc.UseScratchGpr(operand_arg).cvt32(); Xbyak::Reg32 zero = reg_alloc.ScratchGpr().cvt32(); // The 32-bit x64 SHL instruction masks the shift count by 0x1F before performing the shift. // ARM differs from the behaviour: It does not mask the count, so shifts above 31 result in zeros. - code->shl(result, shift); + code->shl(result, code->cl); code->xor_(zero, zero); - code->cmp(shift, 32); + code->cmp(code->cl, 32); code->cmovnb(result, zero); + + reg_alloc.DefineValue(inst, result); } } else { EraseInstruction(block, carry_inst); - auto shift_arg = inst->GetArg(1); - if (shift_arg.IsImmediate()) { - u8 shift = shift_arg.GetU8(); - Xbyak::Reg32 result = reg_alloc.UseDefGpr(inst->GetArg(0), inst).cvt32(); - Xbyak::Reg32 carry = reg_alloc.UseDefGpr(inst->GetArg(2), carry_inst).cvt32(); + u8 shift = shift_arg.GetImmediateU8(); + Xbyak::Reg32 result = reg_alloc.UseScratchGpr(operand_arg).cvt32(); + Xbyak::Reg32 carry = reg_alloc.UseScratchGpr(carry_arg).cvt32(); if (shift == 0) { // There is nothing more to do. @@ -600,21 +615,24 @@ void EmitX64::EmitLogicalShiftLeft(RegAlloc& reg_alloc, IR::Block& block, IR::In code->xor_(result, result); code->and_(carry, 1); } + + reg_alloc.DefineValue(inst, result); + reg_alloc.DefineValue(carry_inst, carry); } else { - Xbyak::Reg8 shift = reg_alloc.UseGpr(shift_arg, {HostLoc::RCX}).cvt8(); - Xbyak::Reg32 result = reg_alloc.UseDefGpr(inst->GetArg(0), inst).cvt32(); - Xbyak::Reg32 carry = reg_alloc.UseDefGpr(inst->GetArg(2), carry_inst).cvt32(); + reg_alloc.Use(shift_arg, HostLoc::RCX); + Xbyak::Reg32 result = reg_alloc.UseScratchGpr(operand_arg).cvt32(); + Xbyak::Reg32 carry = reg_alloc.UseScratchGpr(carry_arg).cvt32(); // TODO: Optimize this. code->inLocalLabel(); - code->cmp(shift, 32); + code->cmp(code->cl, 32); code->ja(".Rs_gt32"); code->je(".Rs_eq32"); // if (Rs & 0xFF < 32) { code->bt(carry.cvt32(), 0); // Set the carry flag for correct behaviour in the case when Rs & 0xFF == 0 - code->shl(result, shift); + code->shl(result, code->cl); code->setc(carry.cvt8()); code->jmp(".end"); // } else if (Rs & 0xFF > 32) { @@ -631,6 +649,9 @@ void EmitX64::EmitLogicalShiftLeft(RegAlloc& reg_alloc, IR::Block& block, IR::In code->L(".end"); code->outLocalLabel(); + + reg_alloc.DefineValue(inst, result); + reg_alloc.DefineValue(carry_inst, carry); } } } @@ -638,44 +659,45 @@ void EmitX64::EmitLogicalShiftLeft(RegAlloc& reg_alloc, IR::Block& block, IR::In void EmitX64::EmitLogicalShiftRight(RegAlloc& reg_alloc, IR::Block& block, IR::Inst* inst) { auto carry_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetCarryFromOp); + auto args = reg_alloc.GetArgumentInfo(inst); + auto& operand_arg = args[0]; + auto& shift_arg = args[1]; + auto& carry_arg = args[2]; + if (!carry_inst) { - if (!inst->GetArg(2).IsImmediate()) { - inst->GetArg(2).GetInst()->DecrementRemainingUses(); - } - - auto shift_arg = inst->GetArg(1); - if (shift_arg.IsImmediate()) { - Xbyak::Reg32 result = reg_alloc.UseDefGpr(inst->GetArg(0), inst).cvt32(); - u8 shift = shift_arg.GetU8(); + Xbyak::Reg32 result = reg_alloc.UseScratchGpr(operand_arg).cvt32(); + u8 shift = shift_arg.GetImmediateU8(); if (shift <= 31) { code->shr(result, shift); } else { code->xor_(result, result); } + + reg_alloc.DefineValue(inst, result); } else { - Xbyak::Reg8 shift = reg_alloc.UseGpr(shift_arg, {HostLoc::RCX}).cvt8(); - Xbyak::Reg32 result = reg_alloc.UseDefGpr(inst->GetArg(0), inst).cvt32(); + reg_alloc.Use(shift_arg, HostLoc::RCX); + Xbyak::Reg32 result = reg_alloc.UseScratchGpr(operand_arg).cvt32(); Xbyak::Reg32 zero = reg_alloc.ScratchGpr().cvt32(); // The 32-bit x64 SHR instruction masks the shift count by 0x1F before performing the shift. // ARM differs from the behaviour: It does not mask the count, so shifts above 31 result in zeros. - code->shr(result, shift); + code->shr(result, code->cl); code->xor_(zero, zero); - code->cmp(shift, 32); + code->cmp(code->cl, 32); code->cmovnb(result, zero); + + reg_alloc.DefineValue(inst, result); } } else { EraseInstruction(block, carry_inst); - auto shift_arg = inst->GetArg(1); - if (shift_arg.IsImmediate()) { - u8 shift = shift_arg.GetU8(); - Xbyak::Reg32 result = reg_alloc.UseDefGpr(inst->GetArg(0), inst).cvt32(); - Xbyak::Reg32 carry = reg_alloc.UseDefGpr(inst->GetArg(2), carry_inst).cvt32(); + u8 shift = shift_arg.GetImmediateU8(); + Xbyak::Reg32 result = reg_alloc.UseScratchGpr(operand_arg).cvt32(); + Xbyak::Reg32 carry = reg_alloc.UseScratchGpr(carry_arg).cvt32(); if (shift == 0) { // There is nothing more to do. @@ -690,23 +712,26 @@ void EmitX64::EmitLogicalShiftRight(RegAlloc& reg_alloc, IR::Block& block, IR::I code->xor_(result, result); code->xor_(carry, carry); } + + reg_alloc.DefineValue(inst, result); + reg_alloc.DefineValue(carry_inst, carry); } else { - Xbyak::Reg8 shift = reg_alloc.UseGpr(shift_arg, {HostLoc::RCX}).cvt8(); - Xbyak::Reg32 result = reg_alloc.UseDefGpr(inst->GetArg(0), inst).cvt32(); - Xbyak::Reg32 carry = reg_alloc.UseDefGpr(inst->GetArg(2), carry_inst).cvt32(); + reg_alloc.Use(shift_arg, HostLoc::RCX); + Xbyak::Reg32 result = reg_alloc.UseScratchGpr(operand_arg).cvt32(); + Xbyak::Reg32 carry = reg_alloc.UseScratchGpr(carry_arg).cvt32(); // TODO: Optimize this. code->inLocalLabel(); - code->cmp(shift, 32); + code->cmp(code->cl, 32); code->ja(".Rs_gt32"); code->je(".Rs_eq32"); // if (Rs & 0xFF == 0) goto end; - code->test(shift, shift); + code->test(code->cl, code->cl); code->jz(".end"); // if (Rs & 0xFF < 32) { - code->shr(result, shift); + code->shr(result, code->cl); code->setc(carry.cvt8()); code->jmp(".end"); // } else if (Rs & 0xFF > 32) { @@ -723,39 +748,48 @@ void EmitX64::EmitLogicalShiftRight(RegAlloc& reg_alloc, IR::Block& block, IR::I code->L(".end"); code->outLocalLabel(); + + reg_alloc.DefineValue(inst, result); + reg_alloc.DefineValue(carry_inst, carry); } } } void EmitX64::EmitLogicalShiftRight64(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { - Xbyak::Reg64 result = reg_alloc.UseDefGpr(inst->GetArg(0), inst); + auto args = reg_alloc.GetArgumentInfo(inst); + auto& operand_arg = args[0]; + auto& shift_arg = args[1]; - auto shift_arg = inst->GetArg(1); ASSERT_MSG(shift_arg.IsImmediate(), "variable 64 bit shifts are not implemented"); - u8 shift = shift_arg.GetU8(); - ASSERT_MSG(shift < 64, "shift width clamping is not implemented"); + ASSERT_MSG(shift_arg.GetImmediateU8() < 64, "shift width clamping is not implemented"); + + Xbyak::Reg64 result = reg_alloc.UseScratchGpr(operand_arg); + u8 shift = shift_arg.GetImmediateU8(); code->shr(result.cvt64(), shift); + + reg_alloc.DefineValue(inst, result); } void EmitX64::EmitArithmeticShiftRight(RegAlloc& reg_alloc, IR::Block& block, IR::Inst* inst) { auto carry_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetCarryFromOp); + auto args = reg_alloc.GetArgumentInfo(inst); + auto& operand_arg = args[0]; + auto& shift_arg = args[1]; + auto& carry_arg = args[2]; + if (!carry_inst) { - if (!inst->GetArg(2).IsImmediate()) { - inst->GetArg(2).GetInst()->DecrementRemainingUses(); - } - - auto shift_arg = inst->GetArg(1); - if (shift_arg.IsImmediate()) { - u8 shift = shift_arg.GetU8(); - Xbyak::Reg32 result = reg_alloc.UseDefGpr(inst->GetArg(0), inst).cvt32(); + u8 shift = shift_arg.GetImmediateU8(); + Xbyak::Reg32 result = reg_alloc.UseScratchGpr(operand_arg).cvt32(); code->sar(result, u8(shift < 31 ? shift : 31)); + + reg_alloc.DefineValue(inst, result); } else { - Xbyak::Reg32 shift = reg_alloc.UseScratchGpr(shift_arg, {HostLoc::RCX}).cvt32(); - Xbyak::Reg32 result = reg_alloc.UseDefGpr(inst->GetArg(0), inst).cvt32(); + reg_alloc.UseScratch(shift_arg, HostLoc::RCX); + Xbyak::Reg32 result = reg_alloc.UseScratchGpr(operand_arg).cvt32(); Xbyak::Reg32 const31 = reg_alloc.ScratchGpr().cvt32(); // The 32-bit x64 SAR instruction masks the shift count by 0x1F before performing the shift. @@ -763,20 +797,20 @@ void EmitX64::EmitArithmeticShiftRight(RegAlloc& reg_alloc, IR::Block& block, IR // We note that all shift values above 31 have the same behaviour as 31 does, so we saturate `shift` to 31. code->mov(const31, 31); - code->movzx(shift, shift.cvt8()); - code->cmp(shift, u32(31)); - code->cmovg(shift, const31); - code->sar(result, shift.cvt8()); + code->movzx(code->ecx, code->cl); + code->cmp(code->ecx, u32(31)); + code->cmovg(code->ecx, const31); + code->sar(result, code->cl); + + reg_alloc.DefineValue(inst, result); } } else { EraseInstruction(block, carry_inst); - auto shift_arg = inst->GetArg(1); - if (shift_arg.IsImmediate()) { - u8 shift = shift_arg.GetU8(); - Xbyak::Reg32 result = reg_alloc.UseDefGpr(inst->GetArg(0), inst).cvt32(); - Xbyak::Reg8 carry = reg_alloc.UseDefGpr(inst->GetArg(2), carry_inst).cvt8(); + u8 shift = shift_arg.GetImmediateU8(); + Xbyak::Reg32 result = reg_alloc.UseScratchGpr(operand_arg).cvt32(); + Xbyak::Reg8 carry = reg_alloc.UseScratchGpr(carry_arg).cvt8(); if (shift == 0) { // There is nothing more to do. @@ -788,22 +822,25 @@ void EmitX64::EmitArithmeticShiftRight(RegAlloc& reg_alloc, IR::Block& block, IR code->bt(result, 31); code->setc(carry); } + + reg_alloc.DefineValue(inst, result); + reg_alloc.DefineValue(carry_inst, carry); } else { - Xbyak::Reg8 shift = reg_alloc.UseGpr(shift_arg, {HostLoc::RCX}).cvt8(); - Xbyak::Reg32 result = reg_alloc.UseDefGpr(inst->GetArg(0), inst).cvt32(); - Xbyak::Reg8 carry = reg_alloc.UseDefGpr(inst->GetArg(2), carry_inst).cvt8(); + reg_alloc.Use(shift_arg, HostLoc::RCX); + Xbyak::Reg32 result = reg_alloc.UseScratchGpr(operand_arg).cvt32(); + Xbyak::Reg8 carry = reg_alloc.UseScratchGpr(carry_arg).cvt8(); // TODO: Optimize this. code->inLocalLabel(); - code->cmp(shift, u32(31)); + code->cmp(code->cl, u32(31)); code->ja(".Rs_gt31"); // if (Rs & 0xFF == 0) goto end; - code->test(shift, shift); + code->test(code->cl, code->cl); code->jz(".end"); // if (Rs & 0xFF <= 31) { - code->sar(result, shift); + code->sar(result, code->cl); code->setc(carry); code->jmp(".end"); // } else if (Rs & 0xFF > 31) { @@ -815,6 +852,9 @@ void EmitX64::EmitArithmeticShiftRight(RegAlloc& reg_alloc, IR::Block& block, IR code->L(".end"); code->outLocalLabel(); + + reg_alloc.DefineValue(inst, result); + reg_alloc.DefineValue(carry_inst, carry); } } } @@ -822,34 +862,35 @@ void EmitX64::EmitArithmeticShiftRight(RegAlloc& reg_alloc, IR::Block& block, IR void EmitX64::EmitRotateRight(RegAlloc& reg_alloc, IR::Block& block, IR::Inst* inst) { auto carry_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetCarryFromOp); + auto args = reg_alloc.GetArgumentInfo(inst); + auto& operand_arg = args[0]; + auto& shift_arg = args[1]; + auto& carry_arg = args[2]; + if (!carry_inst) { - if (!inst->GetArg(2).IsImmediate()) { - inst->GetArg(2).GetInst()->DecrementRemainingUses(); - } - - auto shift_arg = inst->GetArg(1); - if (shift_arg.IsImmediate()) { - u8 shift = shift_arg.GetU8(); - Xbyak::Reg32 result = reg_alloc.UseDefGpr(inst->GetArg(0), inst).cvt32(); + u8 shift = shift_arg.GetImmediateU8(); + Xbyak::Reg32 result = reg_alloc.UseScratchGpr(operand_arg).cvt32(); code->ror(result, u8(shift & 0x1F)); + + reg_alloc.DefineValue(inst, result); } else { - Xbyak::Reg8 shift = reg_alloc.UseGpr(shift_arg, {HostLoc::RCX}).cvt8(); - Xbyak::Reg32 result = reg_alloc.UseDefGpr(inst->GetArg(0), inst).cvt32(); + reg_alloc.Use(shift_arg, HostLoc::RCX); + Xbyak::Reg32 result = reg_alloc.UseScratchGpr(operand_arg).cvt32(); // x64 ROR instruction does (shift & 0x1F) for us. - code->ror(result, shift); + code->ror(result, code->cl); + + reg_alloc.DefineValue(inst, result); } } else { EraseInstruction(block, carry_inst); - auto shift_arg = inst->GetArg(1); - if (shift_arg.IsImmediate()) { - u8 shift = shift_arg.GetU8(); - Xbyak::Reg32 result = reg_alloc.UseDefGpr(inst->GetArg(0), inst).cvt32(); - Xbyak::Reg8 carry = reg_alloc.UseDefGpr(inst->GetArg(2), carry_inst).cvt8(); + u8 shift = shift_arg.GetImmediateU8(); + Xbyak::Reg32 result = reg_alloc.UseScratchGpr(operand_arg).cvt32(); + Xbyak::Reg8 carry = reg_alloc.UseScratchGpr(carry_arg).cvt8(); if (shift == 0) { // There is nothing more to do. @@ -860,23 +901,26 @@ void EmitX64::EmitRotateRight(RegAlloc& reg_alloc, IR::Block& block, IR::Inst* i code->ror(result, shift); code->setc(carry); } + + reg_alloc.DefineValue(inst, result); + reg_alloc.DefineValue(carry_inst, carry); } else { - Xbyak::Reg8 shift = reg_alloc.UseScratchGpr(shift_arg, {HostLoc::RCX}).cvt8(); - Xbyak::Reg32 result = reg_alloc.UseDefGpr(inst->GetArg(0), inst).cvt32(); - Xbyak::Reg8 carry = reg_alloc.UseDefGpr(inst->GetArg(2), carry_inst).cvt8(); + reg_alloc.UseScratch(shift_arg, HostLoc::RCX); + Xbyak::Reg32 result = reg_alloc.UseScratchGpr(operand_arg).cvt32(); + Xbyak::Reg8 carry = reg_alloc.UseScratchGpr(carry_arg).cvt8(); // TODO: Optimize code->inLocalLabel(); // if (Rs & 0xFF == 0) goto end; - code->test(shift, shift); + code->test(code->cl, code->cl); code->jz(".end"); - code->and_(shift.cvt32(), u32(0x1F)); + code->and_(code->ecx, u32(0x1F)); code->jz(".zero_1F"); // if (Rs & 0x1F != 0) { - code->ror(result, shift); + code->ror(result, code->cl); code->setc(carry); code->jmp(".end"); // } else { @@ -887,6 +931,9 @@ void EmitX64::EmitRotateRight(RegAlloc& reg_alloc, IR::Block& block, IR::Inst* i code->L(".end"); code->outLocalLabel(); + + reg_alloc.DefineValue(inst, result); + reg_alloc.DefineValue(carry_inst, carry); } } } @@ -894,28 +941,32 @@ void EmitX64::EmitRotateRight(RegAlloc& reg_alloc, IR::Block& block, IR::Inst* i void EmitX64::EmitRotateRightExtended(RegAlloc& reg_alloc, IR::Block& block, IR::Inst* inst) { auto carry_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetCarryFromOp); - Xbyak::Reg32 result = reg_alloc.UseDefGpr(inst->GetArg(0), inst).cvt32(); - Xbyak::Reg8 carry = carry_inst - ? reg_alloc.UseDefGpr(inst->GetArg(1), carry_inst).cvt8() - : reg_alloc.UseGpr(inst->GetArg(1)).cvt8(); + auto args = reg_alloc.GetArgumentInfo(inst); + + Xbyak::Reg32 result = reg_alloc.UseScratchGpr(args[0]).cvt32(); + Xbyak::Reg8 carry = reg_alloc.UseScratchGpr(args[1]).cvt8(); code->bt(carry.cvt32(), 0); code->rcr(result, 1); + reg_alloc.DefineValue(inst, result); + if (carry_inst) { EraseInstruction(block, carry_inst); code->setc(carry); + + reg_alloc.DefineValue(carry_inst, carry); } } const Xbyak::Reg64 INVALID_REG = Xbyak::Reg64(-1); -static Xbyak::Reg8 DoCarry(RegAlloc& reg_alloc, const IR::Value& carry_in, IR::Inst* carry_out) { +static Xbyak::Reg8 DoCarry(RegAlloc& reg_alloc, Argument& carry_in, IR::Inst* carry_out) { if (carry_in.IsImmediate()) { - return carry_out ? reg_alloc.DefGpr(carry_out).cvt8() : INVALID_REG.cvt8(); + return carry_out ? reg_alloc.ScratchGpr().cvt8() : INVALID_REG.cvt8(); } else { - return carry_out ? reg_alloc.UseDefGpr(carry_in, carry_out).cvt8() : reg_alloc.UseGpr(carry_in).cvt8(); + return carry_out ? reg_alloc.UseScratchGpr(carry_in).cvt8() : reg_alloc.UseGpr(carry_in).cvt8(); } } @@ -923,20 +974,19 @@ void EmitX64::EmitAddWithCarry(RegAlloc& reg_alloc, IR::Block& block, IR::Inst* auto carry_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetCarryFromOp); auto overflow_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetOverflowFromOp); - IR::Value a = inst->GetArg(0); - IR::Value b = inst->GetArg(1); - IR::Value carry_in = inst->GetArg(2); + auto args = reg_alloc.GetArgumentInfo(inst); + auto& carry_in = args[2]; - Xbyak::Reg32 result = reg_alloc.UseDefGpr(a, inst).cvt32(); + Xbyak::Reg32 result = reg_alloc.UseScratchGpr(args[0]).cvt32(); Xbyak::Reg8 carry = DoCarry(reg_alloc, carry_in, carry_inst); - Xbyak::Reg8 overflow = overflow_inst ? reg_alloc.DefGpr(overflow_inst).cvt8() : INVALID_REG.cvt8(); + Xbyak::Reg8 overflow = overflow_inst ? reg_alloc.ScratchGpr().cvt8() : INVALID_REG.cvt8(); // TODO: Consider using LEA. - if (b.IsImmediate()) { - u32 op_arg = b.GetU32(); + if (args[1].IsImmediate()) { + u32 op_arg = args[1].GetImmediateU32(); if (carry_in.IsImmediate()) { - if (carry_in.GetU1()) { + if (carry_in.GetImmediateU1()) { code->stc(); code->adc(result, op_arg); } else { @@ -947,10 +997,10 @@ void EmitX64::EmitAddWithCarry(RegAlloc& reg_alloc, IR::Block& block, IR::Inst* code->adc(result, op_arg); } } else { - OpArg op_arg = reg_alloc.UseOpArg(b, any_gpr); + OpArg op_arg = reg_alloc.UseOpArg(args[1]); op_arg.setBit(32); if (carry_in.IsImmediate()) { - if (carry_in.GetU1()) { + if (carry_in.GetImmediateU1()) { code->stc(); code->adc(result, *op_arg); } else { @@ -962,48 +1012,50 @@ void EmitX64::EmitAddWithCarry(RegAlloc& reg_alloc, IR::Block& block, IR::Inst* } } + reg_alloc.DefineValue(inst, result); + if (carry_inst) { EraseInstruction(block, carry_inst); - code->setc(carry); + reg_alloc.DefineValue(carry_inst, carry); } if (overflow_inst) { EraseInstruction(block, overflow_inst); - code->seto(overflow); + reg_alloc.DefineValue(overflow_inst, overflow); } } void EmitX64::EmitAdd64(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { - IR::Value a = inst->GetArg(0); - IR::Value b = inst->GetArg(1); + auto args = reg_alloc.GetArgumentInfo(inst); - Xbyak::Reg64 result = reg_alloc.UseDefGpr(a, inst); - Xbyak::Reg64 op_arg = reg_alloc.UseGpr(b); + Xbyak::Reg64 result = reg_alloc.UseScratchGpr(args[0]); + Xbyak::Reg64 op_arg = reg_alloc.UseGpr(args[1]); code->add(result, op_arg); + + reg_alloc.DefineValue(inst, result); } void EmitX64::EmitSubWithCarry(RegAlloc& reg_alloc, IR::Block& block, IR::Inst* inst) { auto carry_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetCarryFromOp); auto overflow_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetOverflowFromOp); - IR::Value a = inst->GetArg(0); - IR::Value b = inst->GetArg(1); - IR::Value carry_in = inst->GetArg(2); + auto args = reg_alloc.GetArgumentInfo(inst); + auto& carry_in = args[2]; - Xbyak::Reg32 result = reg_alloc.UseDefGpr(a, inst).cvt32(); + Xbyak::Reg32 result = reg_alloc.UseScratchGpr(args[0]).cvt32(); Xbyak::Reg8 carry = DoCarry(reg_alloc, carry_in, carry_inst); - Xbyak::Reg8 overflow = overflow_inst ? reg_alloc.DefGpr(overflow_inst).cvt8() : INVALID_REG.cvt8(); + Xbyak::Reg8 overflow = overflow_inst ? reg_alloc.ScratchGpr().cvt8() : INVALID_REG.cvt8(); // TODO: Consider using LEA. // TODO: Optimize CMP case. // Note that x64 CF is inverse of what the ARM carry flag is here. - if (b.IsImmediate()) { - u32 op_arg = b.GetU32(); + if (args[1].IsImmediate()) { + u32 op_arg = args[1].GetImmediateU32(); if (carry_in.IsImmediate()) { - if (carry_in.GetU1()) { + if (carry_in.GetImmediateU1()) { code->sub(result, op_arg); } else { code->stc(); @@ -1015,10 +1067,10 @@ void EmitX64::EmitSubWithCarry(RegAlloc& reg_alloc, IR::Block& block, IR::Inst* code->sbb(result, op_arg); } } else { - OpArg op_arg = reg_alloc.UseOpArg(b, any_gpr); + OpArg op_arg = reg_alloc.UseOpArg(args[1]); op_arg.setBit(32); if (carry_in.IsImmediate()) { - if (carry_in.GetU1()) { + if (carry_in.GetImmediateU1()) { code->sub(result, *op_arg); } else { code->stc(); @@ -1031,242 +1083,203 @@ void EmitX64::EmitSubWithCarry(RegAlloc& reg_alloc, IR::Block& block, IR::Inst* } } + reg_alloc.DefineValue(inst, result); + if (carry_inst) { EraseInstruction(block, carry_inst); - code->setnc(carry); + reg_alloc.DefineValue(carry_inst, carry); } if (overflow_inst) { EraseInstruction(block, overflow_inst); - code->seto(overflow); + reg_alloc.DefineValue(overflow_inst, overflow); } } void EmitX64::EmitSub64(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { - IR::Value a = inst->GetArg(0); - IR::Value b = inst->GetArg(1); + auto args = reg_alloc.GetArgumentInfo(inst); - Xbyak::Reg64 result = reg_alloc.UseDefGpr(a, inst); - Xbyak::Reg64 op_arg = reg_alloc.UseGpr(b); + Xbyak::Reg64 result = reg_alloc.UseScratchGpr(args[0]); + Xbyak::Reg64 op_arg = reg_alloc.UseGpr(args[1]); code->sub(result, op_arg); + + reg_alloc.DefineValue(inst, result); } void EmitX64::EmitMul(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { - IR::Value a = inst->GetArg(0); - IR::Value b = inst->GetArg(1); - if (a.IsImmediate()) - std::swap(a, b); + auto args = reg_alloc.GetArgumentInfo(inst); - Xbyak::Reg32 result = reg_alloc.UseDefGpr(a, inst).cvt32(); - if (b.IsImmediate()) { - code->imul(result, result, b.GetU32()); + Xbyak::Reg32 result = reg_alloc.UseScratchGpr(args[0]).cvt32(); + if (args[1].IsImmediate()) { + code->imul(result, result, args[1].GetImmediateU32()); } else { - OpArg op_arg = reg_alloc.UseOpArg(b, any_gpr); + OpArg op_arg = reg_alloc.UseOpArg(args[1]); op_arg.setBit(32); code->imul(result, *op_arg); } + reg_alloc.DefineValue(inst, result); } void EmitX64::EmitMul64(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { - IR::Value a = inst->GetArg(0); - IR::Value b = inst->GetArg(1); + auto args = reg_alloc.GetArgumentInfo(inst); - Xbyak::Reg64 result = reg_alloc.UseDefGpr(a, inst); - OpArg op_arg = reg_alloc.UseOpArg(b, any_gpr); + Xbyak::Reg64 result = reg_alloc.UseScratchGpr(args[0]); + OpArg op_arg = reg_alloc.UseOpArg(args[1]); code->imul(result, *op_arg); + + reg_alloc.DefineValue(inst, result); } void EmitX64::EmitAnd(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { - IR::Value a = inst->GetArg(0); - IR::Value b = inst->GetArg(1); + auto args = reg_alloc.GetArgumentInfo(inst); - Xbyak::Reg32 result = reg_alloc.UseDefGpr(a, inst).cvt32(); + Xbyak::Reg32 result = reg_alloc.UseScratchGpr(args[0]).cvt32(); - if (b.IsImmediate()) { - u32 op_arg = b.GetU32(); + if (args[1].IsImmediate()) { + u32 op_arg = args[1].GetImmediateU32(); code->and_(result, op_arg); } else { - OpArg op_arg = reg_alloc.UseOpArg(b, any_gpr); + OpArg op_arg = reg_alloc.UseOpArg(args[1]); op_arg.setBit(32); code->and_(result, *op_arg); } + + reg_alloc.DefineValue(inst, result); } void EmitX64::EmitEor(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { - IR::Value a = inst->GetArg(0); - IR::Value b = inst->GetArg(1); + auto args = reg_alloc.GetArgumentInfo(inst); - Xbyak::Reg32 result = reg_alloc.UseDefGpr(a, inst).cvt32(); + Xbyak::Reg32 result = reg_alloc.UseScratchGpr(args[0]).cvt32(); - if (b.IsImmediate()) { - u32 op_arg = b.GetU32(); + if (args[1].IsImmediate()) { + u32 op_arg = args[1].GetImmediateU32(); code->xor_(result, op_arg); } else { - OpArg op_arg = reg_alloc.UseOpArg(b, any_gpr); + OpArg op_arg = reg_alloc.UseOpArg(args[1]); op_arg.setBit(32); code->xor_(result, *op_arg); } + + reg_alloc.DefineValue(inst, result); } void EmitX64::EmitOr(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { - IR::Value a = inst->GetArg(0); - IR::Value b = inst->GetArg(1); + auto args = reg_alloc.GetArgumentInfo(inst); - Xbyak::Reg32 result = reg_alloc.UseDefGpr(a, inst).cvt32(); + Xbyak::Reg32 result = reg_alloc.UseScratchGpr(args[0]).cvt32(); - if (b.IsImmediate()) { - u32 op_arg = b.GetU32(); + if (args[1].IsImmediate()) { + u32 op_arg = args[1].GetImmediateU32(); code->or_(result, op_arg); } else { - OpArg op_arg = reg_alloc.UseOpArg(b, any_gpr); + OpArg op_arg = reg_alloc.UseOpArg(args[1]); op_arg.setBit(32); code->or_(result, *op_arg); } + + reg_alloc.DefineValue(inst, result); } void EmitX64::EmitNot(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { - IR::Value a = inst->GetArg(0); + auto args = reg_alloc.GetArgumentInfo(inst); - if (a.IsImmediate()) { - Xbyak::Reg32 result = reg_alloc.DefGpr(inst).cvt32(); - - code->mov(result, u32(~a.GetU32())); + Xbyak::Reg32 result; + if (args[0].IsImmediate()) { + result = reg_alloc.ScratchGpr().cvt32(); + code->mov(result, u32(~args[0].GetImmediateU32())); } else { - Xbyak::Reg32 result = reg_alloc.UseDefGpr(a, inst).cvt32(); - + result = reg_alloc.UseScratchGpr(args[0]).cvt32(); code->not_(result); } + reg_alloc.DefineValue(inst, result); } void EmitX64::EmitSignExtendWordToLong(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { - OpArg source; - Xbyak::Reg64 result; - if (inst->GetArg(0).IsImmediate()) { - // TODO: Optimize - result = reg_alloc.UseDefGpr(inst->GetArg(0), inst); - source = result; - } else { - std::tie(source, result) = reg_alloc.UseDefOpArgGpr(inst->GetArg(0), inst); - } - - source.setBit(32); - code->movsxd(result.cvt64(), *source); + auto args = reg_alloc.GetArgumentInfo(inst); + Xbyak::Reg64 result = reg_alloc.UseScratchGpr(args[0]); + code->movsxd(result.cvt64(), result.cvt32()); + reg_alloc.DefineValue(inst, result); } void EmitX64::EmitSignExtendHalfToWord(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { - OpArg source; - Xbyak::Reg64 result; - if (inst->GetArg(0).IsImmediate()) { - // TODO: Optimize - result = reg_alloc.UseDefGpr(inst->GetArg(0), inst); - source = result; - } else { - std::tie(source, result) = reg_alloc.UseDefOpArgGpr(inst->GetArg(0), inst); - } - - source.setBit(16); - code->movsx(result.cvt32(), *source); + auto args = reg_alloc.GetArgumentInfo(inst); + Xbyak::Reg64 result = reg_alloc.UseScratchGpr(args[0]); + code->movsx(result.cvt32(), result.cvt16()); + reg_alloc.DefineValue(inst, result); } void EmitX64::EmitSignExtendByteToWord(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { - OpArg source; - Xbyak::Reg64 result; - if (inst->GetArg(0).IsImmediate()) { - // TODO: Optimize - result = reg_alloc.UseDefGpr(inst->GetArg(0), inst); - source = result; - } else { - std::tie(source, result) = reg_alloc.UseDefOpArgGpr(inst->GetArg(0), inst); - } - - source.setBit(8); - code->movsx(result.cvt32(), *source); + auto args = reg_alloc.GetArgumentInfo(inst); + Xbyak::Reg64 result = reg_alloc.UseScratchGpr(args[0]); + code->movsx(result.cvt32(), result.cvt8()); + reg_alloc.DefineValue(inst, result); } void EmitX64::EmitZeroExtendWordToLong(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { - OpArg source; - Xbyak::Reg64 result; - if (inst->GetArg(0).IsImmediate()) { - // TODO: Optimize - result = reg_alloc.UseDefGpr(inst->GetArg(0), inst); - source = result; - } else { - std::tie(source, result) = reg_alloc.UseDefOpArgGpr(inst->GetArg(0), inst); - } - - source.setBit(32); - code->mov(result.cvt32(), *source); // x64 zeros upper 32 bits on a 32-bit move + auto args = reg_alloc.GetArgumentInfo(inst); + Xbyak::Reg64 result = reg_alloc.UseScratchGpr(args[0]); + code->mov(result.cvt32(), result.cvt32()); // x64 zeros upper 32 bits on a 32-bit move + reg_alloc.DefineValue(inst, result); } void EmitX64::EmitZeroExtendHalfToWord(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { - OpArg source; - Xbyak::Reg64 result; - if (inst->GetArg(0).IsImmediate()) { - // TODO: Optimize - result = reg_alloc.UseDefGpr(inst->GetArg(0), inst); - source = result; - } else { - std::tie(source, result) = reg_alloc.UseDefOpArgGpr(inst->GetArg(0), inst); - } - - source.setBit(16); - code->movzx(result.cvt32(), *source); + auto args = reg_alloc.GetArgumentInfo(inst); + Xbyak::Reg64 result = reg_alloc.UseScratchGpr(args[0]); + code->movzx(result.cvt32(), result.cvt16()); + reg_alloc.DefineValue(inst, result); } void EmitX64::EmitZeroExtendByteToWord(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { - OpArg source; - Xbyak::Reg64 result; - if (inst->GetArg(0).IsImmediate()) { - // TODO: Optimize - result = reg_alloc.UseDefGpr(inst->GetArg(0), inst); - source = result; - } else { - std::tie(source, result) = reg_alloc.UseDefOpArgGpr(inst->GetArg(0), inst); - } - - source.setBit(8); - code->movzx(result.cvt32(), *source); + auto args = reg_alloc.GetArgumentInfo(inst); + Xbyak::Reg64 result = reg_alloc.UseScratchGpr(args[0]); + code->movzx(result.cvt32(), result.cvt8()); + reg_alloc.DefineValue(inst, result); } void EmitX64::EmitByteReverseWord(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { - Xbyak::Reg32 result = reg_alloc.UseDefGpr(inst->GetArg(0), inst).cvt32(); - + auto args = reg_alloc.GetArgumentInfo(inst); + Xbyak::Reg32 result = reg_alloc.UseScratchGpr(args[0]).cvt32(); code->bswap(result); + reg_alloc.DefineValue(inst, result); } void EmitX64::EmitByteReverseHalf(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { - Xbyak::Reg16 result = reg_alloc.UseDefGpr(inst->GetArg(0), inst).cvt16(); - + auto args = reg_alloc.GetArgumentInfo(inst); + Xbyak::Reg16 result = reg_alloc.UseScratchGpr(args[0]).cvt16(); code->rol(result, 8); + reg_alloc.DefineValue(inst, result); } void EmitX64::EmitByteReverseDual(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { - Xbyak::Reg64 result = reg_alloc.UseDefGpr(inst->GetArg(0), inst); - + auto args = reg_alloc.GetArgumentInfo(inst); + Xbyak::Reg64 result = reg_alloc.UseScratchGpr(args[0]); code->bswap(result); + reg_alloc.DefineValue(inst, result); } void EmitX64::EmitCountLeadingZeros(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { - IR::Value a = inst->GetArg(0); - + auto args = reg_alloc.GetArgumentInfo(inst); if (cpu_info.has(Xbyak::util::Cpu::tLZCNT)) { - Xbyak::Reg32 source = reg_alloc.UseGpr(a).cvt32(); - Xbyak::Reg32 result = reg_alloc.DefGpr(inst).cvt32(); + Xbyak::Reg32 source = reg_alloc.UseGpr(args[0]).cvt32(); + Xbyak::Reg32 result = reg_alloc.ScratchGpr().cvt32(); code->lzcnt(result, source); + + reg_alloc.DefineValue(inst, result); } else { - Xbyak::Reg32 source = reg_alloc.UseScratchGpr(a).cvt32(); - Xbyak::Reg32 result = reg_alloc.DefGpr(inst).cvt32(); + Xbyak::Reg32 source = reg_alloc.UseScratchGpr(args[0]).cvt32(); + Xbyak::Reg32 result = reg_alloc.ScratchGpr().cvt32(); // The result of a bsr of zero is undefined, but zf is set after it. code->bsr(result, source); @@ -1274,18 +1287,19 @@ void EmitX64::EmitCountLeadingZeros(RegAlloc& reg_alloc, IR::Block&, IR::Inst* i code->cmovz(result, source); code->neg(result); code->add(result, 31); + + reg_alloc.DefineValue(inst, result); } } void EmitX64::EmitSignedSaturatedAdd(RegAlloc& reg_alloc, IR::Block& block, IR::Inst* inst) { auto overflow_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetOverflowFromOp); - IR::Value a = inst->GetArg(0); - IR::Value b = inst->GetArg(1); + auto args = reg_alloc.GetArgumentInfo(inst); - Xbyak::Reg32 result = reg_alloc.UseDefGpr(a, inst).cvt32(); - Xbyak::Reg32 addend = reg_alloc.UseGpr(b).cvt32(); - Xbyak::Reg32 overflow = overflow_inst ? reg_alloc.DefGpr(overflow_inst).cvt32() : reg_alloc.ScratchGpr().cvt32(); + Xbyak::Reg32 result = reg_alloc.UseScratchGpr(args[0]).cvt32(); + Xbyak::Reg32 addend = reg_alloc.UseGpr(args[1]).cvt32(); + Xbyak::Reg32 overflow = reg_alloc.ScratchGpr().cvt32(); code->mov(overflow, result); code->shr(overflow, 31); @@ -1294,22 +1308,25 @@ void EmitX64::EmitSignedSaturatedAdd(RegAlloc& reg_alloc, IR::Block& block, IR:: code->add(result, addend); code->cmovo(result, overflow); + reg_alloc.DefineValue(inst, result); + if (overflow_inst) { EraseInstruction(block, overflow_inst); code->seto(overflow.cvt8()); + + reg_alloc.DefineValue(overflow_inst, overflow); } } void EmitX64::EmitSignedSaturatedSub(RegAlloc& reg_alloc, IR::Block& block, IR::Inst* inst) { auto overflow_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetOverflowFromOp); - IR::Value a = inst->GetArg(0); - IR::Value b = inst->GetArg(1); + auto args = reg_alloc.GetArgumentInfo(inst); - Xbyak::Reg32 result = reg_alloc.UseDefGpr(a, inst).cvt32(); - Xbyak::Reg32 subend = reg_alloc.UseGpr(b).cvt32(); - Xbyak::Reg32 overflow = overflow_inst ? reg_alloc.DefGpr(overflow_inst).cvt32() : reg_alloc.ScratchGpr().cvt32(); + Xbyak::Reg32 result = reg_alloc.UseScratchGpr(args[0]).cvt32(); + Xbyak::Reg32 subend = reg_alloc.UseGpr(args[1]).cvt32(); + Xbyak::Reg32 overflow = reg_alloc.ScratchGpr().cvt32(); code->mov(overflow, result); code->shr(overflow, 31); @@ -1318,25 +1335,29 @@ void EmitX64::EmitSignedSaturatedSub(RegAlloc& reg_alloc, IR::Block& block, IR:: code->sub(result, subend); code->cmovo(result, overflow); + reg_alloc.DefineValue(inst, result); + if (overflow_inst) { EraseInstruction(block, overflow_inst); code->seto(overflow.cvt8()); + + reg_alloc.DefineValue(overflow_inst, overflow); } } void EmitX64::EmitUnsignedSaturation(RegAlloc& reg_alloc, IR::Block& block, IR::Inst* inst) { auto overflow_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetOverflowFromOp); - IR::Value a = inst->GetArg(0); - size_t N = inst->GetArg(1).GetU8(); + auto args = reg_alloc.GetArgumentInfo(inst); + size_t N = args[1].GetImmediateU8(); ASSERT(N <= 31); u32 saturated_value = (1u << N) - 1; - Xbyak::Reg32 result = reg_alloc.DefGpr(inst).cvt32(); - Xbyak::Reg32 reg_a = reg_alloc.UseGpr(a).cvt32(); - Xbyak::Reg32 overflow = overflow_inst ? reg_alloc.DefGpr(overflow_inst).cvt32() : reg_alloc.ScratchGpr().cvt32(); + Xbyak::Reg32 result = reg_alloc.ScratchGpr().cvt32(); + Xbyak::Reg32 reg_a = reg_alloc.UseGpr(args[0]).cvt32(); + Xbyak::Reg32 overflow = reg_alloc.ScratchGpr().cvt32(); // Pseudocode: result = clamp(reg_a, 0, saturated_value); code->xor_(overflow, overflow); @@ -1345,22 +1366,26 @@ void EmitX64::EmitUnsignedSaturation(RegAlloc& reg_alloc, IR::Block& block, IR:: code->cmovle(result, overflow); code->cmovbe(result, reg_a); + reg_alloc.DefineValue(inst, result); + if (overflow_inst) { EraseInstruction(block, overflow_inst); code->seta(overflow.cvt8()); + + reg_alloc.DefineValue(overflow_inst, overflow); } } void EmitX64::EmitSignedSaturation(RegAlloc& reg_alloc, IR::Block& block, IR::Inst* inst) { auto overflow_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetOverflowFromOp); - IR::Value a = inst->GetArg(0); - size_t N = inst->GetArg(1).GetU8(); + auto args = reg_alloc.GetArgumentInfo(inst); + size_t N = args[1].GetImmediateU8(); ASSERT(N >= 1 && N <= 32); if (N == 32) { - reg_alloc.RegisterAddDef(inst, a); + reg_alloc.DefineValue(inst, args[0]); if (overflow_inst) { auto no_overflow = IR::Value(false); overflow_inst->ReplaceUsesWith(no_overflow); @@ -1373,9 +1398,9 @@ void EmitX64::EmitSignedSaturation(RegAlloc& reg_alloc, IR::Block& block, IR::In u32 negative_saturated_value = 1u << (N - 1); u32 sext_negative_satured_value = Common::SignExtend(N, negative_saturated_value); - Xbyak::Reg32 result = reg_alloc.DefGpr(inst).cvt32(); - Xbyak::Reg32 reg_a = reg_alloc.UseGpr(a).cvt32(); - Xbyak::Reg32 overflow = overflow_inst ? reg_alloc.DefGpr(overflow_inst).cvt32() : reg_alloc.ScratchGpr().cvt32(); + Xbyak::Reg32 result = reg_alloc.ScratchGpr().cvt32(); + Xbyak::Reg32 reg_a = reg_alloc.UseGpr(args[0]).cvt32(); + Xbyak::Reg32 overflow = reg_alloc.ScratchGpr().cvt32(); Xbyak::Reg32 tmp = reg_alloc.ScratchGpr().cvt32(); // overflow now contains a value between 0 and mask if it was originally between {negative,positive}_saturated_value. @@ -1391,10 +1416,14 @@ void EmitX64::EmitSignedSaturation(RegAlloc& reg_alloc, IR::Block& block, IR::In code->cmp(overflow, mask); code->cmovbe(result, reg_a); + reg_alloc.DefineValue(inst, result); + if (overflow_inst) { EraseInstruction(block, overflow_inst); code->seta(overflow.cvt8()); + + reg_alloc.DefineValue(overflow_inst, overflow); } } @@ -1435,330 +1464,269 @@ static void ExtractAndDuplicateMostSignificantBitFromPackedWords(BlockOfCode* co } void EmitX64::EmitPackedAddU8(RegAlloc& reg_alloc, IR::Block& block, IR::Inst* inst) { + auto args = reg_alloc.GetArgumentInfo(inst); auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp); - IR::Value a = inst->GetArg(0); - IR::Value b = inst->GetArg(1); + Xbyak::Xmm xmm_a = reg_alloc.UseScratchXmm(args[0]); + Xbyak::Xmm xmm_b = reg_alloc.UseXmm(args[1]); - Xbyak::Reg32 reg_a = reg_alloc.UseScratchGpr(a).cvt32(); - Xbyak::Reg32 reg_b = reg_alloc.UseScratchGpr(b).cvt32(); - Xbyak::Reg32 result = reg_alloc.DefGpr(inst).cvt32(); - Xbyak::Reg32 reg_ge, tmp; + code->paddb(xmm_a, xmm_b); if (ge_inst) { EraseInstruction(block, ge_inst); - reg_ge = reg_alloc.DefGpr(ge_inst).cvt32(); - tmp = reg_alloc.ScratchGpr().cvt32(); + Xbyak::Reg32 reg_ge = reg_alloc.ScratchGpr().cvt32(); + Xbyak::Xmm tmp = reg_alloc.ScratchXmm(); - code->mov(reg_ge, reg_a); - code->and_(reg_ge, reg_b); + code->movdqa(tmp, xmm_a); + code->pminub(tmp, xmm_b); + code->pcmpeqb(tmp, xmm_b); + code->movd(reg_ge, tmp); + code->not_(reg_ge); + + ExtractMostSignificantBitFromPackedBytes(cpu_info, code, reg_alloc, reg_ge); + reg_alloc.DefineValue(ge_inst, reg_ge); } - // SWAR Arithmetic - code->mov(result, reg_a); - code->xor_(result, reg_b); - code->and_(result, 0x80808080); - code->and_(reg_a, 0x7F7F7F7F); - code->and_(reg_b, 0x7F7F7F7F); - code->add(reg_a, reg_b); - if (ge_inst) { - code->mov(tmp, result); - code->and_(tmp, reg_a); - code->or_(reg_ge, tmp); - } - code->xor_(result, reg_a); - if (ge_inst) { - ExtractMostSignificantBitFromPackedBytes(cpu_info, code, reg_alloc, reg_ge, tmp); - } + reg_alloc.DefineValue(inst, xmm_a); } void EmitX64::EmitPackedAddS8(RegAlloc& reg_alloc, IR::Block& block, IR::Inst* inst) { + auto args = reg_alloc.GetArgumentInfo(inst); auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp); - IR::Value a = inst->GetArg(0); - IR::Value b = inst->GetArg(1); - - Xbyak::Reg32 reg_a = reg_alloc.UseDefGpr(a, inst).cvt32(); - Xbyak::Reg32 reg_b = reg_alloc.UseGpr(b).cvt32(); Xbyak::Reg32 reg_ge; - Xbyak::Xmm xmm_a = reg_alloc.ScratchXmm(); - Xbyak::Xmm xmm_b = reg_alloc.ScratchXmm(); + Xbyak::Xmm xmm_a = reg_alloc.UseScratchXmm(args[0]); + Xbyak::Xmm xmm_b = reg_alloc.UseXmm(args[1]); if (ge_inst) { EraseInstruction(block, ge_inst); - reg_ge = reg_alloc.DefGpr(ge_inst).cvt32(); - } - - code->movd(xmm_a, reg_a); - code->movd(xmm_b, reg_b); - if (ge_inst) { Xbyak::Xmm saturated_sum = reg_alloc.ScratchXmm(); + reg_ge = reg_alloc.ScratchGpr().cvt32(); + code->movdqa(saturated_sum, xmm_a); code->paddsb(saturated_sum, xmm_b); code->movd(reg_ge, saturated_sum); } + code->paddb(xmm_a, xmm_b); - code->movd(reg_a, xmm_a); + if (ge_inst) { code->not_(reg_ge); ExtractMostSignificantBitFromPackedBytes(cpu_info, code, reg_alloc, reg_ge); + reg_alloc.DefineValue(ge_inst, reg_ge); } + + reg_alloc.DefineValue(inst, xmm_a); } void EmitX64::EmitPackedAddU16(RegAlloc& reg_alloc, IR::Block& block, IR::Inst* inst) { + auto args = reg_alloc.GetArgumentInfo(inst); auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp); - IR::Value a = inst->GetArg(0); - IR::Value b = inst->GetArg(1); + Xbyak::Xmm xmm_a = reg_alloc.UseScratchXmm(args[0]); + Xbyak::Xmm xmm_b = reg_alloc.UseXmm(args[1]); - Xbyak::Reg32 reg_a = reg_alloc.UseScratchGpr(a).cvt32(); - Xbyak::Reg32 reg_b = reg_alloc.UseScratchGpr(b).cvt32(); - Xbyak::Reg32 result = reg_alloc.DefGpr(inst).cvt32(); - Xbyak::Reg32 reg_ge, tmp; + code->paddw(xmm_a, xmm_b); if (ge_inst) { EraseInstruction(block, ge_inst); - reg_ge = reg_alloc.DefGpr(ge_inst).cvt32(); + Xbyak::Reg32 reg_ge = reg_alloc.ScratchGpr().cvt32(); + Xbyak::Xmm tmp = reg_alloc.ScratchXmm(); - code->mov(reg_ge, reg_a); - code->and_(reg_ge, reg_b); + code->movdqa(tmp, xmm_a); + code->pminuw(tmp, xmm_b); + code->pcmpeqw(tmp, xmm_b); + code->movd(reg_ge, tmp); + code->not_(reg_ge); + + ExtractMostSignificantBitFromPackedBytes(cpu_info, code, reg_alloc, reg_ge); + reg_alloc.DefineValue(ge_inst, reg_ge); } - // SWAR Arithmetic - code->mov(result, reg_a); - code->xor_(result, reg_b); - code->and_(result, 0x80008000); - code->and_(reg_a, 0x7FFF7FFF); - code->and_(reg_b, 0x7FFF7FFF); - code->add(reg_a, reg_b); - if (ge_inst) { - tmp = reg_alloc.ScratchGpr().cvt32(); - code->mov(tmp, result); - code->and_(tmp, reg_a); - code->or_(reg_ge, tmp); - } - code->xor_(result, reg_a); - if (ge_inst) { - ExtractAndDuplicateMostSignificantBitFromPackedWords(code, reg_ge); - } + reg_alloc.DefineValue(inst, xmm_a); } void EmitX64::EmitPackedAddS16(RegAlloc& reg_alloc, IR::Block& block, IR::Inst* inst) { + auto args = reg_alloc.GetArgumentInfo(inst); auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp); - IR::Value a = inst->GetArg(0); - IR::Value b = inst->GetArg(1); - - Xbyak::Reg32 reg_a = reg_alloc.UseDefGpr(a, inst).cvt32(); - Xbyak::Reg32 reg_b = reg_alloc.UseGpr(b).cvt32(); + Xbyak::Xmm xmm_a = reg_alloc.UseScratchXmm(args[0]); + Xbyak::Xmm xmm_b = reg_alloc.UseXmm(args[1]); Xbyak::Reg32 reg_ge; - Xbyak::Xmm xmm_a = reg_alloc.ScratchXmm(); - Xbyak::Xmm xmm_b = reg_alloc.ScratchXmm(); - if (ge_inst) { EraseInstruction(block, ge_inst); - reg_ge = reg_alloc.DefGpr(ge_inst).cvt32(); - } - - code->movd(xmm_a, reg_a); - code->movd(xmm_b, reg_b); - if (ge_inst) { + reg_ge = reg_alloc.ScratchGpr().cvt32(); Xbyak::Xmm saturated_sum = reg_alloc.ScratchXmm(); + code->movdqa(saturated_sum, xmm_a); code->paddsw(saturated_sum, xmm_b); code->movd(reg_ge, saturated_sum); } + code->paddw(xmm_a, xmm_b); - code->movd(reg_a, xmm_a); + if (ge_inst) { code->not_(reg_ge); ExtractAndDuplicateMostSignificantBitFromPackedWords(code, reg_ge); + reg_alloc.DefineValue(ge_inst, reg_ge); } + + reg_alloc.DefineValue(inst, xmm_a); } void EmitX64::EmitPackedSubU8(RegAlloc& reg_alloc, IR::Block& block, IR::Inst* inst) { + auto args = reg_alloc.GetArgumentInfo(inst); auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp); - IR::Value a = inst->GetArg(0); - IR::Value b = inst->GetArg(1); - - Xbyak::Reg32 reg_a = reg_alloc.UseDefGpr(a, inst).cvt32(); - Xbyak::Reg32 reg_b = reg_alloc.UseGpr(b).cvt32(); + Xbyak::Xmm xmm_a = reg_alloc.UseScratchXmm(args[0]); + Xbyak::Xmm xmm_b = reg_alloc.UseXmm(args[1]); Xbyak::Reg32 reg_ge; - Xbyak::Xmm xmm_a = reg_alloc.ScratchXmm(); - Xbyak::Xmm xmm_b = reg_alloc.ScratchXmm(); - Xbyak::Xmm xmm_ge; - if (ge_inst) { EraseInstruction(block, ge_inst); - reg_ge = reg_alloc.DefGpr(ge_inst).cvt32(); - xmm_ge = reg_alloc.ScratchXmm(); - } + Xbyak::Xmm xmm_ge = reg_alloc.ScratchXmm(); + reg_ge = reg_alloc.ScratchGpr().cvt32(); - code->movd(xmm_a, reg_a); - code->movd(xmm_b, reg_b); - if (ge_inst) { code->movdqa(xmm_ge, xmm_a); code->pmaxub(xmm_ge, xmm_b); code->pcmpeqb(xmm_ge, xmm_a); code->movd(reg_ge, xmm_ge); } + code->psubb(xmm_a, xmm_b); - code->movd(reg_a, xmm_a); if (ge_inst) { ExtractMostSignificantBitFromPackedBytes(cpu_info, code, reg_alloc, reg_ge); + reg_alloc.DefineValue(ge_inst, reg_ge); } + + reg_alloc.DefineValue(inst, xmm_a); } void EmitX64::EmitPackedSubS8(RegAlloc& reg_alloc, IR::Block& block, IR::Inst* inst) { + auto args = reg_alloc.GetArgumentInfo(inst); auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp); - IR::Value a = inst->GetArg(0); - IR::Value b = inst->GetArg(1); - - Xbyak::Reg32 reg_a = reg_alloc.UseDefGpr(a, inst).cvt32(); - Xbyak::Reg32 reg_b = reg_alloc.UseGpr(b).cvt32(); + Xbyak::Xmm xmm_a = reg_alloc.UseScratchXmm(args[0]); + Xbyak::Xmm xmm_b = reg_alloc.UseXmm(args[1]); Xbyak::Reg32 reg_ge; - Xbyak::Xmm xmm_a = reg_alloc.ScratchXmm(); - Xbyak::Xmm xmm_b = reg_alloc.ScratchXmm(); - if (ge_inst) { EraseInstruction(block, ge_inst); - reg_ge = reg_alloc.DefGpr(ge_inst).cvt32(); - } - code->movd(xmm_b, reg_b); - code->movd(xmm_a, reg_a); - if (ge_inst) { Xbyak::Xmm xmm_ge = reg_alloc.ScratchXmm(); + reg_ge = reg_alloc.ScratchGpr().cvt32(); + code->movdqa(xmm_ge, xmm_a); code->psubsb(xmm_ge, xmm_b); code->movd(reg_ge, xmm_ge); } + code->psubb(xmm_a, xmm_b); - code->movd(reg_a, xmm_a); + if (ge_inst) { code->not_(reg_ge); ExtractMostSignificantBitFromPackedBytes(cpu_info, code, reg_alloc, reg_ge); + reg_alloc.DefineValue(ge_inst, reg_ge); } + + reg_alloc.DefineValue(inst, xmm_a); } void EmitX64::EmitPackedSubU16(RegAlloc& reg_alloc, IR::Block& block, IR::Inst* inst) { + auto args = reg_alloc.GetArgumentInfo(inst); auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp); - IR::Value a = inst->GetArg(0); - IR::Value b = inst->GetArg(1); - - Xbyak::Reg32 reg_a = reg_alloc.UseDefGpr(a, inst).cvt32(); - Xbyak::Reg32 reg_b = reg_alloc.UseGpr(b).cvt32(); + Xbyak::Xmm xmm_a = reg_alloc.UseScratchXmm(args[0]); + Xbyak::Xmm xmm_b = reg_alloc.UseXmm(args[1]); Xbyak::Reg32 reg_ge; - Xbyak::Xmm xmm_a = reg_alloc.ScratchXmm(); - Xbyak::Xmm xmm_b = reg_alloc.ScratchXmm(); - Xbyak::Xmm xmm_ge; - if (ge_inst) { EraseInstruction(block, ge_inst); - reg_ge = reg_alloc.DefGpr(ge_inst).cvt32(); - xmm_ge = reg_alloc.ScratchXmm(); - } + reg_ge = reg_alloc.ScratchGpr().cvt32(); + Xbyak::Xmm xmm_ge = reg_alloc.ScratchXmm(); - code->movd(xmm_a, reg_a); - code->movd(xmm_b, reg_b); - if (ge_inst) { code->movdqa(xmm_ge, xmm_a); code->pmaxuw(xmm_ge, xmm_b); code->pcmpeqw(xmm_ge, xmm_a); code->movd(reg_ge, xmm_ge); } + code->psubw(xmm_a, xmm_b); - code->movd(reg_a, xmm_a); + if (ge_inst) { ExtractAndDuplicateMostSignificantBitFromPackedWords(code, reg_ge); + reg_alloc.DefineValue(ge_inst, reg_ge); } + + reg_alloc.DefineValue(inst, xmm_a); } void EmitX64::EmitPackedSubS16(RegAlloc& reg_alloc, IR::Block& block, IR::Inst* inst) { + auto args = reg_alloc.GetArgumentInfo(inst); auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp); - IR::Value a = inst->GetArg(0); - IR::Value b = inst->GetArg(1); - - Xbyak::Reg32 reg_a = reg_alloc.UseDefGpr(a, inst).cvt32(); - Xbyak::Reg32 reg_b = reg_alloc.UseGpr(b).cvt32(); + Xbyak::Xmm xmm_a = reg_alloc.UseScratchXmm(args[0]); + Xbyak::Xmm xmm_b = reg_alloc.UseXmm(args[1]); Xbyak::Reg32 reg_ge; - Xbyak::Xmm xmm_a = reg_alloc.ScratchXmm(); - Xbyak::Xmm xmm_b = reg_alloc.ScratchXmm(); - if (ge_inst) { EraseInstruction(block, ge_inst); - reg_ge = reg_alloc.DefGpr(ge_inst).cvt32(); - } - - code->movd(xmm_b, reg_b); - code->movd(xmm_a, reg_a); - if (ge_inst) { Xbyak::Xmm xmm_ge = reg_alloc.ScratchXmm(); + reg_ge = reg_alloc.ScratchGpr().cvt32(); + code->movdqa(xmm_ge, xmm_a); code->psubsw(xmm_ge, xmm_b); code->movd(reg_ge, xmm_ge); } + code->psubw(xmm_a, xmm_b); - code->movd(reg_a, xmm_a); + if (ge_inst) { code->not_(reg_ge); ExtractAndDuplicateMostSignificantBitFromPackedWords(code, reg_ge); + reg_alloc.DefineValue(ge_inst, reg_ge); } + + reg_alloc.DefineValue(inst, xmm_a); } void EmitX64::EmitPackedHalvingAddU8(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { - IR::Value a = inst->GetArg(0); - IR::Value b = inst->GetArg(1); + auto args = reg_alloc.GetArgumentInfo(inst); // This code path requires SSSE3 because of the PSHUFB instruction. // A fallback implementation is provided below. if (cpu_info.has(Xbyak::util::Cpu::tSSSE3)) { - Xbyak::Reg32 result = reg_alloc.UseDefGpr(a, inst).cvt32(); - Xbyak::Reg32 arg = reg_alloc.UseGpr(b).cvt32(); - - // Load the operands into Xmm registers - Xbyak::Xmm xmm_scratch_a = reg_alloc.ScratchXmm(); - Xbyak::Xmm xmm_scratch_b = reg_alloc.ScratchXmm(); + Xbyak::Xmm xmm_a = reg_alloc.UseScratchXmm(args[0]); + Xbyak::Xmm xmm_b = reg_alloc.UseScratchXmm(args[1]); Xbyak::Xmm xmm_mask = reg_alloc.ScratchXmm(); Xbyak::Reg64 mask = reg_alloc.ScratchGpr(); - code->movd(xmm_scratch_a, result); - code->movd(xmm_scratch_b, arg); - // Set the mask to expand the values // 0xAABBCCDD becomes 0x00AA00BB00CC00DD code->mov(mask, 0x8003800280018000); code->movq(xmm_mask, mask); // Expand each 8-bit value to 16-bit - code->pshufb(xmm_scratch_a, xmm_mask); - code->pshufb(xmm_scratch_b, xmm_mask); + code->pshufb(xmm_a, xmm_mask); + code->pshufb(xmm_b, xmm_mask); // Add the individual 16-bit values - code->paddw(xmm_scratch_a, xmm_scratch_b); + code->paddw(xmm_a, xmm_b); // Shift the 16-bit values to the right to halve them - code->psrlw(xmm_scratch_a, 1); + code->psrlw(xmm_a, 1); // Set the mask to pack the values again // 0x00AA00BB00CC00DD becomes 0xAABBCCDD @@ -1766,33 +1734,33 @@ void EmitX64::EmitPackedHalvingAddU8(RegAlloc& reg_alloc, IR::Block&, IR::Inst* code->movq(xmm_mask, mask); // Shuffle them back to 8-bit values - code->pshufb(xmm_scratch_a, xmm_mask); + code->pshufb(xmm_a, xmm_mask); - code->movd(result, xmm_scratch_a); - return; + reg_alloc.DefineValue(inst, xmm_a); + } else { + // Fallback implementation in case the CPU doesn't support SSSE3 + Xbyak::Reg32 reg_a = reg_alloc.UseScratchGpr(args[0]).cvt32(); + Xbyak::Reg32 reg_b = reg_alloc.UseGpr(args[1]).cvt32(); + Xbyak::Reg32 xor_a_b = reg_alloc.ScratchGpr().cvt32(); + Xbyak::Reg32 and_a_b = reg_a; + Xbyak::Reg32 result = reg_a; + + code->mov(xor_a_b, reg_a); + code->and(and_a_b, reg_b); + code->xor(xor_a_b, reg_b); + code->shr(xor_a_b, 1); + code->and(xor_a_b, 0x7F7F7F7F); + code->add(result, xor_a_b); + + reg_alloc.DefineValue(inst, result); } - - // Fallback implementation in case the CPU doesn't support SSSE3 - Xbyak::Reg32 reg_a = reg_alloc.UseDefGpr(a, inst).cvt32(); - Xbyak::Reg32 reg_b = reg_alloc.UseGpr(b).cvt32(); - Xbyak::Reg32 xor_a_b = reg_alloc.ScratchGpr().cvt32(); - Xbyak::Reg32 and_a_b = reg_a; - Xbyak::Reg32 result = reg_a; - - code->mov(xor_a_b, reg_a); - code->and(and_a_b, reg_b); - code->xor(xor_a_b, reg_b); - code->shr(xor_a_b, 1); - code->and(xor_a_b, 0x7F7F7F7F); - code->add(result, xor_a_b); } void EmitX64::EmitPackedHalvingAddU16(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { - IR::Value a = inst->GetArg(0); - IR::Value b = inst->GetArg(1); + auto args = reg_alloc.GetArgumentInfo(inst); - Xbyak::Reg32 reg_a = reg_alloc.UseDefGpr(a, inst).cvt32(); - Xbyak::Reg32 reg_b = reg_alloc.UseGpr(b).cvt32(); + Xbyak::Reg32 reg_a = reg_alloc.UseScratchGpr(args[0]).cvt32(); + Xbyak::Reg32 reg_b = reg_alloc.UseGpr(args[1]).cvt32(); Xbyak::Reg32 xor_a_b = reg_alloc.ScratchGpr().cvt32(); Xbyak::Reg32 and_a_b = reg_a; Xbyak::Reg32 result = reg_a; @@ -1808,14 +1776,15 @@ void EmitX64::EmitPackedHalvingAddU16(RegAlloc& reg_alloc, IR::Block&, IR::Inst* code->shr(xor_a_b, 1); code->and(xor_a_b, 0x7FFF7FFF); code->add(result, xor_a_b); + + reg_alloc.DefineValue(inst, result); } void EmitX64::EmitPackedHalvingAddS8(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { - IR::Value a = inst->GetArg(0); - IR::Value b = inst->GetArg(1); + auto args = reg_alloc.GetArgumentInfo(inst); - Xbyak::Reg32 reg_a = reg_alloc.UseDefGpr(a, inst).cvt32(); - Xbyak::Reg32 reg_b = reg_alloc.UseGpr(b).cvt32(); + Xbyak::Reg32 reg_a = reg_alloc.UseScratchGpr(args[0]).cvt32(); + Xbyak::Reg32 reg_b = reg_alloc.UseGpr(args[1]).cvt32(); Xbyak::Reg32 xor_a_b = reg_alloc.ScratchGpr().cvt32(); Xbyak::Reg32 and_a_b = reg_a; Xbyak::Reg32 result = reg_a; @@ -1836,14 +1805,15 @@ void EmitX64::EmitPackedHalvingAddS8(RegAlloc& reg_alloc, IR::Block&, IR::Inst* code->and(xor_a_b, 0x7F7F7F7F); code->add(result, xor_a_b); code->xor(result, carry); + + reg_alloc.DefineValue(inst, result); } void EmitX64::EmitPackedHalvingAddS16(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { - IR::Value a = inst->GetArg(0); - IR::Value b = inst->GetArg(1); + auto args = reg_alloc.GetArgumentInfo(inst); - Xbyak::Reg32 reg_a = reg_alloc.UseDefGpr(a, inst).cvt32(); - Xbyak::Reg32 reg_b = reg_alloc.UseGpr(b).cvt32(); + Xbyak::Reg32 reg_a = reg_alloc.UseScratchGpr(args[0]).cvt32(); + Xbyak::Reg32 reg_b = reg_alloc.UseGpr(args[1]).cvt32(); Xbyak::Reg32 xor_a_b = reg_alloc.ScratchGpr().cvt32(); Xbyak::Reg32 and_a_b = reg_a; Xbyak::Reg32 result = reg_a; @@ -1864,14 +1834,15 @@ void EmitX64::EmitPackedHalvingAddS16(RegAlloc& reg_alloc, IR::Block&, IR::Inst* code->and(xor_a_b, 0x7FFF7FFF); code->add(result, xor_a_b); code->xor(result, carry); + + reg_alloc.DefineValue(inst, result); } void EmitX64::EmitPackedHalvingSubU8(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { - IR::Value a = inst->GetArg(0); - IR::Value b = inst->GetArg(1); + auto args = reg_alloc.GetArgumentInfo(inst); - Xbyak::Reg32 minuend = reg_alloc.UseDefGpr(a, inst).cvt32(); - Xbyak::Reg32 subtrahend = reg_alloc.UseScratchGpr(b).cvt32(); + Xbyak::Reg32 minuend = reg_alloc.UseScratchGpr(args[0]).cvt32(); + Xbyak::Reg32 subtrahend = reg_alloc.UseScratchGpr(args[1]).cvt32(); // This relies on the equality x-y == (x^y) - (((x^y)&y) << 1). // Note that x^y always contains the LSB of the result. @@ -1894,14 +1865,14 @@ void EmitX64::EmitPackedHalvingSubU8(RegAlloc& reg_alloc, IR::Block&, IR::Inst* code->xor(minuend, 0x80808080); // minuend now contains the desired result. + reg_alloc.DefineValue(inst, minuend); } void EmitX64::EmitPackedHalvingSubS8(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { - IR::Value a = inst->GetArg(0); - IR::Value b = inst->GetArg(1); + auto args = reg_alloc.GetArgumentInfo(inst); - Xbyak::Reg32 minuend = reg_alloc.UseDefGpr(a, inst).cvt32(); - Xbyak::Reg32 subtrahend = reg_alloc.UseScratchGpr(b).cvt32(); + Xbyak::Reg32 minuend = reg_alloc.UseScratchGpr(args[0]).cvt32(); + Xbyak::Reg32 subtrahend = reg_alloc.UseScratchGpr(args[1]).cvt32(); Xbyak::Reg32 carry = reg_alloc.ScratchGpr().cvt32(); @@ -1929,14 +1900,15 @@ void EmitX64::EmitPackedHalvingSubS8(RegAlloc& reg_alloc, IR::Block&, IR::Inst* code->sub(minuend, subtrahend); code->xor(minuend, 0x80808080); code->xor(minuend, carry); + + reg_alloc.DefineValue(inst, minuend); } void EmitX64::EmitPackedHalvingSubU16(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { - IR::Value a = inst->GetArg(0); - IR::Value b = inst->GetArg(1); + auto args = reg_alloc.GetArgumentInfo(inst); - Xbyak::Reg32 minuend = reg_alloc.UseDefGpr(a, inst).cvt32(); - Xbyak::Reg32 subtrahend = reg_alloc.UseScratchGpr(b).cvt32(); + Xbyak::Reg32 minuend = reg_alloc.UseScratchGpr(args[0]).cvt32(); + Xbyak::Reg32 subtrahend = reg_alloc.UseScratchGpr(args[1]).cvt32(); // This relies on the equality x-y == (x^y) - (((x^y)&y) << 1). // Note that x^y always contains the LSB of the result. @@ -1957,14 +1929,15 @@ void EmitX64::EmitPackedHalvingSubU16(RegAlloc& reg_alloc, IR::Block&, IR::Inst* code->or(minuend, 0x80008000); code->sub(minuend, subtrahend); code->xor(minuend, 0x80008000); + + reg_alloc.DefineValue(inst, minuend); } void EmitX64::EmitPackedHalvingSubS16(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { - IR::Value a = inst->GetArg(0); - IR::Value b = inst->GetArg(1); + auto args = reg_alloc.GetArgumentInfo(inst); - Xbyak::Reg32 minuend = reg_alloc.UseDefGpr(a, inst).cvt32(); - Xbyak::Reg32 subtrahend = reg_alloc.UseScratchGpr(b).cvt32(); + Xbyak::Reg32 minuend = reg_alloc.UseScratchGpr(args[0]).cvt32(); + Xbyak::Reg32 subtrahend = reg_alloc.UseScratchGpr(args[1]).cvt32(); Xbyak::Reg32 carry = reg_alloc.ScratchGpr().cvt32(); @@ -1992,106 +1965,76 @@ void EmitX64::EmitPackedHalvingSubS16(RegAlloc& reg_alloc, IR::Block&, IR::Inst* code->sub(minuend, subtrahend); code->xor(minuend, 0x80008000); code->xor(minuend, carry); + + reg_alloc.DefineValue(inst, minuend); +} + +void EmitPackedHalvingSubAdd(BlockOfCode* code, RegAlloc& reg_alloc, IR::Inst* inst, bool is_signed) { + auto args = reg_alloc.GetArgumentInfo(inst); + + Xbyak::Reg32 reg_a_hi = reg_alloc.UseScratchGpr(args[0]).cvt32(); + Xbyak::Reg32 reg_b_hi = reg_alloc.UseScratchGpr(args[1]).cvt32(); + Xbyak::Reg32 reg_a_lo = reg_alloc.ScratchGpr().cvt32(); + Xbyak::Reg32 reg_b_lo = reg_alloc.ScratchGpr().cvt32(); + + // If asx is true, the high word contains the sum and the low word the difference. + // If false, the high word contains the difference and the low word the sum. + bool asx = args[2].GetImmediateU1(); + + if (is_signed) { + code->movsx(reg_a_lo, reg_a_hi.cvt16()); + code->movsx(reg_b_lo, reg_b_hi.cvt16()); + code->sar(reg_a_hi, 16); + code->sar(reg_b_hi, 16); + } else { + code->movzx(reg_a_lo, reg_a_hi.cvt16()); + code->movzx(reg_b_lo, reg_b_hi.cvt16()); + code->shr(reg_a_hi, 16); + code->shr(reg_b_hi, 16); + } + + if (asx) { + // Calculate diff such that reg_a_lo<31:16> contains diff<16:1>. + code->sub(reg_a_lo, reg_b_hi); + code->shl(reg_a_lo, 15); + + // Calculate sum such that reg_a_hi<15:0> contains sum<16:1>. + code->add(reg_a_hi, reg_b_lo); + code->shr(reg_a_hi, 1); + } else { + // Calculate sum such that reg_a_lo<31:16> contains sum<16:1>. + code->add(reg_a_lo, reg_b_hi); + code->shl(reg_a_lo, 15); + + // Calculate diff such that reg_a_hi<15:0> contains diff<16:1>. + code->sub(reg_a_hi, reg_b_lo); + code->shr(reg_a_hi, 1); + } + + // reg_a_lo now contains the low word and reg_a_hi now contains the high word. + // Merge them. + code->shld(reg_a_hi, reg_a_lo, 16); + + reg_alloc.DefineValue(inst, reg_a_hi); } void EmitX64::EmitPackedHalvingSubAddU16(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { - IR::Value a = inst->GetArg(0); - IR::Value b = inst->GetArg(1); - - // If asx is true, the high word contains the sum and the low word the difference. - // If false, the high word contains the difference and the low word the sum. - bool asx = inst->GetArg(2).GetU1(); - - Xbyak::Reg32 reg_a_hi = reg_alloc.UseDefGpr(a, inst).cvt32(); - Xbyak::Reg32 reg_b_hi = reg_alloc.UseScratchGpr(b).cvt32(); - Xbyak::Reg32 reg_a_lo = reg_alloc.ScratchGpr().cvt32(); - Xbyak::Reg32 reg_b_lo = reg_alloc.ScratchGpr().cvt32(); - - code->movzx(reg_a_lo, reg_a_hi.cvt16()); - code->movzx(reg_b_lo, reg_b_hi.cvt16()); - code->shr(reg_a_hi, 16); - code->shr(reg_b_hi, 16); - - if (asx) { - // Calculate diff such that reg_a_lo<31:16> contains diff<16:1>. - code->sub(reg_a_lo, reg_b_hi); - code->shl(reg_a_lo, 15); - - // Calculate sum such that reg_a_hi<15:0> contains sum<16:1>. - code->add(reg_a_hi, reg_b_lo); - code->shr(reg_a_hi, 1); - } else { - // Calculate sum such that reg_a_lo<31:16> contains sum<16:1>. - code->add(reg_a_lo, reg_b_hi); - code->shl(reg_a_lo, 15); - - // Calculate diff such that reg_a_hi<15:0> contains diff<16:1>. - code->sub(reg_a_hi, reg_b_lo); - code->shr(reg_a_hi, 1); - } - - // reg_a_lo now contains the low word and reg_a_hi now contains the high word. - // Merge them. - code->shld(reg_a_hi, reg_a_lo, 16); + EmitPackedHalvingSubAdd(code, reg_alloc, inst, false); } void EmitX64::EmitPackedHalvingSubAddS16(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { - IR::Value a = inst->GetArg(0); - IR::Value b = inst->GetArg(1); - - // If asx is true, the high word contains the sum and the low word the difference. - // If false, the high word contains the difference and the low word the sum. - bool asx = inst->GetArg(2).GetU1(); - - Xbyak::Reg32 reg_a_hi = reg_alloc.UseDefGpr(a, inst).cvt32(); - Xbyak::Reg32 reg_b_hi = reg_alloc.UseScratchGpr(b).cvt32(); - Xbyak::Reg32 reg_a_lo = reg_alloc.ScratchGpr().cvt32(); - Xbyak::Reg32 reg_b_lo = reg_alloc.ScratchGpr().cvt32(); - - code->movsx(reg_a_lo, reg_a_hi.cvt16()); - code->movsx(reg_b_lo, reg_b_hi.cvt16()); - code->sar(reg_a_hi, 16); - code->sar(reg_b_hi, 16); - - if (asx) { - // Calculate diff such that reg_a_lo<31:16> contains diff<16:1>. - code->sub(reg_a_lo, reg_b_hi); - code->shl(reg_a_lo, 15); - - // Calculate sum such that reg_a_hi<15:0> contains sum<16:1>. - code->add(reg_a_hi, reg_b_lo); - code->shr(reg_a_hi, 1); - } else { - // Calculate sum such that reg_a_lo<31:16> contains sum<16:1>. - code->add(reg_a_lo, reg_b_hi); - code->shl(reg_a_lo, 15); - - // Calculate diff such that reg_a_hi<15:0> contains diff<16:1>. - code->sub(reg_a_hi, reg_b_lo); - code->shr(reg_a_hi, 1); - } - - // reg_a_lo now contains the low word and reg_a_hi now contains the high word. - // Merge them. - code->shld(reg_a_hi, reg_a_lo, 16); + EmitPackedHalvingSubAdd(code, reg_alloc, inst, true); } static void EmitPackedOperation(BlockOfCode* code, RegAlloc& reg_alloc, IR::Inst* inst, void (Xbyak::CodeGenerator::*fn)(const Xbyak::Mmx& mmx, const Xbyak::Operand&)) { - IR::Value a = inst->GetArg(0); - IR::Value b = inst->GetArg(1); + auto args = reg_alloc.GetArgumentInfo(inst); - Xbyak::Reg32 result = reg_alloc.UseDefGpr(a, inst).cvt32(); - Xbyak::Reg32 arg = reg_alloc.UseGpr(b).cvt32(); + Xbyak::Xmm xmm_a = reg_alloc.UseScratchXmm(args[0]); + Xbyak::Xmm xmm_b = reg_alloc.UseXmm(args[1]); - Xbyak::Xmm xmm_scratch_a = reg_alloc.ScratchXmm(); - Xbyak::Xmm xmm_scratch_b = reg_alloc.ScratchXmm(); + (code->*fn)(xmm_a, xmm_b); - code->movd(xmm_scratch_a, result); - code->movd(xmm_scratch_b, arg); - - (code->*fn)(xmm_scratch_a, xmm_scratch_b); - - code->movd(result, xmm_scratch_a); + reg_alloc.DefineValue(inst, xmm_a); } void EmitX64::EmitPackedSaturatedAddU8(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { @@ -2224,11 +2167,10 @@ static void ZeroIfNaN64(BlockOfCode* code, Xbyak::Xmm xmm_value, Xbyak::Xmm xmm_ } static void FPThreeOp32(BlockOfCode* code, RegAlloc& reg_alloc, IR::Block& block, IR::Inst* inst, void (Xbyak::CodeGenerator::*fn)(const Xbyak::Xmm&, const Xbyak::Operand&)) { - IR::Value a = inst->GetArg(0); - IR::Value b = inst->GetArg(1); + auto args = reg_alloc.GetArgumentInfo(inst); - Xbyak::Xmm result = reg_alloc.UseDefXmm(a, inst); - Xbyak::Xmm operand = reg_alloc.UseXmm(b); + Xbyak::Xmm result = reg_alloc.UseScratchXmm(args[0]); + Xbyak::Xmm operand = reg_alloc.UseXmm(args[1]); Xbyak::Reg32 gpr_scratch = reg_alloc.ScratchGpr().cvt32(); if (block.Location().FPSCR().FTZ()) { @@ -2242,14 +2184,15 @@ static void FPThreeOp32(BlockOfCode* code, RegAlloc& reg_alloc, IR::Block& block if (block.Location().FPSCR().DN()) { DefaultNaN32(code, result); } + + reg_alloc.DefineValue(inst, result); } static void FPThreeOp64(BlockOfCode* code, RegAlloc& reg_alloc, IR::Block& block, IR::Inst* inst, void (Xbyak::CodeGenerator::*fn)(const Xbyak::Xmm&, const Xbyak::Operand&)) { - IR::Value a = inst->GetArg(0); - IR::Value b = inst->GetArg(1); + auto args = reg_alloc.GetArgumentInfo(inst); - Xbyak::Xmm result = reg_alloc.UseDefXmm(a, inst); - Xbyak::Xmm operand = reg_alloc.UseXmm(b); + Xbyak::Xmm result = reg_alloc.UseScratchXmm(args[0]); + Xbyak::Xmm operand = reg_alloc.UseXmm(args[1]); Xbyak::Reg64 gpr_scratch = reg_alloc.ScratchGpr(); if (block.Location().FPSCR().FTZ()) { @@ -2263,12 +2206,14 @@ static void FPThreeOp64(BlockOfCode* code, RegAlloc& reg_alloc, IR::Block& block if (block.Location().FPSCR().DN()) { DefaultNaN64(code, result); } + + reg_alloc.DefineValue(inst, result); } static void FPTwoOp32(BlockOfCode* code, RegAlloc& reg_alloc, IR::Block& block, IR::Inst* inst, void (Xbyak::CodeGenerator::*fn)(const Xbyak::Xmm&, const Xbyak::Operand&)) { - IR::Value a = inst->GetArg(0); + auto args = reg_alloc.GetArgumentInfo(inst); - Xbyak::Xmm result = reg_alloc.UseDefXmm(a, inst); + Xbyak::Xmm result = reg_alloc.UseScratchXmm(args[0]); Xbyak::Reg32 gpr_scratch = reg_alloc.ScratchGpr().cvt32(); if (block.Location().FPSCR().FTZ()) { @@ -2282,12 +2227,14 @@ static void FPTwoOp32(BlockOfCode* code, RegAlloc& reg_alloc, IR::Block& block, if (block.Location().FPSCR().DN()) { DefaultNaN32(code, result); } + + reg_alloc.DefineValue(inst, result); } static void FPTwoOp64(BlockOfCode* code, RegAlloc& reg_alloc, IR::Block& block, IR::Inst* inst, void (Xbyak::CodeGenerator::*fn)(const Xbyak::Xmm&, const Xbyak::Operand&)) { - IR::Value a = inst->GetArg(0); + auto args = reg_alloc.GetArgumentInfo(inst); - Xbyak::Xmm result = reg_alloc.UseDefXmm(a, inst); + Xbyak::Xmm result = reg_alloc.UseScratchXmm(args[0]); Xbyak::Reg64 gpr_scratch = reg_alloc.ScratchGpr(); if (block.Location().FPSCR().FTZ()) { @@ -2301,76 +2248,76 @@ static void FPTwoOp64(BlockOfCode* code, RegAlloc& reg_alloc, IR::Block& block, if (block.Location().FPSCR().DN()) { DefaultNaN64(code, result); } + + reg_alloc.DefineValue(inst, result); } void EmitX64::EmitTransferFromFP32(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { - Xbyak::Reg32 result = reg_alloc.DefGpr(inst).cvt32(); - Xbyak::Xmm source = reg_alloc.UseXmm(inst->GetArg(0)); - // TODO: Eliminate this. - code->movd(result, source); + auto args = reg_alloc.GetArgumentInfo(inst); + reg_alloc.DefineValue(inst, args[0]); } void EmitX64::EmitTransferFromFP64(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { - Xbyak::Reg64 result = reg_alloc.DefGpr(inst); - Xbyak::Xmm source = reg_alloc.UseXmm(inst->GetArg(0)); - // TODO: Eliminate this. - code->movq(result, source); + auto args = reg_alloc.GetArgumentInfo(inst); + reg_alloc.DefineValue(inst, args[0]); } void EmitX64::EmitTransferToFP32(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { - if (inst->GetArg(0).IsImmediate() && inst->GetArg(0).GetU32() == 0) { - Xbyak::Xmm result = reg_alloc.DefXmm(inst); + auto args = reg_alloc.GetArgumentInfo(inst); + if (args[0].IsImmediate() && args[0].GetImmediateU32() == 0) { + Xbyak::Xmm result = reg_alloc.ScratchXmm(); code->xorps(result, result); + reg_alloc.DefineValue(inst, result); } else { - Xbyak::Xmm result = reg_alloc.DefXmm(inst); - Xbyak::Reg32 source = reg_alloc.UseGpr(inst->GetArg(0)).cvt32(); - // TODO: Eliminate this. - code->movd(result, source); + reg_alloc.DefineValue(inst, args[0]); } } void EmitX64::EmitTransferToFP64(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { - if (inst->GetArg(0).IsImmediate() && inst->GetArg(0).GetU64() == 0) { - Xbyak::Xmm result = reg_alloc.DefXmm(inst); - code->xorpd(result, result); + auto args = reg_alloc.GetArgumentInfo(inst); + if (args[0].IsImmediate() && args[0].GetImmediateU64() == 0) { + Xbyak::Xmm result = reg_alloc.ScratchXmm(); + code->xorps(result, result); + reg_alloc.DefineValue(inst, result); } else { - Xbyak::Xmm result = reg_alloc.DefXmm(inst); - Xbyak::Reg64 source = reg_alloc.UseGpr(inst->GetArg(0)); - // TODO: Eliminate this. - code->movq(result, source); + reg_alloc.DefineValue(inst, args[0]); } } void EmitX64::EmitFPAbs32(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { - IR::Value a = inst->GetArg(0); - - Xbyak::Xmm result = reg_alloc.UseDefXmm(a, inst); + auto args = reg_alloc.GetArgumentInfo(inst); + Xbyak::Xmm result = reg_alloc.UseScratchXmm(args[0]); code->pand(result, code->MFloatNonSignMask32()); + + reg_alloc.DefineValue(inst, result); } void EmitX64::EmitFPAbs64(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { - IR::Value a = inst->GetArg(0); - - Xbyak::Xmm result = reg_alloc.UseDefXmm(a, inst); + auto args = reg_alloc.GetArgumentInfo(inst); + Xbyak::Xmm result = reg_alloc.UseScratchXmm(args[0]); code->pand(result, code->MFloatNonSignMask64()); + + reg_alloc.DefineValue(inst, result); } void EmitX64::EmitFPNeg32(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { - IR::Value a = inst->GetArg(0); - - Xbyak::Xmm result = reg_alloc.UseDefXmm(a, inst); + auto args = reg_alloc.GetArgumentInfo(inst); + Xbyak::Xmm result = reg_alloc.UseScratchXmm(args[0]); code->pxor(result, code->MFloatNegativeZero32()); + + reg_alloc.DefineValue(inst, result); } void EmitX64::EmitFPNeg64(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { - IR::Value a = inst->GetArg(0); - - Xbyak::Xmm result = reg_alloc.UseDefXmm(a, inst); + auto args = reg_alloc.GetArgumentInfo(inst); + Xbyak::Xmm result = reg_alloc.UseScratchXmm(args[0]); code->pxor(result, code->MFloatNegativeZero64()); + + reg_alloc.DefineValue(inst, result); } void EmitX64::EmitFPAdd32(RegAlloc& reg_alloc, IR::Block& block, IR::Inst* inst) { @@ -2437,12 +2384,10 @@ static void SetFpscrNzcvFromFlags(BlockOfCode* code, RegAlloc& reg_alloc) { } void EmitX64::EmitFPCompare32(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { - IR::Value a = inst->GetArg(0); - IR::Value b = inst->GetArg(1); - bool quiet = inst->GetArg(2).GetU1(); - - Xbyak::Xmm reg_a = reg_alloc.UseXmm(a); - Xbyak::Xmm reg_b = reg_alloc.UseXmm(b); + auto args = reg_alloc.GetArgumentInfo(inst); + Xbyak::Xmm reg_a = reg_alloc.UseXmm(args[0]); + Xbyak::Xmm reg_b = reg_alloc.UseXmm(args[1]); + bool quiet = args[2].GetImmediateU1(); if (quiet) { code->ucomiss(reg_a, reg_b); @@ -2454,12 +2399,10 @@ void EmitX64::EmitFPCompare32(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { } void EmitX64::EmitFPCompare64(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { - IR::Value a = inst->GetArg(0); - IR::Value b = inst->GetArg(1); - bool quiet = inst->GetArg(2).GetU1(); - - Xbyak::Xmm reg_a = reg_alloc.UseXmm(a); - Xbyak::Xmm reg_b = reg_alloc.UseXmm(b); + auto args = reg_alloc.GetArgumentInfo(inst); + Xbyak::Xmm reg_a = reg_alloc.UseXmm(args[0]); + Xbyak::Xmm reg_b = reg_alloc.UseXmm(args[1]); + bool quiet = args[2].GetImmediateU1(); if (quiet) { code->ucomisd(reg_a, reg_b); @@ -2471,9 +2414,8 @@ void EmitX64::EmitFPCompare64(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { } void EmitX64::EmitFPSingleToDouble(RegAlloc& reg_alloc, IR::Block& block, IR::Inst* inst) { - IR::Value a = inst->GetArg(0); - - Xbyak::Xmm result = reg_alloc.UseDefXmm(a, inst); + auto args = reg_alloc.GetArgumentInfo(inst); + Xbyak::Xmm result = reg_alloc.UseScratchXmm(args[0]); Xbyak::Reg64 gpr_scratch = reg_alloc.ScratchGpr(); if (block.Location().FPSCR().FTZ()) { @@ -2486,12 +2428,13 @@ void EmitX64::EmitFPSingleToDouble(RegAlloc& reg_alloc, IR::Block& block, IR::In if (block.Location().FPSCR().DN()) { DefaultNaN64(code, result); } + + reg_alloc.DefineValue(inst, result); } void EmitX64::EmitFPDoubleToSingle(RegAlloc& reg_alloc, IR::Block& block, IR::Inst* inst) { - IR::Value a = inst->GetArg(0); - - Xbyak::Xmm result = reg_alloc.UseDefXmm(a, inst); + auto args = reg_alloc.GetArgumentInfo(inst); + Xbyak::Xmm result = reg_alloc.UseScratchXmm(args[0]); Xbyak::Reg64 gpr_scratch = reg_alloc.ScratchGpr(); if (block.Location().FPSCR().FTZ()) { @@ -2504,29 +2447,29 @@ void EmitX64::EmitFPDoubleToSingle(RegAlloc& reg_alloc, IR::Block& block, IR::In if (block.Location().FPSCR().DN()) { DefaultNaN32(code, result); } + + reg_alloc.DefineValue(inst, result); } void EmitX64::EmitFPSingleToS32(RegAlloc& reg_alloc, IR::Block& block, IR::Inst* inst) { - IR::Value a = inst->GetArg(0); - bool round_towards_zero = inst->GetArg(1).GetU1(); - - Xbyak::Xmm from = reg_alloc.UseScratchXmm(a); - Xbyak::Xmm to = reg_alloc.DefXmm(inst); - Xbyak::Reg32 gpr_scratch = reg_alloc.ScratchGpr().cvt32(); + auto args = reg_alloc.GetArgumentInfo(inst); + Xbyak::Xmm from = reg_alloc.UseScratchXmm(args[0]); + Xbyak::Reg32 to = reg_alloc.ScratchGpr().cvt32(); Xbyak::Xmm xmm_scratch = reg_alloc.ScratchXmm(); + bool round_towards_zero = args[1].GetImmediateU1(); // ARM saturates on conversion; this differs from x64 which returns a sentinel value. // Conversion to double is lossless, and allows for clamping. if (block.Location().FPSCR().FTZ()) { - DenormalsAreZero32(code, from, gpr_scratch); + DenormalsAreZero32(code, from, to); } code->cvtss2sd(from, from); // First time is to set flags if (round_towards_zero) { - code->cvttsd2si(gpr_scratch, from); // 32 bit gpr + code->cvttsd2si(to, from); // 32 bit gpr } else { - code->cvtsd2si(gpr_scratch, from); // 32 bit gpr + code->cvtsd2si(to, from); // 32 bit gpr } // Clamp to output range ZeroIfNaN64(code, from, xmm_scratch); @@ -2534,21 +2477,20 @@ void EmitX64::EmitFPSingleToS32(RegAlloc& reg_alloc, IR::Block& block, IR::Inst* code->maxsd(from, code->MFloatMinS32()); // Second time is for real if (round_towards_zero) { - code->cvttsd2si(gpr_scratch, from); // 32 bit gpr + code->cvttsd2si(to, from); // 32 bit gpr } else { - code->cvtsd2si(gpr_scratch, from); // 32 bit gpr + code->cvtsd2si(to, from); // 32 bit gpr } - code->movd(to, gpr_scratch); + + reg_alloc.DefineValue(inst, to); } void EmitX64::EmitFPSingleToU32(RegAlloc& reg_alloc, IR::Block& block, IR::Inst* inst) { - IR::Value a = inst->GetArg(0); - bool round_towards_zero = inst->GetArg(1).GetU1(); - - Xbyak::Xmm from = reg_alloc.UseScratchXmm(a); - Xbyak::Xmm to = reg_alloc.DefXmm(inst); - Xbyak::Reg32 gpr_scratch = reg_alloc.ScratchGpr().cvt32(); + auto args = reg_alloc.GetArgumentInfo(inst); + Xbyak::Xmm from = reg_alloc.UseScratchXmm(args[0]); + Xbyak::Reg32 to = reg_alloc.ScratchGpr().cvt32(); Xbyak::Xmm xmm_scratch = reg_alloc.ScratchXmm(); + bool round_towards_zero = args[1].GetImmediateU1(); // ARM saturates on conversion; this differs from x64 which returns a sentinel value. // Conversion to double is lossless, and allows for accurate clamping. @@ -2559,28 +2501,27 @@ void EmitX64::EmitFPSingleToU32(RegAlloc& reg_alloc, IR::Block& block, IR::Inst* if (block.Location().FPSCR().RMode() != Arm::FPSCR::RoundingMode::TowardsZero && !round_towards_zero) { if (block.Location().FPSCR().FTZ()) { - DenormalsAreZero32(code, from, gpr_scratch); + DenormalsAreZero32(code, from, to); } code->cvtss2sd(from, from); ZeroIfNaN64(code, from, xmm_scratch); // Bring into SSE range code->addsd(from, code->MFloatMinS32()); // First time is to set flags - code->cvtsd2si(gpr_scratch, from); // 32 bit gpr + code->cvtsd2si(to, from); // 32 bit gpr // Clamp to output range code->minsd(from, code->MFloatMaxS32()); code->maxsd(from, code->MFloatMinS32()); // Actually convert - code->cvtsd2si(gpr_scratch, from); // 32 bit gpr + code->cvtsd2si(to, from); // 32 bit gpr // Bring back into original range - code->add(gpr_scratch, u32(2147483648u)); - code->movd(to, gpr_scratch); + code->add(to, u32(2147483648u)); } else { Xbyak::Xmm xmm_mask = reg_alloc.ScratchXmm(); Xbyak::Reg32 gpr_mask = reg_alloc.ScratchGpr().cvt32(); if (block.Location().FPSCR().FTZ()) { - DenormalsAreZero32(code, from, gpr_scratch); + DenormalsAreZero32(code, from, to); } code->cvtss2sd(from, from); ZeroIfNaN64(code, from, xmm_scratch); @@ -2593,26 +2534,26 @@ void EmitX64::EmitFPSingleToU32(RegAlloc& reg_alloc, IR::Block& block, IR::Inst* // Bring into range if necessary code->addsd(from, xmm_mask); // First time is to set flags - code->cvttsd2si(gpr_scratch, from); // 32 bit gpr + code->cvttsd2si(to, from); // 32 bit gpr // Clamp to output range code->minsd(from, code->MFloatMaxS32()); code->maxsd(from, code->MFloatMinU32()); // Actually convert - code->cvttsd2si(gpr_scratch, from); // 32 bit gpr + code->cvttsd2si(to, from); // 32 bit gpr // Bring back into original range if necessary - code->add(gpr_scratch, gpr_mask); - code->movd(to, gpr_scratch); + code->add(to, gpr_mask); } + + reg_alloc.DefineValue(inst, to); } void EmitX64::EmitFPDoubleToS32(RegAlloc& reg_alloc, IR::Block& block, IR::Inst* inst) { - IR::Value a = inst->GetArg(0); - bool round_towards_zero = inst->GetArg(1).GetU1(); - - Xbyak::Xmm from = reg_alloc.UseScratchXmm(a); - Xbyak::Xmm to = reg_alloc.DefXmm(inst); - Xbyak::Reg32 gpr_scratch = reg_alloc.ScratchGpr().cvt32(); + auto args = reg_alloc.GetArgumentInfo(inst); + Xbyak::Xmm from = reg_alloc.UseScratchXmm(args[0]); + Xbyak::Reg32 to = reg_alloc.ScratchGpr().cvt32(); Xbyak::Xmm xmm_scratch = reg_alloc.ScratchXmm(); + Xbyak::Reg32 gpr_scratch = reg_alloc.ScratchGpr().cvt32(); + bool round_towards_zero = args[1].GetImmediateU1(); // ARM saturates on conversion; this differs from x64 which returns a sentinel value. @@ -2631,21 +2572,21 @@ void EmitX64::EmitFPDoubleToS32(RegAlloc& reg_alloc, IR::Block& block, IR::Inst* code->maxsd(from, code->MFloatMinS32()); // Second time is for real if (round_towards_zero) { - code->cvttsd2si(gpr_scratch, from); // 32 bit gpr + code->cvttsd2si(to, from); // 32 bit gpr } else { - code->cvtsd2si(gpr_scratch, from); // 32 bit gpr + code->cvtsd2si(to, from); // 32 bit gpr } - code->movd(to, gpr_scratch); + + reg_alloc.DefineValue(inst, to); } void EmitX64::EmitFPDoubleToU32(RegAlloc& reg_alloc, IR::Block& block, IR::Inst* inst) { - IR::Value a = inst->GetArg(0); - bool round_towards_zero = inst->GetArg(1).GetU1(); - - Xbyak::Xmm from = reg_alloc.UseScratchXmm(a); - Xbyak::Xmm to = reg_alloc.DefXmm(inst); - Xbyak::Reg32 gpr_scratch = reg_alloc.ScratchGpr().cvt32(); + auto args = reg_alloc.GetArgumentInfo(inst); + Xbyak::Xmm from = reg_alloc.UseScratchXmm(args[0]); + Xbyak::Reg32 to = reg_alloc.ScratchGpr().cvt32(); Xbyak::Xmm xmm_scratch = reg_alloc.ScratchXmm(); + Xbyak::Reg32 gpr_scratch = reg_alloc.ScratchGpr().cvt32(); + bool round_towards_zero = args[1].GetImmediateU1(); // ARM saturates on conversion; this differs from x64 which returns a sentinel value. // TODO: Use VCVTPD2UDQ when AVX512VL is available. @@ -2664,10 +2605,9 @@ void EmitX64::EmitFPDoubleToU32(RegAlloc& reg_alloc, IR::Block& block, IR::Inst* code->minsd(from, code->MFloatMaxS32()); code->maxsd(from, code->MFloatMinS32()); // Actually convert - code->cvtsd2si(gpr_scratch, from); // 32 bit gpr + code->cvtsd2si(to, from); // 32 bit gpr // Bring back into original range - code->add(gpr_scratch, u32(2147483648u)); - code->movd(to, gpr_scratch); + code->add(to, u32(2147483648u)); } else { Xbyak::Xmm xmm_mask = reg_alloc.ScratchXmm(); Xbyak::Reg32 gpr_mask = reg_alloc.ScratchGpr().cvt32(); @@ -2690,65 +2630,64 @@ void EmitX64::EmitFPDoubleToU32(RegAlloc& reg_alloc, IR::Block& block, IR::Inst* code->minsd(from, code->MFloatMaxS32()); code->maxsd(from, code->MFloatMinU32()); // Actually convert - code->cvttsd2si(gpr_scratch, from); // 32 bit gpr + code->cvttsd2si(to, from); // 32 bit gpr // Bring back into original range if necessary - code->add(gpr_scratch, gpr_mask); - code->movd(to, gpr_scratch); + code->add(to, gpr_mask); } + + reg_alloc.DefineValue(inst, to); } void EmitX64::EmitFPS32ToSingle(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { - IR::Value a = inst->GetArg(0); - bool round_to_nearest = inst->GetArg(1).GetU1(); + auto args = reg_alloc.GetArgumentInfo(inst); + Xbyak::Reg32 from = reg_alloc.UseGpr(args[0]).cvt32(); + Xbyak::Xmm to = reg_alloc.ScratchXmm(); + bool round_to_nearest = args[1].GetImmediateU1(); ASSERT_MSG(!round_to_nearest, "round_to_nearest unimplemented"); - Xbyak::Xmm from = reg_alloc.UseXmm(a); - Xbyak::Xmm to = reg_alloc.DefXmm(inst); - Xbyak::Reg32 gpr_scratch = reg_alloc.ScratchGpr().cvt32(); + code->cvtsi2ss(to, from); - code->movd(gpr_scratch, from); - code->cvtsi2ss(to, gpr_scratch); + reg_alloc.DefineValue(inst, to); } void EmitX64::EmitFPU32ToSingle(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { - IR::Value a = inst->GetArg(0); - bool round_to_nearest = inst->GetArg(1).GetU1(); + auto args = reg_alloc.GetArgumentInfo(inst); + Xbyak::Reg64 from = reg_alloc.UseGpr(args[0]); + Xbyak::Xmm to = reg_alloc.ScratchXmm(); + bool round_to_nearest = args[1].GetImmediateU1(); ASSERT_MSG(!round_to_nearest, "round_to_nearest unimplemented"); - Xbyak::Xmm from = reg_alloc.UseXmm(a); - Xbyak::Xmm to = reg_alloc.DefXmm(inst); - // Use a 64-bit register to ensure we don't end up treating the input as signed - Xbyak::Reg64 gpr_scratch = reg_alloc.ScratchGpr(); + // We are using a 64-bit GPR register to ensure we don't end up treating the input as signed + code->mov(from.cvt32(), from.cvt32()); // TODO: Verify if this is necessary + code->cvtsi2ss(to, from); - code->movq(gpr_scratch, from); - code->cvtsi2ss(to, gpr_scratch); + reg_alloc.DefineValue(inst, to); } void EmitX64::EmitFPS32ToDouble(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { - IR::Value a = inst->GetArg(0); - bool round_to_nearest = inst->GetArg(1).GetU1(); + auto args = reg_alloc.GetArgumentInfo(inst); + Xbyak::Reg32 from = reg_alloc.UseGpr(args[0]).cvt32(); + Xbyak::Xmm to = reg_alloc.ScratchXmm(); + bool round_to_nearest = args[1].GetImmediateU1(); ASSERT_MSG(!round_to_nearest, "round_to_nearest unimplemented"); - Xbyak::Xmm from = reg_alloc.UseXmm(a); - Xbyak::Xmm to = reg_alloc.DefXmm(inst); - Xbyak::Reg32 gpr_scratch = reg_alloc.ScratchGpr().cvt32(); + code->cvtsi2sd(to, from); - code->movd(gpr_scratch, from); - code->cvtsi2sd(to, gpr_scratch); + reg_alloc.DefineValue(inst, to); } void EmitX64::EmitFPU32ToDouble(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { - IR::Value a = inst->GetArg(0); - bool round_to_nearest = inst->GetArg(1).GetU1(); + auto args = reg_alloc.GetArgumentInfo(inst); + Xbyak::Reg64 from = reg_alloc.UseGpr(args[0]); + Xbyak::Xmm to = reg_alloc.ScratchXmm(); + bool round_to_nearest = args[1].GetImmediateU1(); ASSERT_MSG(!round_to_nearest, "round_to_nearest unimplemented"); - Xbyak::Xmm from = reg_alloc.UseXmm(a); - Xbyak::Xmm to = reg_alloc.DefXmm(inst); - // Use a 64-bit register to ensure we don't end up treating the input as signed - Xbyak::Reg64 gpr_scratch = reg_alloc.ScratchGpr(); + // We are using a 64-bit GPR register to ensure we don't end up treating the input as signed + code->mov(from.cvt32(), from.cvt32()); // TODO: Verify if this is necessary + code->cvtsi2sd(to, from); - code->movq(gpr_scratch, from); - code->cvtsi2sd(to, gpr_scratch); + reg_alloc.DefineValue(inst, to); } @@ -2761,8 +2700,9 @@ void EmitX64::EmitClearExclusive(RegAlloc&, IR::Block&, IR::Inst*) { void EmitX64::EmitSetExclusive(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { using namespace Xbyak::util; - ASSERT(inst->GetArg(1).IsImmediate()); - Xbyak::Reg32 address = reg_alloc.UseGpr(inst->GetArg(0)).cvt32(); + auto args = reg_alloc.GetArgumentInfo(inst); + ASSERT(args[1].IsImmediate()); + Xbyak::Reg32 address = reg_alloc.UseGpr(args[0]).cvt32(); code->mov(code->byte[r15 + offsetof(JitState, exclusive_state)], u8(1)); code->mov(dword[r15 + offsetof(JitState, exclusive_address)], address); @@ -2770,16 +2710,16 @@ void EmitX64::EmitSetExclusive(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) template static void ReadMemory(BlockOfCode* code, RegAlloc& reg_alloc, IR::Inst* inst, UserCallbacks& cb, size_t bit_size, FunctionPointer fn) { + auto args = reg_alloc.GetArgumentInfo(inst); + reg_alloc.HostCall(inst, args[0]); + if (!cb.page_table) { - reg_alloc.HostCall(inst, inst->GetArg(0)); code->CallFunction(fn); return; } using namespace Xbyak::util; - reg_alloc.HostCall(inst, inst->GetArg(0)); - Xbyak::Reg64 result = code->ABI_RETURN; Xbyak::Reg32 vaddr = code->ABI_PARAM1.cvt32(); Xbyak::Reg64 page_index = code->ABI_PARAM3; @@ -2787,26 +2727,26 @@ static void ReadMemory(BlockOfCode* code, RegAlloc& reg_alloc, IR::Inst* inst, U Xbyak::Label abort, end; - code->mov(rax, reinterpret_cast(cb.page_table)); + code->mov(result, reinterpret_cast(cb.page_table)); code->mov(page_index.cvt32(), vaddr); code->shr(page_index.cvt32(), 12); - code->mov(rax, qword[rax + page_index * 8]); - code->test(rax, rax); + code->mov(result, qword[result + page_index * 8]); + code->test(result, result); code->jz(abort); code->mov(page_offset.cvt32(), vaddr); code->and_(page_offset.cvt32(), 4095); switch (bit_size) { case 8: - code->movzx(result, code->byte[rax + page_offset]); + code->movzx(result, code->byte[result + page_offset]); break; case 16: - code->movzx(result, word[rax + page_offset]); + code->movzx(result, word[result + page_offset]); break; case 32: - code->mov(result.cvt32(), dword[rax + page_offset]); + code->mov(result.cvt32(), dword[result + page_offset]); break; case 64: - code->mov(result.cvt64(), qword[rax + page_offset]); + code->mov(result.cvt64(), qword[result + page_offset]); break; default: ASSERT_MSG(false, "Invalid bit_size"); @@ -2820,16 +2760,16 @@ static void ReadMemory(BlockOfCode* code, RegAlloc& reg_alloc, IR::Inst* inst, U template static void WriteMemory(BlockOfCode* code, RegAlloc& reg_alloc, IR::Inst* inst, UserCallbacks& cb, size_t bit_size, FunctionPointer fn) { + auto args = reg_alloc.GetArgumentInfo(inst); + reg_alloc.HostCall(nullptr, args[0], args[1]); + if (!cb.page_table) { - reg_alloc.HostCall(nullptr, inst->GetArg(0), inst->GetArg(1)); code->CallFunction(fn); return; } using namespace Xbyak::util; - reg_alloc.HostCall(nullptr, inst->GetArg(0), inst->GetArg(1)); - Xbyak::Reg32 vaddr = code->ABI_PARAM1.cvt32(); Xbyak::Reg64 value = code->ABI_PARAM2; Xbyak::Reg64 page_index = code->ABI_PARAM3; @@ -2902,17 +2842,18 @@ void EmitX64::EmitWriteMemory64(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) template static void ExclusiveWrite(BlockOfCode* code, RegAlloc& reg_alloc, IR::Inst* inst, FunctionPointer fn, bool prepend_high_word) { + auto args = reg_alloc.GetArgumentInfo(inst); + if (prepend_high_word) { + reg_alloc.HostCall(nullptr, args[0], args[1], args[2]); + } else { + reg_alloc.HostCall(nullptr, args[0], args[1]); + } + Xbyak::Reg32 passed = reg_alloc.ScratchGpr().cvt32(); + Xbyak::Reg32 tmp = code->ABI_RETURN.cvt32(); // Use one of the unusued HostCall registers. + using namespace Xbyak::util; Xbyak::Label end; - if (prepend_high_word) { - reg_alloc.HostCall(nullptr, inst->GetArg(0), inst->GetArg(1), inst->GetArg(2)); - } else { - reg_alloc.HostCall(nullptr, inst->GetArg(0), inst->GetArg(1)); - } - Xbyak::Reg32 passed = reg_alloc.DefGpr(inst).cvt32(); - Xbyak::Reg32 tmp = code->ABI_RETURN.cvt32(); // Use one of the unusued HostCall registers. - code->mov(passed, u32(1)); code->cmp(code->byte[r15 + offsetof(JitState, exclusive_state)], u8(0)); code->je(end); @@ -2929,6 +2870,8 @@ static void ExclusiveWrite(BlockOfCode* code, RegAlloc& reg_alloc, IR::Inst* ins code->CallFunction(fn); code->xor_(passed, passed); code->L(end); + + reg_alloc.DefineValue(inst, passed); } void EmitX64::EmitExclusiveWriteMemory8(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { @@ -2951,7 +2894,7 @@ static void EmitCoprocessorException() { ASSERT_MSG(false, "Should raise coproc exception here"); } -static void CallCoprocCallback(BlockOfCode* code, RegAlloc& reg_alloc, Jit* jit_interface, Coprocessor::Callback callback, IR::Inst* inst = nullptr, IR::Value arg0 = {}, IR::Value arg1 = {}) { +static void CallCoprocCallback(BlockOfCode* code, RegAlloc& reg_alloc, Jit* jit_interface, Coprocessor::Callback callback, IR::Inst* inst = nullptr, boost::optional arg0 = {}, boost::optional arg1 = {}) { reg_alloc.HostCall(inst, {}, {}, arg0, arg1); code->mov(code->ABI_PARAM1, reinterpret_cast(jit_interface)); @@ -2989,6 +2932,7 @@ void EmitX64::EmitCoprocInternalOperation(RegAlloc& reg_alloc, IR::Block&, IR::I } void EmitX64::EmitCoprocSendOneWord(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { + auto args = reg_alloc.GetArgumentInfo(inst); auto coproc_info = inst->GetArg(0).GetCoprocInfo(); size_t coproc_num = coproc_info[0]; @@ -2998,8 +2942,6 @@ void EmitX64::EmitCoprocSendOneWord(RegAlloc& reg_alloc, IR::Block&, IR::Inst* i Arm::CoprocReg CRm = static_cast(coproc_info[4]); unsigned opc2 = static_cast(coproc_info[5]); - IR::Value word = inst->GetArg(1); - std::shared_ptr coproc = cb.coprocessors[coproc_num]; if (!coproc) { EmitCoprocessorException(); @@ -3012,12 +2954,12 @@ void EmitX64::EmitCoprocSendOneWord(RegAlloc& reg_alloc, IR::Block&, IR::Inst* i EmitCoprocessorException(); return; case 1: - CallCoprocCallback(code, reg_alloc, jit_interface, boost::get(action), nullptr, word); + CallCoprocCallback(code, reg_alloc, jit_interface, boost::get(action), nullptr, args[1]); return; case 2: { u32* destination_ptr = boost::get(action); - Xbyak::Reg32 reg_word = reg_alloc.UseGpr(word).cvt32(); + Xbyak::Reg32 reg_word = reg_alloc.UseGpr(args[1]).cvt32(); Xbyak::Reg64 reg_destination_addr = reg_alloc.ScratchGpr(); code->mov(reg_destination_addr, reinterpret_cast(destination_ptr)); @@ -3031,6 +2973,7 @@ void EmitX64::EmitCoprocSendOneWord(RegAlloc& reg_alloc, IR::Block&, IR::Inst* i } void EmitX64::EmitCoprocSendTwoWords(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { + auto args = reg_alloc.GetArgumentInfo(inst); auto coproc_info = inst->GetArg(0).GetCoprocInfo(); size_t coproc_num = coproc_info[0]; @@ -3038,9 +2981,6 @@ void EmitX64::EmitCoprocSendTwoWords(RegAlloc& reg_alloc, IR::Block&, IR::Inst* unsigned opc = static_cast(coproc_info[2]); Arm::CoprocReg CRm = static_cast(coproc_info[3]); - IR::Value word1 = inst->GetArg(1); - IR::Value word2 = inst->GetArg(2); - std::shared_ptr coproc = cb.coprocessors[coproc_num]; if (!coproc) { EmitCoprocessorException(); @@ -3053,13 +2993,13 @@ void EmitX64::EmitCoprocSendTwoWords(RegAlloc& reg_alloc, IR::Block&, IR::Inst* EmitCoprocessorException(); return; case 1: - CallCoprocCallback(code, reg_alloc, jit_interface, boost::get(action), nullptr, word1, word2); + CallCoprocCallback(code, reg_alloc, jit_interface, boost::get(action), nullptr, args[1], args[2]); return; case 2: { auto destination_ptrs = boost::get>(action); - Xbyak::Reg32 reg_word1 = reg_alloc.UseGpr(word1).cvt32(); - Xbyak::Reg32 reg_word2 = reg_alloc.UseGpr(word2).cvt32(); + Xbyak::Reg32 reg_word1 = reg_alloc.UseGpr(args[1]).cvt32(); + Xbyak::Reg32 reg_word2 = reg_alloc.UseGpr(args[2]).cvt32(); Xbyak::Reg64 reg_destination_addr = reg_alloc.ScratchGpr(); code->mov(reg_destination_addr, reinterpret_cast(destination_ptrs[0])); @@ -3101,12 +3041,14 @@ void EmitX64::EmitCoprocGetOneWord(RegAlloc& reg_alloc, IR::Block&, IR::Inst* in case 2: { u32* source_ptr = boost::get(action); - Xbyak::Reg32 reg_word = reg_alloc.DefGpr(inst).cvt32(); + Xbyak::Reg32 reg_word = reg_alloc.ScratchGpr().cvt32(); Xbyak::Reg64 reg_source_addr = reg_alloc.ScratchGpr(); code->mov(reg_source_addr, reinterpret_cast(source_ptr)); code->mov(reg_word, code->dword[reg_source_addr]); + reg_alloc.DefineValue(inst, reg_word); + return; } default: @@ -3139,7 +3081,7 @@ void EmitX64::EmitCoprocGetTwoWords(RegAlloc& reg_alloc, IR::Block&, IR::Inst* i case 2: { auto source_ptrs = boost::get>(action); - Xbyak::Reg64 reg_result = reg_alloc.DefGpr(inst); + Xbyak::Reg64 reg_result = reg_alloc.ScratchGpr(); Xbyak::Reg64 reg_destination_addr = reg_alloc.ScratchGpr(); Xbyak::Reg64 reg_tmp = reg_alloc.ScratchGpr(); @@ -3150,6 +3092,8 @@ void EmitX64::EmitCoprocGetTwoWords(RegAlloc& reg_alloc, IR::Block&, IR::Inst* i code->mov(reg_tmp.cvt32(), code->dword[reg_destination_addr]); code->or_(reg_result, reg_tmp); + reg_alloc.DefineValue(inst, reg_result); + return; } default: @@ -3158,6 +3102,7 @@ void EmitX64::EmitCoprocGetTwoWords(RegAlloc& reg_alloc, IR::Block&, IR::Inst* i } void EmitX64::EmitCoprocLoadWords(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { + auto args = reg_alloc.GetArgumentInfo(inst); auto coproc_info = inst->GetArg(0).GetCoprocInfo(); size_t coproc_num = coproc_info[0]; @@ -3167,8 +3112,6 @@ void EmitX64::EmitCoprocLoadWords(RegAlloc& reg_alloc, IR::Block&, IR::Inst* ins bool has_option = coproc_info[4] != 0; boost::optional option{has_option, coproc_info[5]}; - IR::Value address = inst->GetArg(1); - std::shared_ptr coproc = cb.coprocessors[coproc_num]; if (!coproc) { EmitCoprocessorException(); @@ -3181,10 +3124,11 @@ void EmitX64::EmitCoprocLoadWords(RegAlloc& reg_alloc, IR::Block&, IR::Inst* ins return; } - CallCoprocCallback(code, reg_alloc, jit_interface, *action, nullptr, address); + CallCoprocCallback(code, reg_alloc, jit_interface, *action, nullptr, args[1]); } void EmitX64::EmitCoprocStoreWords(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { + auto args = reg_alloc.GetArgumentInfo(inst); auto coproc_info = inst->GetArg(0).GetCoprocInfo(); size_t coproc_num = coproc_info[0]; @@ -3194,8 +3138,6 @@ void EmitX64::EmitCoprocStoreWords(RegAlloc& reg_alloc, IR::Block&, IR::Inst* in bool has_option = coproc_info[4] != 0; boost::optional option{has_option, coproc_info[5]}; - IR::Value address = inst->GetArg(1); - std::shared_ptr coproc = cb.coprocessors[coproc_num]; if (!coproc) { EmitCoprocessorException(); @@ -3208,7 +3150,7 @@ void EmitX64::EmitCoprocStoreWords(RegAlloc& reg_alloc, IR::Block&, IR::Inst* in return; } - CallCoprocCallback(code, reg_alloc, jit_interface, *action, nullptr, address); + CallCoprocCallback(code, reg_alloc, jit_interface, *action, nullptr, args[1]); } void EmitX64::EmitAddCycles(size_t cycles) { diff --git a/src/backend_x64/hostloc.cpp b/src/backend_x64/hostloc.cpp index 2093623a..6349e4a8 100644 --- a/src/backend_x64/hostloc.cpp +++ b/src/backend_x64/hostloc.cpp @@ -10,12 +10,12 @@ namespace Dynarmic { namespace BackendX64 { Xbyak::Reg64 HostLocToReg64(HostLoc loc) { - DEBUG_ASSERT(HostLocIsGPR(loc)); + ASSERT(HostLocIsGPR(loc)); return Xbyak::Reg64(static_cast(loc)); } Xbyak::Xmm HostLocToXmm(HostLoc loc) { - DEBUG_ASSERT(HostLocIsXMM(loc)); + ASSERT(HostLocIsXMM(loc)); return Xbyak::Xmm(static_cast(loc) - static_cast(HostLoc::XMM0)); } @@ -23,7 +23,7 @@ Xbyak::Address SpillToOpArg(HostLoc loc) { using namespace Xbyak::util; static_assert(std::is_same::value, "Spill must be u64"); - DEBUG_ASSERT(HostLocIsSpill(loc)); + ASSERT(HostLocIsSpill(loc)); size_t i = static_cast(loc) - static_cast(HostLoc::FirstSpill); return qword[r15 + offsetof(JitState, Spill) + i * sizeof(u64)]; diff --git a/src/backend_x64/reg_alloc.cpp b/src/backend_x64/reg_alloc.cpp index 073d7be0..3496f6cd 100644 --- a/src/backend_x64/reg_alloc.cpp +++ b/src/backend_x64/reg_alloc.cpp @@ -22,6 +22,8 @@ static u64 ImmediateToU64(const IR::Value& imm) { return u64(imm.GetU1()); case IR::Type::U8: return u64(imm.GetU8()); + case IR::Type::U16: + return u64(imm.GetU16()); case IR::Type::U32: return u64(imm.GetU32()); case IR::Type::U64: @@ -31,253 +33,265 @@ static u64 ImmediateToU64(const IR::Value& imm) { } } -static Xbyak::Reg HostLocToX64(HostLoc hostloc) { - if (HostLocIsGPR(hostloc)) { - DEBUG_ASSERT(hostloc != HostLoc::RSP && hostloc != HostLoc::R15); - return HostLocToReg64(hostloc); - } - if (HostLocIsXMM(hostloc)) { - return HostLocToXmm(hostloc); - } - ASSERT_MSG(false, "This should never happen."); +static bool IsSameHostLocClass(HostLoc a, HostLoc b) { + return (HostLocIsGPR(a) && HostLocIsGPR(b)) + || (HostLocIsXMM(a) && HostLocIsXMM(b)) + || (HostLocIsSpill(a) && HostLocIsSpill(b)); } -HostLoc RegAlloc::DefHostLocReg(IR::Inst* def_inst, HostLocList desired_locations) { - DEBUG_ASSERT(std::all_of(desired_locations.begin(), desired_locations.end(), HostLocIsRegister)); - DEBUG_ASSERT_MSG(!ValueLocation(def_inst), "def_inst has already been defined"); - - HostLoc location = SelectARegister(desired_locations); - - if (IsRegisterOccupied(location)) { - SpillRegister(location); - } - - LocInfo(location).is_being_used = true; - LocInfo(location).def = def_inst; - - DEBUG_ASSERT(LocInfo(location).IsDef()); - return location; -} - -void RegAlloc::RegisterAddDef(IR::Inst* def_inst, const IR::Value& use_inst) { - DEBUG_ASSERT_MSG(!ValueLocation(def_inst), "def_inst has already been defined"); - - if (use_inst.IsImmediate()) { - LoadImmediateIntoHostLocReg(use_inst, DefHostLocReg(def_inst, any_gpr)); - return; - } - - DEBUG_ASSERT_MSG(ValueLocation(use_inst.GetInst()), "use_inst must already be defined"); - HostLoc location = *ValueLocation(use_inst.GetInst()); - LocInfo(location).values.emplace_back(def_inst); - use_inst.GetInst()->DecrementRemainingUses(); - DEBUG_ASSERT(LocInfo(location).IsIdle()); -} - -HostLoc RegAlloc::UseDefHostLocReg(IR::Value use_value, IR::Inst* def_inst, HostLocList desired_locations) { - if (!use_value.IsImmediate()) { - return UseDefHostLocReg(use_value.GetInst(), def_inst, desired_locations); - } - - return LoadImmediateIntoHostLocReg(use_value, DefHostLocReg(def_inst, desired_locations)); -} - -HostLoc RegAlloc::UseDefHostLocReg(IR::Inst* use_inst, IR::Inst* def_inst, HostLocList desired_locations) { - DEBUG_ASSERT(std::all_of(desired_locations.begin(), desired_locations.end(), HostLocIsRegister)); - DEBUG_ASSERT_MSG(!ValueLocation(def_inst), "def_inst has already been defined"); - DEBUG_ASSERT_MSG(ValueLocation(use_inst), "use_inst has not been defined"); - - if (IsLastUse(use_inst)) { - HostLoc current_location = *ValueLocation(use_inst); - auto& loc_info = LocInfo(current_location); - if (loc_info.IsIdle()) { - loc_info.is_being_used = true; - loc_info.def = def_inst; - DEBUG_ASSERT(loc_info.IsUseDef()); - if (HostLocIsSpill(current_location)) { - HostLoc new_location = SelectARegister(desired_locations); - if (IsRegisterOccupied(new_location)) { - SpillRegister(new_location); - } - EmitMove(new_location, current_location); - LocInfo(new_location) = LocInfo(current_location); - LocInfo(current_location) = {}; - return new_location; - } else { - return current_location; - } - } - } - - bool is_floating_point = HostLocIsXMM(*desired_locations.begin()); - if (is_floating_point) { - DEBUG_ASSERT(use_inst->GetType() == IR::Type::F32 || use_inst->GetType() == IR::Type::F64); - } - HostLoc use_reg = UseHostLocReg(use_inst, is_floating_point ? any_xmm : any_gpr); - HostLoc def_reg = DefHostLocReg(def_inst, desired_locations); - if (is_floating_point) { - code->movapd(HostLocToXmm(def_reg), HostLocToXmm(use_reg)); +static void EmitMove(BlockOfCode* code, HostLoc to, HostLoc from) { + if (HostLocIsXMM(to) && HostLocIsXMM(from)) { + code->movaps(HostLocToXmm(to), HostLocToXmm(from)); + } else if (HostLocIsGPR(to) && HostLocIsGPR(from)) { + code->mov(HostLocToReg64(to), HostLocToReg64(from)); + } else if (HostLocIsXMM(to) && HostLocIsGPR(from)) { + code->movq(HostLocToXmm(to), HostLocToReg64(from)); + } else if (HostLocIsGPR(to) && HostLocIsXMM(from)) { + code->movq(HostLocToReg64(to), HostLocToXmm(from)); + } else if (HostLocIsXMM(to) && HostLocIsSpill(from)) { + code->movsd(HostLocToXmm(to), SpillToOpArg(from)); + } else if (HostLocIsSpill(to) && HostLocIsXMM(from)) { + code->movsd(SpillToOpArg(to), HostLocToXmm(from)); + } else if (HostLocIsGPR(to) && HostLocIsSpill(from)) { + code->mov(HostLocToReg64(to), SpillToOpArg(from)); + } else if (HostLocIsSpill(to) && HostLocIsGPR(from)) { + code->mov(SpillToOpArg(to), HostLocToReg64(from)); } else { - code->mov(HostLocToReg64(def_reg), HostLocToReg64(use_reg)); + ASSERT_MSG(false, "Invalid RegAlloc::EmitMove"); } - return def_reg; } -std::tuple RegAlloc::UseDefOpArgHostLocReg(IR::Value use_value, IR::Inst* def_inst, HostLocList desired_locations) { - DEBUG_ASSERT(std::all_of(desired_locations.begin(), desired_locations.end(), HostLocIsRegister)); - DEBUG_ASSERT_MSG(!ValueLocation(def_inst), "def_inst has already been defined"); - DEBUG_ASSERT_MSG(use_value.IsImmediate() || ValueLocation(use_value.GetInst()), "use_inst has not been defined"); +static void EmitExchange(BlockOfCode* code, HostLoc a, HostLoc b) { + if (HostLocIsGPR(a) && HostLocIsGPR(b)) { + code->xchg(HostLocToReg64(a), HostLocToReg64(b)); + } else if (HostLocIsXMM(a) && HostLocIsXMM(b)) { + ASSERT_MSG(false, "Check your code: Exchanging XMM registers is unnecessary"); + } else { + ASSERT_MSG(false, "Invalid RegAlloc::EmitExchange"); + } +} - if (!use_value.IsImmediate()) { - const IR::Inst* use_inst = use_value.GetInst(); +bool HostLocInfo::IsLocked() const { + return is_being_used; +} - if (IsLastUse(use_inst)) { - HostLoc current_location = *ValueLocation(use_inst); - auto& loc_info = LocInfo(current_location); - if (!loc_info.IsIdle()) { - if (HostLocIsSpill(current_location)) { - loc_info.is_being_used = true; - DEBUG_ASSERT(loc_info.IsUse()); - return std::make_tuple(SpillToOpArg(current_location), DefHostLocReg(def_inst, desired_locations)); - } else { - loc_info.is_being_used = true; - loc_info.def = def_inst; - DEBUG_ASSERT(loc_info.IsUseDef()); - return std::make_tuple(HostLocToX64(current_location), current_location); - } - } +bool HostLocInfo::IsEmpty() const { + return !is_being_used && values.empty(); +} + +bool HostLocInfo::IsLastUse() const { + return !is_being_used && std::all_of(values.begin(), values.end(), [](const auto& inst) { return !inst->HasUses(); }); +} + +bool HostLocInfo::ContainsValue(const IR::Inst* inst) const { + return std::find(values.begin(), values.end(), inst) != values.end(); +} + +void HostLocInfo::ReadLock() { + ASSERT(!is_scratch); + is_being_used = true; +} + +void HostLocInfo::WriteLock() { + ASSERT(!is_being_used); + is_being_used = true; + is_scratch = true; +} + +void HostLocInfo::AddValue(IR::Inst* inst) { + values.push_back(inst); +} + +void HostLocInfo::EndOfAllocScope() { + const auto to_erase = std::remove_if(values.begin(), values.end(), [](const auto& inst) { return !inst->HasUses(); }); + values.erase(to_erase, values.end()); + + is_being_used = false; + is_scratch = false; +} + +IR::Type Argument::GetType() const { + return value.GetType(); +} + +bool Argument::IsImmediate() const { + return value.IsImmediate(); +} + +bool Argument::GetImmediateU1() const { + return value.GetU1(); +} + +u8 Argument::GetImmediateU8() const { + u64 imm = ImmediateToU64(value); + ASSERT(imm < 0x100); + return u8(imm); +} + +u16 Argument::GetImmediateU16() const { + u64 imm = ImmediateToU64(value); + ASSERT(imm < 0x10000); + return u16(imm); +} + +u32 Argument::GetImmediateU32() const { + u64 imm = ImmediateToU64(value); + ASSERT(imm < 0x100000000); + return u32(imm); +} + +u64 Argument::GetImmediateU64() const { + return ImmediateToU64(value); +} + +bool Argument::IsInGpr() const { + return HostLocIsGPR(*reg_alloc.ValueLocation(value.GetInst())); +} + +bool Argument::IsInXmm() const { + return HostLocIsXMM(*reg_alloc.ValueLocation(value.GetInst())); +} + +bool Argument::IsInMemory() const { + return HostLocIsSpill(*reg_alloc.ValueLocation(value.GetInst())); +} + +std::array RegAlloc::GetArgumentInfo(IR::Inst* inst) { + std::array ret = { Argument{*this}, Argument{*this}, Argument{*this} }; + for (size_t i = 0; i < inst->NumArgs(); i++) { + IR::Value arg = inst->GetArg(i); + ret[i].value = arg; + if (!arg.IsImmediate()) { + arg.GetInst()->DecrementRemainingUses(); } } - - OpArg use_oparg = UseOpArg(use_value, any_gpr); - HostLoc def_reg = DefHostLocReg(def_inst, desired_locations); - return std::make_tuple(use_oparg, def_reg); + return ret; } -HostLoc RegAlloc::UseHostLocReg(IR::Value use_value, HostLocList desired_locations) { - if (!use_value.IsImmediate()) { - return UseHostLocReg(use_value.GetInst(), desired_locations); - } - - return LoadImmediateIntoHostLocReg(use_value, ScratchHostLocReg(desired_locations)); +Xbyak::Reg64 RegAlloc::UseGpr(Argument& arg) { + ASSERT(!arg.allocated); + arg.allocated = true; + return HostLocToReg64(UseImpl(arg.value, any_gpr)); } -HostLoc RegAlloc::UseHostLocReg(IR::Inst* use_inst, HostLocList desired_locations) { - HostLoc current_location; - bool was_being_used; - std::tie(current_location, was_being_used) = UseHostLoc(use_inst, desired_locations); - - if (HostLocIsRegister(current_location)) { - return current_location; - } else if (HostLocIsSpill(current_location)) { - HostLoc new_location = SelectARegister(desired_locations); - if (IsRegisterOccupied(new_location)) { - SpillRegister(new_location); - } - EmitMove(new_location, current_location); - if (!was_being_used) { - LocInfo(new_location) = LocInfo(current_location); - LocInfo(current_location) = {}; - DEBUG_ASSERT(LocInfo(new_location).IsUse()); - } else { - LocInfo(new_location).is_being_used = true; - DEBUG_ASSERT(LocInfo(new_location).IsScratch()); - } - return new_location; - } - - ASSERT_MSG(false, "Unknown current_location type"); +Xbyak::Xmm RegAlloc::UseXmm(Argument& arg) { + ASSERT(!arg.allocated); + arg.allocated = true; + return HostLocToXmm(UseImpl(arg.value, any_xmm)); } -OpArg RegAlloc::UseOpArg(IR::Value use_value, HostLocList desired_locations) { +OpArg RegAlloc::UseOpArg(Argument& arg) { + return UseGpr(arg); +} + +void RegAlloc::Use(Argument& arg, HostLoc host_loc) { + ASSERT(!arg.allocated); + arg.allocated = true; + UseImpl(arg.value, {host_loc}); +} + +Xbyak::Reg64 RegAlloc::UseScratchGpr(Argument& arg) { + ASSERT(!arg.allocated); + arg.allocated = true; + return HostLocToReg64(UseScratchImpl(arg.value, any_gpr)); +} + +Xbyak::Xmm RegAlloc::UseScratchXmm(Argument& arg) { + ASSERT(!arg.allocated); + arg.allocated = true; + return HostLocToXmm(UseScratchImpl(arg.value, any_xmm)); +} + +void RegAlloc::UseScratch(Argument& arg, HostLoc host_loc) { + ASSERT(!arg.allocated); + arg.allocated = true; + UseScratchImpl(arg.value, {host_loc}); +} + +void RegAlloc::DefineValue(IR::Inst* inst, const Xbyak::Reg& reg) { + ASSERT(reg.getKind() == Xbyak::Operand::XMM || reg.getKind() == Xbyak::Operand::REG); + HostLoc hostloc = static_cast(reg.getIdx() + static_cast(reg.getKind() == Xbyak::Operand::XMM ? HostLoc::XMM0 : HostLoc::RAX)); + DefineValueImpl(inst, hostloc); +} + +void RegAlloc::DefineValue(IR::Inst* inst, Argument& arg) { + ASSERT(!arg.allocated); + arg.allocated = true; + DefineValueImpl(inst, arg.value); +} + +Xbyak::Reg64 RegAlloc::ScratchGpr(HostLocList desired_locations) { + return HostLocToReg64(ScratchImpl(desired_locations)); +} + +Xbyak::Xmm RegAlloc::ScratchXmm(HostLocList desired_locations) { + return HostLocToXmm(ScratchImpl(desired_locations)); +} + +HostLoc RegAlloc::UseImpl(IR::Value use_value, HostLocList desired_locations) { if (use_value.IsImmediate()) { - ASSERT_MSG(false, "UseOpArg does not support immediates"); - return {}; // return a None + return LoadImmediate(use_value, ScratchImpl(desired_locations)); } IR::Inst* use_inst = use_value.GetInst(); + const HostLoc current_location = *ValueLocation(use_inst); - HostLoc current_location; - bool was_being_used; - std::tie(current_location, was_being_used) = UseHostLoc(use_inst, desired_locations); - - if (HostLocIsRegister(current_location)) { - return HostLocToX64(current_location); - } else if (HostLocIsSpill(current_location)) { - return SpillToOpArg(current_location); + const bool can_use_current_location = std::find(desired_locations.begin(), desired_locations.end(), current_location) != desired_locations.end(); + if (can_use_current_location) { + LocInfo(current_location).ReadLock(); + return current_location; } - ASSERT_MSG(false, "Unknown current_location type"); + if (LocInfo(current_location).IsLocked()) { + return UseScratchImpl(use_value, desired_locations); + } + + const HostLoc destination_location = SelectARegister(desired_locations); + if (IsSameHostLocClass(destination_location, current_location)) { + Exchange(destination_location, current_location); + } else { + MoveOutOfTheWay(destination_location); + Move(destination_location, current_location); + } + LocInfo(destination_location).ReadLock(); + return destination_location; } -HostLoc RegAlloc::UseScratchHostLocReg(IR::Value use_value, HostLocList desired_locations) { - if (!use_value.IsImmediate()) { - return UseScratchHostLocReg(use_value.GetInst(), desired_locations); +HostLoc RegAlloc::UseScratchImpl(IR::Value use_value, HostLocList desired_locations) { + if (use_value.IsImmediate()) { + return LoadImmediate(use_value, ScratchImpl(desired_locations)); } - return LoadImmediateIntoHostLocReg(use_value, ScratchHostLocReg(desired_locations)); + IR::Inst* use_inst = use_value.GetInst(); + const HostLoc current_location = *ValueLocation(use_inst); + + const bool can_use_current_location = std::find(desired_locations.begin(), desired_locations.end(), current_location) != desired_locations.end(); + if (can_use_current_location && !LocInfo(current_location).IsLocked()) { + MoveOutOfTheWay(current_location); + LocInfo(current_location).WriteLock(); + return current_location; + } + + const HostLoc destination_location = SelectARegister(desired_locations); + MoveOutOfTheWay(destination_location); + CopyToScratch(destination_location, current_location); + LocInfo(destination_location).WriteLock(); + return destination_location; } -HostLoc RegAlloc::UseScratchHostLocReg(IR::Inst* use_inst, HostLocList desired_locations) { - DEBUG_ASSERT(std::all_of(desired_locations.begin(), desired_locations.end(), HostLocIsRegister)); - DEBUG_ASSERT_MSG(ValueLocation(use_inst), "use_inst has not been defined"); - ASSERT_MSG(use_inst->HasUses(), "use_inst ran out of uses. (Use-d an IR::Inst* too many times)"); - - HostLoc current_location = *ValueLocation(use_inst); - HostLoc new_location = SelectARegister(desired_locations); - if (IsRegisterOccupied(new_location)) { - SpillRegister(new_location); - } - - if (HostLocIsSpill(current_location)) { - EmitMove(new_location, current_location); - LocInfo(new_location).is_being_used = true; - use_inst->DecrementRemainingUses(); - DEBUG_ASSERT(LocInfo(new_location).IsScratch()); - return new_location; - } else if (HostLocIsRegister(current_location)) { - ASSERT(LocInfo(current_location).IsIdle() - || LocInfo(current_location).IsUse() - || LocInfo(current_location).IsUseDef()); - - if (current_location != new_location) { - EmitMove(new_location, current_location); - } else { - ASSERT(LocInfo(current_location).IsIdle()); - } - - LocInfo(new_location).is_being_used = true; - LocInfo(new_location).values.clear(); - use_inst->DecrementRemainingUses(); - DEBUG_ASSERT(LocInfo(new_location).IsScratch()); - return new_location; - } - - ASSERT_MSG(false, "Invalid current_location"); -} - -HostLoc RegAlloc::ScratchHostLocReg(HostLocList desired_locations) { - DEBUG_ASSERT(std::all_of(desired_locations.begin(), desired_locations.end(), HostLocIsRegister)); - +HostLoc RegAlloc::ScratchImpl(HostLocList desired_locations) { HostLoc location = SelectARegister(desired_locations); - - if (IsRegisterOccupied(location)) { - SpillRegister(location); - } - - // Update state - LocInfo(location).is_being_used = true; - - DEBUG_ASSERT(LocInfo(location).IsScratch()); + MoveOutOfTheWay(location); + LocInfo(location).WriteLock(); return location; } -void RegAlloc::HostCall(IR::Inst* result_def, IR::Value arg0_use, IR::Value arg1_use, IR::Value arg2_use, IR::Value arg3_use) { +void RegAlloc::HostCall(IR::Inst* result_def, boost::optional arg0, boost::optional arg1, boost::optional arg2, boost::optional arg3) { constexpr size_t args_count = 4; constexpr std::array args_hostloc = { ABI_PARAM1, ABI_PARAM2, ABI_PARAM3, ABI_PARAM4 }; - const std::array args = {&arg0_use, &arg1_use, &arg2_use, &arg3_use}; + const std::array, args_count> args = { arg0, arg1, arg2, arg3 }; - const static std::vector other_caller_save = [args_hostloc](){ + const static std::vector other_caller_save = [args_hostloc]() { std::vector ret(ABI_ALL_CALLER_SAVE.begin(), ABI_ALL_CALLER_SAVE.end()); for (auto hostloc : args_hostloc) @@ -286,33 +300,45 @@ void RegAlloc::HostCall(IR::Inst* result_def, IR::Value arg0_use, IR::Value arg1 return ret; }(); - // TODO: This works but almost certainly leads to suboptimal generated code. - + ScratchGpr({ABI_RETURN}); if (result_def) { - DefHostLocReg(result_def, {ABI_RETURN}); - } else { - ScratchHostLocReg({ABI_RETURN}); + DefineValueImpl(result_def, ABI_RETURN); } for (size_t i = 0; i < args_count; i++) { - if (!args[i]->IsEmpty()) { - UseScratchHostLocReg(*args[i], {args_hostloc[i]}); - } else { - ScratchHostLocReg({args_hostloc[i]}); + if (args[i]) { + UseScratch(*args[i], args_hostloc[i]); + } + } + + for (size_t i = 0; i < args_count; i++) { + if (!args[i]) { + // TODO: Force spill + ScratchGpr({args_hostloc[i]}); } } for (HostLoc caller_saved : other_caller_save) { - ScratchHostLocReg({caller_saved}); + ScratchImpl({caller_saved}); } } +void RegAlloc::EndOfAllocScope() { + for (auto& iter : hostloc_info) { + iter.EndOfAllocScope(); + } +} + +void RegAlloc::AssertNoMoreUses() { + ASSERT(std::all_of(hostloc_info.begin(), hostloc_info.end(), [](const auto& i) { return i.IsEmpty(); })); +} + HostLoc RegAlloc::SelectARegister(HostLocList desired_locations) const { std::vector candidates = desired_locations; // Find all locations that have not been allocated.. auto allocated_locs = std::partition(candidates.begin(), candidates.end(), [this](auto loc){ - return !this->IsRegisterAllocated(loc); + return !this->LocInfo(loc).IsLocked(); }); candidates.erase(allocated_locs, candidates.end()); ASSERT_MSG(!candidates.empty(), "All candidate registers have already been allocated"); @@ -321,7 +347,7 @@ HostLoc RegAlloc::SelectARegister(HostLocList desired_locations) const { // TODO: Actually do LRU or something. Currently we just try to pick something without a value if possible. std::partition(candidates.begin(), candidates.end(), [this](auto loc){ - return !this->IsRegisterOccupied(loc); + return this->LocInfo(loc).IsEmpty(); }); return candidates.front(); @@ -329,152 +355,33 @@ HostLoc RegAlloc::SelectARegister(HostLocList desired_locations) const { boost::optional RegAlloc::ValueLocation(const IR::Inst* value) const { for (size_t i = 0; i < HostLocCount; i++) - for (const IR::Inst* v : hostloc_info[i].values) - if (v == value) - return boost::make_optional(static_cast(i)); + if (hostloc_info[i].ContainsValue(value)) + return boost::make_optional(static_cast(i)); return boost::none; } -bool RegAlloc::IsRegisterOccupied(HostLoc loc) const { - const auto& info = LocInfo(loc); - - return !info.values.empty() || info.def; +void RegAlloc::DefineValueImpl(IR::Inst* def_inst, HostLoc host_loc) { + ASSERT_MSG(!ValueLocation(def_inst), "def_inst has already been defined"); + LocInfo(host_loc).AddValue(def_inst); } -bool RegAlloc::IsRegisterAllocated(HostLoc loc) const { - return LocInfo(loc).is_being_used; -} +void RegAlloc::DefineValueImpl(IR::Inst* def_inst, const IR::Value& use_inst) { + ASSERT_MSG(!ValueLocation(def_inst), "def_inst has already been defined"); -bool RegAlloc::IsLastUse(const IR::Inst* inst) const { - if (inst->UseCount() > 1) - return false; - return LocInfo(*ValueLocation(inst)).values.size() == 1; -} - -void RegAlloc::SpillRegister(HostLoc loc) { - ASSERT_MSG(HostLocIsRegister(loc), "Only registers can be spilled"); - ASSERT_MSG(IsRegisterOccupied(loc), "There is no need to spill unoccupied registers"); - ASSERT_MSG(!IsRegisterAllocated(loc), "Registers that have been allocated must not be spilt"); - - HostLoc new_loc = FindFreeSpill(); - - EmitMove(new_loc, loc); - - LocInfo(new_loc) = LocInfo(loc); - LocInfo(loc) = {}; -} - -HostLoc RegAlloc::FindFreeSpill() const { - for (size_t i = 0; i < SpillCount; i++) - if (!IsRegisterOccupied(HostLocSpill(i))) - return HostLocSpill(i); - - ASSERT_MSG(false, "All spill locations are full"); -} - -void RegAlloc::EndOfAllocScope() { - for (auto& iter : hostloc_info) { - iter.is_being_used = false; - if (iter.def) { - iter.values.clear(); - iter.values.emplace_back(iter.def); - iter.def = nullptr; - } - if (!iter.values.empty()) { - auto to_erase = std::remove_if(iter.values.begin(), iter.values.end(), - [](const auto& inst){ return !inst->HasUses(); }); - iter.values.erase(to_erase, iter.values.end()); - } - } -} - -void RegAlloc::AssertNoMoreUses() { - ASSERT(std::all_of(hostloc_info.begin(), hostloc_info.end(), [](const auto& i){ return i.values.empty(); })); -} - -void RegAlloc::Reset() { - hostloc_info.fill({}); -} - -void RegAlloc::EmitMove(HostLoc to, HostLoc from) { - if (HostLocIsXMM(to) && HostLocIsSpill(from)) { - code->movsd(HostLocToXmm(to), SpillToOpArg(from)); - } else if (HostLocIsSpill(to) && HostLocIsXMM(from)) { - code->movsd(SpillToOpArg(to), HostLocToXmm(from)); - } else if (HostLocIsXMM(to) && HostLocIsXMM(from)) { - code->movaps(HostLocToXmm(to), HostLocToXmm(from)); - } else if (HostLocIsGPR(to) && HostLocIsSpill(from)) { - code->mov(HostLocToReg64(to), SpillToOpArg(from)); - } else if (HostLocIsSpill(to) && HostLocIsGPR(from)) { - code->mov(SpillToOpArg(to), HostLocToReg64(from)); - } else if (HostLocIsGPR(to) && HostLocIsGPR(from)){ - code->mov(HostLocToReg64(to), HostLocToReg64(from)); - } else { - ASSERT_MSG(false, "Invalid RegAlloc::EmitMove"); - } -} - -void RegAlloc::EmitExchange(HostLoc a, HostLoc b) { - if (HostLocIsGPR(a) && HostLocIsGPR(b)) { - code->xchg(HostLocToReg64(a), HostLocToReg64(b)); - } else if (HostLocIsXMM(a) && HostLocIsXMM(b)) { - ASSERT_MSG(false, "Exchange is unnecessary for XMM registers"); - } else { - ASSERT_MSG(false, "Invalid RegAlloc::EmitExchange"); - } -} - -std::tuple RegAlloc::UseHostLoc(IR::Inst* use_inst, HostLocList desired_locations) { - DEBUG_ASSERT(std::all_of(desired_locations.begin(), desired_locations.end(), HostLocIsRegister)); - DEBUG_ASSERT_MSG(ValueLocation(use_inst), "use_inst has not been defined"); - - HostLoc current_location = *ValueLocation(use_inst); - auto iter = std::find(desired_locations.begin(), desired_locations.end(), current_location); - if (iter != desired_locations.end()) { - if (LocInfo(current_location).IsDef()) { - HostLoc new_location = SelectARegister(desired_locations); - if (IsRegisterOccupied(new_location)) { - SpillRegister(new_location); - } - EmitMove(new_location, current_location); - LocInfo(new_location).is_being_used = true; - LocInfo(new_location).values.emplace_back(use_inst); - use_inst->DecrementRemainingUses(); - DEBUG_ASSERT(LocInfo(new_location).IsUse()); - return std::make_tuple(new_location, false); - } else { - bool was_being_used = LocInfo(current_location).is_being_used; - ASSERT(LocInfo(current_location).IsUse() || LocInfo(current_location).IsIdle()); - LocInfo(current_location).is_being_used = true; - use_inst->DecrementRemainingUses(); - DEBUG_ASSERT(LocInfo(current_location).IsUse()); - return std::make_tuple(current_location, was_being_used); - } + if (use_inst.IsImmediate()) { + HostLoc location = ScratchImpl(any_gpr); + DefineValueImpl(def_inst, location); + LoadImmediate(use_inst, location); + return; } - if (HostLocIsSpill(current_location)) { - bool was_being_used = LocInfo(current_location).is_being_used; - LocInfo(current_location).is_being_used = true; - use_inst->DecrementRemainingUses(); - DEBUG_ASSERT(LocInfo(current_location).IsUse()); - return std::make_tuple(current_location, was_being_used); - } else if (HostLocIsRegister(current_location)) { - HostLoc new_location = SelectARegister(desired_locations); - ASSERT(LocInfo(current_location).IsIdle()); - EmitExchange(new_location, current_location); - std::swap(LocInfo(new_location), LocInfo(current_location)); - LocInfo(new_location).is_being_used = true; - use_inst->DecrementRemainingUses(); - DEBUG_ASSERT(LocInfo(new_location).IsUse()); - return std::make_tuple(new_location, false); - } - - ASSERT_MSG(false, "Invalid current_location"); - return std::make_tuple(static_cast(-1), false); + ASSERT_MSG(ValueLocation(use_inst.GetInst()), "use_inst must already be defined"); + HostLoc location = *ValueLocation(use_inst.GetInst()); + DefineValueImpl(def_inst, location); } -HostLoc RegAlloc::LoadImmediateIntoHostLocReg(IR::Value imm, HostLoc host_loc) { +HostLoc RegAlloc::LoadImmediate(IR::Value imm, HostLoc host_loc) { ASSERT_MSG(imm.IsImmediate(), "imm is not an immediate"); Xbyak::Reg64 reg = HostLocToReg64(host_loc); @@ -487,5 +394,76 @@ HostLoc RegAlloc::LoadImmediateIntoHostLocReg(IR::Value imm, HostLoc host_loc) { return host_loc; } +void RegAlloc::Move(HostLoc to, HostLoc from) { + ASSERT(LocInfo(to).IsEmpty() && !LocInfo(from).IsLocked()); + + if (LocInfo(from).IsEmpty()) { + return; + } + + LocInfo(to) = LocInfo(from); + LocInfo(from) = {}; + + EmitMove(code, to, from); +} + +void RegAlloc::CopyToScratch(HostLoc to, HostLoc from) { + ASSERT(LocInfo(to).IsEmpty() && !LocInfo(from).IsEmpty()); + + EmitMove(code, to, from); +} + +void RegAlloc::Exchange(HostLoc a, HostLoc b) { + ASSERT(!LocInfo(a).IsLocked() && !LocInfo(b).IsLocked()); + + if (LocInfo(a).IsEmpty()) { + Move(a, b); + return; + } + + if (LocInfo(b).IsEmpty()) { + Move(b, a); + return; + } + + std::swap(LocInfo(a), LocInfo(b)); + + EmitExchange(code, a, b); +} + +void RegAlloc::MoveOutOfTheWay(HostLoc reg) { + ASSERT(!LocInfo(reg).IsLocked()); + if (!LocInfo(reg).IsEmpty()) { + SpillRegister(reg); + } +} + +void RegAlloc::SpillRegister(HostLoc loc) { + ASSERT_MSG(HostLocIsRegister(loc), "Only registers can be spilled"); + ASSERT_MSG(!LocInfo(loc).IsEmpty(), "There is no need to spill unoccupied registers"); + ASSERT_MSG(!LocInfo(loc).IsLocked(), "Registers that have been allocated must not be spilt"); + + HostLoc new_loc = FindFreeSpill(); + Move(new_loc, loc); +} + +HostLoc RegAlloc::FindFreeSpill() const { + for (size_t i = 0; i < SpillCount; i++) + if (LocInfo(HostLocSpill(i)).IsEmpty()) + return HostLocSpill(i); + + ASSERT_MSG(false, "All spill locations are full"); +} + +HostLocInfo& RegAlloc::LocInfo(HostLoc loc) { + ASSERT(loc != HostLoc::RSP && loc != HostLoc::R15); + return hostloc_info[static_cast(loc)]; +} + +const HostLocInfo& RegAlloc::LocInfo(HostLoc loc) const { + ASSERT(loc != HostLoc::RSP && loc != HostLoc::R15); + return hostloc_info[static_cast(loc)]; +} + } // namespace BackendX64 } // namespace Dynarmic diff --git a/src/backend_x64/reg_alloc.h b/src/backend_x64/reg_alloc.h index d2e69b62..6a54819f 100644 --- a/src/backend_x64/reg_alloc.h +++ b/src/backend_x64/reg_alloc.h @@ -22,62 +22,78 @@ namespace Dynarmic { namespace BackendX64 { +class RegAlloc; + +struct HostLocInfo { +public: + bool IsLocked() const; + bool IsEmpty() const; + bool IsLastUse() const; + + bool ContainsValue(const IR::Inst* inst) const; + + void ReadLock(); + void WriteLock(); + + void AddValue(IR::Inst* inst); + + void EndOfAllocScope(); + +private: + std::vector values; + bool is_being_used = false; + bool is_scratch = false; +}; + +struct Argument { +public: + IR::Type GetType() const; + bool IsImmediate() const; + + bool GetImmediateU1() const; + u8 GetImmediateU8() const; + u16 GetImmediateU16() const; + u32 GetImmediateU32() const; + u64 GetImmediateU64() const; + + /// Is this value currently in a GPR? + bool IsInGpr() const; + /// Is this value currently in a XMM? + bool IsInXmm() const; + /// Is this value currently in memory? + bool IsInMemory() const; + +private: + friend class RegAlloc; + Argument(RegAlloc& reg_alloc) : reg_alloc(reg_alloc) {} + + bool allocated = false; + RegAlloc& reg_alloc; + IR::Value value; +}; + class RegAlloc final { public: explicit RegAlloc(BlockOfCode* code) : code(code) {} - /// Late-def - Xbyak::Reg64 DefGpr(IR::Inst* def_inst, HostLocList desired_locations = any_gpr) { - return HostLocToReg64(DefHostLocReg(def_inst, desired_locations)); - } - Xbyak::Xmm DefXmm(IR::Inst* def_inst, HostLocList desired_locations = any_xmm) { - return HostLocToXmm(DefHostLocReg(def_inst, desired_locations)); - } - void RegisterAddDef(IR::Inst* def_inst, const IR::Value& use_inst); - /// Early-use, Late-def - Xbyak::Reg64 UseDefGpr(IR::Value use_value, IR::Inst* def_inst, HostLocList desired_locations = any_gpr) { - return HostLocToReg64(UseDefHostLocReg(use_value, def_inst, desired_locations)); - } - Xbyak::Xmm UseDefXmm(IR::Value use_value, IR::Inst* def_inst, HostLocList desired_locations = any_xmm) { - return HostLocToXmm(UseDefHostLocReg(use_value, def_inst, desired_locations)); - } - std::tuple UseDefOpArgGpr(IR::Value use_value, IR::Inst* def_inst, HostLocList desired_locations = any_gpr) { - OpArg op; - HostLoc host_loc; - std::tie(op, host_loc) = UseDefOpArgHostLocReg(use_value, def_inst, desired_locations); - return std::make_tuple(op, HostLocToReg64(host_loc)); - } - std::tuple UseDefOpArgXmm(IR::Value use_value, IR::Inst* def_inst, HostLocList desired_locations = any_xmm) { - OpArg op; - HostLoc host_loc; - std::tie(op, host_loc) = UseDefOpArgHostLocReg(use_value, def_inst, desired_locations); - return std::make_tuple(op, HostLocToXmm(host_loc)); - } - /// Early-use - Xbyak::Reg64 UseGpr(IR::Value use_value, HostLocList desired_locations = any_gpr) { - return HostLocToReg64(UseHostLocReg(use_value, desired_locations)); - } - Xbyak::Xmm UseXmm(IR::Value use_value, HostLocList desired_locations = any_xmm) { - return HostLocToXmm(UseHostLocReg(use_value, desired_locations)); - } - OpArg UseOpArg(IR::Value use_value, HostLocList desired_locations); - /// Early-use, Destroyed - Xbyak::Reg64 UseScratchGpr(IR::Value use_value, HostLocList desired_locations = any_gpr) { - return HostLocToReg64(UseScratchHostLocReg(use_value, desired_locations)); - } - Xbyak::Xmm UseScratchXmm(IR::Value use_value, HostLocList desired_locations = any_xmm) { - return HostLocToXmm(UseScratchHostLocReg(use_value, desired_locations)); - } - /// Early-def, Late-use, single-use - Xbyak::Reg64 ScratchGpr(HostLocList desired_locations = any_gpr) { - return HostLocToReg64(ScratchHostLocReg(desired_locations)); - } - Xbyak::Xmm ScratchXmm(HostLocList desired_locations = any_xmm) { - return HostLocToXmm(ScratchHostLocReg(desired_locations)); - } + std::array GetArgumentInfo(IR::Inst* inst); - /// Late-def for result register, Early-use for all arguments, Each value is placed into registers according to host ABI. - void HostCall(IR::Inst* result_def = nullptr, IR::Value arg0_use = {}, IR::Value arg1_use = {}, IR::Value arg2_use = {}, IR::Value arg3_use = {}); + Xbyak::Reg64 UseGpr(Argument& arg); + Xbyak::Xmm UseXmm(Argument& arg); + OpArg UseOpArg(Argument& arg); + void Use(Argument& arg, HostLoc host_loc); + + Xbyak::Reg64 UseScratchGpr(Argument& arg); + Xbyak::Xmm UseScratchXmm(Argument& arg); + void UseScratch(Argument& arg, HostLoc host_loc); + + void DefineValue(IR::Inst* inst, const Xbyak::Reg& reg); + void DefineValue(IR::Inst* inst, Argument& arg); + + Xbyak::Reg64 ScratchGpr(HostLocList desired_locations = any_gpr); + Xbyak::Xmm ScratchXmm(HostLocList desired_locations = any_xmm); + + void HostCall(IR::Inst* result_def = nullptr, boost::optional arg0 = {}, boost::optional arg1 = {}, boost::optional arg2 = {}, boost::optional arg3 = {}); // TODO: Values in host flags @@ -85,65 +101,32 @@ public: void AssertNoMoreUses(); - void Reset(); - private: + friend struct Argument; + HostLoc SelectARegister(HostLocList desired_locations) const; boost::optional ValueLocation(const IR::Inst* value) const; - bool IsRegisterOccupied(HostLoc loc) const; - bool IsRegisterAllocated(HostLoc loc) const; - bool IsLastUse(const IR::Inst* inst) const; - HostLoc DefHostLocReg(IR::Inst* def_inst, HostLocList desired_locations); - HostLoc UseDefHostLocReg(IR::Value use_value, IR::Inst* def_inst, HostLocList desired_locations); - HostLoc UseDefHostLocReg(IR::Inst* use_inst, IR::Inst* def_inst, HostLocList desired_locations); - std::tuple UseDefOpArgHostLocReg(IR::Value use_value, IR::Inst* def_inst, HostLocList desired_locations); - HostLoc UseHostLocReg(IR::Value use_value, HostLocList desired_locations); - HostLoc UseHostLocReg(IR::Inst* use_inst, HostLocList desired_locations); - std::tuple UseHostLoc(IR::Inst* use_inst, HostLocList desired_locations); - HostLoc UseScratchHostLocReg(IR::Value use_value, HostLocList desired_locations); - HostLoc UseScratchHostLocReg(IR::Inst* use_inst, HostLocList desired_locations); - HostLoc ScratchHostLocReg(HostLocList desired_locations); + HostLoc UseImpl(IR::Value use_value, HostLocList desired_locations); + HostLoc UseScratchImpl(IR::Value use_value, HostLocList desired_locations); + HostLoc ScratchImpl(HostLocList desired_locations); + void DefineValueImpl(IR::Inst* def_inst, HostLoc host_loc); + void DefineValueImpl(IR::Inst* def_inst, const IR::Value& use_inst); - void EmitMove(HostLoc to, HostLoc from); - void EmitExchange(HostLoc a, HostLoc b); - HostLoc LoadImmediateIntoHostLocReg(IR::Value imm, HostLoc reg); + BlockOfCode* code = nullptr; + + HostLoc LoadImmediate(IR::Value imm, HostLoc reg); + void Move(HostLoc to, HostLoc from); + void CopyToScratch(HostLoc to, HostLoc from); + void Exchange(HostLoc a, HostLoc b); + void MoveOutOfTheWay(HostLoc reg); void SpillRegister(HostLoc loc); HostLoc FindFreeSpill() const; - BlockOfCode* code = nullptr; - - struct HostLocInfo { - std::vector values; // early value - IR::Inst* def = nullptr; // late value - bool is_being_used = false; - - bool IsIdle() const { - return !is_being_used; - } - bool IsScratch() const { - return is_being_used && !def && values.empty(); - } - bool IsUse() const { - return is_being_used && !def && !values.empty(); - } - bool IsDef() const { - return is_being_used && def && values.empty(); - } - bool IsUseDef() const { - return is_being_used && def && !values.empty(); - } - }; std::array hostloc_info; - HostLocInfo& LocInfo(HostLoc loc) { - DEBUG_ASSERT(loc != HostLoc::RSP && loc != HostLoc::R15); - return hostloc_info[static_cast(loc)]; - } - const HostLocInfo& LocInfo(HostLoc loc) const { - DEBUG_ASSERT(loc != HostLoc::RSP && loc != HostLoc::R15); - return hostloc_info[static_cast(loc)]; - } + HostLocInfo& LocInfo(HostLoc loc); + const HostLocInfo& LocInfo(HostLoc loc) const; }; } // namespace BackendX64 diff --git a/src/frontend/ir/microinstruction.cpp b/src/frontend/ir/microinstruction.cpp index b5d89188..7ed7dc34 100644 --- a/src/frontend/ir/microinstruction.cpp +++ b/src/frontend/ir/microinstruction.cpp @@ -255,13 +255,13 @@ Inst* Inst::GetAssociatedPseudoOperation(Opcode opcode) { // This is faster than doing a search through the block. switch (opcode) { case IR::Opcode::GetCarryFromOp: - DEBUG_ASSERT(!carry_inst || carry_inst->GetOpcode() == Opcode::GetCarryFromOp); + ASSERT(!carry_inst || carry_inst->GetOpcode() == Opcode::GetCarryFromOp); return carry_inst; case IR::Opcode::GetOverflowFromOp: - DEBUG_ASSERT(!overflow_inst || overflow_inst->GetOpcode() == Opcode::GetOverflowFromOp); + ASSERT(!overflow_inst || overflow_inst->GetOpcode() == Opcode::GetOverflowFromOp); return overflow_inst; case IR::Opcode::GetGEFromOp: - DEBUG_ASSERT(!ge_inst || ge_inst->GetOpcode() == Opcode::GetGEFromOp); + ASSERT(!ge_inst || ge_inst->GetOpcode() == Opcode::GetGEFromOp); return ge_inst; default: break; @@ -278,15 +278,15 @@ Type Inst::GetType() const { } Value Inst::GetArg(size_t index) const { - DEBUG_ASSERT(index < GetNumArgsOf(op)); - DEBUG_ASSERT(!args[index].IsEmpty()); + ASSERT(index < GetNumArgsOf(op)); + ASSERT(!args[index].IsEmpty()); return args[index]; } void Inst::SetArg(size_t index, Value value) { - DEBUG_ASSERT(index < GetNumArgsOf(op)); - DEBUG_ASSERT(AreTypesCompatible(value.GetType(), GetArgTypeOf(op, index))); + ASSERT(index < GetNumArgsOf(op)); + ASSERT(AreTypesCompatible(value.GetType(), GetArgTypeOf(op, index))); if (!args[index].IsImmediate()) { UndoUse(args[index]); @@ -346,15 +346,15 @@ void Inst::UndoUse(const Value& value) { switch (op){ case Opcode::GetCarryFromOp: - DEBUG_ASSERT(value.GetInst()->carry_inst->GetOpcode() == Opcode::GetCarryFromOp); + ASSERT(value.GetInst()->carry_inst->GetOpcode() == Opcode::GetCarryFromOp); value.GetInst()->carry_inst = nullptr; break; case Opcode::GetOverflowFromOp: - DEBUG_ASSERT(value.GetInst()->overflow_inst->GetOpcode() == Opcode::GetOverflowFromOp); + ASSERT(value.GetInst()->overflow_inst->GetOpcode() == Opcode::GetOverflowFromOp); value.GetInst()->overflow_inst = nullptr; break; case Opcode::GetGEFromOp: - DEBUG_ASSERT(value.GetInst()->ge_inst->GetOpcode() == Opcode::GetGEFromOp); + ASSERT(value.GetInst()->ge_inst->GetOpcode() == Opcode::GetGEFromOp); value.GetInst()->ge_inst = nullptr; break; default: diff --git a/src/frontend/ir/value.cpp b/src/frontend/ir/value.cpp index 8ade5af2..50ea6e25 100644 --- a/src/frontend/ir/value.cpp +++ b/src/frontend/ir/value.cpp @@ -69,59 +69,59 @@ Type Value::GetType() const { } Arm::Reg Value::GetRegRef() const { - DEBUG_ASSERT(type == Type::RegRef); + ASSERT(type == Type::RegRef); return inner.imm_regref; } Arm::ExtReg Value::GetExtRegRef() const { - DEBUG_ASSERT(type == Type::ExtRegRef); + ASSERT(type == Type::ExtRegRef); return inner.imm_extregref; } Inst* Value::GetInst() const { - DEBUG_ASSERT(type == Type::Opaque); + ASSERT(type == Type::Opaque); return inner.inst; } bool Value::GetU1() const { if (type == Type::Opaque && inner.inst->GetOpcode() == Opcode::Identity) return inner.inst->GetArg(0).GetU1(); - DEBUG_ASSERT(type == Type::U1); + ASSERT(type == Type::U1); return inner.imm_u1; } u8 Value::GetU8() const { if (type == Type::Opaque && inner.inst->GetOpcode() == Opcode::Identity) return inner.inst->GetArg(0).GetU8(); - DEBUG_ASSERT(type == Type::U8); + ASSERT(type == Type::U8); return inner.imm_u8; } u16 Value::GetU16() const { if (type == Type::Opaque && inner.inst->GetOpcode() == Opcode::Identity) return inner.inst->GetArg(0).GetU16(); - DEBUG_ASSERT(type == Type::U16); + ASSERT(type == Type::U16); return inner.imm_u16; } u32 Value::GetU32() const { if (type == Type::Opaque && inner.inst->GetOpcode() == Opcode::Identity) return inner.inst->GetArg(0).GetU32(); - DEBUG_ASSERT(type == Type::U32); + ASSERT(type == Type::U32); return inner.imm_u32; } u64 Value::GetU64() const { if (type == Type::Opaque && inner.inst->GetOpcode() == Opcode::Identity) return inner.inst->GetArg(0).GetU64(); - DEBUG_ASSERT(type == Type::U64); + ASSERT(type == Type::U64); return inner.imm_u64; } std::array Value::GetCoprocInfo() const { if (type == Type::Opaque && inner.inst->GetOpcode() == Opcode::Identity) return inner.inst->GetArg(0).GetCoprocInfo(); - DEBUG_ASSERT(type == Type::CoprocInfo); + ASSERT(type == Type::CoprocInfo); return inner.imm_coproc; }