JIT: Clean up float loads and stores.

Less code is good, and this should make future changes to memory handling
easier.
This commit is contained in:
magumagu 2014-05-30 21:09:19 -07:00
parent 07da9cbcf4
commit 06864e9fee
6 changed files with 57 additions and 249 deletions

View file

@ -1286,9 +1286,7 @@ void XEmitter::MOVQ_xmm(X64Reg dest, OpArg arg) {
}
void XEmitter::MOVQ_xmm(OpArg arg, X64Reg src) {
if (arg.IsSimpleReg())
PanicAlert("Emitter: MOVQ_xmm doesn't support single registers as destination");
if (src > 7)
if (src > 7 || arg.IsSimpleReg())
{
// Alternate encoding
// This does not display correctly in MSVC's debugger, it thinks it's a MOVD

View file

@ -88,7 +88,7 @@ static GekkoOPTemplate primarytable[] =
{51, &Jit64::FallBackToInterpreter}, //"lfdu", OPTYPE_LOADFP, FL_OUT_A | FL_IN_A}},
{52, &Jit64::stfs}, //"stfs", OPTYPE_STOREFP, FL_IN_A}},
{53, &Jit64::stfs}, //"stfsu", OPTYPE_STOREFP, FL_OUT_A | FL_IN_A}},
{53, &Jit64::FallBackToInterpreter}, //"stfsu", OPTYPE_STOREFP, FL_OUT_A | FL_IN_A}},
{54, &Jit64::stfd}, //"stfd", OPTYPE_STOREFP, FL_IN_A}},
{55, &Jit64::FallBackToInterpreter}, //"stfdu", OPTYPE_STOREFP, FL_OUT_A | FL_IN_A}},

View file

@ -2,9 +2,6 @@
// Licensed under GPLv2
// Refer to the license.txt file included.
// TODO(ector): Tons of pshufb optimization of the loads/stores, for SSSE3+, possibly SSE4, only.
// Should give a very noticeable speed boost to paired single heavy code.
#include "Common/Common.h"
#include "Common/CPUDetect.h"
@ -12,20 +9,8 @@
#include "Core/PowerPC/Jit64/JitAsm.h"
#include "Core/PowerPC/Jit64/JitRegCache.h"
namespace {
// pshufb todo: MOVQ
const u8 GC_ALIGNED16(bswapShuffle1x4[16]) = {3, 2, 1, 0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
const u8 GC_ALIGNED16(bswapShuffle1x8[16]) = {7, 6, 5, 4, 3, 2, 1, 0, 8, 9, 10, 11, 12, 13, 14, 15};
const u8 GC_ALIGNED16(bswapShuffle1x8Dupe[16]) = {7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0};
u64 GC_ALIGNED16(temp64);
}
// TODO: Add peephole optimizations for multiple consecutive lfd/lfs/stfd/stfs since they are so common,
// and pshufb could help a lot.
// Also add hacks for things like lfs/stfs the same reg consecutively, that is, simple memory moves.
void Jit64::lfs(UGeckoInstruction inst)
{
@ -40,12 +25,11 @@ void Jit64::lfs(UGeckoInstruction inst)
SafeLoadToReg(EAX, gpr.R(a), 32, offset, RegistersInUse(), false);
MEMCHECK_START
fpr.Lock(d);
fpr.BindToRegister(d, false);
ConvertSingleToDouble(fpr.RX(d), EAX, true);
fpr.BindToRegister(d, js.memcheck);
MEMCHECK_START
ConvertSingleToDouble(fpr.RX(d), EAX, true);
MEMCHECK_END
fpr.UnlockAll();
@ -56,61 +40,23 @@ void Jit64::lfd(UGeckoInstruction inst)
{
INSTRUCTION_START
JITDISABLE(bJITLoadStoreFloatingOff);
FALLBACK_IF(js.memcheck || !inst.RA);
FALLBACK_IF(!inst.RA);
int d = inst.RD;
int a = inst.RA;
s32 offset = (s32)(s16)inst.SIMM_16;
gpr.FlushLockX(ABI_PARAM1);
gpr.Lock(a);
MOV(32, R(ABI_PARAM1), gpr.R(a));
// TODO - optimize. This has to load the previous value - upper double should stay unmodified.
SafeLoadToReg(RAX, gpr.R(a), 64, offset, RegistersInUse(), false);
fpr.Lock(d);
fpr.BindToRegister(d, true);
X64Reg xd = fpr.RX(d);
if (cpu_info.bSSSE3)
{
#if _M_X86_64
MOVQ_xmm(XMM0, MComplex(RBX, ABI_PARAM1, SCALE_1, offset));
#else
AND(32, R(ABI_PARAM1), Imm32(Memory::MEMVIEW32_MASK));
MOVQ_xmm(XMM0, MDisp(ABI_PARAM1, (u32)Memory::base + offset));
#endif
PSHUFB(XMM0, M((void *)bswapShuffle1x8Dupe));
MOVSD(xd, R(XMM0));
} else {
#if _M_X86_64
LoadAndSwap(64, EAX, MComplex(RBX, ABI_PARAM1, SCALE_1, offset));
MOV(64, M(&temp64), R(EAX));
MEMCHECK_START
MOVQ_xmm(XMM0, R(RAX));
MOVSD(fpr.RX(d), R(XMM0));
MEMCHECK_END
MEMCHECK_START
MOVSD(XMM0, M(&temp64));
MOVSD(xd, R(XMM0));
MEMCHECK_END
#else
AND(32, R(ABI_PARAM1), Imm32(Memory::MEMVIEW32_MASK));
MOV(32, R(EAX), MDisp(ABI_PARAM1, (u32)Memory::base + offset));
BSWAP(32, EAX);
MOV(32, M((void*)((u8 *)&temp64+4)), R(EAX));
MEMCHECK_START
MOV(32, R(EAX), MDisp(ABI_PARAM1, (u32)Memory::base + offset + 4));
BSWAP(32, EAX);
MOV(32, M(&temp64), R(EAX));
MOVSD(XMM0, M(&temp64));
MOVSD(xd, R(XMM0));
MEMCHECK_END
#endif
}
gpr.UnlockAll();
gpr.UnlockAllX();
fpr.UnlockAll();
}
@ -119,146 +65,49 @@ void Jit64::stfd(UGeckoInstruction inst)
{
INSTRUCTION_START
JITDISABLE(bJITLoadStoreFloatingOff);
FALLBACK_IF(js.memcheck || !inst.RA);
FALLBACK_IF(!inst.RA);
int s = inst.RS;
int a = inst.RA;
u32 mem_mask = Memory::ADDR_MASK_HW_ACCESS;
if (Core::g_CoreStartupParameter.bMMU ||
Core::g_CoreStartupParameter.bTLBHack) {
mem_mask |= Memory::ADDR_MASK_MEM1;
}
#ifdef ENABLE_MEM_CHECK
if (Core::g_CoreStartupParameter.bEnableDebugging)
{
mem_mask |= Memory::EXRAM_MASK;
}
#endif
gpr.FlushLockX(ABI_PARAM1);
gpr.Lock(a);
fpr.Lock(s);
gpr.BindToRegister(a, true, false);
MOV(32, R(ABI_PARAM1), gpr.R(a));
if (fpr.R(s).IsSimpleReg())
MOVQ_xmm(R(RAX), fpr.RX(s));
else
MOV(64, R(RAX), fpr.R(s));
s32 offset = (s32)(s16)inst.SIMM_16;
LEA(32, ABI_PARAM1, MDisp(gpr.R(a).GetSimpleReg(), offset));
TEST(32, R(ABI_PARAM1), Imm32(mem_mask));
FixupBranch safe = J_CC(CC_NZ);
SafeWriteRegToReg(RAX, ABI_PARAM1, 64, offset, RegistersInUse());
// Fast routine
if (cpu_info.bSSSE3) {
MOVAPD(XMM0, fpr.R(s));
PSHUFB(XMM0, M((void*)bswapShuffle1x8));
#if _M_X86_64
MOVQ_xmm(MComplex(RBX, ABI_PARAM1, SCALE_1, 0), XMM0);
#else
AND(32, R(ECX), Imm32(Memory::MEMVIEW32_MASK));
MOVQ_xmm(MDisp(ABI_PARAM1, (u32)Memory::base), XMM0);
#endif
} else {
MOVAPD(XMM0, fpr.R(s));
MOVD_xmm(R(EAX), XMM0);
UnsafeWriteRegToReg(EAX, ABI_PARAM1, 32, 4);
PSRLQ(XMM0, 32);
MOVD_xmm(R(EAX), XMM0);
UnsafeWriteRegToReg(EAX, ABI_PARAM1, 32, 0);
}
FixupBranch exit = J(true);
SetJumpTarget(safe);
// Safe but slow routine
MOVAPD(XMM0, fpr.R(s));
PSRLQ(XMM0, 32);
MOVD_xmm(R(EAX), XMM0);
SafeWriteRegToReg(EAX, ABI_PARAM1, 32, 0, RegistersInUse() | (1 << (16 + XMM0)));
MOVAPD(XMM0, fpr.R(s));
MOVD_xmm(R(EAX), XMM0);
LEA(32, ABI_PARAM1, MDisp(gpr.R(a).GetSimpleReg(), offset));
SafeWriteRegToReg(EAX, ABI_PARAM1, 32, 4, RegistersInUse());
SetJumpTarget(exit);
gpr.UnlockAll();
gpr.UnlockAllX();
fpr.UnlockAll();
}
// In Release on 32bit build,
// this seemed to cause a problem with PokePark2
// at start after talking to first pokemon,
// you run and smash a box, then he goes on about
// following him and then you cant do anything.
// I have enabled interpreter for this function
// in the mean time.
// Parlane
void Jit64::stfs(UGeckoInstruction inst)
{
INSTRUCTION_START
JITDISABLE(bJITLoadStoreFloatingOff);
FALLBACK_IF(!inst.RA);
bool update = inst.OPCD & 1;
int s = inst.RS;
int a = inst.RA;
s32 offset = (s32)(s16)inst.SIMM_16;
FALLBACK_IF(!a || update);
fpr.BindToRegister(s, true, false);
ConvertDoubleToSingle(XMM0, fpr.RX(s));
if (gpr.R(a).IsImm())
{
u32 addr = (u32)(gpr.R(a).offset + offset);
if (Memory::IsRAMAddress(addr))
{
if (cpu_info.bSSSE3) {
PSHUFB(XMM0, M((void *)bswapShuffle1x4));
WriteFloatToConstRamAddress(XMM0, addr);
return;
}
}
else if (addr == 0xCC008000)
{
// Float directly to write gather pipe! Fun!
CALL((void*)asm_routines.fifoDirectWriteFloat);
// TODO
js.fifoBytesThisBlock += 4;
return;
}
}
gpr.FlushLockX(ABI_PARAM1, ABI_PARAM2);
gpr.Lock(a);
MOV(32, R(ABI_PARAM2), gpr.R(a));
ADD(32, R(ABI_PARAM2), Imm32(offset));
if (update && offset)
{
// We must flush immediate values from the following register because
// it may take another value at runtime if no MMU exception has been raised
gpr.KillImmediate(a, true, true);
MEMCHECK_START
MOV(32, gpr.R(a), R(ABI_PARAM2));
MEMCHECK_END
}
SafeWriteFloatToReg(XMM0, ABI_PARAM2, RegistersInUse());
gpr.UnlockAll();
gpr.UnlockAllX();
gpr.FlushLockX(ABI_PARAM1);
MOV(32, R(ABI_PARAM1), gpr.R(a));
SafeWriteF32ToReg(XMM0, ABI_PARAM1, offset, RegistersInUse());
fpr.UnlockAll();
gpr.UnlockAllX();
}
void Jit64::stfsx(UGeckoInstruction inst)
{
INSTRUCTION_START
JITDISABLE(bJITLoadStoreFloatingOff);
// We can take a shortcut here - it's not likely that a hardware access would use this instruction.
gpr.FlushLockX(ABI_PARAM1);
MOV(32, R(ABI_PARAM1), gpr.R(inst.RB));
if (inst.RA)
@ -268,14 +117,11 @@ void Jit64::stfsx(UGeckoInstruction inst)
fpr.Lock(s);
fpr.BindToRegister(s, true, false);
ConvertDoubleToSingle(XMM0, fpr.RX(s));
MOVD_xmm(R(EAX), XMM0);
SafeWriteRegToReg(EAX, ABI_PARAM1, 32, 0, RegistersInUse());
gpr.UnlockAllX();
SafeWriteF32ToReg(XMM0, ABI_PARAM1, 0, RegistersInUse());
fpr.UnlockAll();
gpr.UnlockAllX();
}
void Jit64::lfsx(UGeckoInstruction inst)
{
INSTRUCTION_START
@ -283,30 +129,17 @@ void Jit64::lfsx(UGeckoInstruction inst)
MOV(32, R(EAX), gpr.R(inst.RB));
if (inst.RA)
{
ADD(32, R(EAX), gpr.R(inst.RA));
}
SafeLoadToReg(EAX, R(EAX), 32, 0, RegistersInUse(), false);
fpr.Lock(inst.RS);
fpr.BindToRegister(inst.RS, false);
X64Reg s = fpr.RX(inst.RS);
if (cpu_info.bSSSE3 && !js.memcheck) {
#if _M_X86_32
AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK));
MOVD_xmm(XMM0, MDisp(EAX, (u32)Memory::base));
#else
MOVD_xmm(XMM0, MComplex(RBX, EAX, SCALE_1, 0));
#endif
PSHUFB(XMM0, M((void *)bswapShuffle1x4));
ConvertSingleToDouble(s, XMM0);
} else {
SafeLoadToReg(EAX, R(EAX), 32, 0, RegistersInUse(), false);
fpr.BindToRegister(inst.RS, js.memcheck);
MEMCHECK_START
MEMCHECK_START
ConvertSingleToDouble(fpr.RX(inst.RS), EAX, true);
MEMCHECK_END
ConvertSingleToDouble(s, EAX, true);
MEMCHECK_END
}
fpr.UnlockAll();
gpr.UnlockAllX();
}

View file

@ -266,7 +266,7 @@ void CommonAsmRoutines::GenQuantizedSingleStores()
// Easy!
const u8* storeSingleFloat = AlignCode4();
SafeWriteFloatToReg(XMM0, ECX, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
SafeWriteF32ToReg(XMM0, ECX, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
RET();
/*
if (cpu_info.bSSSE3) {

View file

@ -101,7 +101,7 @@ u8 *EmuCodeBlock::UnsafeLoadToReg(X64Reg reg_value, Gen::OpArg opAddress, int ac
if (accessSize == 8 && signExtend)
MOVSX(32, accessSize, reg_value, MComplex(RBX, opAddress.GetSimpleReg(), SCALE_1, offset));
else
MOVZX(32, accessSize, reg_value, MComplex(RBX, opAddress.GetSimpleReg(), SCALE_1, offset));
MOVZX(64, accessSize, reg_value, MComplex(RBX, opAddress.GetSimpleReg(), SCALE_1, offset));
}
else
{
@ -110,7 +110,7 @@ u8 *EmuCodeBlock::UnsafeLoadToReg(X64Reg reg_value, Gen::OpArg opAddress, int ac
if (accessSize == 8 && signExtend)
MOVSX(32, accessSize, reg_value, MComplex(RBX, reg_value, SCALE_1, offset));
else
MOVZX(32, accessSize, reg_value, MComplex(RBX, reg_value, SCALE_1, offset));
MOVZX(64, accessSize, reg_value, MComplex(RBX, reg_value, SCALE_1, offset));
}
#else
if (opAddress.IsImm())
@ -151,6 +151,10 @@ u8 *EmuCodeBlock::UnsafeLoadToReg(X64Reg reg_value, Gen::OpArg opAddress, int ac
case 32:
BSWAP(32, reg_value);
break;
case 64:
BSWAP(64, reg_value);
break;
}
return result;
@ -272,6 +276,8 @@ void EmuCodeBlock::MMIOLoadToReg(MMIO::Mapping* mmio, Gen::X64Reg reg_value,
}
}
// Always clobbers EAX. Preserves the address.
// Preserves the value if the load fails and js.memcheck is enabled.
void EmuCodeBlock::SafeLoadToReg(X64Reg reg_value, const Gen::OpArg & opAddress, int accessSize, s32 offset, u32 registersInUse, bool signExtend, int flags)
{
if (!jit->js.memcheck)
@ -325,7 +331,7 @@ void EmuCodeBlock::SafeLoadToReg(X64Reg reg_value, const Gen::OpArg & opAddress,
{
UnsafeLoadToReg(reg_value, opAddress, accessSize, offset, signExtend);
}
else if (!Core::g_CoreStartupParameter.bMMU && MMIO::IsMMIOAddress(address))
else if (!Core::g_CoreStartupParameter.bMMU && MMIO::IsMMIOAddress(address) && accessSize != 64)
{
MMIOLoadToReg(Memory::mmio_mapping, reg_value, registersInUse,
address, accessSize, signExtend);
@ -335,6 +341,7 @@ void EmuCodeBlock::SafeLoadToReg(X64Reg reg_value, const Gen::OpArg & opAddress,
ABI_PushRegistersAndAdjustStack(registersInUse, false);
switch (accessSize)
{
case 64: ABI_CallFunctionC((void *)&Memory::Read_U64, address); break;
case 32: ABI_CallFunctionC((void *)&Memory::Read_U32, address); break;
case 16: ABI_CallFunctionC((void *)&Memory::Read_U16_ZX, address); break;
case 8: ABI_CallFunctionC((void *)&Memory::Read_U8_ZX, address); break;
@ -350,7 +357,7 @@ void EmuCodeBlock::SafeLoadToReg(X64Reg reg_value, const Gen::OpArg & opAddress,
}
else if (reg_value != EAX)
{
MOVZX(32, accessSize, reg_value, R(EAX));
MOVZX(64, accessSize, reg_value, R(EAX));
}
MEMCHECK_END
@ -372,6 +379,7 @@ void EmuCodeBlock::SafeLoadToReg(X64Reg reg_value, const Gen::OpArg & opAddress,
ABI_PushRegistersAndAdjustStack(registersInUse, false);
switch (accessSize)
{
case 64: ABI_CallFunctionA((void *)&Memory::Read_U64, addr_loc); break;
case 32: ABI_CallFunctionA((void *)&Memory::Read_U32, addr_loc); break;
case 16: ABI_CallFunctionA((void *)&Memory::Read_U16_ZX, addr_loc); break;
case 8: ABI_CallFunctionA((void *)&Memory::Read_U8_ZX, addr_loc); break;
@ -387,7 +395,7 @@ void EmuCodeBlock::SafeLoadToReg(X64Reg reg_value, const Gen::OpArg & opAddress,
}
else if (reg_value != EAX)
{
MOVZX(32, accessSize, reg_value, R(EAX));
MOVZX(64, accessSize, reg_value, R(EAX));
}
MEMCHECK_END
@ -490,6 +498,7 @@ void EmuCodeBlock::SafeWriteRegToReg(X64Reg reg_value, X64Reg reg_addr, int acce
ABI_PushRegistersAndAdjustStack(registersInUse, noProlog);
switch (accessSize)
{
case 64: ABI_CallFunctionRR(swap ? ((void *)&Memory::Write_U64) : ((void *)&Memory::Write_U64_Swap), reg_value, reg_addr, false); break;
case 32: ABI_CallFunctionRR(swap ? ((void *)&Memory::Write_U32) : ((void *)&Memory::Write_U32_Swap), reg_value, reg_addr, false); break;
case 16: ABI_CallFunctionRR(swap ? ((void *)&Memory::Write_U16) : ((void *)&Memory::Write_U16_Swap), reg_value, reg_addr, false); break;
case 8: ABI_CallFunctionRR((void *)&Memory::Write_U8, reg_value, reg_addr, false); break;
@ -501,43 +510,12 @@ void EmuCodeBlock::SafeWriteRegToReg(X64Reg reg_value, X64Reg reg_addr, int acce
SetJumpTarget(exit);
}
void EmuCodeBlock::SafeWriteFloatToReg(X64Reg xmm_value, X64Reg reg_addr, u32 registersInUse, int flags)
// Destroys both arg registers and EAX
void EmuCodeBlock::SafeWriteF32ToReg(X64Reg xmm_value, X64Reg reg_addr, s32 offset, u32 registersInUse, int flags)
{
// FIXME
if (false && cpu_info.bSSSE3) {
// This path should be faster but for some reason it causes errors so I've disabled it.
u32 mem_mask = Memory::ADDR_MASK_HW_ACCESS;
if (Core::g_CoreStartupParameter.bMMU || Core::g_CoreStartupParameter.bTLBHack)
mem_mask |= Memory::ADDR_MASK_MEM1;
#ifdef ENABLE_MEM_CHECK
if (Core::g_CoreStartupParameter.bEnableDebugging)
mem_mask |= Memory::EXRAM_MASK;
#endif
TEST(32, R(reg_addr), Imm32(mem_mask));
FixupBranch argh = J_CC(CC_Z);
MOVSS(M(&float_buffer), xmm_value);
LoadAndSwap(32, EAX, M(&float_buffer));
MOV(32, M(&PC), Imm32(jit->js.compilerPC)); // Helps external systems know which instruction triggered the write
ABI_PushRegistersAndAdjustStack(registersInUse, false);
ABI_CallFunctionRR((void *)&Memory::Write_U32, EAX, reg_addr);
ABI_PopRegistersAndAdjustStack(registersInUse, false);
FixupBranch arg2 = J();
SetJumpTarget(argh);
PSHUFB(xmm_value, M((void *)pbswapShuffle1x4));
#if _M_X86_64
MOVD_xmm(MComplex(RBX, reg_addr, SCALE_1, 0), xmm_value);
#else
AND(32, R(reg_addr), Imm32(Memory::MEMVIEW32_MASK));
MOVD_xmm(MDisp(reg_addr, (u32)Memory::base), xmm_value);
#endif
SetJumpTarget(arg2);
} else {
MOVSS(M(&float_buffer), xmm_value);
MOV(32, R(EAX), M(&float_buffer));
SafeWriteRegToReg(EAX, reg_addr, 32, 0, registersInUse, flags);
}
// TODO: PSHUFB might be faster if fastmem supported MOVSS.
MOVD_xmm(R(EAX), xmm_value);
SafeWriteRegToReg(EAX, reg_addr, 32, offset, registersInUse, flags);
}
void EmuCodeBlock::WriteToConstRamAddress(int accessSize, Gen::X64Reg arg, u32 address, bool swap)

View file

@ -47,8 +47,7 @@ public:
void SafeLoadToReg(Gen::X64Reg reg_value, const Gen::OpArg & opAddress, int accessSize, s32 offset, u32 registersInUse, bool signExtend, int flags = 0);
void SafeWriteRegToReg(Gen::X64Reg reg_value, Gen::X64Reg reg_addr, int accessSize, s32 offset, u32 registersInUse, int flags = 0);
// Trashes both inputs and EAX.
void SafeWriteFloatToReg(Gen::X64Reg xmm_value, Gen::X64Reg reg_addr, u32 registersInUse, int flags = 0);
void SafeWriteF32ToReg(Gen::X64Reg xmm_value, Gen::X64Reg reg_addr, s32 offset, u32 registersInUse, int flags = 0);
void WriteToConstRamAddress(int accessSize, Gen::X64Reg arg, u32 address, bool swap = false);
void WriteFloatToConstRamAddress(const Gen::X64Reg& xmm_reg, u32 address);