Merge pull request #1216 from FioraAeterna/movoptimizations

Add more AVX support, refactor emitter, reduce redundant XMM moves
This commit is contained in:
skidau 2014-10-07 13:25:28 +11:00
commit 8fdf43109f
13 changed files with 488 additions and 395 deletions

View file

@ -1340,41 +1340,56 @@ void XEmitter::IMUL(int bits, X64Reg regOp, OpArg a)
}
void XEmitter::WriteSSEOp(int size, u16 sseOp, bool packed, X64Reg regOp, OpArg arg, int extrabytes)
void XEmitter::WriteSSEOp(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes)
{
if (size == 64 && packed)
Write8(0x66); //this time, override goes upwards
if (!packed)
Write8(size == 64 ? 0xF2 : 0xF3);
if (opPrefix)
Write8(opPrefix);
arg.operandReg = regOp;
arg.WriteRex(this, 0, 0);
Write8(0x0F);
if (sseOp > 0xFF)
Write8((sseOp >> 8) & 0xFF);
Write8(sseOp & 0xFF);
if (op > 0xFF)
Write8((op >> 8) & 0xFF);
Write8(op & 0xFF);
arg.WriteRest(this, extrabytes);
}
void XEmitter::WriteAVXOp(int size, u16 sseOp, bool packed, X64Reg regOp, OpArg arg, int extrabytes)
void XEmitter::WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes)
{
WriteAVXOp(size, sseOp, packed, regOp, X64Reg::INVALID_REG, arg, extrabytes);
WriteAVXOp(opPrefix, op, regOp, INVALID_REG, arg, extrabytes);
}
void XEmitter::WriteAVXOp(int size, u16 sseOp, bool packed, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes)
static int GetVEXmmmmm(u16 op)
{
// Currently, only 0x38 and 0x3A are used as secondary escape byte.
if ((op >> 8) == 0x3A)
return 3;
else if ((op >> 8) == 0x38)
return 2;
else
return 1;
}
static int GetVEXpp(u8 opPrefix)
{
if (opPrefix == 0x66)
return 1;
else if (opPrefix == 0xF3)
return 2;
else if (opPrefix == 0xF2)
return 3;
else
return 0;
}
void XEmitter::WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes)
{
if (!cpu_info.bAVX)
PanicAlert("Trying to use AVX on a system that doesn't support it. Bad programmer.");
// Currently, only 0x38 and 0x3A are used as secondary escape byte.
int mmmmm;
if ((sseOp >> 8) == 0x3A)
mmmmm = 3;
else if ((sseOp >> 8) == 0x38)
mmmmm = 2;
else
mmmmm = 1;
int mmmmm = GetVEXmmmmm(op);
int pp = GetVEXpp(opPrefix);
// FIXME: we currently don't support 256-bit instructions, and "size" is not the vector size here
arg.WriteVex(this, regOp1, regOp2, 0, (packed << 1) | (size == 64), mmmmm);
Write8(sseOp & 0xFF);
arg.WriteVex(this, regOp1, regOp2, 0, pp, mmmmm);
Write8(op & 0xFF);
arg.WriteRest(this, extrabytes, regOp1);
}
@ -1383,21 +1398,8 @@ void XEmitter::WriteVEXOp(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg r
{
if (size != 32 && size != 64)
PanicAlert("VEX GPR instructions only support 32-bit and 64-bit modes!");
int mmmmm, pp;
if ((op >> 8) == 0x3A)
mmmmm = 3;
else if ((op >> 8) == 0x38)
mmmmm = 2;
else
mmmmm = 1;
if (opPrefix == 0x66)
pp = 1;
else if (opPrefix == 0xF3)
pp = 2;
else if (opPrefix == 0xF2)
pp = 3;
else
pp = 0;
int mmmmm = GetVEXmmmmm(op);
int pp = GetVEXpp(opPrefix);
arg.WriteVex(this, regOp1, regOp2, 0, pp, mmmmm, size == 64);
Write8(op & 0xFF);
arg.WriteRest(this, extrabytes, regOp1);
@ -1419,8 +1421,8 @@ void XEmitter::WriteBMI2Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg
WriteVEXOp(size, opPrefix, op, regOp1, regOp2, arg, extrabytes);
}
void XEmitter::MOVD_xmm(X64Reg dest, const OpArg &arg) {WriteSSEOp(64, 0x6E, true, dest, arg, 0);}
void XEmitter::MOVD_xmm(const OpArg &arg, X64Reg src) {WriteSSEOp(64, 0x7E, true, src, arg, 0);}
void XEmitter::MOVD_xmm(X64Reg dest, const OpArg &arg) {WriteSSEOp(0x66, 0x6E, dest, arg, 0);}
void XEmitter::MOVD_xmm(const OpArg &arg, X64Reg src) {WriteSSEOp(0x66, 0x7E, src, arg, 0);}
void XEmitter::MOVQ_xmm(X64Reg dest, OpArg arg)
{
@ -1473,123 +1475,123 @@ void XEmitter::WriteMXCSR(OpArg arg, int ext)
void XEmitter::STMXCSR(OpArg memloc) {WriteMXCSR(memloc, 3);}
void XEmitter::LDMXCSR(OpArg memloc) {WriteMXCSR(memloc, 2);}
void XEmitter::MOVNTDQ(OpArg arg, X64Reg regOp) {WriteSSEOp(64, sseMOVNTDQ, true, regOp, arg);}
void XEmitter::MOVNTPS(OpArg arg, X64Reg regOp) {WriteSSEOp(32, sseMOVNTP, true, regOp, arg);}
void XEmitter::MOVNTPD(OpArg arg, X64Reg regOp) {WriteSSEOp(64, sseMOVNTP, true, regOp, arg);}
void XEmitter::MOVNTDQ(OpArg arg, X64Reg regOp) {WriteSSEOp(0x66, sseMOVNTDQ, regOp, arg);}
void XEmitter::MOVNTPS(OpArg arg, X64Reg regOp) {WriteSSEOp(0x00, sseMOVNTP, regOp, arg);}
void XEmitter::MOVNTPD(OpArg arg, X64Reg regOp) {WriteSSEOp(0x66, sseMOVNTP, regOp, arg);}
void XEmitter::ADDSS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseADD, false, regOp, arg);}
void XEmitter::ADDSD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseADD, false, regOp, arg);}
void XEmitter::SUBSS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseSUB, false, regOp, arg);}
void XEmitter::SUBSD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseSUB, false, regOp, arg);}
void XEmitter::CMPSS(X64Reg regOp, OpArg arg, u8 compare) {WriteSSEOp(32, sseCMP, false, regOp, arg,1); Write8(compare);}
void XEmitter::CMPSD(X64Reg regOp, OpArg arg, u8 compare) {WriteSSEOp(64, sseCMP, false, regOp, arg,1); Write8(compare);}
void XEmitter::MULSS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseMUL, false, regOp, arg);}
void XEmitter::MULSD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseMUL, false, regOp, arg);}
void XEmitter::DIVSS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseDIV, false, regOp, arg);}
void XEmitter::DIVSD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseDIV, false, regOp, arg);}
void XEmitter::MINSS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseMIN, false, regOp, arg);}
void XEmitter::MINSD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseMIN, false, regOp, arg);}
void XEmitter::MAXSS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseMAX, false, regOp, arg);}
void XEmitter::MAXSD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseMAX, false, regOp, arg);}
void XEmitter::SQRTSS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseSQRT, false, regOp, arg);}
void XEmitter::SQRTSD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseSQRT, false, regOp, arg);}
void XEmitter::RSQRTSS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseRSQRT, false, regOp, arg);}
void XEmitter::ADDSS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseADD, regOp, arg);}
void XEmitter::ADDSD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, sseADD, regOp, arg);}
void XEmitter::SUBSS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseSUB, regOp, arg);}
void XEmitter::SUBSD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, sseSUB, regOp, arg);}
void XEmitter::CMPSS(X64Reg regOp, OpArg arg, u8 compare) {WriteSSEOp(0xF3, sseCMP, regOp, arg, 1); Write8(compare);}
void XEmitter::CMPSD(X64Reg regOp, OpArg arg, u8 compare) {WriteSSEOp(0xF2, sseCMP, regOp, arg, 1); Write8(compare);}
void XEmitter::MULSS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseMUL, regOp, arg);}
void XEmitter::MULSD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, sseMUL, regOp, arg);}
void XEmitter::DIVSS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseDIV, regOp, arg);}
void XEmitter::DIVSD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, sseDIV, regOp, arg);}
void XEmitter::MINSS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseMIN, regOp, arg);}
void XEmitter::MINSD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, sseMIN, regOp, arg);}
void XEmitter::MAXSS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseMAX, regOp, arg);}
void XEmitter::MAXSD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, sseMAX, regOp, arg);}
void XEmitter::SQRTSS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseSQRT, regOp, arg);}
void XEmitter::SQRTSD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, sseSQRT, regOp, arg);}
void XEmitter::RSQRTSS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseRSQRT, regOp, arg);}
void XEmitter::ADDPS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseADD, true, regOp, arg);}
void XEmitter::ADDPD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseADD, true, regOp, arg);}
void XEmitter::SUBPS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseSUB, true, regOp, arg);}
void XEmitter::SUBPD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseSUB, true, regOp, arg);}
void XEmitter::CMPPS(X64Reg regOp, OpArg arg, u8 compare) {WriteSSEOp(32, sseCMP, true, regOp, arg,1); Write8(compare);}
void XEmitter::CMPPD(X64Reg regOp, OpArg arg, u8 compare) {WriteSSEOp(64, sseCMP, true, regOp, arg,1); Write8(compare);}
void XEmitter::ANDPS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseAND, true, regOp, arg);}
void XEmitter::ANDPD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseAND, true, regOp, arg);}
void XEmitter::ANDNPS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseANDN, true, regOp, arg);}
void XEmitter::ANDNPD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseANDN, true, regOp, arg);}
void XEmitter::ORPS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseOR, true, regOp, arg);}
void XEmitter::ORPD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseOR, true, regOp, arg);}
void XEmitter::XORPS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseXOR, true, regOp, arg);}
void XEmitter::XORPD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseXOR, true, regOp, arg);}
void XEmitter::MULPS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseMUL, true, regOp, arg);}
void XEmitter::MULPD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseMUL, true, regOp, arg);}
void XEmitter::DIVPS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseDIV, true, regOp, arg);}
void XEmitter::DIVPD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseDIV, true, regOp, arg);}
void XEmitter::MINPS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseMIN, true, regOp, arg);}
void XEmitter::MINPD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseMIN, true, regOp, arg);}
void XEmitter::MAXPS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseMAX, true, regOp, arg);}
void XEmitter::MAXPD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseMAX, true, regOp, arg);}
void XEmitter::SQRTPS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseSQRT, true, regOp, arg);}
void XEmitter::SQRTPD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseSQRT, true, regOp, arg);}
void XEmitter::RSQRTPS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseRSQRT, true, regOp, arg);}
void XEmitter::SHUFPS(X64Reg regOp, OpArg arg, u8 shuffle) {WriteSSEOp(32, sseSHUF, true, regOp, arg,1); Write8(shuffle);}
void XEmitter::SHUFPD(X64Reg regOp, OpArg arg, u8 shuffle) {WriteSSEOp(64, sseSHUF, true, regOp, arg,1); Write8(shuffle);}
void XEmitter::ADDPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseADD, regOp, arg);}
void XEmitter::ADDPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseADD, regOp, arg);}
void XEmitter::SUBPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseSUB, regOp, arg);}
void XEmitter::SUBPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseSUB, regOp, arg);}
void XEmitter::CMPPS(X64Reg regOp, OpArg arg, u8 compare) {WriteSSEOp(0x00, sseCMP, regOp, arg, 1); Write8(compare);}
void XEmitter::CMPPD(X64Reg regOp, OpArg arg, u8 compare) {WriteSSEOp(0x66, sseCMP, regOp, arg, 1); Write8(compare);}
void XEmitter::ANDPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseAND, regOp, arg);}
void XEmitter::ANDPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseAND, regOp, arg);}
void XEmitter::ANDNPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseANDN, regOp, arg);}
void XEmitter::ANDNPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseANDN, regOp, arg);}
void XEmitter::ORPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseOR, regOp, arg);}
void XEmitter::ORPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseOR, regOp, arg);}
void XEmitter::XORPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseXOR, regOp, arg);}
void XEmitter::XORPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseXOR, regOp, arg);}
void XEmitter::MULPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseMUL, regOp, arg);}
void XEmitter::MULPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseMUL, regOp, arg);}
void XEmitter::DIVPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseDIV, regOp, arg);}
void XEmitter::DIVPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseDIV, regOp, arg);}
void XEmitter::MINPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseMIN, regOp, arg);}
void XEmitter::MINPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseMIN, regOp, arg);}
void XEmitter::MAXPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseMAX, regOp, arg);}
void XEmitter::MAXPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseMAX, regOp, arg);}
void XEmitter::SQRTPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseSQRT, regOp, arg);}
void XEmitter::SQRTPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseSQRT, regOp, arg);}
void XEmitter::RSQRTPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseRSQRT, regOp, arg);}
void XEmitter::SHUFPS(X64Reg regOp, OpArg arg, u8 shuffle) {WriteSSEOp(0x00, sseSHUF, regOp, arg,1); Write8(shuffle);}
void XEmitter::SHUFPD(X64Reg regOp, OpArg arg, u8 shuffle) {WriteSSEOp(0x66, sseSHUF, regOp, arg,1); Write8(shuffle);}
void XEmitter::COMISS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseCOMIS, true, regOp, arg);} //weird that these should be packed
void XEmitter::COMISD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseCOMIS, true, regOp, arg);} //ordered
void XEmitter::UCOMISS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseUCOMIS, true, regOp, arg);} //unordered
void XEmitter::UCOMISD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseUCOMIS, true, regOp, arg);}
void XEmitter::COMISS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseCOMIS, regOp, arg);} //weird that these should be packed
void XEmitter::COMISD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseCOMIS, regOp, arg);} //ordered
void XEmitter::UCOMISS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseUCOMIS, regOp, arg);} //unordered
void XEmitter::UCOMISD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseUCOMIS, regOp, arg);}
void XEmitter::MOVAPS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseMOVAPfromRM, true, regOp, arg);}
void XEmitter::MOVAPD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseMOVAPfromRM, true, regOp, arg);}
void XEmitter::MOVAPS(OpArg arg, X64Reg regOp) {WriteSSEOp(32, sseMOVAPtoRM, true, regOp, arg);}
void XEmitter::MOVAPD(OpArg arg, X64Reg regOp) {WriteSSEOp(64, sseMOVAPtoRM, true, regOp, arg);}
void XEmitter::MOVAPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseMOVAPfromRM, regOp, arg);}
void XEmitter::MOVAPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseMOVAPfromRM, regOp, arg);}
void XEmitter::MOVAPS(OpArg arg, X64Reg regOp) {WriteSSEOp(0x00, sseMOVAPtoRM, regOp, arg);}
void XEmitter::MOVAPD(OpArg arg, X64Reg regOp) {WriteSSEOp(0x66, sseMOVAPtoRM, regOp, arg);}
void XEmitter::MOVUPS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseMOVUPfromRM, true, regOp, arg);}
void XEmitter::MOVUPD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseMOVUPfromRM, true, regOp, arg);}
void XEmitter::MOVUPS(OpArg arg, X64Reg regOp) {WriteSSEOp(32, sseMOVUPtoRM, true, regOp, arg);}
void XEmitter::MOVUPD(OpArg arg, X64Reg regOp) {WriteSSEOp(64, sseMOVUPtoRM, true, regOp, arg);}
void XEmitter::MOVUPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseMOVUPfromRM, regOp, arg);}
void XEmitter::MOVUPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseMOVUPfromRM, regOp, arg);}
void XEmitter::MOVUPS(OpArg arg, X64Reg regOp) {WriteSSEOp(0x00, sseMOVUPtoRM, regOp, arg);}
void XEmitter::MOVUPD(OpArg arg, X64Reg regOp) {WriteSSEOp(0x66, sseMOVUPtoRM, regOp, arg);}
void XEmitter::MOVSS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseMOVUPfromRM, false, regOp, arg);}
void XEmitter::MOVSD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseMOVUPfromRM, false, regOp, arg);}
void XEmitter::MOVSS(OpArg arg, X64Reg regOp) {WriteSSEOp(32, sseMOVUPtoRM, false, regOp, arg);}
void XEmitter::MOVSD(OpArg arg, X64Reg regOp) {WriteSSEOp(64, sseMOVUPtoRM, false, regOp, arg);}
void XEmitter::MOVSS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseMOVUPfromRM, regOp, arg);}
void XEmitter::MOVSD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, sseMOVUPfromRM, regOp, arg);}
void XEmitter::MOVSS(OpArg arg, X64Reg regOp) {WriteSSEOp(0xF3, sseMOVUPtoRM, regOp, arg);}
void XEmitter::MOVSD(OpArg arg, X64Reg regOp) {WriteSSEOp(0xF2, sseMOVUPtoRM, regOp, arg);}
void XEmitter::MOVLPD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseMOVLPDfromRM, false, regOp, arg);}
void XEmitter::MOVHPD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseMOVHPDfromRM, false, regOp, arg);}
void XEmitter::MOVLPD(OpArg arg, X64Reg regOp) {WriteSSEOp(64, sseMOVLPDtoRM, false, regOp, arg);}
void XEmitter::MOVHPD(OpArg arg, X64Reg regOp) {WriteSSEOp(64, sseMOVHPDtoRM, false, regOp, arg);}
void XEmitter::MOVLPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, sseMOVLPDfromRM, regOp, arg);}
void XEmitter::MOVHPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, sseMOVHPDfromRM, regOp, arg);}
void XEmitter::MOVLPD(OpArg arg, X64Reg regOp) {WriteSSEOp(0xF2, sseMOVLPDtoRM, regOp, arg);}
void XEmitter::MOVHPD(OpArg arg, X64Reg regOp) {WriteSSEOp(0xF2, sseMOVHPDtoRM, regOp, arg);}
void XEmitter::MOVHLPS(X64Reg regOp1, X64Reg regOp2) {WriteSSEOp(32, sseMOVHLPS, true, regOp1, R(regOp2));}
void XEmitter::MOVLHPS(X64Reg regOp1, X64Reg regOp2) {WriteSSEOp(32, sseMOVLHPS, true, regOp1, R(regOp2));}
void XEmitter::MOVHLPS(X64Reg regOp1, X64Reg regOp2) {WriteSSEOp(0x00, sseMOVHLPS, regOp1, R(regOp2));}
void XEmitter::MOVLHPS(X64Reg regOp1, X64Reg regOp2) {WriteSSEOp(0x00, sseMOVLHPS, regOp1, R(regOp2));}
void XEmitter::CVTPS2PD(X64Reg regOp, OpArg arg) {WriteSSEOp(32, 0x5A, true, regOp, arg);}
void XEmitter::CVTPD2PS(X64Reg regOp, OpArg arg) {WriteSSEOp(64, 0x5A, true, regOp, arg);}
void XEmitter::CVTPS2PD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, 0x5A, regOp, arg);}
void XEmitter::CVTPD2PS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, 0x5A, regOp, arg);}
void XEmitter::CVTSD2SS(X64Reg regOp, OpArg arg) {WriteSSEOp(64, 0x5A, false, regOp, arg);}
void XEmitter::CVTSS2SD(X64Reg regOp, OpArg arg) {WriteSSEOp(32, 0x5A, false, regOp, arg);}
void XEmitter::CVTSD2SI(X64Reg regOp, OpArg arg) {WriteSSEOp(64, 0x2D, false, regOp, arg);}
void XEmitter::CVTSS2SI(X64Reg regOp, OpArg arg) {WriteSSEOp(32, 0x2D, false, regOp, arg);}
void XEmitter::CVTSI2SD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, 0x2A, false, regOp, arg);}
void XEmitter::CVTSI2SS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, 0x2A, false, regOp, arg);}
void XEmitter::CVTSD2SS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, 0x5A, regOp, arg);}
void XEmitter::CVTSS2SD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, 0x5A, regOp, arg);}
void XEmitter::CVTSD2SI(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, 0x2D, regOp, arg);}
void XEmitter::CVTSS2SI(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, 0x2D, regOp, arg);}
void XEmitter::CVTSI2SD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, 0x2A, regOp, arg);}
void XEmitter::CVTSI2SS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, 0x2A, regOp, arg);}
void XEmitter::CVTDQ2PD(X64Reg regOp, OpArg arg) {WriteSSEOp(32, 0xE6, false, regOp, arg);}
void XEmitter::CVTDQ2PS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, 0x5B, true, regOp, arg);}
void XEmitter::CVTPD2DQ(X64Reg regOp, OpArg arg) {WriteSSEOp(64, 0xE6, false, regOp, arg);}
void XEmitter::CVTPS2DQ(X64Reg regOp, OpArg arg) {WriteSSEOp(64, 0x5B, true, regOp, arg);}
void XEmitter::CVTDQ2PD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, 0xE6, regOp, arg);}
void XEmitter::CVTDQ2PS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, 0x5B, regOp, arg);}
void XEmitter::CVTPD2DQ(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, 0xE6, regOp, arg);}
void XEmitter::CVTPS2DQ(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, 0x5B, regOp, arg);}
void XEmitter::CVTTSD2SI(X64Reg regOp, OpArg arg) {WriteSSEOp(64, 0x2C, false, regOp, arg);}
void XEmitter::CVTTSS2SI(X64Reg regOp, OpArg arg) {WriteSSEOp(32, 0x2C, false, regOp, arg);}
void XEmitter::CVTTPS2DQ(X64Reg regOp, OpArg arg) {WriteSSEOp(32, 0x5B, false, regOp, arg);}
void XEmitter::CVTTPD2DQ(X64Reg regOp, OpArg arg) {WriteSSEOp(64, 0xE6, true, regOp, arg);}
void XEmitter::CVTTSD2SI(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, 0x2C, regOp, arg);}
void XEmitter::CVTTSS2SI(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, 0x2C, regOp, arg);}
void XEmitter::CVTTPS2DQ(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, 0x5B, regOp, arg);}
void XEmitter::CVTTPD2DQ(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, 0xE6, regOp, arg);}
void XEmitter::MASKMOVDQU(X64Reg dest, X64Reg src) {WriteSSEOp(64, sseMASKMOVDQU, true, dest, R(src));}
void XEmitter::MASKMOVDQU(X64Reg dest, X64Reg src) {WriteSSEOp(0x66, sseMASKMOVDQU, dest, R(src));}
void XEmitter::MOVMSKPS(X64Reg dest, OpArg arg) {WriteSSEOp(32, 0x50, true, dest, arg);}
void XEmitter::MOVMSKPD(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x50, true, dest, arg);}
void XEmitter::MOVMSKPS(X64Reg dest, OpArg arg) {WriteSSEOp(0x00, 0x50, dest, arg);}
void XEmitter::MOVMSKPD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x50, dest, arg);}
void XEmitter::LDDQU(X64Reg dest, OpArg arg) {WriteSSEOp(64, sseLDDQU, false, dest, arg);} // For integer data only
void XEmitter::LDDQU(X64Reg dest, OpArg arg) {WriteSSEOp(0xF2, sseLDDQU, dest, arg);} // For integer data only
// THESE TWO ARE UNTESTED.
void XEmitter::UNPCKLPS(X64Reg dest, OpArg arg) {WriteSSEOp(32, 0x14, true, dest, arg);}
void XEmitter::UNPCKHPS(X64Reg dest, OpArg arg) {WriteSSEOp(32, 0x15, true, dest, arg);}
void XEmitter::UNPCKLPS(X64Reg dest, OpArg arg) {WriteSSEOp(0x00, 0x14, dest, arg);}
void XEmitter::UNPCKHPS(X64Reg dest, OpArg arg) {WriteSSEOp(0x00, 0x15, dest, arg);}
void XEmitter::UNPCKLPD(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x14, true, dest, arg);}
void XEmitter::UNPCKHPD(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x15, true, dest, arg);}
void XEmitter::UNPCKLPD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x14, dest, arg);}
void XEmitter::UNPCKHPD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x15, dest, arg);}
void XEmitter::MOVDDUP(X64Reg regOp, OpArg arg)
{
if (cpu_info.bSSE3)
{
WriteSSEOp(64, 0x12, false, regOp, arg); //SSE3 movddup
WriteSSEOp(0xF2, 0x12, regOp, arg); //SSE3 movddup
}
else
{
@ -1603,53 +1605,52 @@ void XEmitter::MOVDDUP(X64Reg regOp, OpArg arg)
//There are a few more left
// Also some integer instructions are missing
void XEmitter::PACKSSDW(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x6B, true, dest, arg);}
void XEmitter::PACKSSWB(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x63, true, dest, arg);}
void XEmitter::PACKUSWB(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x67, true, dest, arg);}
void XEmitter::PACKSSDW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x6B, dest, arg);}
void XEmitter::PACKSSWB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x63, dest, arg);}
void XEmitter::PACKUSWB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x67, dest, arg);}
void XEmitter::PUNPCKLBW(X64Reg dest, const OpArg &arg) {WriteSSEOp(64, 0x60, true, dest, arg);}
void XEmitter::PUNPCKLWD(X64Reg dest, const OpArg &arg) {WriteSSEOp(64, 0x61, true, dest, arg);}
void XEmitter::PUNPCKLDQ(X64Reg dest, const OpArg &arg) {WriteSSEOp(64, 0x62, true, dest, arg);}
//void PUNPCKLQDQ(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x60, true, dest, arg);}
void XEmitter::PUNPCKLBW(X64Reg dest, const OpArg &arg) {WriteSSEOp(0x66, 0x60, dest, arg);}
void XEmitter::PUNPCKLWD(X64Reg dest, const OpArg &arg) {WriteSSEOp(0x66, 0x61, dest, arg);}
void XEmitter::PUNPCKLDQ(X64Reg dest, const OpArg &arg) {WriteSSEOp(0x66, 0x62, dest, arg);}
void XEmitter::PSRLW(X64Reg reg, int shift)
{
WriteSSEOp(64, 0x71, true, (X64Reg)2, R(reg));
WriteSSEOp(0x66, 0x71, (X64Reg)2, R(reg));
Write8(shift);
}
void XEmitter::PSRLD(X64Reg reg, int shift)
{
WriteSSEOp(64, 0x72, true, (X64Reg)2, R(reg));
WriteSSEOp(0x66, 0x72, (X64Reg)2, R(reg));
Write8(shift);
}
void XEmitter::PSRLQ(X64Reg reg, int shift)
{
WriteSSEOp(64, 0x73, true, (X64Reg)2, R(reg));
WriteSSEOp(0x66, 0x73, (X64Reg)2, R(reg));
Write8(shift);
}
void XEmitter::PSRLQ(X64Reg reg, OpArg arg)
{
WriteSSEOp(64, 0xd3, true, reg, arg);
WriteSSEOp(0x66, 0xd3, reg, arg);
}
void XEmitter::PSLLW(X64Reg reg, int shift)
{
WriteSSEOp(64, 0x71, true, (X64Reg)6, R(reg));
WriteSSEOp(0x66, 0x71, (X64Reg)6, R(reg));
Write8(shift);
}
void XEmitter::PSLLD(X64Reg reg, int shift)
{
WriteSSEOp(64, 0x72, true, (X64Reg)6, R(reg));
WriteSSEOp(0x66, 0x72, (X64Reg)6, R(reg));
Write8(shift);
}
void XEmitter::PSLLQ(X64Reg reg, int shift)
{
WriteSSEOp(64, 0x73, true, (X64Reg)6, R(reg));
WriteSSEOp(0x66, 0x73, (X64Reg)6, R(reg));
Write8(shift);
}
@ -1677,100 +1678,109 @@ void XEmitter::PSRAD(X64Reg reg, int shift)
Write8(shift);
}
void XEmitter::WriteSSSE3Op(int size, u16 sseOp, bool packed, X64Reg regOp, OpArg arg, int extrabytes)
void XEmitter::WriteSSSE3Op(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes)
{
if (!cpu_info.bSSSE3)
PanicAlert("Trying to use SSSE3 on a system that doesn't support it. Bad programmer.");
WriteSSEOp(size, sseOp, packed, regOp, arg, extrabytes);
WriteSSEOp(opPrefix, op, regOp, arg, extrabytes);
}
void XEmitter::WriteSSE41Op(int size, u16 sseOp, bool packed, X64Reg regOp, OpArg arg, int extrabytes)
void XEmitter::WriteSSE41Op(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes)
{
if (!cpu_info.bSSE4_1)
PanicAlert("Trying to use SSE4.1 on a system that doesn't support it. Bad programmer.");
WriteSSEOp(size, sseOp, packed, regOp, arg, extrabytes);
WriteSSEOp(opPrefix, op, regOp, arg, extrabytes);
}
void XEmitter::PSHUFB(X64Reg dest, OpArg arg) {WriteSSSE3Op(64, 0x3800, true, dest, arg);}
void XEmitter::PTEST(X64Reg dest, OpArg arg) {WriteSSE41Op(64, 0x3817, true, dest, arg);}
void XEmitter::PACKUSDW(X64Reg dest, OpArg arg) {WriteSSE41Op(64, 0x382b, true, dest, arg);}
void XEmitter::PSHUFB(X64Reg dest, OpArg arg) {WriteSSSE3Op(0x66, 0x3800, dest, arg);}
void XEmitter::PTEST(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3817, dest, arg);}
void XEmitter::PACKUSDW(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x382b, dest, arg);}
void XEmitter::PMOVSXBW(X64Reg dest, OpArg arg) {WriteSSE41Op(64, 0x3820, true, dest, arg);}
void XEmitter::PMOVSXBD(X64Reg dest, OpArg arg) {WriteSSE41Op(64, 0x3821, true, dest, arg);}
void XEmitter::PMOVSXBQ(X64Reg dest, OpArg arg) {WriteSSE41Op(64, 0x3822, true, dest, arg);}
void XEmitter::PMOVSXWD(X64Reg dest, OpArg arg) {WriteSSE41Op(64, 0x3823, true, dest, arg);}
void XEmitter::PMOVSXWQ(X64Reg dest, OpArg arg) {WriteSSE41Op(64, 0x3824, true, dest, arg);}
void XEmitter::PMOVSXDQ(X64Reg dest, OpArg arg) {WriteSSE41Op(64, 0x3825, true, dest, arg);}
void XEmitter::PMOVZXBW(X64Reg dest, OpArg arg) {WriteSSE41Op(64, 0x3830, true, dest, arg);}
void XEmitter::PMOVZXBD(X64Reg dest, OpArg arg) {WriteSSE41Op(64, 0x3831, true, dest, arg);}
void XEmitter::PMOVZXBQ(X64Reg dest, OpArg arg) {WriteSSE41Op(64, 0x3832, true, dest, arg);}
void XEmitter::PMOVZXWD(X64Reg dest, OpArg arg) {WriteSSE41Op(64, 0x3833, true, dest, arg);}
void XEmitter::PMOVZXWQ(X64Reg dest, OpArg arg) {WriteSSE41Op(64, 0x3834, true, dest, arg);}
void XEmitter::PMOVZXDQ(X64Reg dest, OpArg arg) {WriteSSE41Op(64, 0x3835, true, dest, arg);}
void XEmitter::PMOVSXBW(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3820, dest, arg);}
void XEmitter::PMOVSXBD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3821, dest, arg);}
void XEmitter::PMOVSXBQ(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3822, dest, arg);}
void XEmitter::PMOVSXWD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3823, dest, arg);}
void XEmitter::PMOVSXWQ(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3824, dest, arg);}
void XEmitter::PMOVSXDQ(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3825, dest, arg);}
void XEmitter::PMOVZXBW(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3830, dest, arg);}
void XEmitter::PMOVZXBD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3831, dest, arg);}
void XEmitter::PMOVZXBQ(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3832, dest, arg);}
void XEmitter::PMOVZXWD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3833, dest, arg);}
void XEmitter::PMOVZXWQ(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3834, dest, arg);}
void XEmitter::PMOVZXDQ(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3835, dest, arg);}
void XEmitter::PBLENDVB(X64Reg dest, OpArg arg) {WriteSSE41Op(64, 0x3810, true, dest, arg);}
void XEmitter::BLENDVPS(X64Reg dest, OpArg arg) {WriteSSE41Op(64, 0x3814, true, dest, arg);}
void XEmitter::BLENDVPD(X64Reg dest, OpArg arg) {WriteSSE41Op(64, 0x3815, true, dest, arg);}
void XEmitter::PBLENDVB(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3810, dest, arg);}
void XEmitter::BLENDVPS(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3814, dest, arg);}
void XEmitter::BLENDVPD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3815, dest, arg);}
void XEmitter::PAND(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xDB, true, dest, arg);}
void XEmitter::PANDN(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xDF, true, dest, arg);}
void XEmitter::PXOR(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xEF, true, dest, arg);}
void XEmitter::POR(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xEB, true, dest, arg);}
void XEmitter::PAND(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xDB, dest, arg);}
void XEmitter::PANDN(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xDF, dest, arg);}
void XEmitter::PXOR(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xEF, dest, arg);}
void XEmitter::POR(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xEB, dest, arg);}
void XEmitter::PADDB(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xFC, true, dest, arg);}
void XEmitter::PADDW(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xFD, true, dest, arg);}
void XEmitter::PADDD(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xFE, true, dest, arg);}
void XEmitter::PADDQ(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xD4, true, dest, arg);}
void XEmitter::PADDB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xFC, dest, arg);}
void XEmitter::PADDW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xFD, dest, arg);}
void XEmitter::PADDD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xFE, dest, arg);}
void XEmitter::PADDQ(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xD4, dest, arg);}
void XEmitter::PADDSB(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xEC, true, dest, arg);}
void XEmitter::PADDSW(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xED, true, dest, arg);}
void XEmitter::PADDUSB(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xDC, true, dest, arg);}
void XEmitter::PADDUSW(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xDD, true, dest, arg);}
void XEmitter::PADDSB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xEC, dest, arg);}
void XEmitter::PADDSW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xED, dest, arg);}
void XEmitter::PADDUSB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xDC, dest, arg);}
void XEmitter::PADDUSW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xDD, dest, arg);}
void XEmitter::PSUBB(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xF8, true, dest, arg);}
void XEmitter::PSUBW(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xF9, true, dest, arg);}
void XEmitter::PSUBD(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xFA, true, dest, arg);}
void XEmitter::PSUBQ(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xFB, true, dest, arg);}
void XEmitter::PSUBB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xF8, dest, arg);}
void XEmitter::PSUBW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xF9, dest, arg);}
void XEmitter::PSUBD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xFA, dest, arg);}
void XEmitter::PSUBQ(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xFB, dest, arg);}
void XEmitter::PSUBSB(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xE8, true, dest, arg);}
void XEmitter::PSUBSW(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xE9, true, dest, arg);}
void XEmitter::PSUBUSB(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xD8, true, dest, arg);}
void XEmitter::PSUBUSW(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xD9, true, dest, arg);}
void XEmitter::PSUBSB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xE8, dest, arg);}
void XEmitter::PSUBSW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xE9, dest, arg);}
void XEmitter::PSUBUSB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xD8, dest, arg);}
void XEmitter::PSUBUSW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xD9, dest, arg);}
void XEmitter::PAVGB(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xE0, true, dest, arg);}
void XEmitter::PAVGW(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xE3, true, dest, arg);}
void XEmitter::PAVGB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xE0, dest, arg);}
void XEmitter::PAVGW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xE3, dest, arg);}
void XEmitter::PCMPEQB(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x74, true, dest, arg);}
void XEmitter::PCMPEQW(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x75, true, dest, arg);}
void XEmitter::PCMPEQD(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x76, true, dest, arg);}
void XEmitter::PCMPEQB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x74, dest, arg);}
void XEmitter::PCMPEQW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x75, dest, arg);}
void XEmitter::PCMPEQD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x76, dest, arg);}
void XEmitter::PCMPGTB(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x64, true, dest, arg);}
void XEmitter::PCMPGTW(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x65, true, dest, arg);}
void XEmitter::PCMPGTD(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x66, true, dest, arg);}
void XEmitter::PCMPGTB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x64, dest, arg);}
void XEmitter::PCMPGTW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x65, dest, arg);}
void XEmitter::PCMPGTD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x66, dest, arg);}
void XEmitter::PEXTRW(X64Reg dest, OpArg arg, u8 subreg) {WriteSSEOp(64, 0xC5, true, dest, arg); Write8(subreg);}
void XEmitter::PINSRW(X64Reg dest, OpArg arg, u8 subreg) {WriteSSEOp(64, 0xC4, true, dest, arg); Write8(subreg);}
void XEmitter::PEXTRW(X64Reg dest, OpArg arg, u8 subreg) {WriteSSEOp(0x66, 0xC5, dest, arg); Write8(subreg);}
void XEmitter::PINSRW(X64Reg dest, OpArg arg, u8 subreg) {WriteSSEOp(0x66, 0xC4, dest, arg); Write8(subreg);}
void XEmitter::PMADDWD(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xF5, true, dest, arg); }
void XEmitter::PSADBW(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xF6, true, dest, arg);}
void XEmitter::PMADDWD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xF5, dest, arg); }
void XEmitter::PSADBW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xF6, dest, arg);}
void XEmitter::PMAXSW(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xEE, true, dest, arg); }
void XEmitter::PMAXUB(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xDE, true, dest, arg); }
void XEmitter::PMINSW(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xEA, true, dest, arg); }
void XEmitter::PMINUB(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xDA, true, dest, arg); }
void XEmitter::PMAXSW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xEE, dest, arg); }
void XEmitter::PMAXUB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xDE, dest, arg); }
void XEmitter::PMINSW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xEA, dest, arg); }
void XEmitter::PMINUB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xDA, dest, arg); }
void XEmitter::PMOVMSKB(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xD7, true, dest, arg); }
void XEmitter::PMOVMSKB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xD7, dest, arg); }
void XEmitter::PSHUFLW(X64Reg regOp, OpArg arg, u8 shuffle) {WriteSSEOp(64, 0x70, false, regOp, arg, 1); Write8(shuffle);}
void XEmitter::PSHUFLW(X64Reg regOp, OpArg arg, u8 shuffle) {WriteSSEOp(0xF2, 0x70, regOp, arg, 1); Write8(shuffle);}
// VEX
void XEmitter::VADDSD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(64, sseADD, false, regOp1, regOp2, arg);}
void XEmitter::VSUBSD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(64, sseSUB, false, regOp1, regOp2, arg);}
void XEmitter::VMULSD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(64, sseMUL, false, regOp1, regOp2, arg);}
void XEmitter::VDIVSD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(64, sseDIV, false, regOp1, regOp2, arg);}
void XEmitter::VSQRTSD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(64, sseSQRT, false, regOp1, regOp2, arg);}
void XEmitter::VPAND(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(64, sseAND, false, regOp1, regOp2, arg);}
void XEmitter::VPANDN(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(64, sseANDN, false, regOp1, regOp2, arg);}
void XEmitter::VADDSD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0xF2, sseADD, regOp1, regOp2, arg);}
void XEmitter::VSUBSD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0xF2, sseSUB, regOp1, regOp2, arg);}
void XEmitter::VMULSD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0xF2, sseMUL, regOp1, regOp2, arg);}
void XEmitter::VDIVSD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0xF2, sseDIV, regOp1, regOp2, arg);}
void XEmitter::VADDPD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, sseADD, regOp1, regOp2, arg);}
void XEmitter::VSUBPD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, sseSUB, regOp1, regOp2, arg);}
void XEmitter::VMULPD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, sseMUL, regOp1, regOp2, arg);}
void XEmitter::VDIVPD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, sseDIV, regOp1, regOp2, arg);}
void XEmitter::VSQRTSD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0xF2, sseSQRT, regOp1, regOp2, arg);}
void XEmitter::VPAND(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, sseAND, regOp1, regOp2, arg);}
void XEmitter::VPANDN(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, sseANDN, regOp1, regOp2, arg);}
void XEmitter::VPOR(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, sseOR, regOp1, regOp2, arg);}
void XEmitter::VPXOR(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, sseXOR, regOp1, regOp2, arg);}
void XEmitter::VSHUFPD(X64Reg regOp1, X64Reg regOp2, OpArg arg, u8 shuffle) {WriteAVXOp(0x66, sseSHUF, regOp1, regOp2, arg, 1); Write8(shuffle);}
void XEmitter::VUNPCKLPD(X64Reg regOp1, X64Reg regOp2, OpArg arg){WriteAVXOp(0x66, 0x14, regOp1, regOp2, arg);}
void XEmitter::VUNPCKHPD(X64Reg regOp1, X64Reg regOp2, OpArg arg){WriteAVXOp(0x66, 0x15, regOp1, regOp2, arg);}
void XEmitter::SARX(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2) {WriteBMI2Op(bits, 0xF3, 0x38F7, regOp1, regOp2, arg);}
void XEmitter::SHLX(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2) {WriteBMI2Op(bits, 0x66, 0x38F7, regOp1, regOp2, arg);}

View file

@ -126,6 +126,11 @@ struct OpArg
//if scale == 0 never mind offsetting
offset = _offset;
}
bool operator==(OpArg b)
{
return operandReg == b.operandReg && scale == b.scale && offsetOrBaseReg == b.offsetOrBaseReg &&
indexReg == b.indexReg && offset == b.offset;
}
void WriteRex(XEmitter *emit, int opBits, int bits, int customOp = -1) const;
void WriteVex(XEmitter* emit, X64Reg regOp1, X64Reg regOp2, int L, int pp, int mmmmm, int W = 0) const;
void WriteRest(XEmitter *emit, int extraBytes=0, X64Reg operandReg=INVALID_REG, bool warn_64bit_offset = true) const;
@ -273,11 +278,11 @@ private:
void WriteShift(int bits, OpArg dest, OpArg &shift, int ext);
void WriteBitTest(int bits, OpArg &dest, OpArg &index, int ext);
void WriteMXCSR(OpArg arg, int ext);
void WriteSSEOp(int size, u16 sseOp, bool packed, X64Reg regOp, OpArg arg, int extrabytes = 0);
void WriteSSSE3Op(int size, u16 sseOp, bool packed, X64Reg regOp, OpArg arg, int extrabytes = 0);
void WriteSSE41Op(int size, u16 sseOp, bool packed, X64Reg regOp, OpArg arg, int extrabytes = 0);
void WriteAVXOp(int size, u16 sseOp, bool packed, X64Reg regOp, OpArg arg, int extrabytes = 0);
void WriteAVXOp(int size, u16 sseOp, bool packed, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes = 0);
void WriteSSEOp(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes = 0);
void WriteSSSE3Op(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes = 0);
void WriteSSE41Op(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes = 0);
void WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes = 0);
void WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes = 0);
void WriteVEXOp(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes = 0);
void WriteBMI1Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes = 0);
void WriteBMI2Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes = 0);
@ -725,9 +730,18 @@ public:
void VSUBSD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
void VMULSD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
void VDIVSD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
void VADDPD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
void VSUBPD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
void VMULPD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
void VDIVPD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
void VSQRTSD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
void VPAND(X64Reg regOp1, X64Reg regOp2, OpArg arg);
void VPANDN(X64Reg regOp1, X64Reg regOp2, OpArg arg);
void VPOR(X64Reg regOp1, X64Reg regOp2, OpArg arg);
void VPXOR(X64Reg regOp1, X64Reg regOp2, OpArg arg);
void VSHUFPD(X64Reg regOp1, X64Reg regOp2, OpArg arg, u8 shuffle);
void VUNPCKLPD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
void VUNPCKHPD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
// VEX GPR instructions
void SARX(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2);

View file

@ -728,6 +728,26 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc
SetJumpTarget(noBreakpoint);
}
// If we have an input register that is going to be used again, load it pre-emptively,
// even if the instruction doesn't strictly need it in a register, to avoid redundant
// loads later. Of course, don't do this if we're already out of registers.
// As a bit of a heuristic, make sure we have at least one register left over for the
// output, which needs to be bound in the actual instruction compilation.
// TODO: make this smarter in the case that we're actually register-starved, i.e.
// prioritize the more important registers.
for (int k = 0; k < 3 && gpr.NumFreeRegisters() >= 2; k++)
{
int reg = ops[i].regsIn[k];
if (reg >= 0 && (ops[i].gprInUse & (1 << reg)) && !gpr.R(reg).IsImm())
gpr.BindToRegister(reg, true, false);
}
for (int k = 0; k < 4 && fpr.NumFreeRegisters() >= 2; k++)
{
int reg = ops[i].fregsIn[k];
if (reg >= 0 && (ops[i].fprInXmm & (1 << reg)))
fpr.BindToRegister(reg, true, false);
}
Jit64Tables::CompileInstruction(ops[i]);
// If we have a register that will never be used again, flush it.

View file

@ -140,10 +140,13 @@ public:
void MultiplyImmediate(u32 imm, int a, int d, bool overflow);
void tri_op(int d, int a, int b, bool reversible, void (Gen::XEmitter::*op)(Gen::X64Reg, Gen::OpArg), UGeckoInstruction inst, bool roundRHS = false);
void tri_op(int d, int a, int b, bool reversible, void (XEmitter::*avxOp)(Gen::X64Reg, Gen::X64Reg, Gen::OpArg),
void (Gen::XEmitter::*sseOp)(Gen::X64Reg, Gen::OpArg), UGeckoInstruction inst, bool roundRHS = false);
typedef u32 (*Operation)(u32 a, u32 b);
void regimmop(int d, int a, bool binary, u32 value, Operation doop, void (Gen::XEmitter::*op)(int, const Gen::OpArg&, const Gen::OpArg&), bool Rc = false, bool carry = false);
void fp_tri_op(int d, int a, int b, bool reversible, bool single, void (Gen::XEmitter::*op)(Gen::X64Reg, Gen::OpArg), UGeckoInstruction inst, bool roundRHS = false);
void regimmop(int d, int a, bool binary, u32 value, Operation doop, void (Gen::XEmitter::*op)(int, const Gen::OpArg&, const Gen::OpArg&),
bool Rc = false, bool carry = false);
void fp_tri_op(int d, int a, int b, bool reversible, bool single, void (Gen::XEmitter::*avxOp)(Gen::X64Reg, Gen::X64Reg, Gen::OpArg),
void (Gen::XEmitter::*sseOp)(Gen::X64Reg, Gen::OpArg), UGeckoInstruction inst, bool roundRHS = false);
void FloatCompare(UGeckoInstruction inst, bool upper = false);
// OPCODES

View file

@ -108,7 +108,22 @@ X64Reg RegCache::GetFreeXReg()
return (X64Reg)xr;
}
}
//Okay, not found :( Force grab one
// Okay, not found :( Force grab one!
// First, see if we have any registers that are only going to be used for a float store.
// These go through GPRs, so the cost of tossing them back into memory is lower than anything else.
for (size_t i = 0; i < aCount; i++)
{
X64Reg xr = (X64Reg)aOrder[i];
if (xregs[xr].locked)
continue;
size_t preg = xregs[xr].ppcReg;
if (!regs[preg].locked && !(jit->js.op->fprInXmm & (1 << preg)))
{
StoreFromRegister(preg);
return xr;
}
}
//TODO - add a pass to grab xregs whose ppcreg is not used in the next 3 instructions
u32 last_used = 0xFFFFFFFF;
@ -366,3 +381,14 @@ void RegCache::Flush(FlushMode mode)
cur_use_quantum = 0;
}
int RegCache::NumFreeRegisters()
{
int count = 0;
size_t aCount;
const int* aOrder = GetAllocationOrder(aCount);
for (size_t i = 0; i < aCount; i++)
if (!xregs[aOrder[i]].locked && xregs[aOrder[i]].free)
count++;
return count;
}

View file

@ -123,6 +123,7 @@ public:
Gen::X64Reg GetFreeXReg();
int NumFreeRegisters();
};
class GPRRegCache : public RegCache

View file

@ -14,65 +14,27 @@ static const u64 GC_ALIGNED16(psSignBits[2]) = {0x8000000000000000ULL, 0x0000000
static const u64 GC_ALIGNED16(psAbsMask[2]) = {0x7FFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL};
static const double GC_ALIGNED16(half_qnan_and_s32_max[2]) = {0x7FFFFFFF, -0x80000};
void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool single, void (XEmitter::*op)(Gen::X64Reg, Gen::OpArg), UGeckoInstruction inst, bool roundRHS)
void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool single, void (XEmitter::*avxOp)(X64Reg, X64Reg, OpArg),
void (XEmitter::*sseOp)(X64Reg, OpArg), UGeckoInstruction inst, bool roundRHS)
{
fpr.Lock(d, a, b);
fpr.BindToRegister(d, d == a || d == b || !single);
if (roundRHS)
{
if (d == a)
{
fpr.BindToRegister(d, true);
MOVSD(XMM0, fpr.R(b));
Force25BitPrecision(XMM0, XMM1);
(this->*op)(fpr.RX(d), R(XMM0));
Force25BitPrecision(XMM0, fpr.R(b), XMM1);
(this->*sseOp)(fpr.RX(d), R(XMM0));
}
else
{
fpr.BindToRegister(d, d == b);
if (d != b)
MOVSD(fpr.RX(d), fpr.R(b));
Force25BitPrecision(fpr.RX(d), XMM0);
(this->*op)(fpr.RX(d), fpr.R(a));
}
}
else if (d == a)
{
fpr.BindToRegister(d, true);
if (!single)
{
fpr.BindToRegister(b, true, false);
}
(this->*op)(fpr.RX(d), fpr.R(b));
}
else if (d == b)
{
if (reversible)
{
fpr.BindToRegister(d, true);
if (!single)
{
fpr.BindToRegister(a, true, false);
}
(this->*op)(fpr.RX(d), fpr.R(a));
}
else
{
MOVSD(XMM0, fpr.R(b));
fpr.BindToRegister(d, !single);
MOVSD(fpr.RX(d), fpr.R(a));
(this->*op)(fpr.RX(d), Gen::R(XMM0));
Force25BitPrecision(fpr.RX(d), fpr.R(b), XMM0);
(this->*sseOp)(fpr.RX(d), fpr.R(a));
}
}
else
{
// Sources different from d, can use rather quick solution
fpr.BindToRegister(d, !single);
if (!single)
{
fpr.BindToRegister(b, true, false);
}
MOVSD(fpr.RX(d), fpr.R(a));
(this->*op)(fpr.RX(d), fpr.R(b));
avx_op(avxOp, sseOp, fpr.RX(d), fpr.R(a), fpr.R(b), false, reversible);
}
if (single)
{
@ -104,10 +66,10 @@ void Jit64::fp_arith(UGeckoInstruction inst)
bool single = inst.OPCD == 59;
switch (inst.SUBOP5)
{
case 18: fp_tri_op(inst.FD, inst.FA, inst.FB, false, single, &XEmitter::DIVSD, inst); break; //div
case 20: fp_tri_op(inst.FD, inst.FA, inst.FB, false, single, &XEmitter::SUBSD, inst); break; //sub
case 21: fp_tri_op(inst.FD, inst.FA, inst.FB, true, single, &XEmitter::ADDSD, inst); break; //add
case 25: fp_tri_op(inst.FD, inst.FA, inst.FC, true, single, &XEmitter::MULSD, inst, single); break; //mul
case 18: fp_tri_op(inst.FD, inst.FA, inst.FB, false, single, &XEmitter::VDIVSD, &XEmitter::DIVSD, inst); break; //div
case 20: fp_tri_op(inst.FD, inst.FA, inst.FB, false, single, &XEmitter::VSUBSD, &XEmitter::SUBSD, inst); break; //sub
case 21: fp_tri_op(inst.FD, inst.FA, inst.FB, true, single, &XEmitter::VADDSD, &XEmitter::ADDSD, inst); break; //add
case 25: fp_tri_op(inst.FD, inst.FA, inst.FC, true, single, &XEmitter::VMULSD, &XEmitter::MULSD, inst, single); break; //mul
default:
_assert_msg_(DYNA_REC, 0, "fp_arith WTF!!!");
}
@ -131,18 +93,20 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
// nmsub is implemented a little differently ((b - a*c) instead of -(a*c - b)), so handle it separately
if (inst.SUBOP5 == 30) //nmsub
{
MOVSD(XMM1, fpr.R(c));
if (single_precision)
Force25BitPrecision(XMM1, XMM0);
Force25BitPrecision(XMM1, fpr.R(c), XMM0);
else
MOVSD(XMM1, fpr.R(c));
MULSD(XMM1, fpr.R(a));
MOVSD(XMM0, fpr.R(b));
SUBSD(XMM0, R(XMM1));
}
else
{
MOVSD(XMM0, fpr.R(c));
if (single_precision)
Force25BitPrecision(XMM0, XMM1);
Force25BitPrecision(XMM0, fpr.R(c), XMM1);
else
MOVSD(XMM0, fpr.R(c));
MULSD(XMM0, fpr.R(a));
if (inst.SUBOP5 == 28) //msub
SUBSD(XMM0, fpr.R(b));

View file

@ -43,17 +43,15 @@ void Jit64::ps_sel(UGeckoInstruction inst)
if (cpu_info.bSSE4_1)
{
MOVAPD(XMM1, fpr.R(a));
PXOR(XMM0, R(XMM0));
CMPPD(XMM0, R(XMM1), NLE);
CMPPD(XMM0, fpr.R(a), NLE);
MOVAPD(XMM1, fpr.R(c));
BLENDVPD(XMM1, fpr.R(b));
}
else
{
MOVAPD(XMM0, fpr.R(a));
PXOR(XMM1, R(XMM1));
CMPPD(XMM1, R(XMM0), NLE);
CMPPD(XMM1, fpr.R(a), NLE);
MOVAPD(XMM0, R(XMM1));
PAND(XMM1, fpr.R(b));
PANDN(XMM0, fpr.R(c));
@ -74,26 +72,18 @@ void Jit64::ps_sign(UGeckoInstruction inst)
int b = inst.FB;
fpr.Lock(d, b);
if (d != b)
{
fpr.BindToRegister(d, false);
MOVAPD(fpr.RX(d), fpr.R(b));
}
else
{
fpr.BindToRegister(d, true);
}
fpr.BindToRegister(d, d == b);
switch (inst.SUBOP10)
{
case 40: //neg
PXOR(fpr.RX(d), M((void*)&psSignBits));
avx_op(&XEmitter::VPXOR, &XEmitter::PXOR, fpr.RX(d), fpr.R(b), M((void*)&psSignBits));
break;
case 136: //nabs
POR(fpr.RX(d), M((void*)&psSignBits));
avx_op(&XEmitter::VPOR, &XEmitter::POR, fpr.RX(d), fpr.R(b), M((void*)&psSignBits));
break;
case 264: //abs
PAND(fpr.RX(d), M((void*)&psAbsMask));
avx_op(&XEmitter::VPAND, &XEmitter::PAND, fpr.RX(d), fpr.R(b), M((void*)&psAbsMask));
break;
}
@ -101,56 +91,29 @@ void Jit64::ps_sign(UGeckoInstruction inst)
}
//There's still a little bit more optimization that can be squeezed out of this
void Jit64::tri_op(int d, int a, int b, bool reversible, void (XEmitter::*op)(X64Reg, OpArg), UGeckoInstruction inst, bool roundRHS)
void Jit64::tri_op(int d, int a, int b, bool reversible, void (XEmitter::*avxOp)(X64Reg, X64Reg, OpArg), void (XEmitter::*sseOp)(X64Reg, OpArg), UGeckoInstruction inst, bool roundRHS)
{
fpr.Lock(d, a, b);
fpr.BindToRegister(d, d == a || d == b);
if (roundRHS)
{
if (d == a)
{
fpr.BindToRegister(d, true);
MOVAPD(XMM0, fpr.R(b));
Force25BitPrecision(XMM0, XMM1);
(this->*op)(fpr.RX(d), R(XMM0));
Force25BitPrecision(XMM0, fpr.R(b), XMM1);
(this->*sseOp)(fpr.RX(d), R(XMM0));
}
else
{
fpr.BindToRegister(d, d == b);
if (d != b)
MOVAPD(fpr.RX(d), fpr.R(b));
Force25BitPrecision(fpr.RX(d), XMM0);
(this->*op)(fpr.RX(d), fpr.R(a));
}
}
else if (d == a)
{
fpr.BindToRegister(d, true);
(this->*op)(fpr.RX(d), fpr.R(b));
}
else if (d == b)
{
if (reversible)
{
fpr.BindToRegister(d, true);
(this->*op)(fpr.RX(d), fpr.R(a));
}
else
{
MOVAPD(XMM0, fpr.R(b));
fpr.BindToRegister(d, false);
MOVAPD(fpr.RX(d), fpr.R(a));
(this->*op)(fpr.RX(d), R(XMM0));
Force25BitPrecision(fpr.RX(d), fpr.R(b), XMM0);
(this->*sseOp)(fpr.RX(d), fpr.R(a));
}
}
else
{
//sources different from d, can use rather quick solution
fpr.BindToRegister(d, false);
MOVAPD(fpr.RX(d), fpr.R(a));
(this->*op)(fpr.RX(d), fpr.R(b));
avx_op(avxOp, sseOp, fpr.RX(d), fpr.R(a), fpr.R(b), true, reversible);
}
ForceSinglePrecisionP(fpr.RX(d));
ForceSinglePrecisionP(fpr.RX(d), fpr.RX(d));
SetFPRFIfNeeded(inst, fpr.RX(d));
fpr.UnlockAll();
}
@ -164,16 +127,16 @@ void Jit64::ps_arith(UGeckoInstruction inst)
switch (inst.SUBOP5)
{
case 18: // div
tri_op(inst.FD, inst.FA, inst.FB, false, &XEmitter::DIVPD, inst);
tri_op(inst.FD, inst.FA, inst.FB, false, &XEmitter::VDIVPD, &XEmitter::DIVPD, inst);
break;
case 20: // sub
tri_op(inst.FD, inst.FA, inst.FB, false, &XEmitter::SUBPD, inst);
tri_op(inst.FD, inst.FA, inst.FB, false, &XEmitter::VSUBPD, &XEmitter::SUBPD, inst);
break;
case 21: // add
tri_op(inst.FD, inst.FA, inst.FB, true, &XEmitter::ADDPD, inst);
tri_op(inst.FD, inst.FA, inst.FB, true, &XEmitter::VADDPD, &XEmitter::ADDPD, inst);
break;
case 25: // mul
tri_op(inst.FD, inst.FA, inst.FC, true, &XEmitter::MULPD, inst, true);
tri_op(inst.FD, inst.FA, inst.FC, true, &XEmitter::VMULPD, &XEmitter::MULPD, inst, true);
break;
default:
_assert_msg_(DYNA_REC, 0, "ps_arith WTF!!!");
@ -208,10 +171,9 @@ void Jit64::ps_sum(UGeckoInstruction inst)
default:
PanicAlert("ps_sum WTF!!!");
}
ForceSinglePrecisionP(XMM0);
SetFPRFIfNeeded(inst, XMM0);
fpr.BindToRegister(d, false);
MOVAPD(fpr.RX(d), R(XMM0));
ForceSinglePrecisionP(fpr.RX(d), XMM0);
SetFPRFIfNeeded(inst, fpr.RX(d));
fpr.UnlockAll();
}
@ -232,18 +194,16 @@ void Jit64::ps_muls(UGeckoInstruction inst)
MOVDDUP(XMM0, fpr.R(c));
break;
case 13:
MOVAPD(XMM0, fpr.R(c));
SHUFPD(XMM0, R(XMM0), 3);
avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, XMM0, fpr.R(c), fpr.R(c), 3);
break;
default:
PanicAlert("ps_muls WTF!!!");
}
Force25BitPrecision(XMM0, XMM1);
Force25BitPrecision(XMM0, R(XMM0), XMM1);
MULPD(XMM0, fpr.R(a));
ForceSinglePrecisionP(XMM0);
SetFPRFIfNeeded(inst, XMM0);
fpr.BindToRegister(d, false);
MOVAPD(fpr.RX(d), R(XMM0));
ForceSinglePrecisionP(fpr.RX(d), XMM0);
SetFPRFIfNeeded(inst, fpr.RX(d));
fpr.UnlockAll();
}
@ -258,27 +218,25 @@ void Jit64::ps_mergeXX(UGeckoInstruction inst)
int a = inst.FA;
int b = inst.FB;
fpr.Lock(a,b,d);
fpr.BindToRegister(d, d == a || d == b);
MOVAPD(XMM0, fpr.R(a));
switch (inst.SUBOP10)
{
case 528:
UNPCKLPD(XMM0, fpr.R(b)); //unpck is faster than shuf
avx_op(&XEmitter::VUNPCKLPD, &XEmitter::UNPCKLPD, fpr.RX(d), fpr.R(a), fpr.R(b));
break; //00
case 560:
SHUFPD(XMM0, fpr.R(b), 2); //must use shuf here
avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, fpr.RX(d), fpr.R(a), fpr.R(b), 2);
break; //01
case 592:
SHUFPD(XMM0, fpr.R(b), 1);
avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, fpr.RX(d), fpr.R(a), fpr.R(b), 1);
break; //10
case 624:
UNPCKHPD(XMM0, fpr.R(b));
avx_op(&XEmitter::VUNPCKHPD, &XEmitter::UNPCKHPD, fpr.RX(d), fpr.R(a), fpr.R(b));
break; //11
default:
_assert_msg_(DYNA_REC, 0, "ps_merge - invalid op");
}
fpr.BindToRegister(d, false);
MOVAPD(fpr.RX(d), R(XMM0));
fpr.UnlockAll();
}
@ -303,7 +261,7 @@ void Jit64::ps_rsqrte(UGeckoInstruction inst)
CALL((void *)asm_routines.frsqrte);
MOVLHPS(fpr.RX(d), XMM0);
ForceSinglePrecisionP(fpr.RX(d));
ForceSinglePrecisionP(fpr.RX(d), fpr.RX(d));
SetFPRFIfNeeded(inst, fpr.RX(d));
fpr.UnlockAll();
gpr.UnlockAllX();
@ -330,7 +288,7 @@ void Jit64::ps_res(UGeckoInstruction inst)
CALL((void *)asm_routines.fres);
MOVLHPS(fpr.RX(d), XMM0);
ForceSinglePrecisionP(fpr.RX(d));
ForceSinglePrecisionP(fpr.RX(d), fpr.RX(d));
SetFPRFIfNeeded(inst, fpr.RX(d));
fpr.UnlockAll();
gpr.UnlockAllX();
@ -352,42 +310,35 @@ void Jit64::ps_maddXX(UGeckoInstruction inst)
switch (inst.SUBOP5)
{
case 14: //madds0
MOVDDUP(XMM1, fpr.R(c));
Force25BitPrecision(XMM1, XMM0);
MOVAPD(XMM0, fpr.R(a));
MULPD(XMM0, R(XMM1));
MOVDDUP(XMM0, fpr.R(c));
Force25BitPrecision(XMM0, R(XMM0), XMM1);
MULPD(XMM0, fpr.R(a));
ADDPD(XMM0, fpr.R(b));
break;
case 15: //madds1
MOVAPD(XMM1, fpr.R(c));
SHUFPD(XMM1, R(XMM1), 3); // copy higher to lower
Force25BitPrecision(XMM1, XMM0);
MOVAPD(XMM0, fpr.R(a));
MULPD(XMM0, R(XMM1));
avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, XMM0, fpr.R(c), fpr.R(c), 3);
Force25BitPrecision(XMM0, R(XMM0), XMM1);
MULPD(XMM0, fpr.R(a));
ADDPD(XMM0, fpr.R(b));
break;
case 28: //msub
MOVAPD(XMM0, fpr.R(c));
Force25BitPrecision(XMM0, XMM1);
Force25BitPrecision(XMM0, fpr.R(c), XMM1);
MULPD(XMM0, fpr.R(a));
SUBPD(XMM0, fpr.R(b));
break;
case 29: //madd
MOVAPD(XMM0, fpr.R(c));
Force25BitPrecision(XMM0, XMM1);
Force25BitPrecision(XMM0, fpr.R(c), XMM1);
MULPD(XMM0, fpr.R(a));
ADDPD(XMM0, fpr.R(b));
break;
case 30: //nmsub
MOVAPD(XMM0, fpr.R(c));
Force25BitPrecision(XMM0, XMM1);
Force25BitPrecision(XMM0, fpr.R(c), XMM1);
MULPD(XMM0, fpr.R(a));
SUBPD(XMM0, fpr.R(b));
PXOR(XMM0, M((void*)&psSignBits));
break;
case 31: //nmadd
MOVAPD(XMM0, fpr.R(c));
Force25BitPrecision(XMM0, XMM1);
Force25BitPrecision(XMM0, fpr.R(c), XMM1);
MULPD(XMM0, fpr.R(a));
ADDPD(XMM0, fpr.R(b));
PXOR(XMM0, M((void*)&psSignBits));
@ -399,9 +350,8 @@ void Jit64::ps_maddXX(UGeckoInstruction inst)
return;
}
fpr.BindToRegister(d, false);
ForceSinglePrecisionP(XMM0);
SetFPRFIfNeeded(inst, XMM0);
MOVAPD(fpr.RX(d), R(XMM0));
ForceSinglePrecisionP(fpr.RX(d), XMM0);
SetFPRFIfNeeded(inst, fpr.RX(d));
fpr.UnlockAll();
}

View file

@ -409,8 +409,7 @@ void CommonAsmRoutines::GenQuantizedSingleStores()
const u8* storeSingleU8 = AlignCode4(); // Used by MKWii
SHR(32, R(RSCRATCH2), Imm8(5));
MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
MULSS(XMM0, R(XMM1));
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
XORPS(XMM1, R(XMM1));
MAXSS(XMM0, R(XMM1));
MINSS(XMM0, M((void *)&m_255));
@ -420,8 +419,7 @@ void CommonAsmRoutines::GenQuantizedSingleStores()
const u8* storeSingleS8 = AlignCode4();
SHR(32, R(RSCRATCH2), Imm8(5));
MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
MULSS(XMM0, R(XMM1));
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
MAXSS(XMM0, M((void *)&m_m128));
MINSS(XMM0, M((void *)&m_127));
CVTTSS2SI(RSCRATCH, R(XMM0));
@ -430,8 +428,7 @@ void CommonAsmRoutines::GenQuantizedSingleStores()
const u8* storeSingleU16 = AlignCode4(); // Used by MKWii
SHR(32, R(RSCRATCH2), Imm8(5));
MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
MULSS(XMM0, R(XMM1));
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
XORPS(XMM1, R(XMM1));
MAXSS(XMM0, R(XMM1));
MINSS(XMM0, M((void *)&m_65535));
@ -441,8 +438,7 @@ void CommonAsmRoutines::GenQuantizedSingleStores()
const u8* storeSingleS16 = AlignCode4();
SHR(32, R(RSCRATCH2), Imm8(5));
MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
MULSS(XMM0, R(XMM1));
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
MAXSS(XMM0, M((void *)&m_m32768));
MINSS(XMM0, M((void *)&m_32767));
CVTTSS2SI(RSCRATCH, R(XMM0));
@ -543,8 +539,7 @@ void CommonAsmRoutines::GenQuantizedLoads()
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 8, 0); // RSCRATCH_EXTRA = 0x000000xx
CVTSI2SS(XMM0, R(RSCRATCH_EXTRA));
SHR(32, R(RSCRATCH2), Imm8(5));
MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
MULSS(XMM0, R(XMM1));
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
UNPCKLPS(XMM0, M((void*)m_one));
RET();
@ -583,8 +578,7 @@ void CommonAsmRoutines::GenQuantizedLoads()
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 8, 0, true);
CVTSI2SS(XMM0, R(RSCRATCH_EXTRA));
SHR(32, R(RSCRATCH2), Imm8(5));
MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
MULSS(XMM0, R(XMM1));
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
UNPCKLPS(XMM0, M((void*)m_one));
RET();
@ -618,8 +612,7 @@ void CommonAsmRoutines::GenQuantizedLoads()
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0, false);
CVTSI2SS(XMM0, R(RSCRATCH_EXTRA));
SHR(32, R(RSCRATCH2), Imm8(5));
MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
MULSS(XMM0, R(XMM1));
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
UNPCKLPS(XMM0, M((void*)m_one));
RET();
@ -652,8 +645,7 @@ void CommonAsmRoutines::GenQuantizedLoads()
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0, true);
CVTSI2SS(XMM0, R(RSCRATCH_EXTRA));
SHR(32, R(RSCRATCH2), Imm8(5));
MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
MULSS(XMM0, R(XMM1));
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
UNPCKLPS(XMM0, M((void*)m_one));
RET();

View file

@ -608,13 +608,98 @@ void EmuCodeBlock::ForceSinglePrecisionS(X64Reg xmm)
}
}
void EmuCodeBlock::ForceSinglePrecisionP(X64Reg xmm)
void EmuCodeBlock::ForceSinglePrecisionP(X64Reg output, X64Reg input)
{
// Most games don't need these. Zelda requires it though - some platforms get stuck without them.
if (jit->jo.accurateSinglePrecision)
{
CVTPD2PS(xmm, R(xmm));
CVTPS2PD(xmm, R(xmm));
CVTPD2PS(input, R(input));
CVTPS2PD(output, R(input));
}
else if (output != input)
{
MOVAPD(output, R(input));
}
}
// Abstract between AVX and SSE: automatically handle 3-operand instructions
void EmuCodeBlock::avx_op(void (XEmitter::*avxOp)(X64Reg, X64Reg, OpArg), void (XEmitter::*sseOp)(X64Reg, OpArg),
X64Reg regOp, OpArg arg1, OpArg arg2, bool packed, bool reversible)
{
if (arg1.IsSimpleReg() && regOp == arg1.GetSimpleReg())
{
(this->*sseOp)(regOp, arg2);
}
else if (arg1.IsSimpleReg() && cpu_info.bAVX)
{
(this->*avxOp)(regOp, arg1.GetSimpleReg(), arg2);
}
else if (arg2.IsSimpleReg() && arg2.GetSimpleReg() == regOp)
{
if (reversible)
{
(this->*sseOp)(regOp, arg1);
}
else
{
// The ugly case: regOp == arg2 without AVX, or with arg1 == memory
if (!arg1.IsSimpleReg() || arg1.GetSimpleReg() != XMM0)
MOVAPD(XMM0, arg1);
if (cpu_info.bAVX)
{
(this->*avxOp)(regOp, XMM0, arg2);
}
else
{
(this->*sseOp)(XMM0, arg2);
if (packed)
MOVAPD(regOp, R(XMM0));
else
MOVSD(regOp, R(XMM0));
}
}
}
else
{
if (packed)
MOVAPD(regOp, arg1);
else
MOVSD(regOp, arg1);
(this->*sseOp)(regOp, arg1 == arg2 ? R(regOp) : arg2);
}
}
// Abstract between AVX and SSE: automatically handle 3-operand instructions
void EmuCodeBlock::avx_op(void (XEmitter::*avxOp)(X64Reg, X64Reg, OpArg, u8), void (XEmitter::*sseOp)(X64Reg, OpArg, u8),
X64Reg regOp, OpArg arg1, OpArg arg2, u8 imm)
{
if (arg1.IsSimpleReg() && regOp == arg1.GetSimpleReg())
{
(this->*sseOp)(regOp, arg2, imm);
}
else if (arg1.IsSimpleReg() && cpu_info.bAVX)
{
(this->*avxOp)(regOp, arg1.GetSimpleReg(), arg2, imm);
}
else if (arg2.IsSimpleReg() && arg2.GetSimpleReg() == regOp)
{
// The ugly case: regOp == arg2 without AVX, or with arg1 == memory
if (!arg1.IsSimpleReg() || arg1.GetSimpleReg() != XMM0)
MOVAPD(XMM0, arg1);
if (cpu_info.bAVX)
{
(this->*avxOp)(regOp, XMM0, arg2, imm);
}
else
{
(this->*sseOp)(XMM0, arg2, imm);
MOVAPD(regOp, R(XMM0));
}
}
else
{
MOVAPD(regOp, arg1);
(this->*sseOp)(regOp, arg1 == arg2 ? R(regOp) : arg2, imm);
}
}
@ -625,15 +710,25 @@ static const u64 GC_ALIGNED16(psRoundBit[2]) = {0x8000000, 0x8000000};
// a single precision multiply. To be precise, it drops the low 28 bits of the mantissa,
// rounding to nearest as it does.
// It needs a temp, so let the caller pass that in.
void EmuCodeBlock::Force25BitPrecision(X64Reg xmm, X64Reg tmp)
void EmuCodeBlock::Force25BitPrecision(X64Reg output, OpArg input, X64Reg tmp)
{
if (jit->jo.accurateSinglePrecision)
{
// mantissa = (mantissa & ~0xFFFFFFF) + ((mantissa & (1ULL << 27)) << 1);
MOVAPD(tmp, R(xmm));
PAND(xmm, M((void*)&psMantissaTruncate));
PAND(tmp, M((void*)&psRoundBit));
PADDQ(xmm, R(tmp));
if (input.IsSimpleReg() && cpu_info.bAVX)
{
VPAND(tmp, input.GetSimpleReg(), M((void*)&psRoundBit));
VPAND(output, input.GetSimpleReg(), M((void*)&psMantissaTruncate));
PADDQ(output, R(tmp));
}
else
{
if (!input.IsSimpleReg() || input.GetSimpleReg() != output)
MOVAPD(output, input);
avx_op(&XEmitter::VPAND, &XEmitter::PAND, tmp, R(output), M((void*)&psRoundBit), true, true);
PAND(output, M((void*)&psMantissaTruncate));
PADDQ(output, R(tmp));
}
}
}

View file

@ -123,9 +123,14 @@ public:
void JitSetCAIf(Gen::CCFlags conditionCode);
void JitClearCA();
void avx_op(void (Gen::XEmitter::*avxOp)(Gen::X64Reg, Gen::X64Reg, Gen::OpArg), void (Gen::XEmitter::*sseOp)(Gen::X64Reg, Gen::OpArg),
Gen::X64Reg regOp, Gen::OpArg arg1, Gen::OpArg arg2, bool packed = true, bool reversible = false);
void avx_op(void (Gen::XEmitter::*avxOp)(Gen::X64Reg, Gen::X64Reg, Gen::OpArg, u8), void (Gen::XEmitter::*sseOp)(Gen::X64Reg, Gen::OpArg, u8),
Gen::X64Reg regOp, Gen::OpArg arg1, Gen::OpArg arg2, u8 imm);
void ForceSinglePrecisionS(Gen::X64Reg xmm);
void ForceSinglePrecisionP(Gen::X64Reg xmm);
void Force25BitPrecision(Gen::X64Reg xmm, Gen::X64Reg tmp);
void ForceSinglePrecisionP(Gen::X64Reg output, Gen::X64Reg input);
void Force25BitPrecision(Gen::X64Reg output, Gen::OpArg input, Gen::X64Reg tmp);
// RSCRATCH might get trashed
void ConvertSingleToDouble(Gen::X64Reg dst, Gen::X64Reg src, bool src_is_gpr = false);

View file

@ -802,6 +802,7 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock *block, CodeBuffer *buffer, u32
bool wantsCA = true;
u32 fregInUse = 0;
u32 regInUse = 0;
u32 fregInXmm = 0;
for (int i = block->m_num_instructions - 1; i >= 0; i--)
{
bool opWantsCR0 = code[i].wantsCR0;
@ -822,6 +823,7 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock *block, CodeBuffer *buffer, u32
wantsCA &= !code[i].outputCA || opWantsCA;
code[i].gprInUse = regInUse;
code[i].fprInUse = fregInUse;
code[i].fprInXmm = fregInXmm;
// TODO: if there's no possible endblocks or exceptions in between, tell the regcache
// we can throw away a register if it's going to be overwritten later.
for (int j = 0; j < 3; j++)
@ -829,7 +831,11 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock *block, CodeBuffer *buffer, u32
regInUse |= 1 << code[i].regsIn[j];
for (int j = 0; j < 4; j++)
if (code[i].fregsIn[j] >= 0)
{
fregInUse |= 1 << code[i].fregsIn[j];
if (strncmp(code[i].opinfo->opname, "stfd", 4))
fregInXmm |= 1 << code[i].fregsIn[j];
}
// For now, we need to count output registers as "used" though; otherwise the flush
// will result in a redundant store (e.g. store to regcache, then store again to
// the same location later).
@ -837,7 +843,11 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock *block, CodeBuffer *buffer, u32
if (code[i].regsOut[j] >= 0)
regInUse |= 1 << code[i].regsOut[j];
if (code[i].fregOut >= 0)
{
fregInUse |= 1 << code[i].fregOut;
if (strncmp(code[i].opinfo->opname, "stfd", 4))
fregInXmm |= 1 << code[i].fregOut;
}
}
return address;
}

View file

@ -45,6 +45,9 @@ struct CodeOp //16B
// which registers are still needed after this instruction in this block
u32 gprInUse;
u32 fprInUse;
// we do double stores from GPRs, so we don't want to load a PowerPC floating point register into
// an XMM only to move it again to a GPR afterwards.
u32 fprInXmm;
};
struct BlockStats