Some WIP work on the JIT... only marginally usable at the moment, but I

wanted to back this up somewhere, and the people familiar with the JIT 
might have comments.  There's a big comment in Jit64IL/IR.cpp with a 
high-level overview of what this is.



git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@1724 8ced0084-cf51-0410-be5f-012b33b47a6e
This commit is contained in:
magumagu9 2008-12-31 01:39:35 +00:00
parent 1d0d106736
commit 68c451f008
20 changed files with 6470 additions and 0 deletions

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,322 @@
// Copyright (C) 2003-2008 Dolphin Project.
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, version 2.0.
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License 2.0 for more details.
// A copy of the GPL 2.0 should have been included with the program.
// If not, see http://www.gnu.org/licenses/
// Official SVN repository and contact information can be found at
// http://code.google.com/p/dolphin-emu/
#ifndef IR_H
#define IR_H
#include "x64Emitter.h"
#include <vector>
namespace IREmitter {
enum Opcode {
Nop = 0,
// "Zero-operand" operators
// Register load operators
LoadGReg,
LoadLink,
LoadCR,
LoadCarry,
LoadCTR,
LoadMSR,
// Unary operators
// Integer unary operators
SExt8,
SExt16,
BSwap32,
BSwap16,
Load8, // These loads zext
Load16,
Load32,
// Branches
BranchUncond,
// Register store operators
StoreGReg,
StoreCR,
StoreLink,
StoreCarry,
StoreCTR,
StoreMSR,
// Arbitrary interpreter instruction
InterpreterFallback,
// Binary operators
// Commutative integer operators
Add,
Mul,
And,
Or,
Xor,
// Non-commutative integer operators
Sub,
Shl, // Note that shifts ignore bits above the bottom 5
Shrl,
Sarl,
Rol,
ICmpCRSigned, // CR for signed int compare
ICmpCRUnsigned, // CR for unsigned int compare
ICmpEq, // One if equal, zero otherwise
ICmpUgt, // One if op1 > op2, zero otherwise
// Memory store operators
Store8,
Store16,
Store32,
BranchCond,
// "Trinary" operators
// FIXME: Need to change representation!
//Select, // Equivalent to C "Op1 ? Op2 : Op3"
// Integer constants
CInt16,
CInt32,
// "Opcode" representing a register too far away to
// reference directly; this is a size optimization
Tramp,
// "Opcode"s representing the start and end
BlockStart, BlockEnd
};
typedef unsigned Inst;
typedef Inst* InstLoc;
unsigned inline getOpcode(Inst i) {
return i & 255;
}
unsigned inline isImm(Inst i) {
return getOpcode(i) >= CInt16 && getOpcode(i) <= CInt32;
}
unsigned inline isUnary(Inst i) {
return getOpcode(i) >= SExt8 && getOpcode(i) <= BSwap16;
}
unsigned inline isBinary(Inst i) {
return getOpcode(i) >= Add && getOpcode(i) <= ICmpCRUnsigned;
}
unsigned inline isMemLoad(Inst i) {
return getOpcode(i) >= Load8 && getOpcode(i) <= Load32;
}
unsigned inline isMemStore(Inst i) {
return getOpcode(i) >= Store8 && getOpcode(i) <= Store32;
}
unsigned inline isRegLoad(Inst i) {
return getOpcode(i) >= LoadGReg && getOpcode(i) <= LoadCR;
}
unsigned inline isRegStore(Inst i) {
return getOpcode(i) >= LoadGReg && getOpcode(i) <= LoadCR;
}
unsigned inline isBranch(Inst i) {
return getOpcode(i) >= BranchUncond &&
getOpcode(i) <= BranchCond;
}
unsigned inline isInterpreterFallback(Inst i) {
return getOpcode(i) == InterpreterFallback;
}
InstLoc inline getOp1(InstLoc i) {
return i - 1 - ((*i >> 8) & 255);
}
InstLoc inline getOp2(InstLoc i) {
return i - 1 - ((*i >> 16) & 255);
}
class IRBuilder {
InstLoc EmitZeroOp(unsigned Opcode, unsigned extra);
InstLoc EmitUOp(unsigned OpCode, InstLoc Op1,
unsigned extra = 0);
InstLoc EmitBiOp(unsigned OpCode, InstLoc Op1, InstLoc Op2);
InstLoc FoldAdd(InstLoc Op1, InstLoc Op2);
InstLoc FoldAnd(InstLoc Op1, InstLoc Op2);
InstLoc FoldOr(InstLoc Op1, InstLoc Op2);
InstLoc FoldRol(InstLoc Op1, InstLoc Op2);
InstLoc FoldShl(InstLoc Op1, InstLoc Op2);
InstLoc FoldShrl(InstLoc Op1, InstLoc Op2);
InstLoc FoldXor(InstLoc Op1, InstLoc Op2);
InstLoc FoldInterpreterFallback(InstLoc Op1, InstLoc Op2);
InstLoc FoldZeroOp(unsigned Opcode, unsigned extra);
InstLoc FoldUOp(unsigned OpCode, InstLoc Op1,
unsigned extra = 0);
InstLoc FoldBiOp(unsigned OpCode, InstLoc Op1, InstLoc Op2);
public:
InstLoc EmitIntConst(unsigned value);
InstLoc EmitStoreLink(InstLoc val) {
return FoldUOp(StoreLink, val);
}
InstLoc EmitBranchUncond(InstLoc val) {
return FoldUOp(BranchUncond, val);
}
InstLoc EmitBranchCond(InstLoc check, InstLoc dest) {
return FoldBiOp(BranchCond, check, dest);
}
InstLoc EmitLoadCR(unsigned crreg) {
return FoldZeroOp(LoadCR, crreg);
}
InstLoc EmitStoreCR(InstLoc value, unsigned crreg) {
return FoldUOp(StoreCR, value, crreg);
}
InstLoc EmitLoadLink() {
return FoldZeroOp(LoadLink, 0);
}
InstLoc EmitLoadMSR() {
return FoldZeroOp(LoadMSR, 0);
}
InstLoc EmitStoreMSR(InstLoc val) {
return FoldUOp(StoreMSR, val);
}
InstLoc EmitLoadGReg(unsigned reg) {
return FoldZeroOp(LoadGReg, reg);
}
InstLoc EmitStoreGReg(InstLoc value, unsigned reg) {
return FoldUOp(StoreGReg, value, reg);
}
InstLoc EmitAnd(InstLoc op1, InstLoc op2) {
return FoldBiOp(And, op1, op2);
}
InstLoc EmitXor(InstLoc op1, InstLoc op2) {
return FoldBiOp(Xor, op1, op2);
}
InstLoc EmitSub(InstLoc op1, InstLoc op2) {
return FoldBiOp(Sub, op1, op2);
}
InstLoc EmitOr(InstLoc op1, InstLoc op2) {
return FoldBiOp(Or, op1, op2);
}
InstLoc EmitAdd(InstLoc op1, InstLoc op2) {
return FoldBiOp(Add, op1, op2);
}
InstLoc EmitMul(InstLoc op1, InstLoc op2) {
return FoldBiOp(Mul, op1, op2);
}
InstLoc EmitRol(InstLoc op1, InstLoc op2) {
return FoldBiOp(Rol, op1, op2);
}
InstLoc EmitShl(InstLoc op1, InstLoc op2) {
return FoldBiOp(Shl, op1, op2);
}
InstLoc EmitShrl(InstLoc op1, InstLoc op2) {
return FoldBiOp(Shrl, op1, op2);
}
InstLoc EmitSarl(InstLoc op1, InstLoc op2) {
return FoldBiOp(Sarl, op1, op2);
}
InstLoc EmitLoadCTR() {
return FoldZeroOp(LoadCTR, 0);
}
InstLoc EmitStoreCTR(InstLoc op1) {
return FoldUOp(StoreCTR, op1);
}
InstLoc EmitICmpEq(InstLoc op1, InstLoc op2) {
return FoldBiOp(ICmpEq, op1, op2);
}
InstLoc EmitICmpUgt(InstLoc op1, InstLoc op2) {
return FoldBiOp(ICmpUgt, op1, op2);
}
InstLoc EmitLoad8(InstLoc op1) {
return FoldUOp(Load8, op1);
}
InstLoc EmitLoad16(InstLoc op1) {
return FoldUOp(Load16, op1);
}
InstLoc EmitLoad32(InstLoc op1) {
return FoldUOp(Load32, op1);
}
InstLoc EmitStore8(InstLoc op1, InstLoc op2) {
return FoldBiOp(Store8, op1, op2);
}
InstLoc EmitStore16(InstLoc op1, InstLoc op2) {
return FoldBiOp(Store16, op1, op2);
}
InstLoc EmitStore32(InstLoc op1, InstLoc op2) {
return FoldBiOp(Store32, op1, op2);
}
InstLoc EmitSExt16(InstLoc op1) {
return FoldUOp(SExt16, op1);
}
InstLoc EmitSExt8(InstLoc op1) {
return FoldUOp(SExt8, op1);
}
InstLoc EmitICmpCRSigned(InstLoc op1, InstLoc op2) {
return FoldBiOp(ICmpCRSigned, op1, op2);
}
InstLoc EmitICmpCRUnsigned(InstLoc op1, InstLoc op2) {
return FoldBiOp(ICmpCRUnsigned, op1, op2);
}
InstLoc EmitInterpreterFallback(InstLoc op1, InstLoc op2) {
return FoldBiOp(InterpreterFallback, op1, op2);
}
InstLoc EmitStoreCarry(InstLoc op1) {
return FoldUOp(StoreCarry, op1);
}
void StartBackPass() { curReadPtr = &InstList[InstList.size()]; }
void StartForwardPass() { curReadPtr = &InstList[0]; }
InstLoc ReadForward() { return curReadPtr++; }
InstLoc ReadBackward() { return --curReadPtr; }
InstLoc getFirstInst() { return &InstList[0]; }
unsigned getNumInsts() { return InstList.size(); }
unsigned ReadInst(InstLoc I) { return *I; }
unsigned GetImmValue(InstLoc I);
void Reset() {
InstList.clear();
InstList.reserve(100000);
for (unsigned i = 0; i < 32; i++) {
GRegCache[i] = 0;
GRegCacheStore[i] = 0;
}
CarryCache = 0;
CarryCacheStore = 0;
for (unsigned i = 0; i < 8; i++) {
CRCache[i] = 0;
CRCacheStore[i] = 0;
}
}
IRBuilder() { Reset(); }
private:
std::vector<Inst> InstList; // FIXME: We must ensure this is
// continuous!
std::vector<unsigned> ConstList;
InstLoc curReadPtr;
InstLoc GRegCache[32];
InstLoc GRegCacheStore[32];
InstLoc CarryCache;
InstLoc CarryCacheStore;
InstLoc CRCache[8];
InstLoc CRCacheStore[8];
};
};
#endif

View file

@ -0,0 +1,528 @@
// Copyright (C) 2003-2008 Dolphin Project.
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, version 2.0.
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License 2.0 for more details.
// A copy of the GPL 2.0 should have been included with the program.
// If not, see http://www.gnu.org/licenses/
// Official SVN repository and contact information can be found at
// http://code.google.com/p/dolphin-emu/
#include <map>
#include "Common.h"
#include "x64Emitter.h"
#include "ABI.h"
#include "Thunk.h"
#include "../../HLE/HLE.h"
#include "../../Core.h"
#include "../../PatchEngine.h"
#include "../../CoreTiming.h"
#include "../../Debugger/Debugger_BreakPoints.h"
#include "../PowerPC.h"
#include "../Profiler.h"
#include "../PPCTables.h"
#include "../PPCAnalyst.h"
#include "../../HW/Memmap.h"
#include "../../HW/GPFifo.h"
#include "Jit.h"
#include "JitAsm.h"
#include "JitCache.h"
#include "JitRegCache.h"
using namespace Gen;
using namespace PowerPC;
extern int blocksExecuted;
// Dolphin's PowerPC->x86 JIT dynamic recompiler
// (Nearly) all code by ector (hrydgard)
// Features:
// * x86 & x64 support, lots of shared code.
// * Basic block linking
// * Fast dispatcher
// Unfeatures:
// * Does not recompile all instructions - sometimes falls back to inserting a CALL to the corresponding JIT function.
// Various notes below
// Register allocation
// RAX - Generic quicktemp register
// RBX - point to base of memory map
// RSI RDI R12 R13 R14 R15 - free for allocation
// RCX RDX R8 R9 R10 R11 - allocate in emergencies. These need to be flushed before functions are called.
// RSP - stack pointer, do not generally use, very dangerous
// RBP - ?
// IMPORTANT:
// Make sure that all generated code and all emulator state sits under the 2GB boundary so that
// RIP addressing can be used easily. Windows will always allocate static code under the 2GB boundary.
// Also make sure to use VirtualAlloc and specify EXECUTE permission.
// Open questions
// * Should there be any statically allocated registers? r3, r4, r5, r8, r0 come to mind.. maybe sp
// * Does it make sense to finish off the remaining non-jitted instructions? Seems we are hitting diminishing returns.
// * Why is the FPU exception handling not working 100%? Several games still get corrupted floating point state.
// This can even be seen in one homebrew Wii demo - RayTracer.elf
// Other considerations
//
// Many instructions have shorter forms for EAX. However, I believe their performance boost
// will be as small to be negligble, so I haven't dirtied up the code with that. AMD recommends it in their
// optimization manuals, though.
//
// We support block linking. Reserve space at the exits of every block for a full 5-byte jmp. Save 16-bit offsets
// from the starts of each block, marking the exits so that they can be nicely patched at any time.
//
// Blocks do NOT use call/ret, they only jmp to each other and to the dispatcher when necessary.
//
// All blocks that can be precompiled will be precompiled. Code will be memory protected - any write will mark
// the region as non-compilable, and all links to the page will be torn out and replaced with dispatcher jmps.
//
// Alternatively, icbi instruction SHOULD mark where we can't compile
//
// Seldom-happening events is handled by adding a decrement of a counter to all blr instructions (which are
// expensive anyway since we need to return to dispatcher, except when they can be predicted).
// TODO: SERIOUS synchronization problem with the video plugin setting tokens and breakpoints in dual core mode!!!
// Somewhat fixed by disabling idle skipping when certain interrupts are enabled
// This is no permantent reliable fix
// TODO: Zeldas go whacko when you hang the gfx thread
// Idea - Accurate exception handling
// Compute register state at a certain instruction by running the JIT in "dry mode", and stopping at the right place.
// Not likely to be done :P
// Optimization Ideas -
/*
* Assume SP is in main RAM (in Wii mode too?) - partly done
* Assume all floating point loads and double precision loads+stores are to/from main ram
(single precision can be used in write gather pipe, specialized fast check added)
* AMD only - use movaps instead of movapd when loading ps from memory?
* HLE functions like floorf, sin, memcpy, etc - they can be much faster
* ABI optimizations - drop F0-F13 on blr, for example. Watch out for context switching.
CR2-CR4 are non-volatile, rest of CR is volatile -> dropped on blr.
R5-R12 are volatile -> dropped on blr.
* classic inlining across calls.
Low hanging fruit:
stfd -- guaranteed in memory
cmpl
mulli
stfs
stwu
lb/stzx
bcx - optimize!
bcctr
stfs
psq_st
addx
orx
rlwimix
fcmpo
DSP_UpdateARAMDMA
lfd
stwu
cntlzwx
bcctrx
WriteBigEData
TODO
lha
srawx
addic_rc
addex
subfcx
subfex
fmaddx
fmulx
faddx
fnegx
frspx
frsqrtex
ps_sum0
ps_muls0
ps_adds1
*/
Jit64 jit;
int CODE_SIZE = 1024*1024*16;
namespace CPUCompare
{
extern u32 m_BlockStart;
}
void Jit(u32 em_address)
{
jit.Jit(em_address);
}
void Jit64::Init()
{
asm_routines.compareEnabled = ::Core::g_CoreStartupParameter.bRunCompareClient;
if (Core::g_CoreStartupParameter.bJITUnlimitedCache)
CODE_SIZE = 1024*1024*8*8;
jo.optimizeStack = true;
jo.enableBlocklink = true; // Speed boost, but not 100% safe
#ifdef _M_X64
jo.enableFastMem = Core::GetStartupParameter().bUseFastMem;
#else
jo.enableFastMem = false;
#endif
jo.assumeFPLoadFromMem = true;
jo.fpAccurateFlags = true;
jo.optimizeGatherPipe = true;
jo.fastInterrupts = false;
jo.accurateSinglePrecision = false;
gpr.SetEmitter(this);
fpr.SetEmitter(this);
trampolines.Init();
AllocCodeSpace(CODE_SIZE);
blocks.Init();
asm_routines.Init();
}
void Jit64::Shutdown()
{
FreeCodeSpace();
blocks.Shutdown();
trampolines.Shutdown();
asm_routines.Shutdown();
}
void Jit64::WriteCallInterpreter(UGeckoInstruction inst)
{
gpr.Flush(FLUSH_ALL);
fpr.Flush(FLUSH_ALL);
if (js.isLastInstruction)
{
MOV(32, M(&PC), Imm32(js.compilerPC));
MOV(32, M(&NPC), Imm32(js.compilerPC + 4));
}
Interpreter::_interpreterInstruction instr = GetInterpreterOp(inst);
ABI_CallFunctionC((void*)instr, inst.hex);
if (js.isLastInstruction)
{
MOV(32, R(EAX), M(&NPC));
WriteRfiExitDestInEAX();
}
}
void Jit64::unknown_instruction(UGeckoInstruction inst)
{
// CCPU::Break();
PanicAlert("unknown_instruction %08x - Fix me ;)", inst.hex);
}
void Jit64::Default(UGeckoInstruction _inst)
{
ibuild.EmitInterpreterFallback(
ibuild.EmitIntConst(_inst.hex),
ibuild.EmitIntConst(js.compilerPC));
}
void Jit64::HLEFunction(UGeckoInstruction _inst)
{
gpr.Flush(FLUSH_ALL);
fpr.Flush(FLUSH_ALL);
ABI_CallFunctionCC((void*)&HLE::Execute, js.compilerPC, _inst.hex);
MOV(32, R(EAX), M(&NPC));
WriteExitDestInEAX(0);
}
void Jit64::DoNothing(UGeckoInstruction _inst)
{
// Yup, just don't do anything.
}
void Jit64::NotifyBreakpoint(u32 em_address, bool set)
{
int block_num = blocks.GetBlockNumberFromStartAddress(em_address);
if (block_num >= 0)
{
blocks.DestroyBlock(block_num, false);
}
}
static const bool ImHereDebug = false;
static const bool ImHereLog = false;
static std::map<u32, int> been_here;
void ImHere()
{
static FILE *f = 0;
if (ImHereLog) {
if (!f)
{
#ifdef _M_X64
f = fopen("log64.txt", "w");
#else
f = fopen("log32.txt", "w");
#endif
}
fprintf(f, "%08x\n", PC);
}
if (been_here.find(PC) != been_here.end()) {
been_here.find(PC)->second++;
if ((been_here.find(PC)->second) & 1023)
return;
}
LOG(DYNA_REC, "I'm here - PC = %08x , LR = %08x", PC, LR);
printf("I'm here - PC = %08x , LR = %08x", PC, LR);
been_here[PC] = 1;
}
void Jit64::Cleanup()
{
if (jo.optimizeGatherPipe && js.fifoBytesThisBlock > 0)
ABI_CallFunction((void *)&GPFifo::CheckGatherPipe);
}
void Jit64::WriteExit(u32 destination, int exit_num)
{
Cleanup();
SUB(32, M(&CoreTiming::downcount), js.downcountAmount > 127 ? Imm32(js.downcountAmount) : Imm8(js.downcountAmount));
//If nobody has taken care of this yet (this can be removed when all branches are done)
JitBlock *b = js.curBlock;
b->exitAddress[exit_num] = destination;
b->exitPtrs[exit_num] = GetWritableCodePtr();
// Link opportunity!
int block = blocks.GetBlockNumberFromStartAddress(destination);
if (block >= 0 && jo.enableBlocklink)
{
// It exists! Joy of joy!
JMP(blocks.GetBlock(block)->checkedEntry, true);
b->linkStatus[exit_num] = true;
}
else
{
MOV(32, M(&PC), Imm32(destination));
JMP(asm_routines.dispatcher, true);
}
}
void Jit64::WriteExitDestInEAX(int exit_num)
{
MOV(32, M(&PC), R(EAX));
Cleanup();
SUB(32, M(&CoreTiming::downcount), js.downcountAmount > 127 ? Imm32(js.downcountAmount) : Imm8(js.downcountAmount));
JMP(asm_routines.dispatcher, true);
}
void Jit64::WriteRfiExitDestInEAX()
{
MOV(32, M(&PC), R(EAX));
Cleanup();
SUB(32, M(&CoreTiming::downcount), js.downcountAmount > 127 ? Imm32(js.downcountAmount) : Imm8(js.downcountAmount));
JMP(asm_routines.testExceptions, true);
}
void Jit64::WriteExceptionExit(u32 exception)
{
Cleanup();
OR(32, M(&PowerPC::ppcState.Exceptions), Imm32(exception));
MOV(32, M(&PC), Imm32(js.compilerPC + 4));
JMP(asm_routines.testExceptions, true);
}
void STACKALIGN Jit64::Run()
{
CompiledCode pExecAddr = (CompiledCode)asm_routines.enterCode;
pExecAddr();
//Will return when PowerPC::state changes
}
void Jit64::SingleStep()
{
// NOT USED, NOT TESTED, PROBABLY NOT WORKING YET
// PanicAlert("Single");
/*
JitBlock temp_block;
PPCAnalyst::CodeBuffer temp_codebuffer(1); // Only room for one instruction! Single step!
const u8 *code = DoJit(PowerPC::ppcState.pc, &temp_codebuffer, &temp_block);
CompiledCode pExecAddr = (CompiledCode)code;
pExecAddr();*/
}
void STACKALIGN Jit64::Jit(u32 em_address)
{
if (GetSpaceLeft() < 0x10000 || blocks.IsFull())
{
LOG(DYNA_REC, "JIT cache full - clearing.")
if (Core::g_CoreStartupParameter.bJITUnlimitedCache)
{
PanicAlert("What? JIT cache still full - clearing.");
}
ClearCache();
}
int block_num = blocks.AllocateBlock(em_address);
JitBlock *b = blocks.GetBlock(block_num);
blocks.FinalizeBlock(block_num, jo.enableBlocklink, DoJit(em_address, &code_buffer, b));
}
const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buffer, JitBlock *b)
{
Core::g_CoreStartupParameter.bJITLoadStoreOff = true;
Core::g_CoreStartupParameter.bJITLoadStoreFloatingOff = true;
Core::g_CoreStartupParameter.bJITLoadStorePairedOff = true;
Core::g_CoreStartupParameter.bJITFloatingPointOff = true;
Core::g_CoreStartupParameter.bJITIntegerOff = true;
Core::g_CoreStartupParameter.bJITPairedOff = true;
Core::g_CoreStartupParameter.bJITSystemRegistersOff = true;
Core::g_CoreStartupParameter.bJITBranchOff = true;
if (em_address == 0)
PanicAlert("ERROR : Trying to compile at 0. LR=%08x", LR);
int size;
js.isLastInstruction = false;
js.blockStart = em_address;
js.fifoBytesThisBlock = 0;
js.curBlock = b;
js.blockSetsQuantizers = false;
js.block_flags = 0;
js.cancel = false;
//Analyze the block, collect all instructions it is made of (including inlining,
//if that is enabled), reorder instructions for optimal performance, and join joinable instructions.
PPCAnalyst::Flatten(em_address, &size, &js.st, &js.gpa, &js.fpa, code_buffer);
PPCAnalyst::CodeOp *ops = code_buffer->codebuffer;
const u8 *start = AlignCode4(); //TODO: Test if this or AlignCode16 make a difference from GetCodePtr
b->checkedEntry = start;
b->runCount = 0;
// Downcount flag check. The last block decremented downcounter, and the flag should still be available.
FixupBranch skip = J_CC(CC_NBE);
MOV(32, M(&PC), Imm32(js.blockStart));
JMP(asm_routines.doTiming, true); // downcount hit zero - go doTiming.
SetJumpTarget(skip);
const u8 *normalEntry = GetCodePtr();
js.normalEntry = (u8*)normalEntry;
if (ImHereDebug)
ABI_CallFunction((void *)&ImHere); //Used to get a trace of the last few blocks before a crash, sometimes VERY useful
if (false && js.fpa.any)
{
//This block uses FPU - needs to add FP exception bailout
TEST(32, M(&PowerPC::ppcState.msr), Imm32(1 << 13)); //Test FP enabled bit
FixupBranch b1 = J_CC(CC_NZ);
MOV(32, M(&PC), Imm32(js.blockStart));
JMP(asm_routines.fpException, true);
SetJumpTarget(b1);
}
if (false && jo.fastInterrupts)
{
// This does NOT yet work.
TEST(32, M(&PowerPC::ppcState.Exceptions), Imm32(0xFFFFFFFF));
FixupBranch b1 = J_CC(CC_Z);
MOV(32, M(&PC), Imm32(js.blockStart));
JMP(asm_routines.testExceptions, true);
SetJumpTarget(b1);
}
// Conditionally add profiling code.
if (Profiler::g_ProfileBlocks) {
ADD(32, M(&b->runCount), Imm8(1));
#ifdef _WIN32
b->ticCounter.QuadPart = 0;
b->ticStart.QuadPart = 0;
b->ticStop.QuadPart = 0;
#else
//TODO
#endif
// get start tic
PROFILER_QUERY_PERFORMACE_COUNTER(&b->ticStart);
}
//Start up the register allocators
//They use the information in gpa/fpa to preload commonly used registers.
//gpr.Start(js.gpa);
//fpr.Start(js.fpa);
ibuild.Reset();
js.downcountAmount = js.st.numCycles + PatchEngine::GetSpeedhackCycles(em_address);
js.blockSize = size;
// Translate instructions
for (int i = 0; i < (int)size; i++)
{
// gpr.Flush(FLUSH_ALL);
// if (PPCTables::UsesFPU(_inst))
// fpr.Flush(FLUSH_ALL);
js.compilerPC = ops[i].address;
js.op = &ops[i];
js.instructionNumber = i;
if (i == (int)size - 1)
{
// WARNING - cmp->branch merging will screw this up.
js.isLastInstruction = true;
js.next_inst = 0;
if (Profiler::g_ProfileBlocks) {
// CAUTION!!! push on stack regs you use, do your stuff, then pop
PROFILER_VPUSH;
// get end tic
PROFILER_QUERY_PERFORMACE_COUNTER(&b->ticStop);
// tic counter += (end tic - start tic)
PROFILER_ADD_DIFF_LARGE_INTEGER(&b->ticCounter, &b->ticStop, &b->ticStart);
PROFILER_VPOP;
}
}
else
{
// help peephole optimizations
js.next_inst = ops[i + 1].inst;
js.next_compilerPC = ops[i + 1].address;
}
if (jo.optimizeGatherPipe && js.fifoBytesThisBlock >= 32)
{
js.fifoBytesThisBlock -= 32;
ABI_CallFunction(thunks.ProtectFunction((void *)&GPFifo::CheckGatherPipe, 0));
}
// If starting from the breakpointed instruction, we don't break.
if (em_address != ops[i].address && BreakPoints::IsAddressBreakPoint(ops[i].address))
{
}
if (!ops[i].skip)
PPCTables::CompileInstruction(ops[i].inst);
gpr.SanityCheck();
fpr.SanityCheck();
if (js.cancel)
break;
}
WriteCode();
b->flags = js.block_flags;
b->codeSize = (u32)(GetCodePtr() - normalEntry);
b->originalSize = size;
return normalEntry;
}

View file

@ -0,0 +1,299 @@
// Copyright (C) 2003-2008 Dolphin Project.
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, version 2.0.
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License 2.0 for more details.
// A copy of the GPL 2.0 should have been included with the program.
// If not, see http://www.gnu.org/licenses/
// Official SVN repository and contact information can be found at
// http://code.google.com/p/dolphin-emu/
// ========================
// See comments in Jit.cpp.
// ========================
// Mystery: Capcom vs SNK 800aa278
// CR flags approach:
// * Store that "N+Z flag contains CR0" or "S+Z flag contains CR3".
// * All flag altering instructions flush this
// * A flush simply does a conditional write to the appropriate CRx.
// * If flag available, branch code can become absolutely trivial.
#ifndef _JIT_H
#define _JIT_H
#include "../PPCAnalyst.h"
#include "JitCache.h"
#include "JitRegCache.h"
#include "x64Emitter.h"
#include "x64Analyzer.h"
#include "IR.h"
#ifdef _WIN32
#include <windows.h>
#else
// A bit of a hack to get things building under linux. We manually fill in this structure as needed
// from the real context.
struct CONTEXT
{
#ifdef _M_X64
u64 Rip;
u64 Rax;
#else
u32 Eip;
u32 Eax;
#endif
};
#endif
class TrampolineCache : public Gen::XCodeBlock
{
public:
void Init();
void Shutdown();
const u8 *GetReadTrampoline(const InstructionInfo &info);
const u8 *GetWriteTrampoline(const InstructionInfo &info);
};
class Jit64 : public Gen::XCodeBlock
{
private:
struct JitState
{
u32 compilerPC;
u32 next_compilerPC;
u32 blockStart;
bool cancel;
UGeckoInstruction next_inst; // for easy peephole opt.
int blockSize;
int instructionNumber;
int downcountAmount;
int block_flags;
bool isLastInstruction;
bool blockSetsQuantizers;
bool forceUnsafeLoad;
int fifoBytesThisBlock;
PPCAnalyst::BlockStats st;
PPCAnalyst::BlockRegStats gpa;
PPCAnalyst::BlockRegStats fpa;
PPCAnalyst::CodeOp *op;
u8* normalEntry;
JitBlock *curBlock;
};
struct JitOptions
{
bool optimizeStack;
bool assumeFPLoadFromMem;
bool enableBlocklink;
bool fpAccurateFlags;
bool enableFastMem;
bool optimizeGatherPipe;
bool fastInterrupts;
bool accurateSinglePrecision;
};
JitBlockCache blocks;
TrampolineCache trampolines;
GPRRegCache gpr;
FPURegCache fpr;
// The default code buffer. We keep it around to not have to alloc/dealloc a
// large chunk of memory for each recompiled block.
PPCAnalyst::CodeBuffer code_buffer;
public:
Jit64() : code_buffer(32000) {}
~Jit64() {}
JitState js;
JitOptions jo;
IREmitter::IRBuilder ibuild;
// Initialization, etc
void Init();
void Shutdown();
// Jit!
void Jit(u32 em_address);
const u8* DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buffer, JitBlock *b);
JitBlockCache *GetBlockCache() { return &blocks; }
void NotifyBreakpoint(u32 em_address, bool set);
void ClearCache()
{
blocks.Clear();
trampolines.ClearCodeSpace();
}
// Run!
void Run();
void SingleStep();
const u8 *BackPatch(u8 *codePtr, int accessType, u32 em_address, CONTEXT *ctx);
#define JIT_OPCODE 0
// Utilities for use by opcodes
void WriteExit(u32 destination, int exit_num);
void WriteExitDestInEAX(int exit_num);
void WriteExceptionExit(u32 exception);
void WriteRfiExitDestInEAX();
void WriteCallInterpreter(UGeckoInstruction _inst);
void Cleanup();
void UnsafeLoadRegToReg(Gen::X64Reg reg_addr, Gen::X64Reg reg_value, int accessSize, s32 offset = 0, bool signExtend = false);
void UnsafeWriteRegToReg(Gen::X64Reg reg_value, Gen::X64Reg reg_addr, int accessSize, s32 offset = 0);
void SafeLoadRegToEAX(Gen::X64Reg reg, int accessSize, s32 offset, bool signExtend = false);
void SafeWriteRegToReg(Gen::X64Reg reg_value, Gen::X64Reg reg_addr, int accessSize, s32 offset);
void WriteToConstRamAddress(int accessSize, const Gen::OpArg& arg, u32 address);
void WriteFloatToConstRamAddress(const Gen::X64Reg& xmm_reg, u32 address);
void GenerateCarry(Gen::X64Reg temp_reg);
void ForceSinglePrecisionS(Gen::X64Reg xmm);
void ForceSinglePrecisionP(Gen::X64Reg xmm);
void JitClearCA();
void JitSetCA();
void tri_op(int d, int a, int b, bool reversible, void (XEmitter::*op)(Gen::X64Reg, Gen::OpArg));
typedef u32 (*Operation)(u32 a, u32 b);
void regimmop(int d, int a, bool binary, u32 value, Operation doop, void (XEmitter::*op)(int, const Gen::OpArg&, const Gen::OpArg&), bool Rc = false, bool carry = false);
void fp_tri_op(int d, int a, int b, bool reversible, bool dupe, void (XEmitter::*op)(Gen::X64Reg, Gen::OpArg));
void WriteCode();
// OPCODES
void unknown_instruction(UGeckoInstruction _inst);
void Default(UGeckoInstruction _inst);
void DoNothing(UGeckoInstruction _inst);
void HLEFunction(UGeckoInstruction _inst);
void DynaRunTable4(UGeckoInstruction _inst);
void DynaRunTable19(UGeckoInstruction _inst);
void DynaRunTable31(UGeckoInstruction _inst);
void DynaRunTable59(UGeckoInstruction _inst);
void DynaRunTable63(UGeckoInstruction _inst);
void addx(UGeckoInstruction inst);
void orx(UGeckoInstruction inst);
void xorx(UGeckoInstruction inst);
void andx(UGeckoInstruction inst);
void mulli(UGeckoInstruction inst);
void mulhwux(UGeckoInstruction inst);
void mullwx(UGeckoInstruction inst);
void divwux(UGeckoInstruction inst);
void srawix(UGeckoInstruction inst);
void srawx(UGeckoInstruction inst);
void addex(UGeckoInstruction inst);
void extsbx(UGeckoInstruction inst);
void extshx(UGeckoInstruction inst);
void sc(UGeckoInstruction _inst);
void rfi(UGeckoInstruction _inst);
void bx(UGeckoInstruction inst);
void bclrx(UGeckoInstruction _inst);
void bcctrx(UGeckoInstruction _inst);
void bcx(UGeckoInstruction inst);
void mtspr(UGeckoInstruction inst);
void mfspr(UGeckoInstruction inst);
void mtmsr(UGeckoInstruction inst);
void mfmsr(UGeckoInstruction inst);
void mftb(UGeckoInstruction inst);
void mtcrf(UGeckoInstruction inst);
void mfcr(UGeckoInstruction inst);
void reg_imm(UGeckoInstruction inst);
void ps_sel(UGeckoInstruction inst);
void ps_mr(UGeckoInstruction inst);
void ps_sign(UGeckoInstruction inst); //aggregate
void ps_arith(UGeckoInstruction inst); //aggregate
void ps_mergeXX(UGeckoInstruction inst);
void ps_maddXX(UGeckoInstruction inst);
void ps_rsqrte(UGeckoInstruction inst);
void ps_sum(UGeckoInstruction inst);
void ps_muls(UGeckoInstruction inst);
void fp_arith_s(UGeckoInstruction inst);
void fcmpx(UGeckoInstruction inst);
void fmrx(UGeckoInstruction inst);
void cmpXX(UGeckoInstruction inst);
void cntlzwx(UGeckoInstruction inst);
void lfs(UGeckoInstruction inst);
void lfd(UGeckoInstruction inst);
void stfd(UGeckoInstruction inst);
void stfs(UGeckoInstruction inst);
void stfsx(UGeckoInstruction inst);
void psq_l(UGeckoInstruction inst);
void psq_st(UGeckoInstruction inst);
void fmaddXX(UGeckoInstruction inst);
void stX(UGeckoInstruction inst); //stw sth stb
void lXz(UGeckoInstruction inst);
void lha(UGeckoInstruction inst);
void rlwinmx(UGeckoInstruction inst);
void rlwimix(UGeckoInstruction inst);
void rlwnmx(UGeckoInstruction inst);
void negx(UGeckoInstruction inst);
void slwx(UGeckoInstruction inst);
void srwx(UGeckoInstruction inst);
void dcbz(UGeckoInstruction inst);
void lfsx(UGeckoInstruction inst);
void subfic(UGeckoInstruction inst);
void subfcx(UGeckoInstruction inst);
void subfx(UGeckoInstruction inst);
void subfex(UGeckoInstruction inst);
void lbzx(UGeckoInstruction inst);
void lwzx(UGeckoInstruction inst);
void lhax(UGeckoInstruction inst);
void lwzux(UGeckoInstruction inst);
void stXx(UGeckoInstruction inst);
void lmw(UGeckoInstruction inst);
void stmw(UGeckoInstruction inst);
};
extern Jit64 jit;
void Jit(u32 em_address);
void ProfiledReJit();
#endif

View file

@ -0,0 +1,277 @@
// Copyright (C) 2003-2008 Dolphin Project.
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, version 2.0.
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License 2.0 for more details.
// A copy of the GPL 2.0 should have been included with the program.
// If not, see http://www.gnu.org/licenses/
// Official SVN repository and contact information can be found at
// http://code.google.com/p/dolphin-emu/
#include "ABI.h"
#include "x64Emitter.h"
#include "../../HW/Memmap.h"
#include "../PowerPC.h"
#include "../../CoreTiming.h"
#include "MemoryUtil.h"
#include "ABI.h"
#include "Jit.h"
#include "JitCache.h"
#include "../../HW/CPUCompare.h"
#include "../../HW/GPFifo.h"
#include "../../Core.h"
#include "JitAsm.h"
using namespace Gen;
int blocksExecuted;
static int temp32;
bool compareEnabled = false;
//TODO - make an option
//#if _DEBUG
static bool enableDebug = false;
//#else
// bool enableDebug = false;
//#endif
static bool enableStatistics = false;
//GLOBAL STATIC ALLOCATIONS x86
//EAX - ubiquitous scratch register - EVERYBODY scratches this
//GLOBAL STATIC ALLOCATIONS x64
//EAX - ubiquitous scratch register - EVERYBODY scratches this
//RBX - Base pointer of memory
//R15 - Pointer to array of block pointers
AsmRoutineManager asm_routines;
// PLAN: no more block numbers - crazy opcodes just contain offset within
// dynarec buffer
// At this offset - 4, there is an int specifying the block number.
void AsmRoutineManager::Generate()
{
enterCode = AlignCode16();
ABI_PushAllCalleeSavedRegsAndAdjustStack();
#ifndef _M_IX86
// Two statically allocated registers.
MOV(64, R(RBX), Imm64((u64)Memory::base));
MOV(64, R(R15), Imm64((u64)jit.GetBlockCache()->GetCodePointers())); //It's below 2GB so 32 bits are good enough
#endif
const u8 *outerLoop = GetCodePtr();
ABI_CallFunction(reinterpret_cast<void *>(&CoreTiming::Advance));
FixupBranch skipToRealDispatch = J(); //skip the sync and compare first time
dispatcher = GetCodePtr();
//This is the place for CPUCompare!
//The result of slice decrementation should be in flags if somebody jumped here
FixupBranch bail = J_CC(CC_S);
SetJumpTarget(skipToRealDispatch);
dispatcherNoCheck = GetCodePtr();
MOV(32, R(EAX), M(&PowerPC::ppcState.pc));
dispatcherPcInEAX = GetCodePtr();
#ifdef _M_IX86
AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK));
MOV(32, R(EBX), Imm32((u32)Memory::base));
MOV(32, R(EAX), MComplex(EBX, EAX, SCALE_1, 0));
#else
MOV(32, R(EAX), MComplex(RBX, RAX, SCALE_1, 0));
#endif
TEST(32, R(EAX), Imm32(0xFC));
FixupBranch notfound = J_CC(CC_NZ);
BSWAP(32, EAX);
//IDEA - we have 26 bits, why not just use offsets from base of code?
if (enableStatistics)
{
ADD(32, M(&blocksExecuted), Imm8(1));
}
if (enableDebug)
{
ADD(32, M(&PowerPC::ppcState.DebugCount), Imm8(1));
}
//grab from list and jump to it
#ifdef _M_IX86
MOV(32, R(EDX), ImmPtr(jit.GetBlockCache()->GetCodePointers()));
JMPptr(MComplex(EDX, EAX, 4, 0));
#else
JMPptr(MComplex(R15, RAX, 8, 0));
#endif
SetJumpTarget(notfound);
//Ok, no block, let's jit
#ifdef _M_IX86
ABI_AlignStack(4);
PUSH(32, M(&PowerPC::ppcState.pc));
CALL(reinterpret_cast<void *>(&Jit));
ABI_RestoreStack(4);
#else
MOV(32, R(ABI_PARAM1), M(&PowerPC::ppcState.pc));
CALL((void *)&Jit);
#endif
JMP(dispatcherNoCheck); // no point in special casing this
//FP blocks test for FPU available, jump here if false
fpException = AlignCode4();
MOV(32, R(EAX), M(&PC));
MOV(32, M(&NPC), R(EAX));
OR(32, M(&PowerPC::ppcState.Exceptions), Imm32(EXCEPTION_FPU_UNAVAILABLE));
ABI_CallFunction(reinterpret_cast<void *>(&PowerPC::CheckExceptions));
MOV(32, R(EAX), M(&NPC));
MOV(32, M(&PC), R(EAX));
JMP(dispatcher);
SetJumpTarget(bail);
doTiming = GetCodePtr();
ABI_CallFunction(reinterpret_cast<void *>(&CoreTiming::Advance));
testExceptions = GetCodePtr();
TEST(32, M(&PowerPC::ppcState.Exceptions), Imm32(0xFFFFFFFF));
FixupBranch skipExceptions = J_CC(CC_Z);
MOV(32, R(EAX), M(&PC));
MOV(32, M(&NPC), R(EAX));
ABI_CallFunction(reinterpret_cast<void *>(&PowerPC::CheckExceptions));
MOV(32, R(EAX), M(&NPC));
MOV(32, M(&PC), R(EAX));
SetJumpTarget(skipExceptions);
TEST(32, M((void*)&PowerPC::state), Imm32(0xFFFFFFFF));
J_CC(CC_Z, outerLoop, true);
//Landing pad for drec space
ABI_PopAllCalleeSavedRegsAndAdjustStack();
RET();
breakpointBailout = GetCodePtr();
//Landing pad for drec space
ABI_PopAllCalleeSavedRegsAndAdjustStack();
RET();
GenerateCommon();
}
void AsmRoutineManager::GenFifoWrite(int size)
{
// Assume value in ABI_PARAM1
PUSH(ESI);
if (size != 32)
PUSH(EDX);
BSWAP(size, ABI_PARAM1);
MOV(32, R(EAX), Imm32((u32)(u64)GPFifo::m_gatherPipe));
MOV(32, R(ESI), M(&GPFifo::m_gatherPipeCount));
if (size != 32) {
MOV(32, R(EDX), R(ABI_PARAM1));
MOV(size, MComplex(RAX, RSI, 1, 0), R(EDX));
} else {
MOV(size, MComplex(RAX, RSI, 1, 0), R(ABI_PARAM1));
}
ADD(32, R(ESI), Imm8(size >> 3));
MOV(32, M(&GPFifo::m_gatherPipeCount), R(ESI));
if (size != 32)
POP(EDX);
POP(ESI);
RET();
}
void AsmRoutineManager::GenFifoFloatWrite()
{
// Assume value in XMM0
PUSH(ESI);
PUSH(EDX);
MOVSS(M(&temp32), XMM0);
MOV(32, R(EDX), M(&temp32));
BSWAP(32, EDX);
MOV(32, R(EAX), Imm32((u32)(u64)GPFifo::m_gatherPipe));
MOV(32, R(ESI), M(&GPFifo::m_gatherPipeCount));
MOV(32, MComplex(RAX, RSI, 1, 0), R(EDX));
ADD(32, R(ESI), Imm8(4));
MOV(32, M(&GPFifo::m_gatherPipeCount), R(ESI));
POP(EDX);
POP(ESI);
RET();
}
void AsmRoutineManager::GenFifoXmm64Write()
{
// Assume value in XMM0. Assume pre-byteswapped (unlike the others here!)
PUSH(ESI);
MOV(32, R(EAX), Imm32((u32)(u64)GPFifo::m_gatherPipe));
MOV(32, R(ESI), M(&GPFifo::m_gatherPipeCount));
MOVQ_xmm(MComplex(RAX, RSI, 1, 0), XMM0);
ADD(32, R(ESI), Imm8(8));
MOV(32, M(&GPFifo::m_gatherPipeCount), R(ESI));
POP(ESI);
RET();
}
void AsmRoutineManager::GenerateCommon()
{
// USES_CR
computeRc = AlignCode16();
CMP(32, R(EAX), Imm8(0));
FixupBranch pLesser = J_CC(CC_L);
FixupBranch pGreater = J_CC(CC_G);
MOV(8, M(&PowerPC::ppcState.cr_fast[0]), Imm8(0x2)); // _x86Reg == 0
RET();
SetJumpTarget(pGreater);
MOV(8, M(&PowerPC::ppcState.cr_fast[0]), Imm8(0x4)); // _x86Reg > 0
RET();
SetJumpTarget(pLesser);
MOV(8, M(&PowerPC::ppcState.cr_fast[0]), Imm8(0x8)); // _x86Reg < 0
RET();
fifoDirectWrite8 = AlignCode4();
GenFifoWrite(8);
fifoDirectWrite16 = AlignCode4();
GenFifoWrite(16);
fifoDirectWrite32 = AlignCode4();
GenFifoWrite(32);
fifoDirectWriteFloat = AlignCode4();
GenFifoFloatWrite();
fifoDirectWriteXmm64 = AlignCode4();
GenFifoXmm64Write();
doReJit = AlignCode4();
ABI_AlignStack(0);
CALL(reinterpret_cast<void *>(&ProfiledReJit));
ABI_RestoreStack(0);
SUB(32, M(&CoreTiming::downcount), Imm8(0));
JMP(dispatcher, true);
computeRcFp = AlignCode16();
//CMPSD(R(XMM0), M(&zero),
// TODO
// Fast write routines - special case the most common hardware write
// TODO: use this.
// Even in x86, the param values will be in the right registers.
/*
const u8 *fastMemWrite8 = AlignCode16();
CMP(32, R(ABI_PARAM2), Imm32(0xCC008000));
FixupBranch skip_fast_write = J_CC(CC_NE, false);
MOV(32, EAX, M(&m_gatherPipeCount));
MOV(8, MDisp(EAX, (u32)&m_gatherPipe), ABI_PARAM1);
ADD(32, 1, M(&m_gatherPipeCount));
RET();
SetJumpTarget(skip_fast_write);
CALL((void *)&Memory::Write_U8);*/
}

View file

@ -0,0 +1,88 @@
// Copyright (C) 2003-2008 Dolphin Project.
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, version 2.0.
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License 2.0 for more details.
// A copy of the GPL 2.0 should have been included with the program.
// If not, see http://www.gnu.org/licenses/
// Official SVN repository and contact information can be found at
// http://code.google.com/p/dolphin-emu/
#ifndef _JITASM_H
#define _JITASM_H
#include "x64Emitter.h"
// In Dolphin, we don't use inline assembly. Instead, we generate all machine-near
// code at runtime. In the case of fixed code like this, after writing it, we write
// protect the memory, essentially making it work just like precompiled code.
// There are some advantages to this approach:
// 1) No need to setup an external assembler in the build.
// 2) Cross platform, as long as it's x86/x64.
// 3) Can optimize code at runtime for the specific CPU model.
// There aren't really any disadvantages other than having to maintain a x86 emitter,
// which we have to do anyway :)
//
// To add a new asm routine, just add another const here, and add the code to Generate.
// Also, possibly increase the size of the code buffer.
class AsmRoutineManager : public Gen::XCodeBlock
{
private:
void Generate();
void GenerateCommon();
void GenFifoWrite(int size);
void GenFifoFloatWrite();
void GenFifoXmm64Write();
public:
void Init() {
AllocCodeSpace(8192);
Generate();
WriteProtect();
}
void Shutdown() {
FreeCodeSpace();
}
// Public generated functions. Just CALL(M((void*)func)) them.
const u8 *enterCode;
const u8 *dispatcher;
const u8 *dispatcherNoCheck;
const u8 *dispatcherPcInEAX;
const u8 *fpException;
const u8 *computeRc;
const u8 *computeRcFp;
const u8 *testExceptions;
const u8 *dispatchPcInEAX;
const u8 *doTiming;
const u8 *fifoDirectWrite8;
const u8 *fifoDirectWrite16;
const u8 *fifoDirectWrite32;
const u8 *fifoDirectWriteFloat;
const u8 *fifoDirectWriteXmm64;
const u8 *breakpointBailout;
const u8 *doReJit;
bool compareEnabled;
};
extern AsmRoutineManager asm_routines;
#endif

View file

@ -0,0 +1,215 @@
// Copyright (C) 2003-2008 Dolphin Project.
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, version 2.0.
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License 2.0 for more details.
// A copy of the GPL 2.0 should have been included with the program.
// If not, see http://www.gnu.org/licenses/
// Official SVN repository and contact information can be found at
// http://code.google.com/p/dolphin-emu/
#include <string>
#include "Common.h"
#include "disasm.h"
#include "JitAsm.h"
#include "../../HW/Memmap.h"
#include "x64Emitter.h"
#include "ABI.h"
#include "Thunk.h"
#include "x64Analyzer.h"
#include "StringUtil.h"
#include "Jit.h"
using namespace Gen;
extern u8 *trampolineCodePtr;
void BackPatchError(const std::string &text, u8 *codePtr, u32 emAddress) {
u64 code_addr = (u64)codePtr;
disassembler disasm;
char disbuf[256];
memset(disbuf, 0, 256);
#ifdef _M_IX86
disasm.disasm32(0, code_addr, codePtr, disbuf);
#else
disasm.disasm64(0, code_addr, codePtr, disbuf);
#endif
PanicAlert("%s\n\n"
"Error encountered accessing emulated address %08x.\n"
"Culprit instruction: \n%s\nat %08x%08x",
text.c_str(), emAddress, disbuf, code_addr>>32, code_addr);
return;
}
void TrampolineCache::Init()
{
AllocCodeSpace(1024 * 1024);
}
void TrampolineCache::Shutdown()
{
AllocCodeSpace(1024 * 1024);
}
// Extremely simplistic - just generate the requested trampoline. May reuse them in the future.
const u8 *TrampolineCache::GetReadTrampoline(const InstructionInfo &info)
{
if (GetSpaceLeft() < 1024)
PanicAlert("Trampoline cache full");
X64Reg addrReg = (X64Reg)info.scaledReg;
X64Reg dataReg = (X64Reg)info.regOperandReg;
const u8 *trampoline = GetCodePtr();
#ifdef _M_X64
// It's a read. Easy.
ABI_PushAllCallerSavedRegsAndAdjustStack();
if (addrReg != ABI_PARAM1)
MOV(32, R(ABI_PARAM1), R((X64Reg)addrReg));
if (info.displacement) {
ADD(32, R(ABI_PARAM1), Imm32(info.displacement));
}
switch (info.operandSize) {
case 4:
CALL(thunks.ProtectFunction((void *)&Memory::Read_U32, 1));
break;
}
ABI_PopAllCallerSavedRegsAndAdjustStack();
MOV(32, R(dataReg), R(EAX));
RET();
#endif
return trampoline;
}
// Extremely simplistic - just generate the requested trampoline. May reuse them in the future.
const u8 *TrampolineCache::GetWriteTrampoline(const InstructionInfo &info)
{
if (GetSpaceLeft() < 1024)
PanicAlert("Trampoline cache full");
X64Reg addrReg = (X64Reg)info.scaledReg;
X64Reg dataReg = (X64Reg)info.regOperandReg;
if (dataReg != EAX)
PanicAlert("Backpatch write - not through EAX");
const u8 *trampoline = GetCodePtr();
#ifdef _M_X64
// It's a write. Yay. Remember that we don't have to be super efficient since it's "just" a
// hardware access - we can take shortcuts.
//if (emAddress == 0xCC008000)
// PanicAlert("caught a fifo write");
CMP(32, R(addrReg), Imm32(0xCC008000));
FixupBranch skip_fast = J_CC(CC_NE, false);
MOV(32, R(ABI_PARAM1), R((X64Reg)dataReg));
CALL((void*)asm_routines.fifoDirectWrite32);
RET();
SetJumpTarget(skip_fast);
ABI_PushAllCallerSavedRegsAndAdjustStack();
if (addrReg != ABI_PARAM1) {
MOV(32, R(ABI_PARAM1), R((X64Reg)dataReg));
MOV(32, R(ABI_PARAM2), R((X64Reg)addrReg));
} else {
MOV(32, R(ABI_PARAM2), R((X64Reg)addrReg));
MOV(32, R(ABI_PARAM1), R((X64Reg)dataReg));
}
if (info.displacement) {
ADD(32, R(ABI_PARAM2), Imm32(info.displacement));
}
switch (info.operandSize) {
case 4:
CALL(thunks.ProtectFunction((void *)&Memory::Write_U32, 2));
break;
}
ABI_PopAllCallerSavedRegsAndAdjustStack();
RET();
#endif
return trampoline;
}
// This generates some fairly heavy trampolines, but:
// 1) It's really necessary. We don't know anything about the context.
// 2) It doesn't really hurt. Only instructions that access I/O will get these, and there won't be
// that many of them in a typical program/game.
const u8 *Jit64::BackPatch(u8 *codePtr, int accessType, u32 emAddress, CONTEXT *ctx)
{
#ifdef _M_X64
if (!jit.IsInCodeSpace(codePtr))
return 0; // this will become a regular crash real soon after this
InstructionInfo info;
if (!DisassembleMov(codePtr, info, accessType)) {
BackPatchError("BackPatch - failed to disassemble MOV instruction", codePtr, emAddress);
}
/*
if (info.isMemoryWrite) {
if (!Memory::IsRAMAddress(emAddress, true)) {
PanicAlert("Exception: Caught write to invalid address %08x", emAddress);
return;
}
BackPatchError("BackPatch - determined that MOV is write, not yet supported and should have been caught before",
codePtr, emAddress);
}*/
if (info.operandSize != 4) {
BackPatchError(StringFromFormat("BackPatch - no support for operand size %i", info.operandSize), codePtr, emAddress);
}
if (info.otherReg != RBX)
PanicAlert("BackPatch : Base reg not RBX."
"\n\nAttempted to access %08x.", emAddress);
if (accessType == OP_ACCESS_WRITE)
PanicAlert("BackPatch : Currently only supporting reads."
"\n\nAttempted to write to %08x.", emAddress);
// In the first iteration, we assume that all accesses are 32-bit. We also only deal with reads.
if (accessType == 0)
{
XEmitter emitter(codePtr);
int bswapNopCount;
// Check the following BSWAP for REX byte
if ((codePtr[info.instructionSize] & 0xF0) == 0x40)
bswapNopCount = 3;
else
bswapNopCount = 2;
const u8 *trampoline = trampolines.GetReadTrampoline(info);
emitter.CALL((void *)trampoline);
emitter.NOP((int)info.instructionSize + bswapNopCount - 5);
return codePtr;
}
else if (accessType == 1)
{
// TODO: special case FIFO writes. Also, support 32-bit mode.
// Also, debug this so that it actually works correctly :P
XEmitter emitter(codePtr - 2);
// We know it's EAX so the BSWAP before will be two byte. Overwrite it.
const u8 *trampoline = trampolines.GetWriteTrampoline(info);
emitter.CALL((void *)trampoline);
emitter.NOP((int)info.instructionSize - 3);
if (info.instructionSize < 3)
PanicAlert("instruction too small");
// We entered here with a BSWAP-ed EAX. We'll have to swap it back.
ctx->Rax = Common::swap32((u32)ctx->Rax);
return codePtr - 2;
}
return 0;
#else
return 0;
#endif
}

View file

@ -0,0 +1,346 @@
// Copyright (C) 2003-2008 Dolphin Project.
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, version 2.0.
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License 2.0 for more details.
// A copy of the GPL 2.0 should have been included with the program.
// If not, see http://www.gnu.org/licenses/
// Official SVN repository and contact information can be found at
// http://code.google.com/p/dolphin-emu/
// Enable define below to enable oprofile integration. For this to work,
// it requires at least oprofile version 0.9.4, and changing the build
// system to link the Dolphin executable against libopagent. Since the
// dependency is a little inconvenient and this is possibly a slight
// performance hit, it's not enabled by default, but it's useful for
// locating performance issues.
#define OPROFILE_REPORT
#include "Common.h"
#include "../../Core.h"
#include "MemoryUtil.h"
#include "../../HW/Memmap.h"
#include "../../CoreTiming.h"
#include "../PowerPC.h"
#include "../PPCTables.h"
#include "../PPCAnalyst.h"
#include "x64Emitter.h"
#include "x64Analyzer.h"
#include "Jit.h"
#include "JitCache.h"
#include "JitAsm.h"
#include "disasm.h"
#ifdef OPROFILE_REPORT
#include <opagent.h>
#endif
#ifdef OPROFILE_REPORT
op_agent_t agent;
#endif
using namespace Gen;
#define INVALID_EXIT 0xFFFFFFFF
bool JitBlock::ContainsAddress(u32 em_address)
{
// WARNING - THIS DOES NOT WORK WITH INLINING ENABLED.
return (em_address >= originalAddress && em_address < originalAddress + originalSize);
}
bool JitBlockCache::IsFull() const
{
return GetNumBlocks() >= MAX_NUM_BLOCKS - 1;
}
void JitBlockCache::Init()
{
MAX_NUM_BLOCKS = 65536*2;
if (Core::g_CoreStartupParameter.bJITUnlimitedCache)
{
MAX_NUM_BLOCKS = 65536*8;
}
#ifdef OPROFILE_REPORT
agent = op_open_agent();
#endif
blocks = new JitBlock[MAX_NUM_BLOCKS];
blockCodePointers = new const u8*[MAX_NUM_BLOCKS];
Clear();
}
void JitBlockCache::Shutdown()
{
delete [] blocks;
delete [] blockCodePointers;
blocks = 0;
blockCodePointers = 0;
num_blocks = 0;
#ifdef OPROFILE_REPORT
op_close_agent(agent);
#endif
}
// This clears the JIT cache. It's called from JitCache.cpp when the JIT cache
// is full and when saving and loading states.
void JitBlockCache::Clear()
{
Core::DisplayMessage("Cleared code cache.", 3000);
// Is destroying the blocks really necessary?
for (int i = 0; i < num_blocks; i++)
{
DestroyBlock(i, false);
}
links_to.clear();
num_blocks = 0;
memset(blockCodePointers, 0, sizeof(u8*)*MAX_NUM_BLOCKS);
}
void JitBlockCache::DestroyBlocksWithFlag(BlockFlag death_flag)
{
for (int i = 0; i < num_blocks; i++)
{
if (blocks[i].flags & death_flag)
{
DestroyBlock(i, false);
}
}
}
void JitBlockCache::Reset()
{
Shutdown();
Init();
}
JitBlock *JitBlockCache::GetBlock(int no)
{
return &blocks[no];
}
int JitBlockCache::GetNumBlocks() const
{
return num_blocks;
}
bool JitBlockCache::RangeIntersect(int s1, int e1, int s2, int e2) const
{
// check if any endpoint is inside the other range
if ((s1 >= s2 && s1 <= e2) ||
(e1 >= s2 && e1 <= e2) ||
(s2 >= s1 && s2 <= e1) ||
(e2 >= s1 && e2 <= e1))
return true;
else
return false;
}
int JitBlockCache::AllocateBlock(u32 em_address)
{
JitBlock &b = blocks[num_blocks];
b.invalid = false;
b.originalAddress = em_address;
b.originalFirstOpcode = Memory::ReadFast32(em_address);
b.exitAddress[0] = INVALID_EXIT;
b.exitAddress[1] = INVALID_EXIT;
b.exitPtrs[0] = 0;
b.exitPtrs[1] = 0;
b.linkStatus[0] = false;
b.linkStatus[1] = false;
num_blocks++; //commit the current block
return num_blocks - 1;
}
void JitBlockCache::FinalizeBlock(int block_num, bool block_link, const u8 *code_ptr)
{
blockCodePointers[block_num] = code_ptr;
JitBlock &b = blocks[block_num];
Memory::WriteUnchecked_U32((JIT_OPCODE << 26) | block_num, blocks[block_num].originalAddress);
if (block_link)
{
for (int i = 0; i < 2; i++)
{
if (b.exitAddress[i] != INVALID_EXIT)
links_to.insert(std::pair<u32, int>(b.exitAddress[i], block_num));
}
LinkBlock(block_num);
LinkBlockExits(block_num);
}
#ifdef OPROFILE_REPORT
char buf[100];
sprintf(buf, "EmuCode%x", b.originalAddress);
const u8* blockStart = blockCodePointers[block_num];
op_write_native_code(agent, buf, (uint64_t)blockStart,
blockStart, b.codeSize);
#endif
}
const u8 **JitBlockCache::GetCodePointers()
{
return blockCodePointers;
}
int JitBlockCache::GetBlockNumberFromStartAddress(u32 addr)
{
if (!blocks)
return -1;
u32 code = Memory::ReadFast32(addr);
if ((code >> 26) == JIT_OPCODE)
{
// Jitted code.
unsigned int block = code & 0x03FFFFFF;
if (block >= (unsigned int)num_blocks) {
return -1;
}
if (blocks[block].originalAddress != addr)
{
//_assert_msg_(DYNA_REC, 0, "GetBlockFromAddress %08x - No match - This is BAD", addr);
return -1;
}
return block;
}
else
{
return -1;
}
}
void JitBlockCache::GetBlockNumbersFromAddress(u32 em_address, std::vector<int> *block_numbers)
{
for (int i = 0; i < num_blocks; i++)
if (blocks[i].ContainsAddress(em_address))
block_numbers->push_back(i);
}
u32 JitBlockCache::GetOriginalCode(u32 address)
{
int num = GetBlockNumberFromStartAddress(address);
if (num == -1)
return Memory::ReadUnchecked_U32(address);
else
return blocks[num].originalFirstOpcode;
}
CompiledCode JitBlockCache::GetCompiledCodeFromBlock(int blockNumber)
{
return (CompiledCode)blockCodePointers[blockNumber];
}
//Block linker
//Make sure to have as many blocks as possible compiled before calling this
//It's O(N), so it's fast :)
//Can be faster by doing a queue for blocks to link up, and only process those
//Should probably be done
void JitBlockCache::LinkBlockExits(int i)
{
JitBlock &b = blocks[i];
if (b.invalid)
{
// This block is dead. Don't relink it.
return;
}
for (int e = 0; e < 2; e++)
{
if (b.exitAddress[e] != INVALID_EXIT && !b.linkStatus[e])
{
int destinationBlock = GetBlockNumberFromStartAddress(b.exitAddress[e]);
if (destinationBlock != -1)
{
XEmitter emit(b.exitPtrs[e]);
emit.JMP(blocks[destinationBlock].checkedEntry, true);
b.linkStatus[e] = true;
}
}
}
}
using namespace std;
void JitBlockCache::LinkBlock(int i)
{
LinkBlockExits(i);
JitBlock &b = blocks[i];
std::map<u32, int>::iterator iter;
pair<multimap<u32, int>::iterator, multimap<u32, int>::iterator> ppp;
// equal_range(b) returns pair<iterator,iterator> representing the range
// of element with key b
ppp = links_to.equal_range(b.originalAddress);
if (ppp.first == ppp.second)
return;
for (multimap<u32, int>::iterator iter2 = ppp.first; iter2 != ppp.second; ++iter2) {
// PanicAlert("Linking block %i to block %i", iter2->second, i);
LinkBlockExits(iter2->second);
}
}
void JitBlockCache::DestroyBlock(int blocknum, bool invalidate)
{
u32 codebytes = (JIT_OPCODE << 26) | blocknum; //generate from i
JitBlock &b = blocks[blocknum];
b.invalid = 1;
if (codebytes == Memory::ReadFast32(b.originalAddress))
{
//nobody has changed it, good
Memory::WriteUnchecked_U32(b.originalFirstOpcode, b.originalAddress);
}
else if (!invalidate)
{
//PanicAlert("Detected code overwrite");
//else, we may be in trouble, since we apparently know of this block but it's been
//overwritten. We should have thrown it out before, on instruction cache invalidate or something.
//Not ne cessarily bad though , if a game has simply thrown away a lot of code and is now using the space
//for something else, then it's fine.
LOG(MASTER_LOG, "WARNING - ClearCache detected code overwrite @ %08x", blocks[blocknum].originalAddress);
}
// We don't unlink blocks, we just send anyone who tries to run them back to the dispatcher.
// Not entirely ideal, but .. pretty good.
// TODO - make sure that the below stuff really is safe.
// Spurious entrances from previously linked blocks can only come through checkedEntry
XEmitter emit((u8 *)b.checkedEntry);
emit.MOV(32, M(&PC), Imm32(b.originalAddress));
emit.JMP(asm_routines.dispatcher, true);
emit.SetCodePtr((u8 *)blockCodePointers[blocknum]);
emit.MOV(32, M(&PC), Imm32(b.originalAddress));
emit.JMP(asm_routines.dispatcher, true);
}
void JitBlockCache::InvalidateCodeRange(u32 address, u32 length)
{
if (!jit.jo.enableBlocklink)
return;
return;
//This is slow but should be safe (zelda needs it for block linking)
for (int i = 0; i < num_blocks; i++)
{
if (RangeIntersect(blocks[i].originalAddress, blocks[i].originalAddress + blocks[i].originalSize,
address, address + length))
{
DestroyBlock(i, true);
}
}
}

View file

@ -0,0 +1,116 @@
// Copyright (C) 2003-2008 Dolphin Project.
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, version 2.0.
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License 2.0 for more details.
// A copy of the GPL 2.0 should have been included with the program.
// If not, see http://www.gnu.org/licenses/
// Official SVN repository and contact information can be found at
// http://code.google.com/p/dolphin-emu/
#ifndef _JITCACHE_H
#define _JITCACHE_H
#include <map>
#include <vector>
#include "../Gekko.h"
#include "../PPCAnalyst.h"
#ifdef _WIN32
#include <windows.h>
#endif
enum BlockFlag
{
BLOCK_USE_GQR0 = 0x1, BLOCK_USE_GQR1 = 0x2, BLOCK_USE_GQR2 = 0x4, BLOCK_USE_GQR3 = 0x8,
BLOCK_USE_GQR4 = 0x10, BLOCK_USE_GQR5 = 0x20, BLOCK_USE_GQR6 = 0x40, BLOCK_USE_GQR7 = 0x80,
};
// TODO(ector) - optimize this struct for size
struct JitBlock
{
u32 exitAddress[2]; // 0xFFFFFFFF == unknown
u8 *exitPtrs[2]; // to be able to rewrite the exit jump
bool linkStatus[2];
u32 originalAddress;
u32 originalFirstOpcode; //to be able to restore
u32 codeSize;
u32 originalSize;
int runCount; // for profiling.
#ifdef _WIN32
// we don't really need to save start and stop
// TODO (mb2): ticStart and ticStop -> "local var" mean "in block" ... low priority ;)
LARGE_INTEGER ticStart; // for profiling - time.
LARGE_INTEGER ticStop; // for profiling - time.
LARGE_INTEGER ticCounter; // for profiling - time.
#endif
const u8 *checkedEntry;
bool invalid;
int flags;
bool ContainsAddress(u32 em_address);
};
typedef void (*CompiledCode)();
class JitBlockCache
{
const u8 **blockCodePointers;
JitBlock *blocks;
int num_blocks;
std::multimap<u32, int> links_to;
int MAX_NUM_BLOCKS;
bool RangeIntersect(int s1, int e1, int s2, int e2) const;
void LinkBlockExits(int i);
void LinkBlock(int i);
public:
JitBlockCache() {}
int AllocateBlock(u32 em_address);
void FinalizeBlock(int block_num, bool block_link, const u8 *code_ptr);
void Clear();
void Init();
void Shutdown();
void Reset();
bool IsFull() const;
// Code Cache
JitBlock *GetBlock(int block_num);
int GetNumBlocks() const;
const u8 **GetCodePointers();
// Fast way to get a block. Only works on the first ppc instruction of a block.
int GetBlockNumberFromStartAddress(u32 em_address);
// slower, but can get numbers from within blocks, not just the first instruction.
// WARNING! WILL NOT WORK WITH INLINING ENABLED (not yet a feature but will be soon)
// Returns a list of block numbers - only one block can start at a particular address, but they CAN overlap.
// This one is slow so should only be used for one-shots from the debugger UI, not for anything during runtime.
void GetBlockNumbersFromAddress(u32 em_address, std::vector<int> *block_numbers);
u32 GetOriginalCode(u32 address);
CompiledCode GetCompiledCodeFromBlock(int blockNumber);
// DOES NOT WORK CORRECTLY WITH INLINING
void InvalidateCodeRange(u32 em_address, u32 length);
void DestroyBlock(int blocknum, bool invalidate);
// Not currently used
void DestroyBlocksWithFlag(BlockFlag death_flag);
};
#endif

View file

@ -0,0 +1,395 @@
// Copyright (C) 2003-2008 Dolphin Project.
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, version 2.0.
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License 2.0 for more details.
// A copy of the GPL 2.0 should have been included with the program.
// If not, see http://www.gnu.org/licenses/
// Official SVN repository and contact information can be found at
// http://code.google.com/p/dolphin-emu/
#include "../PowerPC.h"
#include "../PPCTables.h"
#include "../PPCAnalyst.h"
#include "Jit.h"
#include "JitCache.h"
#include "JitAsm.h"
#include "JitRegCache.h"
using namespace Gen;
using namespace PowerPC;
void RegCache::Start(PPCAnalyst::BlockRegStats &stats)
{
for (int i = 0; i < NUMXREGS; i++)
{
xregs[i].free = true;
xregs[i].dirty = false;
xlocks[i] = false;
}
for (int i = 0; i < 32; i++)
{
regs[i].location = GetDefaultLocation(i);
regs[i].away = false;
}
// todo: sort to find the most popular regs
/*
int maxPreload = 2;
for (int i = 0; i < 32; i++)
{
if (stats.numReads[i] > 2 || stats.numWrites[i] >= 2)
{
LoadToX64(i, true, false); //stats.firstRead[i] <= stats.firstWrite[i], false);
maxPreload--;
if (!maxPreload)
break;
}
}*/
//Find top regs - preload them (load bursts ain't bad)
//But only preload IF written OR reads >= 3
}
// these are powerpc reg indices
void RegCache::Lock(int p1, int p2, int p3, int p4)
{
locks[p1] = true;
if (p2 != 0xFF) locks[p2] = true;
if (p3 != 0xFF) locks[p3] = true;
if (p4 != 0xFF) locks[p4] = true;
}
// these are x64 reg indices
void RegCache::LockX(int x1, int x2, int x3, int x4)
{
if (xlocks[x1]) {
PanicAlert("RegCache: x %i already locked!");
}
xlocks[x1] = true;
if (x2 != 0xFF) xlocks[x2] = true;
if (x3 != 0xFF) xlocks[x3] = true;
if (x4 != 0xFF) xlocks[x4] = true;
}
bool RegCache::IsFreeX(int xreg) const
{
return xregs[xreg].free && !xlocks[xreg];
}
void RegCache::UnlockAll()
{
for (int i = 0; i < 32; i++)
locks[i] = false;
}
void RegCache::UnlockAllX()
{
for (int i = 0; i < NUMXREGS; i++)
xlocks[i] = false;
}
X64Reg RegCache::GetFreeXReg()
{
int aCount;
const int *aOrder = GetAllocationOrder(aCount);
for (int i = 0; i < aCount; i++)
{
X64Reg xr = (X64Reg)aOrder[i];
if (!xlocks[xr] && xregs[xr].free)
{
return (X64Reg)xr;
}
}
//Okay, not found :( Force grab one
//TODO - add a pass to grab xregs whose ppcreg is not used in the next 3 instructions
for (int i = 0; i < aCount; i++)
{
X64Reg xr = (X64Reg)aOrder[i];
if (xlocks[xr])
continue;
int preg = xregs[xr].ppcReg;
if (!locks[preg])
{
StoreFromX64(preg);
return xr;
}
}
//Still no dice? Die!
_assert_msg_(DYNA_REC, 0, "Regcache ran out of regs");
return (X64Reg) -1;
}
void RegCache::SaveState()
{
memcpy(saved_locks, locks, sizeof(locks));
memcpy(saved_xlocks, xlocks, sizeof(xlocks));
memcpy(saved_regs, regs, sizeof(regs));
memcpy(saved_xregs, xregs, sizeof(xregs));
}
void RegCache::LoadState()
{
memcpy(xlocks, saved_xlocks, sizeof(xlocks));
memcpy(locks, saved_locks, sizeof(locks));
memcpy(regs, saved_regs, sizeof(regs));
memcpy(xregs, saved_xregs, sizeof(xregs));
}
void RegCache::FlushR(X64Reg reg)
{
if (reg >= NUMXREGS)
PanicAlert("Flushing non existent reg");
if (!xregs[reg].free)
{
StoreFromX64(xregs[reg].ppcReg);
}
}
void RegCache::SanityCheck() const
{
for (int i = 0; i < 32; i++) {
if (regs[i].away) {
if (regs[i].location.IsSimpleReg()) {
Gen::X64Reg simple = regs[i].location.GetSimpleReg();
if (xlocks[simple]) {
PanicAlert("%08x : PPC Reg %i is in locked x64 register %i", /*js.compilerPC*/ 0, i, regs[i].location.GetSimpleReg());
}
if (xregs[simple].ppcReg != i) {
PanicAlert("%08x : Xreg/ppcreg mismatch");
}
}
}
}
}
void RegCache::DiscardRegContentsIfCached(int preg)
{
if (regs[preg].away && regs[preg].location.IsSimpleReg())
{
xregs[regs[preg].location.GetSimpleReg()].free = true;
xregs[regs[preg].location.GetSimpleReg()].dirty = false;
regs[preg].away = false;
}
}
void GPRRegCache::SetImmediate32(int preg, u32 immValue)
{
DiscardRegContentsIfCached(preg);
regs[preg].away = true;
regs[preg].location = Imm32(immValue);
}
void GPRRegCache::Start(PPCAnalyst::BlockRegStats &stats)
{
RegCache::Start(stats);
}
void FPURegCache::Start(PPCAnalyst::BlockRegStats &stats)
{
RegCache::Start(stats);
}
const int *GPRRegCache::GetAllocationOrder(int &count)
{
static const int allocationOrder[] =
{
#ifdef _M_X64
#ifdef _WIN32
RSI, RDI, R12, R13, R14, R8, R9, R10, R11 //, RCX
#else
RBP, R12, R13, R14, R8, R9, R10, R11, //, RCX
#endif
#elif _M_IX86
ESI, EDI, EBX, EBP, EDX, ECX,
#endif
};
count = sizeof(allocationOrder) / sizeof(const int);
return allocationOrder;
}
const int *FPURegCache::GetAllocationOrder(int &count)
{
static const int allocationOrder[] =
{
#ifdef _M_X64
XMM6, XMM7, XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, XMM2, XMM3, XMM4, XMM5
#elif _M_IX86
XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
#endif
};
count = sizeof(allocationOrder) / sizeof(int);
return allocationOrder;
}
OpArg GPRRegCache::GetDefaultLocation(int reg) const
{
return M(&ppcState.gpr[reg]);
}
OpArg FPURegCache::GetDefaultLocation(int reg) const
{
return M(&ppcState.ps[reg][0]);
}
void RegCache::KillImmediate(int preg)
{
if (regs[preg].away && regs[preg].location.IsImm())
{
LoadToX64(preg, true, true);
}
}
void GPRRegCache::LoadToX64(int i, bool doLoad, bool makeDirty)
{
PanicAlert("BADNESS!");
if (!regs[i].away && regs[i].location.IsImm())
PanicAlert("Bad immedaite");
if (!regs[i].away || (regs[i].away && regs[i].location.IsImm()))
{
X64Reg xr = GetFreeXReg();
if (xregs[xr].dirty) PanicAlert("Xreg already dirty");
if (xlocks[xr]) PanicAlert("GetFreeXReg returned locked register");
xregs[xr].free = false;
xregs[xr].ppcReg = i;
xregs[xr].dirty = makeDirty || regs[i].location.IsImm();
OpArg newloc = ::Gen::R(xr);
if (doLoad)
emit->MOV(32, newloc, regs[i].location);
for (int j = 0; j < 32; j++)
{
if (i != j && regs[j].location.IsSimpleReg() && regs[j].location.GetSimpleReg() == xr)
{
Crash();
}
}
regs[i].away = true;
regs[i].location = newloc;
}
else
{
// reg location must be simplereg; memory locations
// and immediates are taken care of above.
xregs[RX(i)].dirty |= makeDirty;
}
if (xlocks[RX(i)]) {
PanicAlert("Seriously WTF, this reg should have been flushed");
}
}
void GPRRegCache::StoreFromX64(int i)
{
if (regs[i].away)
{
bool doStore;
if (regs[i].location.IsSimpleReg())
{
X64Reg xr = RX(i);
xregs[xr].free = true;
xregs[xr].ppcReg = -1;
doStore = xregs[xr].dirty;
xregs[xr].dirty = false;
}
else
{
//must be immediate - do nothing
doStore = true;
}
OpArg newLoc = GetDefaultLocation(i);
// if (doStore) //<-- Breaks JIT compilation
emit->MOV(32, newLoc, regs[i].location);
regs[i].location = newLoc;
regs[i].away = false;
}
}
void FPURegCache::LoadToX64(int i, bool doLoad, bool makeDirty)
{
_assert_msg_(DYNA_REC, !regs[i].location.IsImm(), "WTF - load - imm");
if (!regs[i].away)
{
// Reg is at home in the memory register file. Let's pull it out.
X64Reg xr = GetFreeXReg();
_assert_msg_(DYNA_REC, xr >= 0 && xr < NUMXREGS, "WTF - load - invalid reg");
xregs[xr].ppcReg = i;
xregs[xr].free = false;
xregs[xr].dirty = makeDirty;
OpArg newloc = ::Gen::R(xr);
if (doLoad)
{
if (!regs[i].location.IsImm() && (regs[i].location.offset & 0xF))
{
PanicAlert("WARNING - misaligned fp register location %i", i);
}
emit->MOVAPD(xr, regs[i].location);
}
regs[i].location = newloc;
regs[i].away = true;
} else {
// There are no immediates in the FPR reg file, so we already had this in a register. Make dirty as necessary.
xregs[RX(i)].dirty |= makeDirty;
}
}
void FPURegCache::StoreFromX64(int i)
{
_assert_msg_(DYNA_REC, !regs[i].location.IsImm(), "WTF - store - imm");
if (regs[i].away)
{
X64Reg xr = regs[i].location.GetSimpleReg();
_assert_msg_(DYNA_REC, xr >= 0 && xr < NUMXREGS, "WTF - store - invalid reg");
xregs[xr].free = true;
xregs[xr].dirty = false;
xregs[xr].ppcReg = -1;
OpArg newLoc = GetDefaultLocation(i);
emit->MOVAPD(newLoc, xr);
regs[i].location = newLoc;
regs[i].away = false;
}
else
{
// _assert_msg_(DYNA_REC,0,"already stored");
}
}
void RegCache::Flush(FlushMode mode)
{
for (int i = 0; i < NUMXREGS; i++) {
if (xlocks[i])
PanicAlert("Somone forgot to unlock X64 reg %i.", i);
}
for (int i = 0; i < 32; i++)
{
if (locks[i])
{
PanicAlert("Somebody forgot to unlock PPC reg %i.", i);
}
if (regs[i].away)
{
if (regs[i].location.IsSimpleReg())
{
X64Reg xr = RX(i);
StoreFromX64(i);
xregs[xr].dirty = false;
}
else if (regs[i].location.IsImm())
{
StoreFromX64(i);
}
else
{
_assert_msg_(DYNA_REC,0,"Jit64 - Flush unhandled case, reg %i", i);
}
}
}
}

View file

@ -0,0 +1,150 @@
// Copyright (C) 2003-2008 Dolphin Project.
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, version 2.0.
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License 2.0 for more details.
// A copy of the GPL 2.0 should have been included with the program.
// If not, see http://www.gnu.org/licenses/
// Official SVN repository and contact information can be found at
// http://code.google.com/p/dolphin-emu/
#ifndef _JITREGCACHE_H
#define _JITREGCACHE_H
#include "x64Emitter.h"
using namespace Gen;
enum FlushMode
{
FLUSH_ALL
};
enum GrabMode
{
M_READ = 1,
M_WRITE = 2,
M_READWRITE = 3,
};
struct PPCCachedReg
{
OpArg location;
bool away; // value not in source register
};
struct X64CachedReg
{
int ppcReg;
bool dirty;
bool free;
};
typedef int XReg;
typedef int PReg;
#ifdef _M_X64
#define NUMXREGS 16
#elif _M_IX86
#define NUMXREGS 8
#endif
class RegCache
{
private:
bool locks[32];
bool saved_locks[32];
bool saved_xlocks[NUMXREGS];
protected:
bool xlocks[NUMXREGS];
PPCCachedReg regs[32];
X64CachedReg xregs[NUMXREGS];
PPCCachedReg saved_regs[32];
X64CachedReg saved_xregs[NUMXREGS];
void DiscardRegContentsIfCached(int preg);
virtual const int *GetAllocationOrder(int &count) = 0;
XEmitter *emit;
public:
virtual ~RegCache() {}
virtual void Start(PPCAnalyst::BlockRegStats &stats) = 0;
void SetEmitter(XEmitter *emitter) {emit = emitter;}
void FlushR(X64Reg reg);
void FlushR(X64Reg reg, X64Reg reg2) {FlushR(reg); FlushR(reg2);}
void FlushLockX(X64Reg reg) {
FlushR(reg);
LockX(reg);
}
void FlushLockX(X64Reg reg1, X64Reg reg2) {
FlushR(reg1); FlushR(reg2);
LockX(reg1); LockX(reg2);
}
virtual void Flush(FlushMode mode);
virtual void Flush(PPCAnalyst::CodeOp *op) {Flush(FLUSH_ALL);}
void SanityCheck() const;
void KillImmediate(int preg);
//TODO - instead of doload, use "read", "write"
//read only will not set dirty flag
virtual void LoadToX64(int preg, bool doLoad = true, bool makeDirty = true) = 0;
virtual void StoreFromX64(int preg) = 0;
const OpArg &R(int preg) const {return regs[preg].location;}
X64Reg RX(int preg) const
{
if (regs[preg].away && regs[preg].location.IsSimpleReg())
return regs[preg].location.GetSimpleReg();
PanicAlert("Not so simple - %i", preg);
return (X64Reg)-1;
}
virtual OpArg GetDefaultLocation(int reg) const = 0;
// Register locking.
void Lock(int p1, int p2=0xff, int p3=0xff, int p4=0xff);
void LockX(int x1, int x2=0xff, int x3=0xff, int x4=0xff);
void UnlockAll();
void UnlockAllX();
bool IsFreeX(int xreg) const;
X64Reg GetFreeXReg();
void SaveState();
void LoadState();
};
class GPRRegCache : public RegCache
{
public:
void Start(PPCAnalyst::BlockRegStats &stats);
void LoadToX64(int preg, bool doLoad = true, bool makeDirty = true);
void StoreFromX64(int preg);
OpArg GetDefaultLocation(int reg) const;
const int *GetAllocationOrder(int &count);
void SetImmediate32(int preg, u32 immValue);
};
class FPURegCache : public RegCache
{
public:
void Start(PPCAnalyst::BlockRegStats &stats);
void LoadToX64(int preg, bool doLoad = true, bool makeDirty = true);
void StoreFromX64(int preg);
const int *GetAllocationOrder(int &count);
OpArg GetDefaultLocation(int reg) const;
};
#endif

View file

@ -0,0 +1,200 @@
// Copyright (C) 2003-2008 Dolphin Project.
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, version 2.0.
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License 2.0 for more details.
// A copy of the GPL 2.0 should have been included with the program.
// If not, see http://www.gnu.org/licenses/
// Official SVN repository and contact information can be found at
// http://code.google.com/p/dolphin-emu/
#include "Common.h"
#include "Thunk.h"
#include "../../Core.h"
#include "../PowerPC.h"
#include "../../CoreTiming.h"
#include "../PPCTables.h"
#include "x64Emitter.h"
#include "Jit.h"
#include "JitRegCache.h"
#include "JitCache.h"
#include "JitAsm.h"
// The branches are known good, or at least reasonably good.
// No need for a disable-mechanism.
// If defined, clears CR0 at blr and bl-s. If the assumption that
// flags never carry over between functions holds, then the task for
// an optimizer becomes much easier.
// #define ACID_TEST
// Zelda and many more games seem to pass the Acid Test.
using namespace Gen;
void Jit64::sc(UGeckoInstruction inst)
{
if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITBranchOff)
{Default(inst); return;} // turn off from debugger
gpr.Flush(FLUSH_ALL);
fpr.Flush(FLUSH_ALL);
WriteExceptionExit(EXCEPTION_SYSCALL);
}
void Jit64::rfi(UGeckoInstruction inst)
{
if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITBranchOff)
{Default(inst); return;} // turn off from debugger
gpr.Flush(FLUSH_ALL);
fpr.Flush(FLUSH_ALL);
//Bits SRR1[0, 5-9, 16-23, 25-27, 30-31] are placed into the corresponding bits of the MSR.
//MSR[13] is set to 0.
const u32 mask = 0x87C0FF73;
// MSR = (MSR & ~mask) | (SRR1 & mask);
MOV(32, R(EAX), M(&MSR));
MOV(32, R(ECX), M(&SRR1));
AND(32, R(EAX), Imm32(~mask));
AND(32, R(ECX), Imm32(mask));
OR(32, R(EAX), R(ECX));
// MSR &= 0xFFFDFFFF; //TODO: VERIFY
AND(32, R(EAX), Imm32(0xFFFDFFFF));
MOV(32, M(&MSR), R(EAX));
// NPC = SRR0;
MOV(32, R(EAX), M(&SRR0));
WriteRfiExitDestInEAX();
}
void Jit64::bx(UGeckoInstruction inst)
{
if (inst.LK)
ibuild.EmitStoreLink(ibuild.EmitIntConst(js.compilerPC + 4));
u32 destination;
if (inst.AA)
destination = SignExt26(inst.LI << 2);
else
destination = js.compilerPC + SignExt26(inst.LI << 2);
ibuild.EmitBranchUncond(ibuild.EmitIntConst(destination));
}
// TODO - optimize to hell and beyond
// TODO - make nice easy to optimize special cases for the most common
// variants of this instruction.
void Jit64::bcx(UGeckoInstruction inst)
{
if (inst.LK)
ibuild.EmitStoreLink(
ibuild.EmitIntConst(js.compilerPC + 4));
IREmitter::InstLoc CRTest = 0, CTRTest = 0;
if ((inst.BO & 16) == 0) // Test a CR bit
{
IREmitter::InstLoc CRReg = ibuild.EmitLoadCR(inst.BI >> 2);
IREmitter::InstLoc CRCmp = ibuild.EmitIntConst(8 >> (inst.BI & 3));
CRTest = ibuild.EmitAnd(CRReg, CRCmp);
if (inst.BO & 8)
CRTest = ibuild.EmitXor(CRTest, CRCmp);
}
if ((inst.BO & BO_DONT_DECREMENT_FLAG) == 0) {
IREmitter::InstLoc c = ibuild.EmitLoadCTR();
c = ibuild.EmitSub(c, ibuild.EmitIntConst(1));
ibuild.EmitStoreCTR(c);
}
if ((inst.BO & 4) == 0) {
IREmitter::InstLoc c = ibuild.EmitLoadCTR();
if (!(inst.BO & 2)) {
CTRTest = ibuild.EmitICmpEq(c,
ibuild.EmitIntConst(0));
} else {
CTRTest = c;
}
}
IREmitter::InstLoc Test = CRTest;
if (CTRTest) {
if (Test)
Test = ibuild.EmitOr(Test, CTRTest);
else
Test = CTRTest;
}
if (!Test) {
PanicAlert("Unconditional conditional branch?!");
}
u32 destination;
if(inst.AA)
destination = SignExt16(inst.BD << 2);
else
destination = js.compilerPC + SignExt16(inst.BD << 2);
ibuild.EmitBranchCond(Test, ibuild.EmitIntConst(destination));
ibuild.EmitBranchUncond(ibuild.EmitIntConst(js.compilerPC + 4));
}
void Jit64::bcctrx(UGeckoInstruction inst)
{
if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITBranchOff)
{Default(inst); return;} // turn off from debugger
gpr.Flush(FLUSH_ALL);
fpr.Flush(FLUSH_ALL);
// bool fastway = true;
if ((inst.BO & 16) == 0)
{
PanicAlert("Bizarro bcctrx %08x, not supported.", inst.hex);
_assert_msg_(DYNA_REC, 0, "Bizarro bcctrx");
/*
fastway = false;
MOV(32, M(&PC), Imm32(js.compilerPC+4));
MOV(32, R(EAX), M(&CR));
XOR(32, R(ECX), R(ECX));
AND(32, R(EAX), Imm32(0x80000000 >> inst.BI));
CCFlags branch;
if(inst.BO & 8)
branch = CC_NZ;
else
branch = CC_Z;
*/
// TODO(ector): Why is this commented out?
//SETcc(branch, R(ECX));
// check for EBX
//TEST(32, R(ECX), R(ECX));
//linkEnd = J_CC(branch);
}
// NPC = CTR & 0xfffffffc;
MOV(32, R(EAX), M(&CTR));
if (inst.LK)
MOV(32, M(&LR), Imm32(js.compilerPC + 4)); // LR = PC + 4;
AND(32, R(EAX), Imm32(0xFFFFFFFC));
WriteExitDestInEAX(0);
}
void Jit64::bclrx(UGeckoInstruction inst)
{
if (inst.hex == 0x4e800020) {
ibuild.EmitBranchUncond(ibuild.EmitLoadLink());
return;
}
Default(inst);
return;
}

View file

@ -0,0 +1,224 @@
// Copyright (C) 2003-2008 Dolphin Project.
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, version 2.0.
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License 2.0 for more details.
// A copy of the GPL 2.0 should have been included with the program.
// If not, see http://www.gnu.org/licenses/
// Official SVN repository and contact information can be found at
// http://code.google.com/p/dolphin-emu/
#include "Common.h"
#include "../../Core.h"
#include "../PowerPC.h"
#include "../PPCTables.h"
#include "x64Emitter.h"
#include "Jit.h"
#include "JitCache.h"
#include "JitRegCache.h"
#define INSTRUCTION_START
// #define INSTRUCTION_START Default(inst); return;
const u64 GC_ALIGNED16(psSignBits2[2]) = {0x8000000000000000ULL, 0x8000000000000000ULL};
const u64 GC_ALIGNED16(psAbsMask2[2]) = {0x7FFFFFFFFFFFFFFFULL, 0x7FFFFFFFFFFFFFFFULL};
const double GC_ALIGNED16(psOneOne2[2]) = {1.0, 1.0};
void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool dupe, void (XEmitter::*op)(Gen::X64Reg, Gen::OpArg))
{
fpr.Lock(d, a, b);
if (d == a)
{
fpr.LoadToX64(d, true);
(this->*op)(fpr.RX(d), fpr.R(b));
}
else if (d == b && reversible)
{
fpr.LoadToX64(d, true);
(this->*op)(fpr.RX(d), fpr.R(a));
}
else if (a != d && b != d)
{
// Sources different from d, can use rather quick solution
fpr.LoadToX64(d, !dupe);
MOVSD(fpr.RX(d), fpr.R(a));
(this->*op)(fpr.RX(d), fpr.R(b));
}
else if (b != d)
{
fpr.LoadToX64(d, !dupe);
MOVSD(XMM0, fpr.R(b));
MOVSD(fpr.RX(d), fpr.R(a));
(this->*op)(fpr.RX(d), Gen::R(XMM0));
}
else // Other combo, must use two temps :(
{
MOVSD(XMM0, fpr.R(a));
MOVSD(XMM1, fpr.R(b));
fpr.LoadToX64(d, !dupe);
(this->*op)(XMM0, Gen::R(XMM1));
MOVSD(fpr.RX(d), Gen::R(XMM0));
}
if (dupe) {
ForceSinglePrecisionS(fpr.RX(d));
MOVDDUP(fpr.RX(d), fpr.R(d));
}
fpr.UnlockAll();
}
void Jit64::fp_arith_s(UGeckoInstruction inst)
{
if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITFloatingPointOff)
{Default(inst); return;} // turn off from debugger
INSTRUCTION_START;
if (inst.Rc) {
Default(inst); return;
}
bool dupe = inst.OPCD == 59;
switch (inst.SUBOP5)
{
case 18: fp_tri_op(inst.FD, inst.FA, inst.FB, false, dupe, &XEmitter::DIVSD); break; //div
case 20: fp_tri_op(inst.FD, inst.FA, inst.FB, false, dupe, &XEmitter::SUBSD); break; //sub
case 21: fp_tri_op(inst.FD, inst.FA, inst.FB, true, dupe, &XEmitter::ADDSD); break; //add
case 23: //sel
Default(inst);
break;
case 24: //res
Default(inst);
break;
case 25: fp_tri_op(inst.FD, inst.FA, inst.FC, true, dupe, &XEmitter::MULSD); break; //mul
default:
_assert_msg_(DYNA_REC, 0, "fp_arith_s WTF!!!");
}
}
void Jit64::fmaddXX(UGeckoInstruction inst)
{
if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITFloatingPointOff)
{Default(inst); return;} // turn off from debugger
INSTRUCTION_START;
if (inst.Rc) {
Default(inst); return;
}
bool single_precision = inst.OPCD == 59;
int a = inst.FA;
int b = inst.FB;
int c = inst.FC;
int d = inst.FD;
fpr.Lock(a, b, c, d);
MOVSD(XMM0, fpr.R(a));
switch (inst.SUBOP5)
{
case 28: //msub
MULSD(XMM0, fpr.R(c));
SUBSD(XMM0, fpr.R(b));
break;
case 29: //madd
MULSD(XMM0, fpr.R(c));
ADDSD(XMM0, fpr.R(b));
break;
case 30: //nmsub
MULSD(XMM0, fpr.R(c));
SUBSD(XMM0, fpr.R(b));
XORPD(XMM0, M((void*)&psSignBits2));
break;
case 31: //nmadd
MULSD(XMM0, fpr.R(c));
ADDSD(XMM0, fpr.R(b));
XORPD(XMM0, M((void*)&psSignBits2));
break;
}
fpr.LoadToX64(d, false);
//YES it is necessary to dupe the result :(
//TODO : analysis - does the top reg get used? If so, dupe, if not, don't.
if (single_precision) {
ForceSinglePrecisionS(XMM0);
MOVDDUP(fpr.RX(d), R(XMM0));
} else {
MOVSD(fpr.RX(d), R(XMM0));
}
fpr.UnlockAll();
}
void Jit64::fmrx(UGeckoInstruction inst)
{
if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITFloatingPointOff)
{Default(inst); return;} // turn off from debugger
INSTRUCTION_START;
if (inst.Rc) {
Default(inst); return;
}
int d = inst.FD;
int b = inst.FB;
fpr.LoadToX64(d, true); // we don't want to destroy the high bit
MOVSD(fpr.RX(d), fpr.R(b));
}
void Jit64::fcmpx(UGeckoInstruction inst)
{
if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITFloatingPointOff)
{Default(inst); return;} // turn off from debugger
INSTRUCTION_START;
if (jo.fpAccurateFlags)
{
Default(inst);
return;
}
bool ordered = inst.SUBOP10 == 32;
/*
double fa = rPS0(_inst.FA);
double fb = rPS0(_inst.FB);
u32 compareResult;
if(IsNAN(fa) || IsNAN(fb)) compareResult = 1;
else if(fa < fb) compareResult = 8;
else if(fa > fb) compareResult = 4;
else compareResult = 2;
FPSCR.FPRF = compareResult;
CR = (CR & (~(0xf0000000 >> (_inst.CRFD * 4)))) | (compareResult << ((7 - _inst.CRFD) * 4));
*/
int a = inst.FA;
int b = inst.FB;
int crf = inst.CRFD;
int shift = crf * 4;
//FPSCR
//XOR(32,R(EAX),R(EAX));
fpr.Lock(a,b);
if (a != b)
fpr.LoadToX64(a, true);
// USES_CR
if (ordered)
COMISD(fpr.R(a).GetSimpleReg(), fpr.R(b));
else
UCOMISD(fpr.R(a).GetSimpleReg(), fpr.R(b));
FixupBranch pLesser = J_CC(CC_B);
FixupBranch pGreater = J_CC(CC_A);
// _x86Reg == 0
MOV(8, M(&PowerPC::ppcState.cr_fast[crf]), Imm8(0x2));
FixupBranch continue1 = J();
// _x86Reg > 0
SetJumpTarget(pGreater);
MOV(8, M(&PowerPC::ppcState.cr_fast[crf]), Imm8(0x4));
FixupBranch continue2 = J();
// _x86Reg < 0
SetJumpTarget(pLesser);
MOV(8, M(&PowerPC::ppcState.cr_fast[crf]), Imm8(0x8));
SetJumpTarget(continue1);
SetJumpTarget(continue2);
fpr.UnlockAll();
}

View file

@ -0,0 +1,520 @@
// Copyright (C) 2003-2008 Dolphin Project.
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, version 2.0.
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License 2.0 for more details.
// A copy of the GPL 2.0 should have been included with the program.
// If not, see http://www.gnu.org/licenses/
// Official SVN repository and contact information can be found at
// http://code.google.com/p/dolphin-emu/
#include "../../Core.h" // include "Common.h", "CoreParameter.h", SCoreStartupParameter
#include "../PowerPC.h"
#include "../PPCTables.h"
#include "x64Emitter.h"
#include "Jit.h"
#include "JitCache.h"
#include "JitRegCache.h"
#include "JitAsm.h"
// #define INSTRUCTION_START Default(inst); return;
#define INSTRUCTION_START
static void ComputeRC(IREmitter::IRBuilder& ibuild,
IREmitter::InstLoc val) {
IREmitter::InstLoc res =
ibuild.EmitICmpCRSigned(val, ibuild.EmitIntConst(0));
ibuild.EmitStoreCR(res, 0);
}
void Jit64::reg_imm(UGeckoInstruction inst)
{
int d = inst.RD, a = inst.RA, s = inst.RS;
IREmitter::InstLoc val, test, c;
switch (inst.OPCD)
{
case 14: //addi
val = ibuild.EmitIntConst(inst.SIMM_16);
if (a)
val = ibuild.EmitAdd(ibuild.EmitLoadGReg(a), val);
ibuild.EmitStoreGReg(val, d);
break;
case 15: //addis
val = ibuild.EmitIntConst(inst.SIMM_16 << 16);
if (a)
val = ibuild.EmitAdd(ibuild.EmitLoadGReg(a), val);
ibuild.EmitStoreGReg(val, d);
break;
case 24: //ori
val = ibuild.EmitIntConst(inst.UIMM);
val = ibuild.EmitOr(ibuild.EmitLoadGReg(s), val);
ibuild.EmitStoreGReg(val, a);
break;
case 25: //oris
val = ibuild.EmitIntConst(inst.UIMM << 16);
val = ibuild.EmitOr(ibuild.EmitLoadGReg(s), val);
ibuild.EmitStoreGReg(val, a);
break;
case 28: //andi
val = ibuild.EmitIntConst(inst.UIMM);
val = ibuild.EmitAnd(ibuild.EmitLoadGReg(s), val);
ibuild.EmitStoreGReg(val, a);
ComputeRC(ibuild, val);
break;
case 29: //andis
val = ibuild.EmitIntConst(inst.UIMM << 16);
val = ibuild.EmitAnd(ibuild.EmitLoadGReg(s), val);
ibuild.EmitStoreGReg(val, a);
ComputeRC(ibuild, val);
break;
case 26: //xori
val = ibuild.EmitIntConst(inst.UIMM);
val = ibuild.EmitXor(ibuild.EmitLoadGReg(s), val);
ibuild.EmitStoreGReg(val, a);
break;
case 27: //xoris
val = ibuild.EmitIntConst(inst.UIMM << 16);
val = ibuild.EmitXor(ibuild.EmitLoadGReg(s), val);
ibuild.EmitStoreGReg(val, a);
break;
case 12: //addic
case 13: //addic_rc
c = ibuild.EmitIntConst(inst.SIMM_16);
val = ibuild.EmitAdd(ibuild.EmitLoadGReg(a), c);
ibuild.EmitStoreGReg(val, d);
test = ibuild.EmitICmpUgt(c, val);
ibuild.EmitStoreCarry(test);
if (inst.OPCD == 13)
ComputeRC(ibuild, val);
break;
default:
Default(inst);
break;
}
}
void Jit64::cmpXX(UGeckoInstruction inst)
{
IREmitter::InstLoc lhs, rhs, res;
lhs = ibuild.EmitLoadGReg(inst.RA);
if (inst.OPCD == 31) {
rhs = ibuild.EmitLoadGReg(inst.RB);
if (inst.SUBOP10 == 32) {
res = ibuild.EmitICmpCRUnsigned(lhs, rhs);
} else {
res = ibuild.EmitICmpCRSigned(lhs, rhs);
}
} else if (inst.OPCD == 10) {
rhs = ibuild.EmitIntConst(inst.UIMM);
res = ibuild.EmitICmpCRUnsigned(lhs, rhs);
} else { // inst.OPCD == 11
rhs = ibuild.EmitIntConst(inst.SIMM_16);
res = ibuild.EmitICmpCRSigned(lhs, rhs);
}
ibuild.EmitStoreCR(res, inst.CRFD);
}
void Jit64::orx(UGeckoInstruction inst)
{
IREmitter::InstLoc val = ibuild.EmitLoadGReg(inst.RB);
val = ibuild.EmitOr(ibuild.EmitLoadGReg(inst.RS), val);
ibuild.EmitStoreGReg(val, inst.RA);
if (inst.Rc)
ComputeRC(ibuild, val);
}
// m_GPR[_inst.RA] = m_GPR[_inst.RS] ^ m_GPR[_inst.RB];
void Jit64::xorx(UGeckoInstruction inst)
{
IREmitter::InstLoc val = ibuild.EmitLoadGReg(inst.RB);
val = ibuild.EmitXor(ibuild.EmitLoadGReg(inst.RS), val);
ibuild.EmitStoreGReg(val, inst.RA);
if (inst.Rc)
ComputeRC(ibuild, val);
}
void Jit64::andx(UGeckoInstruction inst)
{
IREmitter::InstLoc val = ibuild.EmitLoadGReg(inst.RB);
val = ibuild.EmitAnd(ibuild.EmitLoadGReg(inst.RS), val);
ibuild.EmitStoreGReg(val, inst.RA);
if (inst.Rc)
ComputeRC(ibuild, val);
}
void Jit64::extsbx(UGeckoInstruction inst)
{
IREmitter::InstLoc val = ibuild.EmitLoadGReg(inst.RS);
val = ibuild.EmitSExt8(val);
ibuild.EmitStoreGReg(val, inst.RA);
if (inst.Rc)
ComputeRC(ibuild, val);
}
void Jit64::extshx(UGeckoInstruction inst)
{
IREmitter::InstLoc val = ibuild.EmitLoadGReg(inst.RS);
val = ibuild.EmitSExt16(val);
ibuild.EmitStoreGReg(val, inst.RA);
if (inst.Rc)
ComputeRC(ibuild, val);
}
void Jit64::subfic(UGeckoInstruction inst)
{
if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITIntegerOff)
{Default(inst); return;} // turn off from debugger
INSTRUCTION_START;
int a = inst.RA, d = inst.RD;
gpr.FlushLockX(ECX);
gpr.Lock(a, d);
gpr.LoadToX64(d, a == d, true);
int imm = inst.SIMM_16;
MOV(32, R(EAX), gpr.R(a));
NOT(32, R(EAX));
ADD(32, R(EAX), Imm32(imm + 1));
MOV(32, gpr.R(d), R(EAX));
//GenerateCarry(ECX);
gpr.UnlockAll();
gpr.UnlockAllX();
// This instruction has no RC flag
}
void Jit64::subfcx(UGeckoInstruction inst)
{
INSTRUCTION_START;
Default(inst);
return;
/*
u32 a = m_GPR[_inst.RA];
u32 b = m_GPR[_inst.RB];
m_GPR[_inst.RD] = b - a;
SetCarry(a == 0 || Helper_Carry(b, 0-a));
if (_inst.OE) PanicAlert("OE: subfcx");
if (_inst.Rc) Helper_UpdateCR0(m_GPR[_inst.RD]);
*/
}
void Jit64::subfex(UGeckoInstruction inst)
{
INSTRUCTION_START;
Default(inst);
return;
/*
u32 a = m_GPR[_inst.RA];
u32 b = m_GPR[_inst.RB];
int carry = GetCarry();
m_GPR[_inst.RD] = (~a) + b + carry;
SetCarry(Helper_Carry(~a, b) || Helper_Carry((~a) + b, carry));
if (_inst.OE) PanicAlert("OE: subfcx");
if (_inst.Rc) Helper_UpdateCR0(m_GPR[_inst.RD]);
*/
}
void Jit64::subfx(UGeckoInstruction inst)
{
if (inst.OE) PanicAlert("OE: subfx");
IREmitter::InstLoc val = ibuild.EmitLoadGReg(inst.RB);
val = ibuild.EmitSub(val, ibuild.EmitLoadGReg(inst.RA));
ibuild.EmitStoreGReg(val, inst.RD);
if (inst.Rc)
ComputeRC(ibuild, val);
}
void Jit64::mulli(UGeckoInstruction inst)
{
IREmitter::InstLoc val = ibuild.EmitLoadGReg(inst.RA);
val = ibuild.EmitMul(val, ibuild.EmitIntConst(inst.SIMM_16));
ibuild.EmitStoreGReg(val, inst.RD);
}
void Jit64::mullwx(UGeckoInstruction inst)
{
IREmitter::InstLoc val = ibuild.EmitLoadGReg(inst.RB);
val = ibuild.EmitMul(ibuild.EmitLoadGReg(inst.RA), val);
ibuild.EmitStoreGReg(val, inst.RD);
if (inst.Rc)
ComputeRC(ibuild, val);
}
void Jit64::mulhwux(UGeckoInstruction inst)
{
if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITIntegerOff)
{Default(inst); return;} // turn off from debugger
INSTRUCTION_START;
int a = inst.RA, b = inst.RB, d = inst.RD;
gpr.FlushLockX(EDX);
gpr.Lock(a, b, d);
if (d != a && d != b) {
gpr.LoadToX64(d, false, true);
} else {
gpr.LoadToX64(d, true, true);
}
if (gpr.RX(d) == EDX)
PanicAlert("mulhwux : WTF");
MOV(32, R(EAX), gpr.R(a));
gpr.KillImmediate(b);
MUL(32, gpr.R(b));
gpr.UnlockAll();
gpr.UnlockAllX();
if (inst.Rc) {
MOV(32, R(EAX), R(EDX));
MOV(32, gpr.R(d), R(EDX));
// result is already in eax
CALL((u8*)asm_routines.computeRc);
} else {
MOV(32, gpr.R(d), R(EDX));
}
}
// skipped some of the special handling in here - if we get crashes, let the interpreter handle this op
void Jit64::divwux(UGeckoInstruction inst) {
Default(inst); return;
int a = inst.RA, b = inst.RB, d = inst.RD;
gpr.FlushLockX(EDX);
gpr.Lock(a, b, d);
if (d != a && d != b) {
gpr.LoadToX64(d, false, true);
} else {
gpr.LoadToX64(d, true, true);
}
MOV(32, R(EAX), gpr.R(a));
XOR(32, R(EDX), R(EDX));
gpr.KillImmediate(b);
DIV(32, gpr.R(b));
MOV(32, gpr.R(d), R(EAX));
gpr.UnlockAll();
gpr.UnlockAllX();
if (inst.Rc) {
CALL((u8*)asm_routines.computeRc);
}
}
u32 Helper_Mask(u8 mb, u8 me)
{
return (((mb > me) ?
~(((u32)-1 >> mb) ^ ((me >= 31) ? 0 : (u32) -1 >> (me + 1)))
:
(((u32)-1 >> mb) ^ ((me >= 31) ? 0 : (u32) -1 >> (me + 1))))
);
}
void Jit64::addx(UGeckoInstruction inst)
{
IREmitter::InstLoc val = ibuild.EmitLoadGReg(inst.RB);
val = ibuild.EmitAdd(ibuild.EmitLoadGReg(inst.RA), val);
ibuild.EmitStoreGReg(val, inst.RD);
if (inst.Rc)
ComputeRC(ibuild, val);
}
// This can be optimized
void Jit64::addex(UGeckoInstruction inst)
{
Default(inst); return;
// USES_XER
if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITIntegerOff)
{Default(inst); return;} // turn off from debugger
INSTRUCTION_START;
int a = inst.RA, b = inst.RB, d = inst.RD;
gpr.FlushLockX(ECX);
gpr.Lock(a, b, d);
if (d != a && d != b)
gpr.LoadToX64(d, false);
else
gpr.LoadToX64(d, true);
MOV(32, R(EAX), M(&PowerPC::ppcState.spr[SPR_XER]));
SHR(32, R(EAX), Imm8(30)); // shift the carry flag out into the x86 carry flag
MOV(32, R(EAX), gpr.R(a));
ADC(32, R(EAX), gpr.R(b));
MOV(32, gpr.R(d), R(EAX));
//GenerateCarry(ECX);
gpr.UnlockAll();
gpr.UnlockAllX();
if (inst.Rc)
{
CALL((u8*)asm_routines.computeRc);
}
}
void Jit64::rlwinmx(UGeckoInstruction inst)
{
unsigned mask = Helper_Mask(inst.MB, inst.ME);
IREmitter::InstLoc val = ibuild.EmitLoadGReg(inst.RS);
val = ibuild.EmitRol(val, ibuild.EmitIntConst(inst.SH));
val = ibuild.EmitAnd(val, ibuild.EmitIntConst(mask));
ibuild.EmitStoreGReg(val, inst.RA);
if (inst.Rc)
ComputeRC(ibuild, val);
}
void Jit64::rlwimix(UGeckoInstruction inst)
{
unsigned mask = Helper_Mask(inst.MB, inst.ME);
IREmitter::InstLoc val = ibuild.EmitLoadGReg(inst.RS);
val = ibuild.EmitRol(val, ibuild.EmitIntConst(inst.SH));
val = ibuild.EmitAnd(val, ibuild.EmitIntConst(mask));
IREmitter::InstLoc ival = ibuild.EmitLoadGReg(inst.RA);
ival = ibuild.EmitAnd(ival, ibuild.EmitIntConst(~mask));
val = ibuild.EmitOr(ival, val);
ibuild.EmitStoreGReg(val, inst.RA);
if (inst.Rc)
ComputeRC(ibuild, val);
}
void Jit64::rlwnmx(UGeckoInstruction inst)
{
if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITIntegerOff)
{Default(inst); return;} // turn off from debugger
INSTRUCTION_START;
int a = inst.RA, b = inst.RB, s = inst.RS;
if (gpr.R(a).IsImm())
{
Default(inst);
return;
}
u32 mask = Helper_Mask(inst.MB, inst.ME);
gpr.FlushLockX(ECX);
gpr.Lock(a, b, s);
MOV(32, R(EAX), gpr.R(s));
MOV(32, R(ECX), gpr.R(b));
AND(32, R(ECX), Imm32(0x1f));
ROL(32, R(EAX), R(ECX));
AND(32, R(EAX), Imm32(mask));
MOV(32, gpr.R(a), R(EAX));
gpr.UnlockAll();
gpr.UnlockAllX();
if (inst.Rc)
{
MOV(32, R(EAX), gpr.R(a));
CALL((u8*)asm_routines.computeRc);
}
}
void Jit64::negx(UGeckoInstruction inst)
{
IREmitter::InstLoc val = ibuild.EmitLoadGReg(inst.RA);
val = ibuild.EmitSub(ibuild.EmitIntConst(0), val);
ibuild.EmitStoreGReg(val, inst.RD);
if (inst.Rc)
ComputeRC(ibuild, val);
}
void Jit64::srwx(UGeckoInstruction inst)
{
IREmitter::InstLoc val = ibuild.EmitLoadGReg(inst.RS),
samt = ibuild.EmitLoadGReg(inst.RB),
corr;
// FIXME: We can do better with a cmov
// FIXME: We can do better on 64-bit
val = ibuild.EmitShrl(val, samt);
corr = ibuild.EmitShl(samt, ibuild.EmitIntConst(26));
corr = ibuild.EmitSarl(corr, ibuild.EmitIntConst(31));
corr = ibuild.EmitXor(corr, ibuild.EmitIntConst(-1));
val = ibuild.EmitAnd(corr, val);
ibuild.EmitStoreGReg(val, inst.RA);
if (inst.Rc)
ComputeRC(ibuild, val);
}
void Jit64::slwx(UGeckoInstruction inst)
{
IREmitter::InstLoc val = ibuild.EmitLoadGReg(inst.RS),
samt = ibuild.EmitLoadGReg(inst.RB),
corr;
// FIXME: We can do better with a cmov
// FIXME: We can do better on 64-bit
val = ibuild.EmitShl(val, samt);
corr = ibuild.EmitShl(samt, ibuild.EmitIntConst(26));
corr = ibuild.EmitSarl(corr, ibuild.EmitIntConst(31));
corr = ibuild.EmitXor(corr, ibuild.EmitIntConst(-1));
val = ibuild.EmitAnd(corr, val);
ibuild.EmitStoreGReg(val, inst.RA);
if (inst.Rc)
ComputeRC(ibuild, val);
}
void Jit64::srawx(UGeckoInstruction inst)
{
// FIXME: We can do a lot better on 64-bit
IREmitter::InstLoc val, samt, mask, mask2, test;
val = ibuild.EmitLoadGReg(inst.RS);
samt = ibuild.EmitLoadGReg(inst.RB);
mask = ibuild.EmitIntConst(-1);
val = ibuild.EmitSarl(val, samt);
mask = ibuild.EmitShl(mask, samt);
samt = ibuild.EmitShl(samt, ibuild.EmitIntConst(26));
samt = ibuild.EmitSarl(samt, ibuild.EmitIntConst(31));
samt = ibuild.EmitAnd(samt, ibuild.EmitIntConst(31));
val = ibuild.EmitSarl(val, samt);
ibuild.EmitStoreGReg(val, inst.RA);
mask = ibuild.EmitShl(mask, samt);
mask2 = ibuild.EmitAnd(mask, ibuild.EmitIntConst(0x7FFFFFFF));
test = ibuild.EmitOr(val, mask2);
test = ibuild.EmitICmpUgt(test, mask);
ibuild.EmitStoreCarry(test);
}
void Jit64::srawix(UGeckoInstruction inst)
{
IREmitter::InstLoc val = ibuild.EmitLoadGReg(inst.RS), test;
val = ibuild.EmitSarl(val, ibuild.EmitIntConst(inst.SH));
ibuild.EmitStoreGReg(val, inst.RA);
unsigned mask = -1u << inst.SH;
test = ibuild.EmitOr(val, ibuild.EmitIntConst(mask & 0x7FFFFFFF));
test = ibuild.EmitICmpUgt(test, ibuild.EmitIntConst(mask));
ibuild.EmitStoreCarry(test);
if (inst.Rc)
ComputeRC(ibuild, val);
}
// count leading zeroes
void Jit64::cntlzwx(UGeckoInstruction inst)
{
if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITIntegerOff)
{Default(inst); return;} // turn off from debugger
INSTRUCTION_START;
int a = inst.RA;
int s = inst.RS;
if (gpr.R(a).IsImm() || gpr.R(s).IsImm() || s == a)
{
Default(inst);
return;
}
gpr.Lock(a,s);
gpr.LoadToX64(a,false);
BSR(32, gpr.R(a).GetSimpleReg(), gpr.R(s));
FixupBranch gotone = J_CC(CC_NZ);
MOV(32, gpr.R(a), Imm32(63));
SetJumpTarget(gotone);
XOR(32, gpr.R(a), Imm8(0x1f)); // flip order
gpr.UnlockAll();
if (inst.Rc)
{
MOV(32, R(EAX), gpr.R(a));
CALL((u8*)asm_routines.computeRc);
// TODO: Check PPC manual too
}
}

View file

@ -0,0 +1,198 @@
// Copyright (C) 2003-2008 Dolphin Project.
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, version 2.0.
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License 2.0 for more details.
// A copy of the GPL 2.0 should have been included with the program.
// If not, see http://www.gnu.org/licenses/
// Official SVN repository and contact information can be found at
// http://code.google.com/p/dolphin-emu/
// TODO(ector): Tons of pshufb optimization of the loads/stores, for SSSE3+, possibly SSE4, only.
// Should give a very noticable speed boost to paired single heavy code.
#include "Common.h"
#include "Thunk.h"
#include "../PowerPC.h"
#include "../../Core.h"
#include "../../HW/GPFifo.h"
#include "../../HW/CommandProcessor.h"
#include "../../HW/PixelEngine.h"
#include "../../HW/Memmap.h"
#include "../PPCTables.h"
#include "x64Emitter.h"
#include "ABI.h"
#include "Jit.h"
#include "JitCache.h"
#include "JitAsm.h"
#include "JitRegCache.h"
// #define INSTRUCTION_START Default(inst); return;
#define INSTRUCTION_START
void Jit64::lbzx(UGeckoInstruction inst)
{
IREmitter::InstLoc addr = ibuild.EmitLoadGReg(inst.RB);
if (inst.RA)
addr = ibuild.EmitAdd(addr, ibuild.EmitLoadGReg(inst.RA));
ibuild.EmitStoreGReg(ibuild.EmitLoad8(addr), inst.RD);
}
void Jit64::lwzx(UGeckoInstruction inst)
{
IREmitter::InstLoc addr = ibuild.EmitLoadGReg(inst.RB);
if (inst.RA)
addr = ibuild.EmitAdd(addr, ibuild.EmitLoadGReg(inst.RA));
ibuild.EmitStoreGReg(ibuild.EmitLoad32(addr), inst.RD);
}
void Jit64::lhax(UGeckoInstruction inst)
{
IREmitter::InstLoc addr = ibuild.EmitLoadGReg(inst.RB);
if (inst.RA)
addr = ibuild.EmitAdd(addr, ibuild.EmitLoadGReg(inst.RA));
IREmitter::InstLoc val = ibuild.EmitLoad16(addr);
val = ibuild.EmitSExt16(val);
ibuild.EmitStoreGReg(val, inst.RD);
}
void Jit64::lXz(UGeckoInstruction inst)
{
IREmitter::InstLoc addr = ibuild.EmitIntConst(inst.SIMM_16);
if (inst.RA)
addr = ibuild.EmitAdd(addr, ibuild.EmitLoadGReg(inst.RA));
IREmitter::InstLoc val;
switch (inst.OPCD)
{
case 32: val = ibuild.EmitLoad32(addr); break; //lwz
case 40: val = ibuild.EmitLoad16(addr); break; //lhz
case 34: val = ibuild.EmitLoad8(addr); break; //lbz
default: PanicAlert("lXz: invalid access size");
}
ibuild.EmitStoreGReg(val, inst.RD);
}
void Jit64::lha(UGeckoInstruction inst)
{
IREmitter::InstLoc addr =
ibuild.EmitIntConst((s32)(s16)inst.SIMM_16);
if (inst.RA)
addr = ibuild.EmitAdd(addr, ibuild.EmitLoadGReg(inst.RA));
IREmitter::InstLoc val = ibuild.EmitLoad16(addr);
val = ibuild.EmitSExt16(val);
ibuild.EmitStoreGReg(val, inst.RD);
}
void Jit64::lwzux(UGeckoInstruction inst)
{
IREmitter::InstLoc addr = ibuild.EmitLoadGReg(inst.RB);
if (inst.RA) {
addr = ibuild.EmitAdd(addr, ibuild.EmitLoadGReg(inst.RA));
ibuild.EmitStoreGReg(addr, inst.RA);
}
ibuild.EmitStoreGReg(ibuild.EmitLoad32(addr), inst.RD);
}
// Zero cache line.
void Jit64::dcbz(UGeckoInstruction inst)
{
if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITLoadStoreOff)
{Default(inst); return;} // turn off from debugger
INSTRUCTION_START;
MOV(32, R(EAX), gpr.R(inst.RB));
if (inst.RA)
ADD(32, R(EAX), gpr.R(inst.RA));
AND(32, R(EAX), Imm32(~31));
XORPD(XMM0, R(XMM0));
#ifdef _M_X64
MOVAPS(MComplex(EBX, EAX, SCALE_1, 0), XMM0);
MOVAPS(MComplex(EBX, EAX, SCALE_1, 16), XMM0);
#else
AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK));
MOVAPS(MDisp(EAX, (u32)Memory::base), XMM0);
MOVAPS(MDisp(EAX, (u32)Memory::base + 16), XMM0);
#endif
}
void Jit64::stX(UGeckoInstruction inst)
{
IREmitter::InstLoc addr = ibuild.EmitIntConst(inst.SIMM_16),
value = ibuild.EmitLoadGReg(inst.RS);
if (inst.RA)
addr = ibuild.EmitAdd(ibuild.EmitLoadGReg(inst.RA), addr);
if (inst.OPCD & 1)
ibuild.EmitStoreGReg(addr, inst.RA);
switch (inst.OPCD & ~1)
{
case 36: ibuild.EmitStore32(value, addr); break; //stw
case 44: ibuild.EmitStore16(value, addr); break; //sth
case 38: ibuild.EmitStore8(value, addr); break; //stb
default: _assert_msg_(DYNA_REC, 0, "AWETKLJASDLKF"); return;
}
}
void Jit64::stXx(UGeckoInstruction inst)
{
IREmitter::InstLoc addr = ibuild.EmitLoadGReg(inst.RB),
value = ibuild.EmitLoadGReg(inst.RS);
addr = ibuild.EmitAdd(addr, ibuild.EmitLoadGReg(inst.RA));
if (inst.SUBOP10 & 32)
ibuild.EmitStoreGReg(addr, inst.RA);
switch (inst.SUBOP10 & ~32)
{
case 151: ibuild.EmitStore32(value, addr); break; //stw
case 407: ibuild.EmitStore16(value, addr); break; //sth
case 215: ibuild.EmitStore8(value, addr); break; //stb
default: _assert_msg_(DYNA_REC, 0, "AWETKLJASDLKF"); return;
}
}
// A few games use these heavily in video codecs.
void Jit64::lmw(UGeckoInstruction inst)
{
#ifdef _M_IX86
Default(inst); return;
#else
gpr.FlushLockX(ECX);
MOV(32, R(EAX), Imm32((u32)(s32)inst.SIMM_16));
if (inst.RA)
ADD(32, R(EAX), gpr.R(inst.RA));
for (int i = inst.RD; i < 32; i++)
{
MOV(32, R(ECX), MComplex(EBX, EAX, SCALE_1, (i - inst.RD) * 4));
BSWAP(32, ECX);
gpr.LoadToX64(i, false, true);
MOV(32, gpr.R(i), R(ECX));
}
gpr.UnlockAllX();
#endif
}
void Jit64::stmw(UGeckoInstruction inst)
{
#ifdef _M_IX86
Default(inst); return;
#else
gpr.FlushLockX(ECX);
MOV(32, R(EAX), Imm32((u32)(s32)inst.SIMM_16));
if (inst.RA)
ADD(32, R(EAX), gpr.R(inst.RA));
for (int i = inst.RD; i < 32; i++)
{
MOV(32, R(ECX), gpr.R(i));
BSWAP(32, ECX);
MOV(32, MComplex(EBX, EAX, SCALE_1, (i - inst.RD) * 4), R(ECX));
}
gpr.UnlockAllX();
#endif
}

View file

@ -0,0 +1,322 @@
// Copyright (C) 2003-2008 Dolphin Project.
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, version 2.0.
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License 2.0 for more details.
// A copy of the GPL 2.0 should have been included with the program.
// If not, see http://www.gnu.org/licenses/
// Official SVN repository and contact information can be found at
// http://code.google.com/p/dolphin-emu/
// TODO(ector): Tons of pshufb optimization of the loads/stores, for SSSE3+, possibly SSE4, only.
// Should give a very noticable speed boost to paired single heavy code.
#include "Common.h"
#include "../PowerPC.h"
#include "../../Core.h" // include "Common.h", "CoreParameter.h"
#include "../../HW/GPFifo.h"
#include "../../HW/CommandProcessor.h"
#include "../../HW/PixelEngine.h"
#include "../../HW/Memmap.h"
#include "../PPCTables.h"
#include "CPUDetect.h"
#include "x64Emitter.h"
#include "ABI.h"
#include "Jit.h"
#include "JitCache.h"
#include "JitAsm.h"
#include "JitRegCache.h"
// #define INSTRUCTION_START Default(inst); return;
#define INSTRUCTION_START
// pshufb todo: MOVQ
const u8 GC_ALIGNED16(bswapShuffle1x4[16]) = {3, 2, 1, 0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
const u8 GC_ALIGNED16(bswapShuffle2x4[16]) = {3, 2, 1, 0, 7, 6, 5, 4, 8, 9, 10, 11, 12, 13, 14, 15};
const u8 GC_ALIGNED16(bswapShuffle1x8[16]) = {7, 6, 5, 4, 3, 2, 1, 0, 8, 9, 10, 11, 12, 13, 14, 15};
const u8 GC_ALIGNED16(bswapShuffle1x8Dupe[16]) = {7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0};
const u8 GC_ALIGNED16(bswapShuffle2x8[16]) = {7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8};
namespace {
u64 GC_ALIGNED16(temp64);
u32 GC_ALIGNED16(temp32);
}
// TODO: Add peephole optimizations for multiple consecutive lfd/lfs/stfd/stfs since they are so common,
// and pshufb could help a lot.
// Also add hacks for things like lfs/stfs the same reg consecutively, that is, simple memory moves.
void Jit64::lfs(UGeckoInstruction inst)
{
if (Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITLoadStoreFloatingOff)
{Default(inst); return;} // turn off from debugger
INSTRUCTION_START;
int d = inst.RD;
int a = inst.RA;
if (!a)
{
Default(inst);
return;
}
s32 offset = (s32)(s16)inst.SIMM_16;
gpr.FlushLockX(ABI_PARAM1);
gpr.Lock(a);
MOV(32, R(ABI_PARAM1), gpr.R(a));
if (jo.assumeFPLoadFromMem)
{
UnsafeLoadRegToReg(ABI_PARAM1, EAX, 32, offset, false);
}
else
{
SafeLoadRegToEAX(ABI_PARAM1, 32, offset);
}
MOV(32, M(&temp32), R(EAX));
fpr.Lock(d);
fpr.LoadToX64(d, false);
CVTSS2SD(fpr.RX(d), M(&temp32));
MOVDDUP(fpr.RX(d), fpr.R(d));
gpr.UnlockAll();
gpr.UnlockAllX();
fpr.UnlockAll();
}
void Jit64::lfd(UGeckoInstruction inst)
{
if (Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITLoadStoreFloatingOff)
{Default(inst); return;} // turn off from debugger
INSTRUCTION_START;
int d = inst.RD;
int a = inst.RA;
if (!a)
{
Default(inst);
return;
}
s32 offset = (s32)(s16)inst.SIMM_16;
gpr.FlushLockX(ABI_PARAM1);
gpr.Lock(a);
MOV(32, R(ABI_PARAM1), gpr.R(a));
// TODO - optimize. This has to load the previous value - upper double should stay unmodified.
fpr.LoadToX64(d, true);
fpr.Lock(d);
X64Reg xd = fpr.RX(d);
if (cpu_info.bSSSE3) {
#ifdef _M_X64
MOVQ_xmm(XMM0, MComplex(RBX, ABI_PARAM1, SCALE_1, offset));
#else
AND(32, R(ABI_PARAM1), Imm32(Memory::MEMVIEW32_MASK));
MOVQ_xmm(XMM0, MDisp(ABI_PARAM1, (u32)Memory::base + offset));
#endif
PSHUFB(XMM0, M((void *)bswapShuffle1x8Dupe));
MOVSD(xd, R(XMM0));
} else {
#ifdef _M_X64
MOV(64, R(EAX), MComplex(RBX, ABI_PARAM1, SCALE_1, offset));
BSWAP(64, EAX);
MOV(64, M(&temp64), R(EAX));
MOVSD(XMM0, M(&temp64));
MOVSD(xd, R(XMM0));
#else
AND(32, R(ABI_PARAM1), Imm32(Memory::MEMVIEW32_MASK));
MOV(32, R(EAX), MDisp(ABI_PARAM1, (u32)Memory::base + offset));
BSWAP(32, EAX);
MOV(32, M((void*)((u32)&temp64+4)), R(EAX));
MOV(32, R(EAX), MDisp(ABI_PARAM1, (u32)Memory::base + offset + 4));
BSWAP(32, EAX);
MOV(32, M(&temp64), R(EAX));
MOVSD(XMM0, M(&temp64));
MOVSD(xd, R(XMM0));
#if 0
// Alternate implementation; possibly faster
AND(32, R(ABI_PARAM1), Imm32(Memory::MEMVIEW32_MASK));
MOVQ_xmm(XMM0, MDisp(ABI_PARAM1, (u32)Memory::base + offset));
PSHUFLW(XMM0, R(XMM0), 0x1B);
PSRLW(XMM0, 8);
MOVSD(xd, R(XMM0));
MOVQ_xmm(XMM0, MDisp(ABI_PARAM1, (u32)Memory::base + offset));
PSHUFLW(XMM0, R(XMM0), 0x1B);
PSLLW(XMM0, 8);
POR(xd, R(XMM0));
#endif
#endif
}
gpr.UnlockAll();
gpr.UnlockAllX();
fpr.UnlockAll();
}
void Jit64::stfd(UGeckoInstruction inst)
{
if (Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITLoadStoreFloatingOff)
{Default(inst); return;} // turn off from debugger
INSTRUCTION_START;
int s = inst.RS;
int a = inst.RA;
if (!a)
{
Default(inst);
return;
}
s32 offset = (s32)(s16)inst.SIMM_16;
gpr.FlushLockX(ABI_PARAM1);
gpr.Lock(a);
fpr.Lock(s);
MOV(32, R(ABI_PARAM1), gpr.R(a));
#ifdef _M_IX86
AND(32, R(ABI_PARAM1), Imm32(Memory::MEMVIEW32_MASK));
#endif
if (cpu_info.bSSSE3) {
MOVAPD(XMM0, fpr.R(s));
PSHUFB(XMM0, M((void *)bswapShuffle1x8));
#ifdef _M_X64
MOVQ_xmm(MComplex(RBX, ABI_PARAM1, SCALE_1, offset), XMM0);
#else
MOVQ_xmm(MDisp(ABI_PARAM1, (u32)Memory::base + offset), XMM0);
#endif
} else {
#ifdef _M_X64
fpr.LoadToX64(s, true, false);
MOVSD(M(&temp64), fpr.RX(s));
MOV(64, R(EAX), M(&temp64));
BSWAP(64, EAX);
MOV(64, MComplex(RBX, ABI_PARAM1, SCALE_1, offset), R(EAX));
#else
fpr.LoadToX64(s, true, false);
MOVSD(M(&temp64), fpr.RX(s));
MOV(32, R(EAX), M(&temp64));
BSWAP(32, EAX);
MOV(32, MDisp(ABI_PARAM1, (u32)Memory::base + offset + 4), R(EAX));
MOV(32, R(EAX), M((void*)((u32)&temp64 + 4)));
BSWAP(32, EAX);
MOV(32, MDisp(ABI_PARAM1, (u32)Memory::base + offset), R(EAX));
#endif
}
gpr.UnlockAll();
gpr.UnlockAllX();
fpr.UnlockAll();
}
void Jit64::stfs(UGeckoInstruction inst)
{
if (Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITLoadStoreFloatingOff)
{Default(inst); return;} // turn off from debugger
INSTRUCTION_START;
bool update = inst.OPCD & 1;
int s = inst.RS;
int a = inst.RA;
s32 offset = (s32)(s16)inst.SIMM_16;
if (!a || update) {
Default(inst);
return;
}
if (gpr.R(a).IsImm())
{
u32 addr = (u32)(gpr.R(a).offset + offset);
if (Memory::IsRAMAddress(addr))
{
if (cpu_info.bSSSE3) {
CVTSD2SS(XMM0, fpr.R(s));
PSHUFB(XMM0, M((void *)bswapShuffle1x4));
WriteFloatToConstRamAddress(XMM0, addr);
return;
}
}
else if (addr == 0xCC008000)
{
// Float directly to write gather pipe! Fun!
CVTSD2SS(XMM0, fpr.R(s));
CALL((void*)asm_routines.fifoDirectWriteFloat);
// TODO
js.fifoBytesThisBlock += 4;
return;
}
}
gpr.FlushLockX(ABI_PARAM1, ABI_PARAM2);
gpr.Lock(a);
fpr.Lock(s);
MOV(32, R(ABI_PARAM2), gpr.R(a));
ADD(32, R(ABI_PARAM2), Imm32(offset));
if (update && offset)
{
MOV(32, gpr.R(a), R(ABI_PARAM2));
}
CVTSD2SS(XMM0, fpr.R(s));
MOVSS(M(&temp32), XMM0);
MOV(32, R(ABI_PARAM1), M(&temp32));
SafeWriteRegToReg(ABI_PARAM1, ABI_PARAM2, 32, 0);
gpr.UnlockAll();
gpr.UnlockAllX();
fpr.UnlockAll();
}
void Jit64::stfsx(UGeckoInstruction inst)
{
if (Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITLoadStoreFloatingOff)
{Default(inst); return;} // turn off from debugger
INSTRUCTION_START;
// We can take a shortcut here - it's not likely that a hardware access would use this instruction.
gpr.FlushLockX(ABI_PARAM1);
fpr.Lock(inst.RS);
MOV(32, R(ABI_PARAM1), gpr.R(inst.RB));
if (inst.RA)
ADD(32, R(ABI_PARAM1), gpr.R(inst.RA));
CVTSD2SS(XMM0, fpr.R(inst.RS));
MOVD_xmm(R(EAX), XMM0);
UnsafeWriteRegToReg(EAX, ABI_PARAM1, 32, 0);
gpr.UnlockAllX();
fpr.UnlockAll();
}
void Jit64::lfsx(UGeckoInstruction inst)
{
if (Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITLoadStoreFloatingOff)
{Default(inst); return;} // turn off from debugger
INSTRUCTION_START;
fpr.Lock(inst.RS);
fpr.LoadToX64(inst.RS, false, true);
MOV(32, R(EAX), gpr.R(inst.RB));
if (inst.RA)
ADD(32, R(EAX), gpr.R(inst.RA));
if (cpu_info.bSSSE3) {
X64Reg r = fpr.R(inst.RS).GetSimpleReg();
#ifdef _M_IX86
AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK));
MOVD_xmm(r, MDisp(EAX, (u32)Memory::base));
#else
MOVD_xmm(r, MComplex(RBX, EAX, SCALE_1, 0));
#endif
PSHUFB(r, M((void *)bswapShuffle1x4));
CVTSS2SD(r, R(r));
MOVDDUP(r, R(r));
} else {
UnsafeLoadRegToReg(EAX, EAX, 32, false);
MOV(32, M(&temp32), R(EAX));
CVTSS2SD(XMM0, M(&temp32));
MOVDDUP(fpr.R(inst.RS).GetSimpleReg(), R(XMM0));
}
fpr.UnlockAll();
}

View file

@ -0,0 +1,458 @@
// Copyright (C) 2003-2008 Dolphin Project.
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, version 2.0.
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License 2.0 for more details.
// A copy of the GPL 2.0 should have been included with the program.
// If not, see http://www.gnu.org/licenses/
// Official SVN repository and contact information can be found at
// http://code.google.com/p/dolphin-emu/
// TODO(ector): Tons of pshufb optimization of the loads/stores, for SSSE3+, possibly SSE4, only.
// Should give a very noticable speed boost to paired single heavy code.
#include "Common.h"
#include "Thunk.h"
#include "../PowerPC.h"
#include "../../Core.h"
#include "../../HW/GPFifo.h"
#include "../../HW/CommandProcessor.h"
#include "../../HW/PixelEngine.h"
#include "../../HW/Memmap.h"
#include "../PPCTables.h"
#include "CPUDetect.h"
#include "x64Emitter.h"
#include "ABI.h"
#include "Jit.h"
#include "JitCache.h"
#include "JitAsm.h"
#include "JitRegCache.h"
#define INSTRUCTION_START
// #define INSTRUCTION_START Default(inst); return;
const u8 GC_ALIGNED16(pbswapShuffle2x4[16]) = {3, 2, 1, 0, 7, 6, 5, 4, 8, 9, 10, 11, 12, 13, 14, 15};
const u8 GC_ALIGNED16(pbswapShuffleNoop[16]) = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
static double GC_ALIGNED16(psTemp[2]) = {1.0, 1.0};
static u64 GC_ALIGNED16(temp64);
// TODO(ector): Improve 64-bit version
static void WriteDual32(u64 value, u32 address)
{
Memory::Write_U32((u32)(value >> 32), address);
Memory::Write_U32((u32)value, address + 4);
}
const double GC_ALIGNED16(m_quantizeTableD[]) =
{
(1 << 0), (1 << 1), (1 << 2), (1 << 3),
(1 << 4), (1 << 5), (1 << 6), (1 << 7),
(1 << 8), (1 << 9), (1 << 10), (1 << 11),
(1 << 12), (1 << 13), (1 << 14), (1 << 15),
(1 << 16), (1 << 17), (1 << 18), (1 << 19),
(1 << 20), (1 << 21), (1 << 22), (1 << 23),
(1 << 24), (1 << 25), (1 << 26), (1 << 27),
(1 << 28), (1 << 29), (1 << 30), (1 << 31),
1.0 / (1ULL << 32), 1.0 / (1 << 31), 1.0 / (1 << 30), 1.0 / (1 << 29),
1.0 / (1 << 28), 1.0 / (1 << 27), 1.0 / (1 << 26), 1.0 / (1 << 25),
1.0 / (1 << 24), 1.0 / (1 << 23), 1.0 / (1 << 22), 1.0 / (1 << 21),
1.0 / (1 << 20), 1.0 / (1 << 19), 1.0 / (1 << 18), 1.0 / (1 << 17),
1.0 / (1 << 16), 1.0 / (1 << 15), 1.0 / (1 << 14), 1.0 / (1 << 13),
1.0 / (1 << 12), 1.0 / (1 << 11), 1.0 / (1 << 10), 1.0 / (1 << 9),
1.0 / (1 << 8), 1.0 / (1 << 7), 1.0 / (1 << 6), 1.0 / (1 << 5),
1.0 / (1 << 4), 1.0 / (1 << 3), 1.0 / (1 << 2), 1.0 / (1 << 1),
};
const double GC_ALIGNED16(m_dequantizeTableD[]) =
{
1.0 / (1 << 0), 1.0 / (1 << 1), 1.0 / (1 << 2), 1.0 / (1 << 3),
1.0 / (1 << 4), 1.0 / (1 << 5), 1.0 / (1 << 6), 1.0 / (1 << 7),
1.0 / (1 << 8), 1.0 / (1 << 9), 1.0 / (1 << 10), 1.0 / (1 << 11),
1.0 / (1 << 12), 1.0 / (1 << 13), 1.0 / (1 << 14), 1.0 / (1 << 15),
1.0 / (1 << 16), 1.0 / (1 << 17), 1.0 / (1 << 18), 1.0 / (1 << 19),
1.0 / (1 << 20), 1.0 / (1 << 21), 1.0 / (1 << 22), 1.0 / (1 << 23),
1.0 / (1 << 24), 1.0 / (1 << 25), 1.0 / (1 << 26), 1.0 / (1 << 27),
1.0 / (1 << 28), 1.0 / (1 << 29), 1.0 / (1 << 30), 1.0 / (1 << 31),
(1ULL << 32), (1 << 31), (1 << 30), (1 << 29),
(1 << 28), (1 << 27), (1 << 26), (1 << 25),
(1 << 24), (1 << 23), (1 << 22), (1 << 21),
(1 << 20), (1 << 19), (1 << 18), (1 << 17),
(1 << 16), (1 << 15), (1 << 14), (1 << 13),
(1 << 12), (1 << 11), (1 << 10), (1 << 9),
(1 << 8), (1 << 7), (1 << 6), (1 << 5),
(1 << 4), (1 << 3), (1 << 2), (1 << 1),
};
// The big problem is likely instructions that set the quantizers in the same block.
// We will have to break block after quantizers are written to.
void Jit64::psq_st(UGeckoInstruction inst)
{
if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITLoadStorePairedOff)
{Default(inst); return;} // turn off from debugger
INSTRUCTION_START;
js.block_flags |= BLOCK_USE_GQR0 << inst.I;
if (js.blockSetsQuantizers || !Core::GetStartupParameter().bOptimizeQuantizers)
{
Default(inst);
return;
}
if (!inst.RA)
{
// This really should never happen. Unless we change this to also support stwux
Default(inst);
return;
}
const UGQR gqr(rSPR(SPR_GQR0 + inst.I));
const EQuantizeType stType = static_cast<EQuantizeType>(gqr.ST_TYPE);
int stScale = gqr.ST_SCALE;
bool update = inst.OPCD == 61;
int offset = inst.SIMM_12;
int a = inst.RA;
int s = inst.RS; // Fp numbers
if (inst.W) {
// PanicAlert("W=1: stType %i stScale %i update %i", (int)stType, (int)stScale, (int)update);
// It's fairly common that games write stuff to the pipe using this. Then, it's pretty much only
// floats so that's what we'll work on.
switch (stType)
{
case QUANTIZE_FLOAT:
{
// This one has quite a bit of optimization potential.
if (gpr.R(a).IsImm())
{
PanicAlert("Imm: %08x", gpr.R(a).offset);
}
gpr.FlushLockX(ABI_PARAM1, ABI_PARAM2);
gpr.Lock(a);
fpr.Lock(s);
if (update)
gpr.LoadToX64(a, true, true);
MOV(32, R(ABI_PARAM2), gpr.R(a));
if (offset)
ADD(32, R(ABI_PARAM2), Imm32((u32)offset));
TEST(32, R(ABI_PARAM2), Imm32(0x0C000000));
if (update && offset)
MOV(32, gpr.R(a), R(ABI_PARAM2));
CVTSD2SS(XMM0, fpr.R(s));
MOVD_xmm(M(&temp64), XMM0);
MOV(32, R(ABI_PARAM1), M(&temp64));
FixupBranch argh = J_CC(CC_NZ);
BSWAP(32, ABI_PARAM1);
#ifdef _M_X64
MOV(32, MComplex(RBX, ABI_PARAM2, SCALE_1, 0), R(ABI_PARAM1));
#else
MOV(32, R(EAX), R(ABI_PARAM2));
AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK));
MOV(32, MDisp(EAX, (u32)Memory::base), R(ABI_PARAM1));
#endif
FixupBranch skip_call = J();
SetJumpTarget(argh);
ABI_CallFunctionRR(thunks.ProtectFunction((void *)&Memory::Write_U32, 2), ABI_PARAM1, ABI_PARAM2);
SetJumpTarget(skip_call);
gpr.UnlockAll();
gpr.UnlockAllX();
fpr.UnlockAll();
return;
}
default:
Default(inst);
return;
}
return;
}
if (stType == QUANTIZE_FLOAT)
{
if (gpr.R(a).IsImm() && !update && cpu_info.bSSSE3)
{
u32 addr = (u32)(gpr.R(a).offset + offset);
if (addr == 0xCC008000) {
// Writing to FIFO. Let's do fast method.
CVTPD2PS(XMM0, fpr.R(s));
PSHUFB(XMM0, M((void*)&pbswapShuffle2x4));
CALL((void*)asm_routines.fifoDirectWriteXmm64);
js.fifoBytesThisBlock += 8;
return;
}
}
gpr.FlushLockX(ABI_PARAM1, ABI_PARAM2);
gpr.Lock(a);
fpr.Lock(s);
if (update)
gpr.LoadToX64(a, true, true);
MOV(32, R(ABI_PARAM2), gpr.R(a));
if (offset)
ADD(32, R(ABI_PARAM2), Imm32((u32)offset));
TEST(32, R(ABI_PARAM2), Imm32(0x0C000000));
if (update && offset)
MOV(32, gpr.R(a), R(ABI_PARAM2));
CVTPD2PS(XMM0, fpr.R(s));
SHUFPS(XMM0, R(XMM0), 1);
MOVQ_xmm(M(&temp64), XMM0);
#ifdef _M_X64
MOV(64, R(ABI_PARAM1), M(&temp64));
FixupBranch argh = J_CC(CC_NZ);
BSWAP(64, ABI_PARAM1);
MOV(64, MComplex(RBX, ABI_PARAM2, SCALE_1, 0), R(ABI_PARAM1));
FixupBranch arg2 = J();
SetJumpTarget(argh);
CALL(thunks.ProtectFunction((void *)&WriteDual32, 0));
#else
FixupBranch argh = J_CC(CC_NZ);
MOV(32, R(ABI_PARAM1), M(((char*)&temp64) + 4));
BSWAP(32, ABI_PARAM1);
AND(32, R(ABI_PARAM2), Imm32(Memory::MEMVIEW32_MASK));
MOV(32, MDisp(ABI_PARAM2, (u32)Memory::base), R(ABI_PARAM1));
MOV(32, R(ABI_PARAM1), M(&temp64));
BSWAP(32, ABI_PARAM1);
MOV(32, MDisp(ABI_PARAM2, 4+(u32)Memory::base), R(ABI_PARAM1));
FixupBranch arg2 = J();
SetJumpTarget(argh);
MOV(32, R(ABI_PARAM1), M(((char*)&temp64) + 4));
ABI_CallFunctionRR(thunks.ProtectFunction((void *)&Memory::Write_U32, 2), ABI_PARAM1, ABI_PARAM2);
MOV(32, R(ABI_PARAM1), M(((char*)&temp64)));
ADD(32, R(ABI_PARAM2), Imm32(4));
ABI_CallFunctionRR(thunks.ProtectFunction((void *)&Memory::Write_U32, 2), ABI_PARAM1, ABI_PARAM2);
#endif
SetJumpTarget(arg2);
gpr.UnlockAll();
gpr.UnlockAllX();
fpr.UnlockAll();
}
else if (stType == QUANTIZE_U8)
{
gpr.FlushLockX(ABI_PARAM1, ABI_PARAM2);
gpr.Lock(a);
fpr.Lock(s);
if (update)
gpr.LoadToX64(a, true, update);
MOV(32, R(ABI_PARAM2), gpr.R(a));
if (offset)
ADD(32, R(ABI_PARAM2), Imm32((u32)offset));
if (update && offset)
MOV(32, gpr.R(a), R(ABI_PARAM2));
MOVAPD(XMM0, fpr.R(s));
MOVDDUP(XMM1, M((void*)&m_quantizeTableD[stScale]));
MULPD(XMM0, R(XMM1));
CVTPD2DQ(XMM0, R(XMM0));
PACKSSDW(XMM0, R(XMM0));
PACKUSWB(XMM0, R(XMM0));
MOVD_xmm(M(&temp64), XMM0);
MOV(16, R(ABI_PARAM1), M(&temp64));
#ifdef _M_X64
MOV(16, MComplex(RBX, ABI_PARAM2, SCALE_1, 0), R(ABI_PARAM1));
#else
MOV(32, R(EAX), R(ABI_PARAM2));
AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK));
MOV(16, MDisp(EAX, (u32)Memory::base), R(ABI_PARAM1));
#endif
if (update)
MOV(32, gpr.R(a), R(ABI_PARAM2));
gpr.UnlockAll();
gpr.UnlockAllX();
fpr.UnlockAll();
}
else if (stType == QUANTIZE_S16)
{
gpr.FlushLockX(ABI_PARAM1, ABI_PARAM2);
gpr.Lock(a);
fpr.Lock(s);
if (update)
gpr.LoadToX64(a, true, update);
MOV(32, R(ABI_PARAM2), gpr.R(a));
if (offset)
ADD(32, R(ABI_PARAM2), Imm32((u32)offset));
if (update)
MOV(32, gpr.R(a), R(ABI_PARAM2));
MOVAPD(XMM0, fpr.R(s));
MOVDDUP(XMM1, M((void*)&m_quantizeTableD[stScale]));
MULPD(XMM0, R(XMM1));
SHUFPD(XMM0, R(XMM0), 1);
CVTPD2DQ(XMM0, R(XMM0));
PACKSSDW(XMM0, R(XMM0));
MOVD_xmm(M(&temp64), XMM0);
MOV(32, R(ABI_PARAM1), M(&temp64));
BSWAP(32, ABI_PARAM1);
#ifdef _M_X64
MOV(32, MComplex(RBX, ABI_PARAM2, SCALE_1, 0), R(ABI_PARAM1));
#else
MOV(32, R(EAX), R(ABI_PARAM2));
AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK));
MOV(32, MDisp(EAX, (u32)Memory::base), R(ABI_PARAM1));
#endif
gpr.UnlockAll();
gpr.UnlockAllX();
fpr.UnlockAll();
}
else {
// Dodger uses this.
// mario tennis
//PanicAlert("st %i:%i", stType, inst.W);
Default(inst);
}
}
void Jit64::psq_l(UGeckoInstruction inst)
{
if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITLoadStorePairedOff)
{Default(inst); return;} // turn off from debugger
INSTRUCTION_START;
js.block_flags |= BLOCK_USE_GQR0 << inst.I;
if (js.blockSetsQuantizers || !Core::GetStartupParameter().bOptimizeQuantizers)
{
Default(inst);
return;
}
const UGQR gqr(rSPR(SPR_GQR0 + inst.I));
const EQuantizeType ldType = static_cast<EQuantizeType>(gqr.LD_TYPE);
int ldScale = gqr.LD_SCALE;
bool update = inst.OPCD == 57;
if (!inst.RA || inst.W)
{
// 0 1 during load
//PanicAlert("ld:%i %i", ldType, (int)inst.W);
Default(inst);
return;
}
int offset = inst.SIMM_12;
switch (ldType) {
case QUANTIZE_FLOAT: // We know this is from RAM, so we don't need to check the address.
{
#ifdef _M_X64
gpr.LoadToX64(inst.RA, true, update);
fpr.LoadToX64(inst.RS, false);
if (cpu_info.bSSSE3) {
X64Reg xd = fpr.R(inst.RS).GetSimpleReg();
MOVQ_xmm(xd, MComplex(RBX, gpr.R(inst.RA).GetSimpleReg(), 1, offset));
PSHUFB(xd, M((void *)pbswapShuffle2x4));
CVTPS2PD(xd, R(xd));
} else {
MOV(64, R(RAX), MComplex(RBX, gpr.R(inst.RA).GetSimpleReg(), 1, offset));
BSWAP(64, RAX);
MOV(64, M(&psTemp[0]), R(RAX));
X64Reg r = fpr.R(inst.RS).GetSimpleReg();
CVTPS2PD(r, M(&psTemp[0]));
SHUFPD(r, R(r), 1);
}
if (update && offset != 0)
ADD(32, gpr.R(inst.RA), Imm32(offset));
break;
#else
if (cpu_info.bSSSE3) {
gpr.LoadToX64(inst.RA, true, update);
fpr.LoadToX64(inst.RS, false);
X64Reg xd = fpr.R(inst.RS).GetSimpleReg();
MOV(32, R(EAX), gpr.R(inst.RA));
AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK));
MOVQ_xmm(xd, MDisp(EAX, (u32)Memory::base + offset));
PSHUFB(xd, M((void *)pbswapShuffle2x4));
CVTPS2PD(xd, R(xd));
} else {
gpr.FlushLockX(ECX);
gpr.LoadToX64(inst.RA, true, update);
// This can probably be optimized somewhat.
LEA(32, ECX, MDisp(gpr.R(inst.RA).GetSimpleReg(), offset));
AND(32, R(ECX), Imm32(Memory::MEMVIEW32_MASK));
MOV(32, R(EAX), MDisp(ECX, (u32)Memory::base));
BSWAP(32, RAX);
MOV(32, M(&psTemp[0]), R(RAX));
MOV(32, R(EAX), MDisp(ECX, (u32)Memory::base + 4));
BSWAP(32, RAX);
MOV(32, M(((float *)&psTemp[0]) + 1), R(RAX));
fpr.LoadToX64(inst.RS, false, true);
X64Reg r = fpr.R(inst.RS).GetSimpleReg();
CVTPS2PD(r, M(&psTemp[0]));
gpr.UnlockAllX();
}
if (update && offset != 0)
ADD(32, gpr.R(inst.RA), Imm32(offset));
break;
#endif
}
case QUANTIZE_U8:
{
gpr.LoadToX64(inst.RA, true, update);
#ifdef _M_X64
MOVZX(32, 16, EAX, MComplex(RBX, gpr.R(inst.RA).GetSimpleReg(), 1, offset));
#else
LEA(32, EAX, MDisp(gpr.R(inst.RA).GetSimpleReg(), offset));
AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK));
MOVZX(32, 16, EAX, MDisp(EAX, (u32)Memory::base));
#endif
MOV(32, M(&temp64), R(EAX));
MOVD_xmm(XMM0, M(&temp64));
// SSE4 optimization opportunity here.
PXOR(XMM1, R(XMM1));
PUNPCKLBW(XMM0, R(XMM1));
PUNPCKLWD(XMM0, R(XMM1));
CVTDQ2PD(XMM0, R(XMM0));
fpr.LoadToX64(inst.RS, false, true);
X64Reg r = fpr.R(inst.RS).GetSimpleReg();
MOVDDUP(r, M((void *)&m_dequantizeTableD[ldScale]));
MULPD(r, R(XMM0));
if (update && offset != 0)
ADD(32, gpr.R(inst.RA), Imm32(offset));
}
break;
case QUANTIZE_S16:
{
gpr.LoadToX64(inst.RA, true, update);
#ifdef _M_X64
MOV(32, R(EAX), MComplex(RBX, gpr.R(inst.RA).GetSimpleReg(), 1, offset));
#else
LEA(32, EAX, MDisp(gpr.R(inst.RA).GetSimpleReg(), offset));
AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK));
MOV(32, R(EAX), MDisp(EAX, (u32)Memory::base));
#endif
BSWAP(32, EAX);
MOV(32, M(&temp64), R(EAX));
fpr.LoadToX64(inst.RS, false, true);
X64Reg r = fpr.R(inst.RS).GetSimpleReg();
MOVD_xmm(XMM0, M(&temp64));
PUNPCKLWD(XMM0, R(XMM0)); // unpack to higher word in each dword..
PSRAD(XMM0, 16); // then use this signed shift to sign extend. clever eh? :P
CVTDQ2PD(XMM0, R(XMM0));
MOVDDUP(r, M((void*)&m_dequantizeTableD[ldScale]));
MULPD(r, R(XMM0));
SHUFPD(r, R(r), 1);
if (update && offset != 0)
ADD(32, gpr.R(inst.RA), Imm32(offset));
}
break;
/*
Dynamic quantizer. Todo when we have a test set.
MOVZX(32, 8, EAX, M(((char *)&PowerPC::ppcState.spr[SPR_GQR0 + inst.I]) + 3)); // it's in the high byte.
AND(32, R(EAX), Imm8(0x3F));
MOV(32, R(ECX), Imm32((u32)&m_dequantizeTableD));
MOVDDUP(r, MComplex(RCX, EAX, 8, 0));
*/
default:
// 4 0
// 6 0 //power tennis
// 5 0
// PanicAlert("ld:%i %i", ldType, (int)inst.W);
Default(inst);
return;
}
//u32 EA = (m_GPR[_inst.RA] + _inst.SIMM_12) : _inst.SIMM_12;
}

View file

@ -0,0 +1,407 @@
// Copyright (C) 2003-2008 Dolphin Project.
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, version 2.0.
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License 2.0 for more details.
// A copy of the GPL 2.0 should have been included with the program.
// If not, see http://www.gnu.org/licenses/
// Official SVN repository and contact information can be found at
// http://code.google.com/p/dolphin-emu/
#include "Common.h"
#include "../../Core.h"
#include "../PowerPC.h"
#include "../PPCTables.h"
#include "x64Emitter.h"
#include "../../HW/GPFifo.h"
#include "Jit.h"
#include "JitCache.h"
#include "JitRegCache.h"
// TODO
// ps_madds0
// ps_muls0
// ps_madds1
// ps_sel
// cmppd, andpd, andnpd, or
// lfsx, ps_merge01 etc
// #define INSTRUCTION_START Default(inst); return;
#define INSTRUCTION_START
const u64 GC_ALIGNED16(psSignBits[2]) = {0x8000000000000000ULL, 0x8000000000000000ULL};
const u64 GC_ALIGNED16(psAbsMask[2]) = {0x7FFFFFFFFFFFFFFFULL, 0x7FFFFFFFFFFFFFFFULL};
const double GC_ALIGNED16(psOneOne[2]) = {1.0, 1.0};
const double GC_ALIGNED16(psZeroZero[2]) = {0.0, 0.0};
void Jit64::ps_mr(UGeckoInstruction inst)
{
if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITPairedOff)
{Default(inst); return;} // turn off from debugger
INSTRUCTION_START;
if (inst.Rc) {
Default(inst); return;
}
int d = inst.FD;
int b = inst.FB;
if (d == b)
return;
fpr.LoadToX64(d, false);
MOVAPD(fpr.RX(d), fpr.R(b));
}
void Jit64::ps_sel(UGeckoInstruction inst)
{
if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITPairedOff)
{Default(inst); return;} // turn off from debugger
INSTRUCTION_START;
Default(inst);
return;
if (inst.Rc) {
Default(inst); return;
}
// GRR can't get this to work 100%. Getting artifacts in D.O.N. intro.
int d = inst.FD;
int a = inst.FA;
int b = inst.FB;
int c = inst.FC;
fpr.FlushLockX(XMM7);
fpr.FlushLockX(XMM6);
fpr.Lock(a, b, c, d);
fpr.LoadToX64(a, true, false);
fpr.LoadToX64(d, false, true);
// BLENDPD would have been nice...
MOVAPD(XMM7, fpr.R(a));
CMPPD(XMM7, M((void*)psZeroZero), 1); //less-than = 111111
MOVAPD(XMM6, R(XMM7));
ANDPD(XMM7, fpr.R(d));
ANDNPD(XMM6, fpr.R(c));
MOVAPD(fpr.RX(d), R(XMM7));
ORPD(fpr.RX(d), R(XMM6));
fpr.UnlockAll();
fpr.UnlockAllX();
}
void Jit64::ps_sign(UGeckoInstruction inst)
{
if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITPairedOff)
{Default(inst); return;} // turn off from debugger
INSTRUCTION_START;
if (inst.Rc) {
Default(inst); return;
}
int d = inst.FD;
int b = inst.FB;
fpr.Lock(d, b);
if (d != b)
{
fpr.LoadToX64(d, false);
MOVAPD(fpr.RX(d), fpr.R(b));
}
else
{
fpr.LoadToX64(d, true);
}
switch (inst.SUBOP10)
{
case 40: //neg
XORPD(fpr.RX(d), M((void*)&psSignBits));
break;
case 136: //nabs
ORPD(fpr.RX(d), M((void*)&psSignBits));
break;
case 264: //abs
ANDPD(fpr.RX(d), M((void*)&psAbsMask));
break;
}
fpr.UnlockAll();
}
void Jit64::ps_rsqrte(UGeckoInstruction inst)
{
if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITPairedOff)
{Default(inst); return;} // turn off from debugger
INSTRUCTION_START;
if (inst.Rc) {
Default(inst); return;
}
int d = inst.FD;
int b = inst.FB;
fpr.Lock(d, b);
SQRTPD(XMM0, fpr.R(b));
MOVAPD(XMM1, M((void*)&psOneOne));
DIVPD(XMM1, R(XMM0));
MOVAPD(fpr.R(d), XMM1);
fpr.UnlockAll();
}
//add a, b, c
//mov a, b
//add a, c
//we need:
/*
psq_l
psq_stu
*/
/*
add a,b,a
*/
//There's still a little bit more optimization that can be squeezed out of this
void Jit64::tri_op(int d, int a, int b, bool reversible, void (XEmitter::*op)(X64Reg, OpArg))
{
fpr.Lock(d, a, b);
if (d == a)
{
fpr.LoadToX64(d, true);
(this->*op)(fpr.RX(d), fpr.R(b));
}
else if (d == b && reversible)
{
fpr.LoadToX64(d, true);
(this->*op)(fpr.RX(d), fpr.R(a));
}
else if (a != d && b != d)
{
//sources different from d, can use rather quick solution
fpr.LoadToX64(d, false);
MOVAPD(fpr.RX(d), fpr.R(a));
(this->*op)(fpr.RX(d), fpr.R(b));
}
else if (b != d)
{
fpr.LoadToX64(d, false);
MOVAPD(XMM0, fpr.R(b));
MOVAPD(fpr.RX(d), fpr.R(a));
(this->*op)(fpr.RX(d), Gen::R(XMM0));
}
else //Other combo, must use two temps :(
{
MOVAPD(XMM0, fpr.R(a));
MOVAPD(XMM1, fpr.R(b));
fpr.LoadToX64(d, false);
(this->*op)(XMM0, Gen::R(XMM1));
MOVAPD(fpr.RX(d), Gen::R(XMM0));
}
ForceSinglePrecisionP(fpr.RX(d));
fpr.UnlockAll();
}
void Jit64::ps_arith(UGeckoInstruction inst)
{
if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITPairedOff)
{Default(inst); return;} // turn off from debugger
INSTRUCTION_START;
if (inst.Rc) {
Default(inst); return;
}
switch (inst.SUBOP5)
{
case 18: tri_op(inst.FD, inst.FA, inst.FB, false, &XEmitter::DIVPD); break; //div
case 20: tri_op(inst.FD, inst.FA, inst.FB, false, &XEmitter::SUBPD); break; //sub
case 21: tri_op(inst.FD, inst.FA, inst.FB, true, &XEmitter::ADDPD); break; //add
case 23://sel
Default(inst);
break;
case 24://res
Default(inst);
break;
case 25: tri_op(inst.FD, inst.FA, inst.FC, true, &XEmitter::MULPD); break; //mul
default:
_assert_msg_(DYNA_REC, 0, "ps_arith WTF!!!");
}
}
void Jit64::ps_sum(UGeckoInstruction inst)
{
if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITPairedOff)
{Default(inst); return;} // turn off from debugger
INSTRUCTION_START;
if (inst.Rc) {
Default(inst); return;
}
int d = inst.FD;
int a = inst.FA;
int b = inst.FB;
int c = inst.FC;
fpr.Lock(a,b,c,d);
fpr.LoadToX64(d, d == a || d == b || d == c, true);
switch (inst.SUBOP5)
{
case 10:
// Do the sum in upper subregisters, merge uppers
MOVDDUP(XMM0, fpr.R(a));
MOVAPD(XMM1, fpr.R(b));
ADDPD(XMM0, R(XMM1));
UNPCKHPD(XMM0, fpr.R(c)); //merge
MOVAPD(fpr.R(d), XMM0);
break;
case 11:
// Do the sum in lower subregisters, merge lowers
MOVAPD(XMM0, fpr.R(a));
MOVAPD(XMM1, fpr.R(b));
SHUFPD(XMM1, R(XMM1), 5); // copy higher to lower
ADDPD(XMM0, R(XMM1)); // sum lowers
MOVAPD(XMM1, fpr.R(c));
UNPCKLPD(XMM1, R(XMM0)); // merge
MOVAPD(fpr.R(d), XMM1);
break;
default:
PanicAlert("ps_sum WTF!!!");
}
ForceSinglePrecisionP(fpr.RX(d));
fpr.UnlockAll();
}
void Jit64::ps_muls(UGeckoInstruction inst)
{
if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITPairedOff)
{Default(inst); return;} // turn off from debugger
INSTRUCTION_START;
if (inst.Rc) {
Default(inst); return;
}
int d = inst.FD;
int a = inst.FA;
int c = inst.FC;
fpr.Lock(a, c, d);
fpr.LoadToX64(d, d == a || d == c, true);
switch (inst.SUBOP5)
{
case 12:
// Single multiply scalar high
// TODO - faster version for when regs are different
MOVAPD(XMM0, fpr.R(a));
MOVDDUP(XMM1, fpr.R(c));
MULPD(XMM0, R(XMM1));
MOVAPD(fpr.R(d), XMM0);
break;
case 13:
// TODO - faster version for when regs are different
MOVAPD(XMM0, fpr.R(a));
MOVAPD(XMM1, fpr.R(c));
SHUFPD(XMM1, R(XMM1), 3); // copy higher to lower
MULPD(XMM0, R(XMM1));
MOVAPD(fpr.R(d), XMM0);
break;
default:
PanicAlert("ps_muls WTF!!!");
}
ForceSinglePrecisionP(fpr.RX(d));
fpr.UnlockAll();
}
//TODO: find easy cases and optimize them, do a breakout like ps_arith
void Jit64::ps_mergeXX(UGeckoInstruction inst)
{
if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITPairedOff)
{Default(inst); return;} // turn off from debugger
INSTRUCTION_START;
if (inst.Rc) {
Default(inst); return;
}
int d = inst.FD;
int a = inst.FA;
int b = inst.FB;
fpr.Lock(a,b,d);
MOVAPD(XMM0, fpr.R(a));
switch (inst.SUBOP10)
{
case 528:
UNPCKLPD(XMM0, fpr.R(b)); //unpck is faster than shuf
break; //00
case 560:
SHUFPD(XMM0, fpr.R(b), 2); //must use shuf here
break; //01
case 592:
SHUFPD(XMM0, fpr.R(b), 1);
break; //10
case 624:
UNPCKHPD(XMM0, fpr.R(b));
break; //11
default:
_assert_msg_(DYNA_REC, 0, "ps_merge - invalid op");
}
fpr.LoadToX64(d, false);
MOVAPD(fpr.RX(d), Gen::R(XMM0));
fpr.UnlockAll();
}
//TODO: add optimized cases
void Jit64::ps_maddXX(UGeckoInstruction inst)
{
if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITPairedOff)
{Default(inst); return;} // turn off from debugger
INSTRUCTION_START;
if (inst.Rc) {
Default(inst); return;
}
int a = inst.FA;
int b = inst.FB;
int c = inst.FC;
int d = inst.FD;
fpr.Lock(a,b,c,d);
MOVAPD(XMM0, fpr.R(a));
switch (inst.SUBOP5)
{
case 14: //madds0
MOVDDUP(XMM1, fpr.R(c));
MULPD(XMM0, R(XMM1));
ADDPD(XMM0, fpr.R(b));
break;
case 15: //madds1
MOVAPD(XMM1, fpr.R(c));
SHUFPD(XMM1, R(XMM1), 3); // copy higher to lower
MULPD(XMM0, R(XMM1));
ADDPD(XMM0, fpr.R(b));
break;
case 28: //msub
MULPD(XMM0, fpr.R(c));
SUBPD(XMM0, fpr.R(b));
break;
case 29: //madd
MULPD(XMM0, fpr.R(c));
ADDPD(XMM0, fpr.R(b));
break;
case 30: //nmsub
MULPD(XMM0, fpr.R(c));
SUBPD(XMM0, fpr.R(b));
XORPD(XMM0, M((void*)&psSignBits));
break;
case 31: //nmadd
MULPD(XMM0, fpr.R(c));
ADDPD(XMM0, fpr.R(b));
XORPD(XMM0, M((void*)&psSignBits));
break;
default:
_assert_msg_(DYNA_REC, 0, "ps_maddXX WTF!!!");
//Default(inst);
//fpr.UnlockAll();
return;
}
fpr.LoadToX64(d, false);
MOVAPD(fpr.RX(d), Gen::R(XMM0));
ForceSinglePrecisionP(fpr.RX(d));
fpr.UnlockAll();
}

View file

@ -0,0 +1,149 @@
// Copyright (C) 2003-2008 Dolphin Project.
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, version 2.0.
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License 2.0 for more details.
// A copy of the GPL 2.0 should have been included with the program.
// If not, see http://www.gnu.org/licenses/
// Official SVN repository and contact information can be found at
// http://code.google.com/p/dolphin-emu/
#include "Common.h"
#include "../../Core.h"
#include "../../CoreTiming.h"
#include "../../HW/SystemTimers.h"
#include "../PowerPC.h"
#include "../PPCTables.h"
#include "x64Emitter.h"
#include "ABI.h"
#include "Thunk.h"
#include "Jit.h"
#include "JitCache.h"
#include "JitRegCache.h"
#define INSTRUCTION_START
// #define INSTRUCTION_START Default(inst); return;
void Jit64::mtspr(UGeckoInstruction inst)
{
u32 iIndex = (inst.SPRU << 5) | (inst.SPRL & 0x1F);
switch(iIndex) {
case SPR_LR:
ibuild.EmitStoreLink(ibuild.EmitLoadGReg(inst.RD));
return;
case SPR_CTR:
ibuild.EmitStoreCTR(ibuild.EmitLoadGReg(inst.RD));
return;
default:
printf("mtspr case %d", iIndex);
Default(inst);
return;
}
}
void Jit64::mfspr(UGeckoInstruction inst)
{
u32 iIndex = (inst.SPRU << 5) | (inst.SPRL & 0x1F);
switch (iIndex)
{
case SPR_LR:
ibuild.EmitStoreGReg(ibuild.EmitLoadLink(), inst.RD);
return;
case SPR_CTR:
ibuild.EmitStoreGReg(ibuild.EmitLoadCTR(), inst.RD);
return;
default:
printf("mfspr case %d", iIndex);
Default(inst);
return;
}
}
// =======================================================================================
// Don't interpret this, if we do we get thrown out
// --------------
void Jit64::mtmsr(UGeckoInstruction inst)
{
ibuild.EmitStoreMSR(ibuild.EmitLoadGReg(inst.RS));
ibuild.EmitBranchUncond(ibuild.EmitIntConst(js.compilerPC + 4));
}
// ==============
void Jit64::mfmsr(UGeckoInstruction inst)
{
ibuild.EmitStoreGReg(ibuild.EmitLoadMSR(), inst.RD);
}
void Jit64::mftb(UGeckoInstruction inst)
{
if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITSystemRegistersOff)
{Default(inst); return;} // turn off from debugger
INSTRUCTION_START;
mfspr(inst);
}
void Jit64::mfcr(UGeckoInstruction inst)
{
if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITSystemRegistersOff)
{Default(inst); return;} // turn off from debugger
INSTRUCTION_START;
// USES_CR
int d = inst.RD;
gpr.LoadToX64(d, false, true);
MOV(8, R(EAX), M(&PowerPC::ppcState.cr_fast[0]));
SHL(32, R(EAX), Imm8(4));
for (int i = 1; i < 7; i++) {
OR(8, R(EAX), M(&PowerPC::ppcState.cr_fast[i]));
SHL(32, R(EAX), Imm8(4));
}
OR(8, R(EAX), M(&PowerPC::ppcState.cr_fast[7]));
MOV(32, gpr.R(d), R(EAX));
}
void Jit64::mtcrf(UGeckoInstruction inst)
{
if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITSystemRegistersOff)
{Default(inst); return;} // turn off from debugger
INSTRUCTION_START;
// USES_CR
u32 mask = 0;
u32 crm = inst.CRM;
if (crm == 0xFF) {
gpr.FlushLockX(ECX);
MOV(32, R(EAX), gpr.R(inst.RS));
for (int i = 0; i < 8; i++) {
MOV(32, R(ECX), R(EAX));
SHR(32, R(ECX), Imm8(28 - (i * 4)));
AND(32, R(ECX), Imm32(0xF));
MOV(8, M(&PowerPC::ppcState.cr_fast[i]), R(ECX));
}
gpr.UnlockAllX();
} else {
Default(inst);
return;
// TODO: translate this to work in new CR model.
for (int i = 0; i < 8; i++) {
if (crm & (1 << i))
mask |= 0xF << (i*4);
}
MOV(32, R(EAX), gpr.R(inst.RS));
MOV(32, R(ECX), M(&PowerPC::ppcState.cr));
AND(32, R(EAX), Imm32(mask));
AND(32, R(ECX), Imm32(~mask));
OR(32, R(EAX), R(ECX));
MOV(32, M(&PowerPC::ppcState.cr), R(EAX));
}
}

View file

@ -0,0 +1,161 @@
// Copyright (C) 2003-2008 Dolphin Project.
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, version 2.0.
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License 2.0 for more details.
// A copy of the GPL 2.0 should have been included with the program.
// If not, see http://www.gnu.org/licenses/
// Official SVN repository and contact information can be found at
// http://code.google.com/p/dolphin-emu/
#include "Common.h"
#include "Thunk.h"
#include "../PowerPC.h"
#include "../../Core.h"
#include "../../HW/GPFifo.h"
#include "../../HW/CommandProcessor.h"
#include "../../HW/PixelEngine.h"
#include "../../HW/Memmap.h"
#include "../PPCTables.h"
#include "x64Emitter.h"
#include "ABI.h"
#include "Jit.h"
#include "JitCache.h"
#include "JitAsm.h"
#include "JitRegCache.h"
void Jit64::JitClearCA()
{
AND(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(~XER_CA_MASK)); //XER.CA = 0
}
void Jit64::JitSetCA()
{
OR(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(XER_CA_MASK)); //XER.CA = 1
}
void Jit64::UnsafeLoadRegToReg(X64Reg reg_addr, X64Reg reg_value, int accessSize, s32 offset, bool signExtend)
{
#ifdef _M_IX86
AND(32, R(reg_addr), Imm32(Memory::MEMVIEW32_MASK));
MOVZX(32, accessSize, reg_value, MDisp(reg_addr, (u32)Memory::base + offset));
#else
MOVZX(32, accessSize, reg_value, MComplex(RBX, reg_addr, SCALE_1, offset));
#endif
if (accessSize == 32)
{
BSWAP(32, reg_value);
}
else if (accessSize == 16)
{
BSWAP(32, reg_value);
if (signExtend)
SAR(32, R(reg_value), Imm8(16));
else
SHR(32, R(reg_value), Imm8(16));
} else if (signExtend) {
// TODO: bake 8-bit into the original load.
MOVSX(32, accessSize, reg_value, R(reg_value));
}
}
void Jit64::SafeLoadRegToEAX(X64Reg reg, int accessSize, s32 offset, bool signExtend)
{
if (offset)
ADD(32, R(reg), Imm32((u32)offset));
TEST(32, R(reg), Imm32(0x0C000000));
FixupBranch argh = J_CC(CC_Z);
switch (accessSize)
{
case 32: ABI_CallFunctionR(thunks.ProtectFunction((void *)&Memory::Read_U32, 1), reg); break;
case 16: ABI_CallFunctionR(thunks.ProtectFunction((void *)&Memory::Read_U16, 1), reg); break;
case 8: ABI_CallFunctionR(thunks.ProtectFunction((void *)&Memory::Read_U8, 1), reg); break;
}
if (signExtend && accessSize < 32) {
// Need to sign extend values coming from the Read_U* functions.
MOVSX(32, accessSize, EAX, R(EAX));
}
FixupBranch arg2 = J();
SetJumpTarget(argh);
UnsafeLoadRegToReg(reg, EAX, accessSize, 0, signExtend);
SetJumpTarget(arg2);
}
void Jit64::UnsafeWriteRegToReg(X64Reg reg_value, X64Reg reg_addr, int accessSize, s32 offset)
{
if (accessSize == 8 && reg_value >= 4) {
PanicAlert("WARNING: likely incorrect use of UnsafeWriteRegToReg!");
}
BSWAP(accessSize, reg_value);
#ifdef _M_IX86
AND(32, R(reg_addr), Imm32(Memory::MEMVIEW32_MASK));
MOV(accessSize, MDisp(reg_addr, (u32)Memory::base + offset), R(reg_value));
#else
MOV(accessSize, MComplex(RBX, reg_addr, SCALE_1, offset), R(reg_value));
#endif
}
// Destroys both arg registers
void Jit64::SafeWriteRegToReg(X64Reg reg_value, X64Reg reg_addr, int accessSize, s32 offset)
{
if (offset)
ADD(32, R(reg_addr), Imm32(offset));
TEST(32, R(reg_addr), Imm32(0x0C000000));
FixupBranch argh = J_CC(CC_Z);
switch (accessSize)
{
case 32: ABI_CallFunctionRR(thunks.ProtectFunction((void *)&Memory::Write_U32, 2), reg_value, reg_addr); break;
case 16: ABI_CallFunctionRR(thunks.ProtectFunction((void *)&Memory::Write_U16, 2), reg_value, reg_addr); break;
case 8: ABI_CallFunctionRR(thunks.ProtectFunction((void *)&Memory::Write_U8, 2), reg_value, reg_addr); break;
}
FixupBranch arg2 = J();
SetJumpTarget(argh);
UnsafeWriteRegToReg(reg_value, reg_addr, accessSize, 0);
SetJumpTarget(arg2);
}
void Jit64::WriteToConstRamAddress(int accessSize, const Gen::OpArg& arg, u32 address)
{
#ifdef _M_X64
MOV(accessSize, MDisp(RBX, address & 0x3FFFFFFF), arg);
#else
MOV(accessSize, M((void*)(Memory::base + (address & Memory::MEMVIEW32_MASK))), arg);
#endif
}
void Jit64::WriteFloatToConstRamAddress(const Gen::X64Reg& xmm_reg, u32 address)
{
#ifdef _M_X64
MOV(32, R(RAX), Imm32(address));
MOVSS(MComplex(RBX, RAX, 1, 0), xmm_reg);
#else
MOVSS(M((void*)((u32)Memory::base + (address & Memory::MEMVIEW32_MASK))), xmm_reg);
#endif
}
void Jit64::ForceSinglePrecisionS(X64Reg xmm) {
// Most games don't need these. Zelda requires it though - some platforms get stuck without them.
if (jo.accurateSinglePrecision)
{
CVTSD2SS(xmm, R(xmm));
CVTSS2SD(xmm, R(xmm));
}
}
void Jit64::ForceSinglePrecisionP(X64Reg xmm) {
// Most games don't need these. Zelda requires it though - some platforms get stuck without them.
if (jo.accurateSinglePrecision)
{
CVTPD2PS(xmm, R(xmm));
CVTPS2PD(xmm, R(xmm));
}
}