Some WIP work on the JIT... only marginally usable at the moment, but I

wanted to back this up somewhere, and the people familiar with the JIT might have comments. There's a big comment in Jit64IL/IR.cpp with a high-level overview of what this is. git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@1724 8ced0084-cf51-0410-be5f-012b33b47a6e
2024-09-21 11:51:48 +02:00 · 2008-12-31 01:39:35 +00:00 · 2008-12-31 01:39:35 +00:00 · 68c451f008
commit 68c451f008
parent 1d0d106736
20 changed files with 6470 additions and 0 deletions
--- a/Source/Core/Core/Src/PowerPC/Jit64IL/IR.cpp
+++ b/Source/Core/Core/Src/PowerPC/Jit64IL/IR.cpp
--- a/Source/Core/Core/Src/PowerPC/Jit64IL/IR.h
+++ b/Source/Core/Core/Src/PowerPC/Jit64IL/IR.h
@ -0,0 +1,322 @@
+// Copyright (C) 2003-2008 Dolphin Project.
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, version 2.0.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License 2.0 for more details.
+
+// A copy of the GPL 2.0 should have been included with the program.
+// If not, see http://www.gnu.org/licenses/
+
+// Official SVN repository and contact information can be found at
+// http://code.google.com/p/dolphin-emu/
+
+#ifndef IR_H
+#define IR_H
+
+#include "x64Emitter.h"
+#include <vector>
+
+namespace IREmitter {
+
+	enum Opcode {
+		Nop = 0,
+
+		// "Zero-operand" operators
+		// Register load operators
+		LoadGReg,
+		LoadLink,
+		LoadCR,
+		LoadCarry,
+		LoadCTR,
+		LoadMSR,
+
+		// Unary operators
+		// Integer unary operators
+		SExt8,
+		SExt16,
+		BSwap32,
+		BSwap16,
+		Load8,  // These loads zext
+		Load16,
+		Load32,
+		// Branches
+		BranchUncond,
+		// Register store operators
+		StoreGReg,
+		StoreCR,
+		StoreLink,
+		StoreCarry,
+		StoreCTR,
+		StoreMSR,
+		// Arbitrary interpreter instruction
+		InterpreterFallback,
+
+		// Binary operators
+		// Commutative integer operators
+		Add,
+		Mul,
+		And,
+		Or,
+		Xor,
+		// Non-commutative integer operators
+		Sub,
+		Shl,  // Note that shifts ignore bits above the bottom 5
+		Shrl,
+		Sarl,
+		Rol,
+		ICmpCRSigned,   // CR for signed int compare
+		ICmpCRUnsigned, // CR for unsigned int compare
+		ICmpEq,         // One if equal, zero otherwise
+		ICmpUgt,	// One if op1 > op2, zero otherwise
+		// Memory store operators
+		Store8,
+		Store16,
+		Store32,
+		BranchCond,
+
+		// "Trinary" operators
+		// FIXME: Need to change representation!
+		//Select,       // Equivalent to C "Op1 ? Op2 : Op3"
+
+		// Integer constants
+		CInt16,
+		CInt32,
+
+		// "Opcode" representing a register too far away to
+		// reference directly; this is a size optimization
+		Tramp,
+		// "Opcode"s representing the start and end
+		BlockStart, BlockEnd
+	};
+
+	typedef unsigned Inst;
+	typedef Inst* InstLoc;
+
+	unsigned inline getOpcode(Inst i) {
+		return i & 255;
+	}
+
+	unsigned inline isImm(Inst i) {
+		return getOpcode(i) >= CInt16 && getOpcode(i) <= CInt32;
+	}
+
+	unsigned inline isUnary(Inst i) {
+		return getOpcode(i) >= SExt8 && getOpcode(i) <= BSwap16;
+	}
+
+	unsigned inline isBinary(Inst i) {
+		return getOpcode(i) >= Add && getOpcode(i) <= ICmpCRUnsigned;
+	}
+
+	unsigned inline isMemLoad(Inst i) {
+		return getOpcode(i) >= Load8 && getOpcode(i) <= Load32;
+	}
+
+	unsigned inline isMemStore(Inst i) {
+		return getOpcode(i) >= Store8 && getOpcode(i) <= Store32;
+	}
+
+	unsigned inline isRegLoad(Inst i) {
+		return getOpcode(i) >= LoadGReg && getOpcode(i) <= LoadCR;
+	}
+
+	unsigned inline isRegStore(Inst i) {
+		return getOpcode(i) >= LoadGReg && getOpcode(i) <= LoadCR;
+	}
+
+	unsigned inline isBranch(Inst i) {
+		return getOpcode(i) >= BranchUncond &&
+		       getOpcode(i) <= BranchCond;
+	}
+
+	unsigned inline isInterpreterFallback(Inst i) {
+		return getOpcode(i) == InterpreterFallback;
+	}
+
+	InstLoc inline getOp1(InstLoc i) {
+		return i - 1 - ((*i >> 8) & 255);
+	}
+
+	InstLoc inline getOp2(InstLoc i) {
+		return i - 1 - ((*i >> 16) & 255);
+	}
+
+	class IRBuilder {
+		InstLoc EmitZeroOp(unsigned Opcode, unsigned extra);
+		InstLoc EmitUOp(unsigned OpCode, InstLoc Op1,
+				unsigned extra = 0);
+		InstLoc EmitBiOp(unsigned OpCode, InstLoc Op1, InstLoc Op2);
+
+		InstLoc FoldAdd(InstLoc Op1, InstLoc Op2);
+		InstLoc FoldAnd(InstLoc Op1, InstLoc Op2);
+		InstLoc FoldOr(InstLoc Op1, InstLoc Op2);
+		InstLoc FoldRol(InstLoc Op1, InstLoc Op2);
+		InstLoc FoldShl(InstLoc Op1, InstLoc Op2);
+		InstLoc FoldShrl(InstLoc Op1, InstLoc Op2);
+		InstLoc FoldXor(InstLoc Op1, InstLoc Op2);
+
+		InstLoc FoldInterpreterFallback(InstLoc Op1, InstLoc Op2);
+
+		InstLoc FoldZeroOp(unsigned Opcode, unsigned extra);
+		InstLoc FoldUOp(unsigned OpCode, InstLoc Op1,
+				unsigned extra = 0);
+		InstLoc FoldBiOp(unsigned OpCode, InstLoc Op1, InstLoc Op2);
+
+		public:
+		InstLoc EmitIntConst(unsigned value);
+		InstLoc EmitStoreLink(InstLoc val) {
+			return FoldUOp(StoreLink, val);
+		}
+		InstLoc EmitBranchUncond(InstLoc val) {
+			return FoldUOp(BranchUncond, val);
+		}
+		InstLoc EmitBranchCond(InstLoc check, InstLoc dest) {
+			return FoldBiOp(BranchCond, check, dest);
+		}
+		InstLoc EmitLoadCR(unsigned crreg) {
+			return FoldZeroOp(LoadCR, crreg);
+		}
+		InstLoc EmitStoreCR(InstLoc value, unsigned crreg) {
+			return FoldUOp(StoreCR, value, crreg);
+		}
+		InstLoc EmitLoadLink() {
+			return FoldZeroOp(LoadLink, 0);
+		}
+		InstLoc EmitLoadMSR() {
+			return FoldZeroOp(LoadMSR, 0);
+		}
+		InstLoc EmitStoreMSR(InstLoc val) {
+			return FoldUOp(StoreMSR, val);
+		}
+		InstLoc EmitLoadGReg(unsigned reg) {
+			return FoldZeroOp(LoadGReg, reg);
+		}
+		InstLoc EmitStoreGReg(InstLoc value, unsigned reg) {
+			return FoldUOp(StoreGReg, value, reg);
+		}
+		InstLoc EmitAnd(InstLoc op1, InstLoc op2) {
+			return FoldBiOp(And, op1, op2);
+		}
+		InstLoc EmitXor(InstLoc op1, InstLoc op2) {
+			return FoldBiOp(Xor, op1, op2);
+		}
+		InstLoc EmitSub(InstLoc op1, InstLoc op2) {
+			return FoldBiOp(Sub, op1, op2);
+		}
+		InstLoc EmitOr(InstLoc op1, InstLoc op2) {
+			return FoldBiOp(Or, op1, op2);
+		}
+		InstLoc EmitAdd(InstLoc op1, InstLoc op2) {
+			return FoldBiOp(Add, op1, op2);
+		}
+		InstLoc EmitMul(InstLoc op1, InstLoc op2) {
+			return FoldBiOp(Mul, op1, op2);
+		}
+		InstLoc EmitRol(InstLoc op1, InstLoc op2) {
+			return FoldBiOp(Rol, op1, op2);
+		}
+		InstLoc EmitShl(InstLoc op1, InstLoc op2) {
+			return FoldBiOp(Shl, op1, op2);
+		}
+		InstLoc EmitShrl(InstLoc op1, InstLoc op2) {
+			return FoldBiOp(Shrl, op1, op2);
+		}
+		InstLoc EmitSarl(InstLoc op1, InstLoc op2) {
+			return FoldBiOp(Sarl, op1, op2);
+		}
+		InstLoc EmitLoadCTR() {
+			return FoldZeroOp(LoadCTR, 0);
+		}
+		InstLoc EmitStoreCTR(InstLoc op1) {
+			return FoldUOp(StoreCTR, op1);
+		}
+		InstLoc EmitICmpEq(InstLoc op1, InstLoc op2) {
+			return FoldBiOp(ICmpEq, op1, op2);
+		}
+		InstLoc EmitICmpUgt(InstLoc op1, InstLoc op2) {
+			return FoldBiOp(ICmpUgt, op1, op2);
+		}
+		InstLoc EmitLoad8(InstLoc op1) {
+			return FoldUOp(Load8, op1);
+		}
+		InstLoc EmitLoad16(InstLoc op1) {
+			return FoldUOp(Load16, op1);
+		}
+		InstLoc EmitLoad32(InstLoc op1) {
+			return FoldUOp(Load32, op1);
+		}
+		InstLoc EmitStore8(InstLoc op1, InstLoc op2) {
+			return FoldBiOp(Store8, op1, op2);
+		}
+		InstLoc EmitStore16(InstLoc op1, InstLoc op2) {
+			return FoldBiOp(Store16, op1, op2);
+		}
+		InstLoc EmitStore32(InstLoc op1, InstLoc op2) {
+			return FoldBiOp(Store32, op1, op2);
+		}
+		InstLoc EmitSExt16(InstLoc op1) {
+			return FoldUOp(SExt16, op1);
+		}
+		InstLoc EmitSExt8(InstLoc op1) {
+			return FoldUOp(SExt8, op1);
+		}
+		InstLoc EmitICmpCRSigned(InstLoc op1, InstLoc op2) {
+			return FoldBiOp(ICmpCRSigned, op1, op2);
+		}
+		InstLoc EmitICmpCRUnsigned(InstLoc op1, InstLoc op2) {
+			return FoldBiOp(ICmpCRUnsigned, op1, op2);
+		}
+		InstLoc EmitInterpreterFallback(InstLoc op1, InstLoc op2) {
+			return FoldBiOp(InterpreterFallback, op1, op2);
+		}
+		InstLoc EmitStoreCarry(InstLoc op1) {
+			return FoldUOp(StoreCarry, op1);
+		}
+
+		void StartBackPass() { curReadPtr = &InstList[InstList.size()]; }
+		void StartForwardPass() { curReadPtr = &InstList[0]; }
+		InstLoc ReadForward() { return curReadPtr++; }
+		InstLoc ReadBackward() { return --curReadPtr; }
+		InstLoc getFirstInst() { return &InstList[0]; }
+		unsigned getNumInsts() { return InstList.size(); }
+		unsigned ReadInst(InstLoc I) { return *I; }
+		unsigned GetImmValue(InstLoc I);
+
+		void Reset() {
+			InstList.clear();
+			InstList.reserve(100000);
+			for (unsigned i = 0; i < 32; i++) {
+				GRegCache[i] = 0;
+				GRegCacheStore[i] = 0;
+			}
+			CarryCache = 0;
+			CarryCacheStore = 0;
+			for (unsigned i = 0; i < 8; i++) {
+				CRCache[i] = 0;
+				CRCacheStore[i] = 0;
+			}
+		}
+
+		IRBuilder() { Reset(); }
+
+		private:
+		std::vector<Inst> InstList; // FIXME: We must ensure this is 
+					    // continuous!
+		std::vector<unsigned> ConstList;
+		InstLoc curReadPtr;
+		InstLoc GRegCache[32];
+		InstLoc GRegCacheStore[32];
+		InstLoc CarryCache;
+		InstLoc CarryCacheStore;
+		InstLoc CRCache[8];
+		InstLoc CRCacheStore[8];
+	};
+
+};
+
+#endif
--- a/Source/Core/Core/Src/PowerPC/Jit64IL/Jit.cpp
+++ b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit.cpp
@ -0,0 +1,528 @@
+// Copyright (C) 2003-2008 Dolphin Project.
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, version 2.0.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License 2.0 for more details.
+
+// A copy of the GPL 2.0 should have been included with the program.
+// If not, see http://www.gnu.org/licenses/
+
+// Official SVN repository and contact information can be found at
+// http://code.google.com/p/dolphin-emu/
+
+#include <map>
+
+#include "Common.h"
+#include "x64Emitter.h"
+#include "ABI.h"
+#include "Thunk.h"
+#include "../../HLE/HLE.h"
+#include "../../Core.h"
+#include "../../PatchEngine.h"
+#include "../../CoreTiming.h"
+#include "../../Debugger/Debugger_BreakPoints.h"
+#include "../PowerPC.h"
+#include "../Profiler.h"
+#include "../PPCTables.h"
+#include "../PPCAnalyst.h"
+#include "../../HW/Memmap.h"
+#include "../../HW/GPFifo.h"
+#include "Jit.h"
+#include "JitAsm.h"
+#include "JitCache.h"
+#include "JitRegCache.h"
+
+using namespace Gen;
+using namespace PowerPC;
+
+extern int blocksExecuted;
+
+// Dolphin's PowerPC->x86 JIT dynamic recompiler
+// (Nearly) all code by ector (hrydgard)
+// Features:
+// * x86 & x64 support, lots of shared code.
+// * Basic block linking
+// * Fast dispatcher
+
+// Unfeatures:
+// * Does not recompile all instructions - sometimes falls back to inserting a CALL to the corresponding JIT function.
+
+// Various notes below
+
+// Register allocation
+//   RAX - Generic quicktemp register
+//   RBX - point to base of memory map
+//   RSI RDI R12 R13 R14 R15 - free for allocation
+//   RCX RDX R8 R9 R10 R11 - allocate in emergencies. These need to be flushed before functions are called.
+//   RSP - stack pointer, do not generally use, very dangerous
+//   RBP - ?
+
+// IMPORTANT:
+// Make sure that all generated code and all emulator state sits under the 2GB boundary so that
+// RIP addressing can be used easily. Windows will always allocate static code under the 2GB boundary.
+// Also make sure to use VirtualAlloc and specify EXECUTE permission.
+
+// Open questions
+// * Should there be any statically allocated registers? r3, r4, r5, r8, r0 come to mind.. maybe sp
+// * Does it make sense to finish off the remaining non-jitted instructions? Seems we are hitting diminishing returns.
+// * Why is the FPU exception handling not working 100%? Several games still get corrupted floating point state.
+//   This can even be seen in one homebrew Wii demo - RayTracer.elf
+
+// Other considerations
+//
+// Many instructions have shorter forms for EAX. However, I believe their performance boost
+// will be as small to be negligble, so I haven't dirtied up the code with that. AMD recommends it in their
+// optimization manuals, though.
+//
+// We support block linking. Reserve space at the exits of every block for a full 5-byte jmp. Save 16-bit offsets 
+// from the starts of each block, marking the exits so that they can be nicely patched at any time.
+//
+// Blocks do NOT use call/ret, they only jmp to each other and to the dispatcher when necessary.
+//
+// All blocks that can be precompiled will be precompiled. Code will be memory protected - any write will mark
+// the region as non-compilable, and all links to the page will be torn out and replaced with dispatcher jmps.
+//
+// Alternatively, icbi instruction SHOULD mark where we can't compile
+//
+// Seldom-happening events is handled by adding a decrement of a counter to all blr instructions (which are
+// expensive anyway since we need to return to dispatcher, except when they can be predicted).
+
+// TODO: SERIOUS synchronization problem with the video plugin setting tokens and breakpoints in dual core mode!!!
+//       Somewhat fixed by disabling idle skipping when certain interrupts are enabled
+//       This is no permantent reliable fix
+// TODO: Zeldas go whacko when you hang the gfx thread
+
+// Idea - Accurate exception handling
+// Compute register state at a certain instruction by running the JIT in "dry mode", and stopping at the right place.
+// Not likely to be done :P
+
+
+// Optimization Ideas -
+/*
+  * Assume SP is in main RAM (in Wii mode too?) - partly done
+  * Assume all floating point loads and double precision loads+stores are to/from main ram
+    (single precision can be used in write gather pipe, specialized fast check added)
+  * AMD only - use movaps instead of movapd when loading ps from memory?
+  * HLE functions like floorf, sin, memcpy, etc - they can be much faster
+  * ABI optimizations - drop F0-F13 on blr, for example. Watch out for context switching.
+    CR2-CR4 are non-volatile, rest of CR is volatile -> dropped on blr.
+	R5-R12 are volatile -> dropped on blr.
+  * classic inlining across calls.
+  
+Low hanging fruit:
+stfd -- guaranteed in memory
+cmpl
+mulli
+stfs
+stwu
+lb/stzx
+
+bcx - optimize!
+bcctr
+stfs
+psq_st
+addx
+orx
+rlwimix
+fcmpo
+DSP_UpdateARAMDMA
+lfd
+stwu
+cntlzwx
+bcctrx
+WriteBigEData
+
+TODO
+lha
+srawx
+addic_rc
+addex
+subfcx
+subfex
+
+fmaddx
+fmulx
+faddx
+fnegx
+frspx
+frsqrtex
+ps_sum0
+ps_muls0
+ps_adds1
+
+*/
+
+Jit64 jit;
+
+int CODE_SIZE = 1024*1024*16;
+
+namespace CPUCompare
+{
+	extern u32 m_BlockStart;
+}
+
+	void Jit(u32 em_address)
+	{
+		jit.Jit(em_address);
+	}
+
+	void Jit64::Init()
+	{
+		asm_routines.compareEnabled = ::Core::g_CoreStartupParameter.bRunCompareClient;
+		if (Core::g_CoreStartupParameter.bJITUnlimitedCache)
+			CODE_SIZE = 1024*1024*8*8;
+
+		jo.optimizeStack = true;
+		jo.enableBlocklink = true;  // Speed boost, but not 100% safe
+#ifdef _M_X64
+		jo.enableFastMem = Core::GetStartupParameter().bUseFastMem;
+#else
+		jo.enableFastMem = false;
+#endif
+		jo.assumeFPLoadFromMem = true;
+		jo.fpAccurateFlags = true;
+		jo.optimizeGatherPipe = true;
+		jo.fastInterrupts = false;
+		jo.accurateSinglePrecision = false;
+
+		gpr.SetEmitter(this);
+		fpr.SetEmitter(this);
+
+		trampolines.Init();
+		AllocCodeSpace(CODE_SIZE);
+
+		blocks.Init();
+		asm_routines.Init();
+	}
+
+	void Jit64::Shutdown()
+	{
+		FreeCodeSpace();
+
+		blocks.Shutdown();
+		trampolines.Shutdown();
+		asm_routines.Shutdown();
+	}
+
+
+	void Jit64::WriteCallInterpreter(UGeckoInstruction inst)
+	{
+		gpr.Flush(FLUSH_ALL);
+		fpr.Flush(FLUSH_ALL);
+		if (js.isLastInstruction)
+		{
+			MOV(32, M(&PC), Imm32(js.compilerPC));
+			MOV(32, M(&NPC), Imm32(js.compilerPC + 4));
+		}
+		Interpreter::_interpreterInstruction instr = GetInterpreterOp(inst);
+		ABI_CallFunctionC((void*)instr, inst.hex);
+		if (js.isLastInstruction)
+		{
+			MOV(32, R(EAX), M(&NPC));
+			WriteRfiExitDestInEAX();
+		}
+	}
+
+	void Jit64::unknown_instruction(UGeckoInstruction inst)
+	{
+		//	CCPU::Break();
+		PanicAlert("unknown_instruction %08x - Fix me ;)", inst.hex);
+	}
+
+	void Jit64::Default(UGeckoInstruction _inst)
+	{
+		ibuild.EmitInterpreterFallback(
+			ibuild.EmitIntConst(_inst.hex),
+			ibuild.EmitIntConst(js.compilerPC));
+	}
+
+	void Jit64::HLEFunction(UGeckoInstruction _inst)
+	{
+		gpr.Flush(FLUSH_ALL);
+		fpr.Flush(FLUSH_ALL);
+		ABI_CallFunctionCC((void*)&HLE::Execute, js.compilerPC, _inst.hex);
+		MOV(32, R(EAX), M(&NPC));
+		WriteExitDestInEAX(0);
+	}
+
+	void Jit64::DoNothing(UGeckoInstruction _inst)
+	{
+		// Yup, just don't do anything.
+	}
+
+	void Jit64::NotifyBreakpoint(u32 em_address, bool set)
+	{
+		int block_num = blocks.GetBlockNumberFromStartAddress(em_address);
+		if (block_num >= 0)
+		{
+			blocks.DestroyBlock(block_num, false);
+		}
+	}
+
+	static const bool ImHereDebug = false;
+	static const bool ImHereLog = false;
+	static std::map<u32, int> been_here;
+
+	void ImHere()
+	{
+		static FILE *f = 0;
+		if (ImHereLog) {
+			if (!f)
+			{
+#ifdef _M_X64
+				f = fopen("log64.txt", "w");
+#else
+				f = fopen("log32.txt", "w");
+#endif
+			}
+			fprintf(f, "%08x\n", PC);
+		}
+		if (been_here.find(PC) != been_here.end()) {
+			been_here.find(PC)->second++;
+			if ((been_here.find(PC)->second) & 1023)
+				return;
+		}
+		LOG(DYNA_REC, "I'm here - PC = %08x , LR = %08x", PC, LR);
+		printf("I'm here - PC = %08x , LR = %08x", PC, LR);
+		been_here[PC] = 1;
+	}
+
+	void Jit64::Cleanup()
+	{
+		if (jo.optimizeGatherPipe && js.fifoBytesThisBlock > 0)
+			ABI_CallFunction((void *)&GPFifo::CheckGatherPipe);
+	}
+
+	void Jit64::WriteExit(u32 destination, int exit_num)
+	{
+		Cleanup();
+		SUB(32, M(&CoreTiming::downcount), js.downcountAmount > 127 ? Imm32(js.downcountAmount) : Imm8(js.downcountAmount)); 
+
+		//If nobody has taken care of this yet (this can be removed when all branches are done)
+		JitBlock *b = js.curBlock;
+		b->exitAddress[exit_num] = destination;
+		b->exitPtrs[exit_num] = GetWritableCodePtr();
+		
+		// Link opportunity!
+		int block = blocks.GetBlockNumberFromStartAddress(destination);
+		if (block >= 0 && jo.enableBlocklink) 
+		{
+			// It exists! Joy of joy!
+			JMP(blocks.GetBlock(block)->checkedEntry, true);
+			b->linkStatus[exit_num] = true;
+		}
+		else 
+		{
+			MOV(32, M(&PC), Imm32(destination));
+			JMP(asm_routines.dispatcher, true);
+		}
+	}
+
+	void Jit64::WriteExitDestInEAX(int exit_num) 
+	{
+		MOV(32, M(&PC), R(EAX));
+		Cleanup();
+		SUB(32, M(&CoreTiming::downcount), js.downcountAmount > 127 ? Imm32(js.downcountAmount) : Imm8(js.downcountAmount)); 
+		JMP(asm_routines.dispatcher, true);
+	}
+
+	void Jit64::WriteRfiExitDestInEAX() 
+	{
+		MOV(32, M(&PC), R(EAX));
+		Cleanup();
+		SUB(32, M(&CoreTiming::downcount), js.downcountAmount > 127 ? Imm32(js.downcountAmount) : Imm8(js.downcountAmount)); 
+		JMP(asm_routines.testExceptions, true);
+	}
+
+	void Jit64::WriteExceptionExit(u32 exception)
+	{
+		Cleanup();
+		OR(32, M(&PowerPC::ppcState.Exceptions), Imm32(exception));
+		MOV(32, M(&PC), Imm32(js.compilerPC + 4));
+		JMP(asm_routines.testExceptions, true);
+	}
+	
+	void STACKALIGN Jit64::Run()
+	{
+		CompiledCode pExecAddr = (CompiledCode)asm_routines.enterCode;
+		pExecAddr();
+		//Will return when PowerPC::state changes
+	}
+
+	void Jit64::SingleStep()
+	{
+		// NOT USED, NOT TESTED, PROBABLY NOT WORKING YET
+		// PanicAlert("Single");
+		/*
+		JitBlock temp_block;
+		PPCAnalyst::CodeBuffer temp_codebuffer(1);  // Only room for one instruction! Single step!
+		const u8 *code = DoJit(PowerPC::ppcState.pc, &temp_codebuffer, &temp_block);
+		CompiledCode pExecAddr = (CompiledCode)code;
+		pExecAddr();*/
+	}
+
+	void STACKALIGN Jit64::Jit(u32 em_address)
+	{
+		if (GetSpaceLeft() < 0x10000 || blocks.IsFull())
+		{
+			LOG(DYNA_REC, "JIT cache full - clearing.")
+			if (Core::g_CoreStartupParameter.bJITUnlimitedCache)
+			{
+				PanicAlert("What? JIT cache still full - clearing.");
+			}
+			ClearCache();
+		}
+		int block_num = blocks.AllocateBlock(em_address);
+		JitBlock *b = blocks.GetBlock(block_num);
+		blocks.FinalizeBlock(block_num, jo.enableBlocklink, DoJit(em_address, &code_buffer, b));
+	}
+
+	const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buffer, JitBlock *b)
+	{
+		Core::g_CoreStartupParameter.bJITLoadStoreOff = true;
+		Core::g_CoreStartupParameter.bJITLoadStoreFloatingOff = true;
+		Core::g_CoreStartupParameter.bJITLoadStorePairedOff = true;
+		Core::g_CoreStartupParameter.bJITFloatingPointOff = true;
+		Core::g_CoreStartupParameter.bJITIntegerOff = true;
+		Core::g_CoreStartupParameter.bJITPairedOff = true;
+		Core::g_CoreStartupParameter.bJITSystemRegistersOff = true;
+		Core::g_CoreStartupParameter.bJITBranchOff = true;
+		if (em_address == 0)
+			PanicAlert("ERROR : Trying to compile at 0. LR=%08x", LR);
+
+		int size;
+		js.isLastInstruction = false;
+		js.blockStart = em_address;
+		js.fifoBytesThisBlock = 0;
+		js.curBlock = b;
+		js.blockSetsQuantizers = false;
+		js.block_flags = 0;
+		js.cancel = false;
+
+		//Analyze the block, collect all instructions it is made of (including inlining,
+		//if that is enabled), reorder instructions for optimal performance, and join joinable instructions.
+		PPCAnalyst::Flatten(em_address, &size, &js.st, &js.gpa, &js.fpa, code_buffer);
+		PPCAnalyst::CodeOp *ops = code_buffer->codebuffer;
+
+		const u8 *start = AlignCode4(); //TODO: Test if this or AlignCode16 make a difference from GetCodePtr
+		b->checkedEntry = start;
+		b->runCount = 0;
+
+		// Downcount flag check. The last block decremented downcounter, and the flag should still be available.
+		FixupBranch skip = J_CC(CC_NBE);
+		MOV(32, M(&PC), Imm32(js.blockStart));
+		JMP(asm_routines.doTiming, true);  // downcount hit zero - go doTiming.
+		SetJumpTarget(skip);
+
+		const u8 *normalEntry = GetCodePtr();
+		js.normalEntry = (u8*)normalEntry;
+		
+		if (ImHereDebug)
+			ABI_CallFunction((void *)&ImHere); //Used to get a trace of the last few blocks before a crash, sometimes VERY useful
+		
+		if (false && js.fpa.any)
+		{
+			//This block uses FPU - needs to add FP exception bailout
+			TEST(32, M(&PowerPC::ppcState.msr), Imm32(1 << 13)); //Test FP enabled bit
+			FixupBranch b1 = J_CC(CC_NZ);
+			MOV(32, M(&PC), Imm32(js.blockStart));
+			JMP(asm_routines.fpException, true);
+			SetJumpTarget(b1);
+		}
+
+		if (false && jo.fastInterrupts)
+		{
+			// This does NOT yet work.
+			TEST(32, M(&PowerPC::ppcState.Exceptions), Imm32(0xFFFFFFFF));
+			FixupBranch b1 = J_CC(CC_Z);
+			MOV(32, M(&PC), Imm32(js.blockStart));
+			JMP(asm_routines.testExceptions, true);
+			SetJumpTarget(b1);
+		}
+
+		// Conditionally add profiling code.
+		if (Profiler::g_ProfileBlocks) {
+			ADD(32, M(&b->runCount), Imm8(1));
+#ifdef _WIN32
+			b->ticCounter.QuadPart = 0;
+			b->ticStart.QuadPart = 0;
+			b->ticStop.QuadPart = 0;
+#else
+//TODO
+#endif
+			// get start tic
+			PROFILER_QUERY_PERFORMACE_COUNTER(&b->ticStart);
+		}
+
+		//Start up the register allocators
+		//They use the information in gpa/fpa to preload commonly used registers.
+		//gpr.Start(js.gpa);
+		//fpr.Start(js.fpa);
+		ibuild.Reset();
+
+		js.downcountAmount = js.st.numCycles + PatchEngine::GetSpeedhackCycles(em_address);
+		js.blockSize = size;
+		// Translate instructions
+		for (int i = 0; i < (int)size; i++)
+		{
+			// gpr.Flush(FLUSH_ALL);
+			// if (PPCTables::UsesFPU(_inst))
+			// fpr.Flush(FLUSH_ALL);
+			js.compilerPC = ops[i].address;
+			js.op = &ops[i];
+			js.instructionNumber = i;
+			if (i == (int)size - 1)
+			{
+				// WARNING - cmp->branch merging will screw this up.
+				js.isLastInstruction = true;
+				js.next_inst = 0;
+				if (Profiler::g_ProfileBlocks) {
+					// CAUTION!!! push on stack regs you use, do your stuff, then pop
+					PROFILER_VPUSH;
+					// get end tic
+					PROFILER_QUERY_PERFORMACE_COUNTER(&b->ticStop);
+					// tic counter += (end tic - start tic)
+					PROFILER_ADD_DIFF_LARGE_INTEGER(&b->ticCounter, &b->ticStop, &b->ticStart);
+					PROFILER_VPOP;
+				}
+			}
+			else
+			{
+				// help peephole optimizations
+				js.next_inst = ops[i + 1].inst;
+				js.next_compilerPC = ops[i + 1].address;
+			}
+
+			if (jo.optimizeGatherPipe && js.fifoBytesThisBlock >= 32)
+			{
+				js.fifoBytesThisBlock -= 32;
+				ABI_CallFunction(thunks.ProtectFunction((void *)&GPFifo::CheckGatherPipe, 0));
+			}
+
+			// If starting from the breakpointed instruction, we don't break.
+			if (em_address != ops[i].address && BreakPoints::IsAddressBreakPoint(ops[i].address))
+			{
+				
+			}
+
+			if (!ops[i].skip)
+				PPCTables::CompileInstruction(ops[i].inst);
+
+			gpr.SanityCheck();
+			fpr.SanityCheck();
+			if (js.cancel)
+				break;
+		}
+
+		WriteCode();
+
+		b->flags = js.block_flags;
+		b->codeSize = (u32)(GetCodePtr() - normalEntry);
+		b->originalSize = size;
+		return normalEntry;
+	}
--- a/Source/Core/Core/Src/PowerPC/Jit64IL/Jit.h
+++ b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit.h
@ -0,0 +1,299 @@
+// Copyright (C) 2003-2008 Dolphin Project.
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, version 2.0.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License 2.0 for more details.
+
+// A copy of the GPL 2.0 should have been included with the program.
+// If not, see http://www.gnu.org/licenses/
+
+// Official SVN repository and contact information can be found at
+// http://code.google.com/p/dolphin-emu/
+
+// ========================
+// See comments in Jit.cpp.
+// ========================
+
+// Mystery: Capcom vs SNK 800aa278
+
+// CR flags approach:
+//   * Store that "N+Z flag contains CR0" or "S+Z flag contains CR3".
+//   * All flag altering instructions flush this
+//   * A flush simply does a conditional write to the appropriate CRx.
+//   * If flag available, branch code can become absolutely trivial.
+
+#ifndef _JIT_H
+#define _JIT_H
+
+#include "../PPCAnalyst.h"
+#include "JitCache.h"
+#include "JitRegCache.h"
+#include "x64Emitter.h"
+#include "x64Analyzer.h"
+#include "IR.h"
+
+#ifdef _WIN32
+
+#include <windows.h>
+
+#else
+
+// A bit of a hack to get things building under linux. We manually fill in this structure as needed
+// from the real context.
+struct CONTEXT
+{
+#ifdef _M_X64
+	u64 Rip;
+	u64 Rax;
+#else
+	u32 Eip;
+	u32 Eax;
+#endif 
+};
+
+#endif
+
+
+class TrampolineCache : public Gen::XCodeBlock
+{
+public:
+	void Init();
+	void Shutdown();
+
+	const u8 *GetReadTrampoline(const InstructionInfo &info);
+	const u8 *GetWriteTrampoline(const InstructionInfo &info);
+};
+
+
+class Jit64 : public Gen::XCodeBlock
+{
+private:
+	struct JitState
+	{
+		u32 compilerPC;
+		u32 next_compilerPC;
+		u32 blockStart;
+		bool cancel;
+		UGeckoInstruction next_inst;  // for easy peephole opt.
+		int blockSize;
+		int instructionNumber;
+		int downcountAmount;
+		int block_flags;
+
+		bool isLastInstruction;
+		bool blockSetsQuantizers;
+		bool forceUnsafeLoad;
+
+		int fifoBytesThisBlock;
+
+		PPCAnalyst::BlockStats st;
+		PPCAnalyst::BlockRegStats gpa;
+		PPCAnalyst::BlockRegStats fpa;
+		PPCAnalyst::CodeOp *op;
+		u8* normalEntry;
+
+		JitBlock *curBlock;
+	};
+
+	struct JitOptions
+	{
+		bool optimizeStack;
+		bool assumeFPLoadFromMem;
+		bool enableBlocklink;
+		bool fpAccurateFlags;
+		bool enableFastMem;
+		bool optimizeGatherPipe;
+		bool fastInterrupts;
+		bool accurateSinglePrecision;
+	};
+
+	JitBlockCache blocks;
+	TrampolineCache trampolines;
+	GPRRegCache gpr;
+	FPURegCache fpr;
+
+	// The default code buffer. We keep it around to not have to alloc/dealloc a
+	// large chunk of memory for each recompiled block.
+	PPCAnalyst::CodeBuffer code_buffer;
+
+public:
+	Jit64() : code_buffer(32000) {}
+	~Jit64() {}
+
+	JitState js;
+	JitOptions jo;
+	IREmitter::IRBuilder ibuild;
+
+	// Initialization, etc
+
+	void Init();
+	void Shutdown();
+
+	// Jit!
+
+	void Jit(u32 em_address);
+	const u8* DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buffer, JitBlock *b);
+
+	JitBlockCache *GetBlockCache() { return &blocks; }
+
+	void NotifyBreakpoint(u32 em_address, bool set);
+
+	void ClearCache() 
+	{
+		blocks.Clear();
+		trampolines.ClearCodeSpace();
+	}
+
+	// Run!
+
+	void Run();
+	void SingleStep();
+
+	const u8 *BackPatch(u8 *codePtr, int accessType, u32 em_address, CONTEXT *ctx);
+
+#define JIT_OPCODE 0
+
+	// Utilities for use by opcodes
+
+	void WriteExit(u32 destination, int exit_num);
+	void WriteExitDestInEAX(int exit_num);
+	void WriteExceptionExit(u32 exception);
+	void WriteRfiExitDestInEAX();
+	void WriteCallInterpreter(UGeckoInstruction _inst);
+	void Cleanup();
+	
+	void UnsafeLoadRegToReg(Gen::X64Reg reg_addr, Gen::X64Reg reg_value, int accessSize, s32 offset = 0, bool signExtend = false);
+	void UnsafeWriteRegToReg(Gen::X64Reg reg_value, Gen::X64Reg reg_addr, int accessSize, s32 offset = 0);
+	void SafeLoadRegToEAX(Gen::X64Reg reg, int accessSize, s32 offset, bool signExtend = false);
+	void SafeWriteRegToReg(Gen::X64Reg reg_value, Gen::X64Reg reg_addr, int accessSize, s32 offset);
+
+	void WriteToConstRamAddress(int accessSize, const Gen::OpArg& arg, u32 address);
+	void WriteFloatToConstRamAddress(const Gen::X64Reg& xmm_reg, u32 address);
+	void GenerateCarry(Gen::X64Reg temp_reg);
+
+	void ForceSinglePrecisionS(Gen::X64Reg xmm);
+	void ForceSinglePrecisionP(Gen::X64Reg xmm);
+	void JitClearCA();
+	void JitSetCA();
+	void tri_op(int d, int a, int b, bool reversible, void (XEmitter::*op)(Gen::X64Reg, Gen::OpArg));
+	typedef u32 (*Operation)(u32 a, u32 b);
+	void regimmop(int d, int a, bool binary, u32 value, Operation doop, void (XEmitter::*op)(int, const Gen::OpArg&, const Gen::OpArg&), bool Rc = false, bool carry = false);
+	void fp_tri_op(int d, int a, int b, bool reversible, bool dupe, void (XEmitter::*op)(Gen::X64Reg, Gen::OpArg));
+
+	void WriteCode();
+
+	// OPCODES
+	void unknown_instruction(UGeckoInstruction _inst);
+	void Default(UGeckoInstruction _inst);
+	void DoNothing(UGeckoInstruction _inst);
+	void HLEFunction(UGeckoInstruction _inst);
+
+	void DynaRunTable4(UGeckoInstruction _inst);
+	void DynaRunTable19(UGeckoInstruction _inst);
+	void DynaRunTable31(UGeckoInstruction _inst);
+	void DynaRunTable59(UGeckoInstruction _inst);
+	void DynaRunTable63(UGeckoInstruction _inst);
+
+	void addx(UGeckoInstruction inst);
+	void orx(UGeckoInstruction inst);
+	void xorx(UGeckoInstruction inst);
+	void andx(UGeckoInstruction inst);
+	void mulli(UGeckoInstruction inst);
+	void mulhwux(UGeckoInstruction inst);
+	void mullwx(UGeckoInstruction inst);
+	void divwux(UGeckoInstruction inst);
+	void srawix(UGeckoInstruction inst);
+	void srawx(UGeckoInstruction inst);
+	void addex(UGeckoInstruction inst);
+
+	void extsbx(UGeckoInstruction inst);
+	void extshx(UGeckoInstruction inst);
+
+	void sc(UGeckoInstruction _inst);
+	void rfi(UGeckoInstruction _inst);
+
+	void bx(UGeckoInstruction inst);
+	void bclrx(UGeckoInstruction _inst);
+	void bcctrx(UGeckoInstruction _inst);
+	void bcx(UGeckoInstruction inst);
+
+	void mtspr(UGeckoInstruction inst);
+	void mfspr(UGeckoInstruction inst);
+	void mtmsr(UGeckoInstruction inst);
+	void mfmsr(UGeckoInstruction inst);
+	void mftb(UGeckoInstruction inst);
+	void mtcrf(UGeckoInstruction inst);
+	void mfcr(UGeckoInstruction inst);
+
+	void reg_imm(UGeckoInstruction inst);
+
+	void ps_sel(UGeckoInstruction inst);
+	void ps_mr(UGeckoInstruction inst);
+	void ps_sign(UGeckoInstruction inst); //aggregate
+	void ps_arith(UGeckoInstruction inst); //aggregate
+	void ps_mergeXX(UGeckoInstruction inst);
+	void ps_maddXX(UGeckoInstruction inst);
+	void ps_rsqrte(UGeckoInstruction inst);
+	void ps_sum(UGeckoInstruction inst);
+	void ps_muls(UGeckoInstruction inst);
+
+	void fp_arith_s(UGeckoInstruction inst);
+
+	void fcmpx(UGeckoInstruction inst);
+	void fmrx(UGeckoInstruction inst);
+
+	void cmpXX(UGeckoInstruction inst);
+
+	void cntlzwx(UGeckoInstruction inst);
+
+	void lfs(UGeckoInstruction inst);
+	void lfd(UGeckoInstruction inst);
+	void stfd(UGeckoInstruction inst);
+	void stfs(UGeckoInstruction inst);
+	void stfsx(UGeckoInstruction inst);
+	void psq_l(UGeckoInstruction inst);
+	void psq_st(UGeckoInstruction inst);
+
+	void fmaddXX(UGeckoInstruction inst);
+	void stX(UGeckoInstruction inst); //stw sth stb
+	void lXz(UGeckoInstruction inst);
+	void lha(UGeckoInstruction inst);
+	void rlwinmx(UGeckoInstruction inst);
+	void rlwimix(UGeckoInstruction inst);
+	void rlwnmx(UGeckoInstruction inst);
+	void negx(UGeckoInstruction inst);
+	void slwx(UGeckoInstruction inst);
+	void srwx(UGeckoInstruction inst);
+	void dcbz(UGeckoInstruction inst);
+	void lfsx(UGeckoInstruction inst);
+
+	void subfic(UGeckoInstruction inst);
+	void subfcx(UGeckoInstruction inst);
+	void subfx(UGeckoInstruction inst);
+	void subfex(UGeckoInstruction inst);
+
+	void lbzx(UGeckoInstruction inst);
+	void lwzx(UGeckoInstruction inst);
+	void lhax(UGeckoInstruction inst);
+	
+	void lwzux(UGeckoInstruction inst);
+
+	void stXx(UGeckoInstruction inst);
+
+	void lmw(UGeckoInstruction inst);
+	void stmw(UGeckoInstruction inst);
+};
+
+extern Jit64 jit;
+
+void Jit(u32 em_address);
+
+void ProfiledReJit();
+
+#endif
+
--- a/Source/Core/Core/Src/PowerPC/Jit64IL/JitAsm.cpp
+++ b/Source/Core/Core/Src/PowerPC/Jit64IL/JitAsm.cpp
@ -0,0 +1,277 @@
+// Copyright (C) 2003-2008 Dolphin Project.
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, version 2.0.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License 2.0 for more details.
+
+// A copy of the GPL 2.0 should have been included with the program.
+// If not, see http://www.gnu.org/licenses/
+
+// Official SVN repository and contact information can be found at
+// http://code.google.com/p/dolphin-emu/
+
+#include "ABI.h"
+#include "x64Emitter.h"
+
+#include "../../HW/Memmap.h"
+
+#include "../PowerPC.h"
+#include "../../CoreTiming.h"
+#include "MemoryUtil.h"
+
+#include "ABI.h"
+#include "Jit.h"
+#include "JitCache.h"
+
+#include "../../HW/CPUCompare.h"
+#include "../../HW/GPFifo.h"
+#include "../../Core.h"
+#include "JitAsm.h"
+
+using namespace Gen;
+int blocksExecuted;
+
+static int temp32;
+
+bool compareEnabled = false;
+
+//TODO - make an option
+//#if _DEBUG
+static bool enableDebug = false; 
+//#else
+//		bool enableDebug = false; 
+//#endif
+
+static bool enableStatistics = false;
+
+//GLOBAL STATIC ALLOCATIONS x86
+//EAX - ubiquitous scratch register - EVERYBODY scratches this
+
+//GLOBAL STATIC ALLOCATIONS x64
+//EAX - ubiquitous scratch register - EVERYBODY scratches this
+//RBX - Base pointer of memory
+//R15 - Pointer to array of block pointers 
+
+AsmRoutineManager asm_routines;
+
+// PLAN: no more block numbers - crazy opcodes just contain offset within
+// dynarec buffer
+// At this offset - 4, there is an int specifying the block number.
+
+
+void AsmRoutineManager::Generate()
+{
+	enterCode = AlignCode16();
+	ABI_PushAllCalleeSavedRegsAndAdjustStack();
+#ifndef _M_IX86
+	// Two statically allocated registers.
+	MOV(64, R(RBX), Imm64((u64)Memory::base));
+	MOV(64, R(R15), Imm64((u64)jit.GetBlockCache()->GetCodePointers())); //It's below 2GB so 32 bits are good enough
+#endif
+
+	const u8 *outerLoop = GetCodePtr();
+		ABI_CallFunction(reinterpret_cast<void *>(&CoreTiming::Advance));
+		FixupBranch skipToRealDispatch = J(); //skip the sync and compare first time
+	
+		dispatcher = GetCodePtr();
+			//This is the place for CPUCompare!
+
+			//The result of slice decrementation should be in flags if somebody jumped here
+			FixupBranch bail = J_CC(CC_S);
+			SetJumpTarget(skipToRealDispatch);
+
+			dispatcherNoCheck = GetCodePtr();
+			MOV(32, R(EAX), M(&PowerPC::ppcState.pc));
+			dispatcherPcInEAX = GetCodePtr();
+#ifdef _M_IX86
+			AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK));
+			MOV(32, R(EBX), Imm32((u32)Memory::base));
+			MOV(32, R(EAX), MComplex(EBX, EAX, SCALE_1, 0));
+#else
+			MOV(32, R(EAX), MComplex(RBX, RAX, SCALE_1, 0));
+#endif
+			TEST(32, R(EAX), Imm32(0xFC));
+			FixupBranch notfound = J_CC(CC_NZ);
+				BSWAP(32, EAX);
+				//IDEA - we have 26 bits, why not just use offsets from base of code?
+				if (enableStatistics)
+				{
+					ADD(32, M(&blocksExecuted), Imm8(1));
+				}
+				if (enableDebug)
+				{
+					ADD(32, M(&PowerPC::ppcState.DebugCount), Imm8(1));
+				}
+				//grab from list and jump to it
+#ifdef _M_IX86
+				MOV(32, R(EDX), ImmPtr(jit.GetBlockCache()->GetCodePointers()));
+				JMPptr(MComplex(EDX, EAX, 4, 0));
+#else
+				JMPptr(MComplex(R15, RAX, 8, 0));
+#endif
+			SetJumpTarget(notfound);
+
+			//Ok, no block, let's jit
+#ifdef _M_IX86
+			ABI_AlignStack(4);
+			PUSH(32, M(&PowerPC::ppcState.pc));
+			CALL(reinterpret_cast<void *>(&Jit));
+			ABI_RestoreStack(4);
+#else
+			MOV(32, R(ABI_PARAM1), M(&PowerPC::ppcState.pc));
+			CALL((void *)&Jit);
+#endif
+			JMP(dispatcherNoCheck); // no point in special casing this
+
+			//FP blocks test for FPU available, jump here if false
+			fpException = AlignCode4(); 
+			MOV(32, R(EAX), M(&PC));
+			MOV(32, M(&NPC), R(EAX));
+			OR(32, M(&PowerPC::ppcState.Exceptions), Imm32(EXCEPTION_FPU_UNAVAILABLE));
+			ABI_CallFunction(reinterpret_cast<void *>(&PowerPC::CheckExceptions));
+			MOV(32, R(EAX), M(&NPC));
+			MOV(32, M(&PC), R(EAX));
+			JMP(dispatcher);
+
+		SetJumpTarget(bail);
+		doTiming = GetCodePtr();
+
+		ABI_CallFunction(reinterpret_cast<void *>(&CoreTiming::Advance));
+		
+		testExceptions = GetCodePtr();
+		TEST(32, M(&PowerPC::ppcState.Exceptions), Imm32(0xFFFFFFFF));
+		FixupBranch skipExceptions = J_CC(CC_Z);
+			MOV(32, R(EAX), M(&PC));
+			MOV(32, M(&NPC), R(EAX));
+			ABI_CallFunction(reinterpret_cast<void *>(&PowerPC::CheckExceptions));
+			MOV(32, R(EAX), M(&NPC));
+			MOV(32, M(&PC), R(EAX));
+		SetJumpTarget(skipExceptions);
+		
+		TEST(32, M((void*)&PowerPC::state), Imm32(0xFFFFFFFF));
+		J_CC(CC_Z, outerLoop, true);
+
+	//Landing pad for drec space
+	ABI_PopAllCalleeSavedRegsAndAdjustStack();
+	RET();
+
+	breakpointBailout = GetCodePtr();
+	//Landing pad for drec space
+	ABI_PopAllCalleeSavedRegsAndAdjustStack();
+	RET();
+
+	GenerateCommon();
+}
+
+
+void AsmRoutineManager::GenFifoWrite(int size) 
+{
+	// Assume value in ABI_PARAM1
+	PUSH(ESI);
+	if (size != 32)
+		PUSH(EDX);
+	BSWAP(size, ABI_PARAM1);
+	MOV(32, R(EAX), Imm32((u32)(u64)GPFifo::m_gatherPipe));
+	MOV(32, R(ESI), M(&GPFifo::m_gatherPipeCount));
+	if (size != 32) {
+		MOV(32, R(EDX), R(ABI_PARAM1));
+		MOV(size, MComplex(RAX, RSI, 1, 0), R(EDX));
+	} else {
+		MOV(size, MComplex(RAX, RSI, 1, 0), R(ABI_PARAM1));
+	}
+	ADD(32, R(ESI), Imm8(size >> 3));
+	MOV(32, M(&GPFifo::m_gatherPipeCount), R(ESI));
+	if (size != 32)
+		POP(EDX);
+	POP(ESI);
+	RET();
+}
+
+void AsmRoutineManager::GenFifoFloatWrite() 
+{
+	// Assume value in XMM0
+	PUSH(ESI);
+	PUSH(EDX);
+	MOVSS(M(&temp32), XMM0);
+	MOV(32, R(EDX), M(&temp32));
+	BSWAP(32, EDX);
+	MOV(32, R(EAX), Imm32((u32)(u64)GPFifo::m_gatherPipe));
+	MOV(32, R(ESI), M(&GPFifo::m_gatherPipeCount));
+	MOV(32, MComplex(RAX, RSI, 1, 0), R(EDX));
+	ADD(32, R(ESI), Imm8(4));
+	MOV(32, M(&GPFifo::m_gatherPipeCount), R(ESI));
+	POP(EDX);
+	POP(ESI);
+	RET();
+}
+
+void AsmRoutineManager::GenFifoXmm64Write() 
+{
+	// Assume value in XMM0. Assume pre-byteswapped (unlike the others here!)
+	PUSH(ESI);
+	MOV(32, R(EAX), Imm32((u32)(u64)GPFifo::m_gatherPipe));
+	MOV(32, R(ESI), M(&GPFifo::m_gatherPipeCount));
+	MOVQ_xmm(MComplex(RAX, RSI, 1, 0), XMM0);
+	ADD(32, R(ESI), Imm8(8));
+	MOV(32, M(&GPFifo::m_gatherPipeCount), R(ESI));
+	POP(ESI);
+	RET();
+}
+
+void AsmRoutineManager::GenerateCommon()
+{
+	// USES_CR
+	computeRc = AlignCode16();
+	CMP(32, R(EAX), Imm8(0));
+	FixupBranch pLesser  = J_CC(CC_L);
+	FixupBranch pGreater = J_CC(CC_G);
+	MOV(8, M(&PowerPC::ppcState.cr_fast[0]), Imm8(0x2)); // _x86Reg == 0
+	RET();
+	SetJumpTarget(pGreater);
+	MOV(8, M(&PowerPC::ppcState.cr_fast[0]), Imm8(0x4)); // _x86Reg > 0
+	RET();
+	SetJumpTarget(pLesser);
+	MOV(8, M(&PowerPC::ppcState.cr_fast[0]), Imm8(0x8)); // _x86Reg < 0
+	RET();
+	
+	fifoDirectWrite8 = AlignCode4();
+	GenFifoWrite(8);
+	fifoDirectWrite16 = AlignCode4();
+	GenFifoWrite(16);
+	fifoDirectWrite32 = AlignCode4();
+	GenFifoWrite(32);
+	fifoDirectWriteFloat = AlignCode4();
+	GenFifoFloatWrite();
+	fifoDirectWriteXmm64 = AlignCode4();
+	GenFifoXmm64Write();
+
+	doReJit = AlignCode4();
+	ABI_AlignStack(0);
+	CALL(reinterpret_cast<void *>(&ProfiledReJit));
+	ABI_RestoreStack(0);
+	SUB(32, M(&CoreTiming::downcount), Imm8(0));
+	JMP(dispatcher, true);
+
+	computeRcFp = AlignCode16();
+	//CMPSD(R(XMM0), M(&zero), 
+	// TODO
+
+	// Fast write routines - special case the most common hardware write
+	// TODO: use this.
+	// Even in x86, the param values will be in the right registers.
+	/*
+	const u8 *fastMemWrite8 = AlignCode16();
+	CMP(32, R(ABI_PARAM2), Imm32(0xCC008000));
+	FixupBranch skip_fast_write = J_CC(CC_NE, false);
+	MOV(32, EAX, M(&m_gatherPipeCount));
+	MOV(8, MDisp(EAX, (u32)&m_gatherPipe), ABI_PARAM1);
+	ADD(32, 1, M(&m_gatherPipeCount));
+	RET();
+	SetJumpTarget(skip_fast_write);
+	CALL((void *)&Memory::Write_U8);*/
+}
--- a/Source/Core/Core/Src/PowerPC/Jit64IL/JitAsm.h
+++ b/Source/Core/Core/Src/PowerPC/Jit64IL/JitAsm.h
@ -0,0 +1,88 @@
+// Copyright (C) 2003-2008 Dolphin Project.
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, version 2.0.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License 2.0 for more details.
+
+// A copy of the GPL 2.0 should have been included with the program.
+// If not, see http://www.gnu.org/licenses/
+
+// Official SVN repository and contact information can be found at
+// http://code.google.com/p/dolphin-emu/
+
+#ifndef _JITASM_H
+#define _JITASM_H
+
+#include "x64Emitter.h"
+
+// In Dolphin, we don't use inline assembly. Instead, we generate all machine-near
+// code at runtime. In the case of fixed code like this, after writing it, we write
+// protect the memory, essentially making it work just like precompiled code.
+
+// There are some advantages to this approach:
+//   1) No need to setup an external assembler in the build.
+//   2) Cross platform, as long as it's x86/x64.
+//   3) Can optimize code at runtime for the specific CPU model.
+// There aren't really any disadvantages other than having to maintain a x86 emitter,
+// which we have to do anyway :)
+// 
+// To add a new asm routine, just add another const here, and add the code to Generate.
+// Also, possibly increase the size of the code buffer.
+
+class AsmRoutineManager : public Gen::XCodeBlock
+{
+private:
+	void Generate();
+	void GenerateCommon();
+	void GenFifoWrite(int size);
+	void GenFifoFloatWrite();
+	void GenFifoXmm64Write();
+
+public:
+	void Init() {
+		AllocCodeSpace(8192);
+		Generate();
+		WriteProtect();
+	}
+
+	void Shutdown() {
+		FreeCodeSpace();
+	}
+
+
+	// Public generated functions. Just CALL(M((void*)func)) them.
+
+	const u8 *enterCode;
+
+	const u8 *dispatcher;
+	const u8 *dispatcherNoCheck;
+	const u8 *dispatcherPcInEAX;
+
+	const u8 *fpException;
+	const u8 *computeRc;
+	const u8 *computeRcFp;
+	const u8 *testExceptions;
+	const u8 *dispatchPcInEAX;
+	const u8 *doTiming;
+
+	const u8 *fifoDirectWrite8;
+	const u8 *fifoDirectWrite16;
+	const u8 *fifoDirectWrite32;
+	const u8 *fifoDirectWriteFloat;
+	const u8 *fifoDirectWriteXmm64;
+
+	const u8 *breakpointBailout;
+
+	const u8 *doReJit;
+
+	bool compareEnabled;
+};
+
+extern AsmRoutineManager asm_routines;
+
+#endif
--- a/Source/Core/Core/Src/PowerPC/Jit64IL/JitBackpatch.cpp
+++ b/Source/Core/Core/Src/PowerPC/Jit64IL/JitBackpatch.cpp
@ -0,0 +1,215 @@
+// Copyright (C) 2003-2008 Dolphin Project.
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, version 2.0.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License 2.0 for more details.
+
+// A copy of the GPL 2.0 should have been included with the program.
+// If not, see http://www.gnu.org/licenses/
+
+// Official SVN repository and contact information can be found at
+// http://code.google.com/p/dolphin-emu/
+
+#include <string>
+
+#include "Common.h"
+#include "disasm.h"
+#include "JitAsm.h"
+#include "../../HW/Memmap.h"
+
+#include "x64Emitter.h"
+#include "ABI.h"
+#include "Thunk.h"
+#include "x64Analyzer.h"
+
+#include "StringUtil.h"
+#include "Jit.h"
+
+using namespace Gen;
+
+extern u8 *trampolineCodePtr;
+
+void BackPatchError(const std::string &text, u8 *codePtr, u32 emAddress) {
+	u64 code_addr = (u64)codePtr;
+	disassembler disasm;
+	char disbuf[256];
+	memset(disbuf, 0, 256);
+#ifdef _M_IX86
+	disasm.disasm32(0, code_addr, codePtr, disbuf);
+#else
+	disasm.disasm64(0, code_addr, codePtr, disbuf);
+#endif
+	PanicAlert("%s\n\n"
+       "Error encountered accessing emulated address %08x.\n"
+	   "Culprit instruction: \n%s\nat %08x%08x",
+	   text.c_str(), emAddress, disbuf, code_addr>>32, code_addr);
+	return;
+}
+
+
+void TrampolineCache::Init()
+{
+	AllocCodeSpace(1024 * 1024);
+}
+
+void TrampolineCache::Shutdown()
+{
+	AllocCodeSpace(1024 * 1024);
+}
+
+// Extremely simplistic - just generate the requested trampoline. May reuse them in the future.
+const u8 *TrampolineCache::GetReadTrampoline(const InstructionInfo &info)
+{
+	if (GetSpaceLeft() < 1024)
+		PanicAlert("Trampoline cache full");
+
+	X64Reg addrReg = (X64Reg)info.scaledReg;
+	X64Reg dataReg = (X64Reg)info.regOperandReg;
+	const u8 *trampoline = GetCodePtr();
+#ifdef _M_X64
+	// It's a read. Easy.
+	ABI_PushAllCallerSavedRegsAndAdjustStack();
+	if (addrReg != ABI_PARAM1)
+		MOV(32, R(ABI_PARAM1), R((X64Reg)addrReg));
+	if (info.displacement) {
+		ADD(32, R(ABI_PARAM1), Imm32(info.displacement));
+	}
+	switch (info.operandSize) {
+	case 4:
+		CALL(thunks.ProtectFunction((void *)&Memory::Read_U32, 1));
+		break;
+	}
+	ABI_PopAllCallerSavedRegsAndAdjustStack();
+	MOV(32, R(dataReg), R(EAX));
+	RET();
+#endif
+	return trampoline;
+}
+
+// Extremely simplistic - just generate the requested trampoline. May reuse them in the future.
+const u8 *TrampolineCache::GetWriteTrampoline(const InstructionInfo &info)
+{
+	if (GetSpaceLeft() < 1024)
+		PanicAlert("Trampoline cache full");
+
+	X64Reg addrReg = (X64Reg)info.scaledReg;
+	X64Reg dataReg = (X64Reg)info.regOperandReg;
+	if (dataReg != EAX)
+		PanicAlert("Backpatch write - not through EAX");
+
+	const u8 *trampoline = GetCodePtr();
+
+#ifdef _M_X64
+
+	// It's a write. Yay. Remember that we don't have to be super efficient since it's "just" a 
+	// hardware access - we can take shortcuts.
+	//if (emAddress == 0xCC008000)
+	//	PanicAlert("caught a fifo write");
+	CMP(32, R(addrReg), Imm32(0xCC008000));
+	FixupBranch skip_fast = J_CC(CC_NE, false);
+	MOV(32, R(ABI_PARAM1), R((X64Reg)dataReg));
+	CALL((void*)asm_routines.fifoDirectWrite32);
+	RET();
+	SetJumpTarget(skip_fast);
+	ABI_PushAllCallerSavedRegsAndAdjustStack();
+	if (addrReg != ABI_PARAM1) {
+		MOV(32, R(ABI_PARAM1), R((X64Reg)dataReg));
+		MOV(32, R(ABI_PARAM2), R((X64Reg)addrReg));
+	} else {
+		MOV(32, R(ABI_PARAM2), R((X64Reg)addrReg));
+		MOV(32, R(ABI_PARAM1), R((X64Reg)dataReg));
+	}
+	if (info.displacement) {
+		ADD(32, R(ABI_PARAM2), Imm32(info.displacement));
+	}
+	switch (info.operandSize) {
+	case 4:
+		CALL(thunks.ProtectFunction((void *)&Memory::Write_U32, 2));
+		break;
+	}
+	ABI_PopAllCallerSavedRegsAndAdjustStack();
+	RET();
+#endif
+
+	return trampoline;
+}
+
+
+// This generates some fairly heavy trampolines, but:
+// 1) It's really necessary. We don't know anything about the context.
+// 2) It doesn't really hurt. Only instructions that access I/O will get these, and there won't be 
+//    that many of them in a typical program/game.
+const u8 *Jit64::BackPatch(u8 *codePtr, int accessType, u32 emAddress, CONTEXT *ctx)
+{
+#ifdef _M_X64
+	if (!jit.IsInCodeSpace(codePtr))
+		return 0;  // this will become a regular crash real soon after this
+	
+	InstructionInfo info;
+	if (!DisassembleMov(codePtr, info, accessType)) {
+		BackPatchError("BackPatch - failed to disassemble MOV instruction", codePtr, emAddress);
+	}
+
+	/*
+	if (info.isMemoryWrite) {
+		if (!Memory::IsRAMAddress(emAddress, true)) {
+			PanicAlert("Exception: Caught write to invalid address %08x", emAddress);
+			return;
+		}
+		BackPatchError("BackPatch - determined that MOV is write, not yet supported and should have been caught before",
+					   codePtr, emAddress);
+	}*/
+
+	if (info.operandSize != 4) {
+		BackPatchError(StringFromFormat("BackPatch - no support for operand size %i", info.operandSize), codePtr, emAddress);
+	}
+
+	if (info.otherReg != RBX)
+		PanicAlert("BackPatch : Base reg not RBX."
+		           "\n\nAttempted to access %08x.", emAddress);
+	
+	if (accessType == OP_ACCESS_WRITE)
+		PanicAlert("BackPatch : Currently only supporting reads."
+		           "\n\nAttempted to write to %08x.", emAddress);
+
+	// In the first iteration, we assume that all accesses are 32-bit. We also only deal with reads.
+	if (accessType == 0)
+	{
+		XEmitter emitter(codePtr);
+		int bswapNopCount;
+		// Check the following BSWAP for REX byte
+		if ((codePtr[info.instructionSize] & 0xF0) == 0x40)
+			bswapNopCount = 3;
+		else
+			bswapNopCount = 2;
+		const u8 *trampoline = trampolines.GetReadTrampoline(info);
+		emitter.CALL((void *)trampoline);
+		emitter.NOP((int)info.instructionSize + bswapNopCount - 5);
+		return codePtr;
+	}
+	else if (accessType == 1)
+	{
+		// TODO: special case FIFO writes. Also, support 32-bit mode.
+		// Also, debug this so that it actually works correctly :P
+		XEmitter emitter(codePtr - 2);
+		// We know it's EAX so the BSWAP before will be two byte. Overwrite it.
+		const u8 *trampoline = trampolines.GetWriteTrampoline(info);
+		emitter.CALL((void *)trampoline);
+		emitter.NOP((int)info.instructionSize - 3);
+		if (info.instructionSize < 3)
+			PanicAlert("instruction too small");
+		// We entered here with a BSWAP-ed EAX. We'll have to swap it back.
+		ctx->Rax = Common::swap32((u32)ctx->Rax);
+		return codePtr - 2;
+	}
+	return 0;
+#else
+	return 0;
+#endif
+}
+
--- a/Source/Core/Core/Src/PowerPC/Jit64IL/JitCache.cpp
+++ b/Source/Core/Core/Src/PowerPC/Jit64IL/JitCache.cpp
@ -0,0 +1,346 @@
+// Copyright (C) 2003-2008 Dolphin Project.
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, version 2.0.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License 2.0 for more details.
+
+// A copy of the GPL 2.0 should have been included with the program.
+// If not, see http://www.gnu.org/licenses/
+
+// Official SVN repository and contact information can be found at
+// http://code.google.com/p/dolphin-emu/
+
+// Enable define below to enable oprofile integration. For this to work,
+// it requires at least oprofile version 0.9.4, and changing the build
+// system to link the Dolphin executable against libopagent.  Since the
+// dependency is a little inconvenient and this is possibly a slight
+// performance hit, it's not enabled by default, but it's useful for
+// locating performance issues.
+
+#define OPROFILE_REPORT
+
+#include "Common.h"
+#include "../../Core.h"
+#include "MemoryUtil.h"
+
+#include "../../HW/Memmap.h"
+#include "../../CoreTiming.h"
+
+#include "../PowerPC.h"
+#include "../PPCTables.h"
+#include "../PPCAnalyst.h"
+
+#include "x64Emitter.h"
+#include "x64Analyzer.h"
+
+#include "Jit.h"
+#include "JitCache.h"
+#include "JitAsm.h"
+
+#include "disasm.h"
+
+#ifdef OPROFILE_REPORT
+#include <opagent.h>
+#endif
+
+#ifdef OPROFILE_REPORT
+	op_agent_t agent;
+#endif
+
+using namespace Gen;
+
+#define INVALID_EXIT 0xFFFFFFFF
+
+
+bool JitBlock::ContainsAddress(u32 em_address)
+{
+	// WARNING - THIS DOES NOT WORK WITH INLINING ENABLED.
+	return (em_address >= originalAddress && em_address < originalAddress + originalSize);
+}
+
+	bool JitBlockCache::IsFull() const 
+	{
+		return GetNumBlocks() >= MAX_NUM_BLOCKS - 1;
+	}
+
+	void JitBlockCache::Init()
+	{
+		MAX_NUM_BLOCKS = 65536*2;
+		if (Core::g_CoreStartupParameter.bJITUnlimitedCache)
+		{
+			MAX_NUM_BLOCKS = 65536*8;
+		}
+
+#ifdef OPROFILE_REPORT
+		agent = op_open_agent();
+#endif
+		blocks = new JitBlock[MAX_NUM_BLOCKS];
+		blockCodePointers = new const u8*[MAX_NUM_BLOCKS];
+
+		Clear();
+	}
+
+	void JitBlockCache::Shutdown()
+	{
+		delete [] blocks;
+		delete [] blockCodePointers;
+		blocks = 0;
+		blockCodePointers = 0;
+		num_blocks = 0;
+#ifdef OPROFILE_REPORT
+		op_close_agent(agent);
+#endif
+	}
+	
+	// This clears the JIT cache. It's called from JitCache.cpp when the JIT cache
+	// is full and when saving and loading states.
+	void JitBlockCache::Clear()
+	{
+		Core::DisplayMessage("Cleared code cache.", 3000);
+		// Is destroying the blocks really necessary?
+		for (int i = 0; i < num_blocks; i++)
+		{
+			DestroyBlock(i, false);
+		}
+		links_to.clear();
+		num_blocks = 0;
+		memset(blockCodePointers, 0, sizeof(u8*)*MAX_NUM_BLOCKS);
+	}
+
+	void JitBlockCache::DestroyBlocksWithFlag(BlockFlag death_flag)
+	{
+		for (int i = 0; i < num_blocks; i++)
+		{
+			if (blocks[i].flags & death_flag)
+			{
+				DestroyBlock(i, false);
+			}
+		}
+	}
+
+	void JitBlockCache::Reset()
+	{
+		Shutdown();
+		Init();
+	}
+
+	JitBlock *JitBlockCache::GetBlock(int no)
+	{
+		return &blocks[no];
+	}
+
+	int JitBlockCache::GetNumBlocks() const
+	{
+		return num_blocks;
+	}
+
+	bool JitBlockCache::RangeIntersect(int s1, int e1, int s2, int e2) const
+	{
+		// check if any endpoint is inside the other range
+		if ((s1 >= s2 && s1 <= e2) ||
+			(e1 >= s2 && e1 <= e2) ||
+			(s2 >= s1 && s2 <= e1) ||
+			(e2 >= s1 && e2 <= e1)) 
+			return true;
+		else
+			return false;
+	}
+
+	int JitBlockCache::AllocateBlock(u32 em_address)
+	{
+		JitBlock &b = blocks[num_blocks];
+		b.invalid = false;
+		b.originalAddress = em_address;
+		b.originalFirstOpcode = Memory::ReadFast32(em_address);
+		b.exitAddress[0] = INVALID_EXIT;
+		b.exitAddress[1] = INVALID_EXIT;
+		b.exitPtrs[0] = 0;
+		b.exitPtrs[1] = 0;
+		b.linkStatus[0] = false;
+		b.linkStatus[1] = false;
+		num_blocks++; //commit the current block
+		return num_blocks - 1;
+	}
+
+	void JitBlockCache::FinalizeBlock(int block_num, bool block_link, const u8 *code_ptr)
+	{
+		blockCodePointers[block_num] = code_ptr;
+		JitBlock &b = blocks[block_num];
+		Memory::WriteUnchecked_U32((JIT_OPCODE << 26) | block_num, blocks[block_num].originalAddress);
+		if (block_link)
+		{
+			for (int i = 0; i < 2; i++)
+			{
+				if (b.exitAddress[i] != INVALID_EXIT) 
+					links_to.insert(std::pair<u32, int>(b.exitAddress[i], block_num));
+			}
+			
+			LinkBlock(block_num);
+			LinkBlockExits(block_num);
+		}
+
+#ifdef OPROFILE_REPORT
+		char buf[100];
+		sprintf(buf, "EmuCode%x", b.originalAddress);
+		const u8* blockStart = blockCodePointers[block_num];
+		op_write_native_code(agent, buf, (uint64_t)blockStart,
+		                     blockStart, b.codeSize);
+#endif
+	}
+
+	const u8 **JitBlockCache::GetCodePointers()
+	{
+		return blockCodePointers;
+	}
+
+	int JitBlockCache::GetBlockNumberFromStartAddress(u32 addr)
+	{
+		if (!blocks)
+			return -1;
+		u32 code = Memory::ReadFast32(addr);
+		if ((code >> 26) == JIT_OPCODE)
+		{
+			// Jitted code.
+			unsigned int block = code & 0x03FFFFFF;
+			if (block >= (unsigned int)num_blocks) {
+				return -1;
+			}
+
+			if (blocks[block].originalAddress != addr)
+			{
+				//_assert_msg_(DYNA_REC, 0, "GetBlockFromAddress %08x - No match - This is BAD", addr);
+				return -1;
+			}
+			return block;
+		}
+		else
+		{
+			return -1;
+		}
+	}
+
+void JitBlockCache::GetBlockNumbersFromAddress(u32 em_address, std::vector<int> *block_numbers)
+{
+	for (int i = 0; i < num_blocks; i++)
+		if (blocks[i].ContainsAddress(em_address))
+			block_numbers->push_back(i);
+}
+
+	u32 JitBlockCache::GetOriginalCode(u32 address)
+	{
+		int num = GetBlockNumberFromStartAddress(address);
+		if (num == -1)
+			return Memory::ReadUnchecked_U32(address);
+		else
+			return blocks[num].originalFirstOpcode;
+	} 
+
+	CompiledCode JitBlockCache::GetCompiledCodeFromBlock(int blockNumber)
+	{
+		return (CompiledCode)blockCodePointers[blockNumber];
+	}
+
+	//Block linker
+	//Make sure to have as many blocks as possible compiled before calling this
+	//It's O(N), so it's fast :)
+	//Can be faster by doing a queue for blocks to link up, and only process those
+	//Should probably be done
+
+	void JitBlockCache::LinkBlockExits(int i)
+	{
+		JitBlock &b = blocks[i];
+		if (b.invalid)
+		{
+			// This block is dead. Don't relink it.
+			return;
+		}
+		for (int e = 0; e < 2; e++)
+		{
+			if (b.exitAddress[e] != INVALID_EXIT && !b.linkStatus[e])
+			{
+				int destinationBlock = GetBlockNumberFromStartAddress(b.exitAddress[e]);
+				if (destinationBlock != -1)
+				{
+					XEmitter emit(b.exitPtrs[e]);
+					emit.JMP(blocks[destinationBlock].checkedEntry, true);
+					b.linkStatus[e] = true;
+				}
+			}
+		}
+	}
+
+	using namespace std;
+
+	void JitBlockCache::LinkBlock(int i)
+	{
+		LinkBlockExits(i);
+		JitBlock &b = blocks[i];
+		std::map<u32, int>::iterator iter;
+		pair<multimap<u32, int>::iterator, multimap<u32, int>::iterator> ppp;
+		// equal_range(b) returns pair<iterator,iterator> representing the range
+		// of element with key b
+		ppp = links_to.equal_range(b.originalAddress);
+		if (ppp.first == ppp.second)
+			return;
+		for (multimap<u32, int>::iterator iter2 = ppp.first; iter2 != ppp.second; ++iter2) {
+			// PanicAlert("Linking block %i to block %i", iter2->second, i);
+			LinkBlockExits(iter2->second);
+		}
+	}
+
+	void JitBlockCache::DestroyBlock(int blocknum, bool invalidate)
+	{
+		u32 codebytes = (JIT_OPCODE << 26) | blocknum; //generate from i
+		JitBlock &b = blocks[blocknum];
+		b.invalid = 1;
+		if (codebytes == Memory::ReadFast32(b.originalAddress))
+		{
+			//nobody has changed it, good
+			Memory::WriteUnchecked_U32(b.originalFirstOpcode, b.originalAddress);
+		}
+		else if (!invalidate)
+		{
+			//PanicAlert("Detected code overwrite");
+			//else, we may be in trouble, since we apparently know of this block but it's been
+			//overwritten. We should have thrown it out before, on instruction cache invalidate or something.
+			//Not ne cessarily bad though , if a game has simply thrown away a lot of code and is now using the space
+			//for something else, then it's fine.
+			LOG(MASTER_LOG, "WARNING - ClearCache detected code overwrite @ %08x", blocks[blocknum].originalAddress);
+		}
+
+		// We don't unlink blocks, we just send anyone who tries to run them back to the dispatcher.
+		// Not entirely ideal, but .. pretty good.
+
+		// TODO - make sure that the below stuff really is safe.
+
+		// Spurious entrances from previously linked blocks can only come through checkedEntry
+		XEmitter emit((u8 *)b.checkedEntry);
+		emit.MOV(32, M(&PC), Imm32(b.originalAddress));
+		emit.JMP(asm_routines.dispatcher, true);
+
+		emit.SetCodePtr((u8 *)blockCodePointers[blocknum]);
+		emit.MOV(32, M(&PC), Imm32(b.originalAddress));
+		emit.JMP(asm_routines.dispatcher, true);
+	}
+
+
+	void JitBlockCache::InvalidateCodeRange(u32 address, u32 length)
+	{
+		if (!jit.jo.enableBlocklink)
+			return;
+		return;
+		//This is slow but should be safe (zelda needs it for block linking)
+		for (int i = 0; i < num_blocks; i++)
+		{
+			if (RangeIntersect(blocks[i].originalAddress, blocks[i].originalAddress + blocks[i].originalSize,
+				               address, address + length))
+			{
+				DestroyBlock(i, true);
+			}
+		}
+	}
--- a/Source/Core/Core/Src/PowerPC/Jit64IL/JitCache.h
+++ b/Source/Core/Core/Src/PowerPC/Jit64IL/JitCache.h
@ -0,0 +1,116 @@
+// Copyright (C) 2003-2008 Dolphin Project.
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, version 2.0.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License 2.0 for more details.
+
+// A copy of the GPL 2.0 should have been included with the program.
+// If not, see http://www.gnu.org/licenses/
+
+// Official SVN repository and contact information can be found at
+// http://code.google.com/p/dolphin-emu/
+
+#ifndef _JITCACHE_H
+#define _JITCACHE_H
+
+#include <map>
+#include <vector>
+
+#include "../Gekko.h"
+#include "../PPCAnalyst.h"
+
+#ifdef _WIN32
+#include <windows.h>
+#endif
+
+enum BlockFlag
+{
+	BLOCK_USE_GQR0 = 0x1, BLOCK_USE_GQR1 = 0x2, BLOCK_USE_GQR2 = 0x4, BLOCK_USE_GQR3 = 0x8,
+	BLOCK_USE_GQR4 = 0x10, BLOCK_USE_GQR5 = 0x20, BLOCK_USE_GQR6 = 0x40, BLOCK_USE_GQR7 = 0x80,
+};
+
+// TODO(ector) - optimize this struct for size
+struct JitBlock
+{
+	u32 exitAddress[2];  // 0xFFFFFFFF == unknown
+	u8 *exitPtrs[2];     // to be able to rewrite the exit jump
+	bool linkStatus[2];
+
+	u32 originalAddress;
+	u32 originalFirstOpcode; //to be able to restore
+	u32 codeSize; 
+	u32 originalSize;
+	int runCount;  // for profiling.
+
+#ifdef _WIN32
+	// we don't really need to save start and stop
+	// TODO (mb2): ticStart and ticStop -> "local var" mean "in block" ... low priority ;)
+	LARGE_INTEGER ticStart;		// for profiling - time.
+	LARGE_INTEGER ticStop;		// for profiling - time.
+	LARGE_INTEGER ticCounter;	// for profiling - time.
+#endif
+	const u8 *checkedEntry;
+	bool invalid;
+	int flags;
+
+	bool ContainsAddress(u32 em_address);
+};
+
+typedef void (*CompiledCode)();
+
+class JitBlockCache
+{
+	const u8 **blockCodePointers;
+	JitBlock *blocks;
+	int num_blocks;
+	std::multimap<u32, int> links_to;
+	int MAX_NUM_BLOCKS;
+
+	bool RangeIntersect(int s1, int e1, int s2, int e2) const;
+	void LinkBlockExits(int i);
+	void LinkBlock(int i);
+
+public:
+	JitBlockCache() {}
+
+	int AllocateBlock(u32 em_address);
+	void FinalizeBlock(int block_num, bool block_link, const u8 *code_ptr);
+
+	void Clear();
+	void Init();
+	void Shutdown();
+	void Reset();
+
+	bool IsFull() const;
+
+	// Code Cache
+	JitBlock *GetBlock(int block_num);
+	int GetNumBlocks() const;
+	const u8 **GetCodePointers();
+
+	// Fast way to get a block. Only works on the first ppc instruction of a block.
+	int GetBlockNumberFromStartAddress(u32 em_address);
+
+    // slower, but can get numbers from within blocks, not just the first instruction.
+	// WARNING! WILL NOT WORK WITH INLINING ENABLED (not yet a feature but will be soon)
+	// Returns a list of block numbers - only one block can start at a particular address, but they CAN overlap.
+	// This one is slow so should only be used for one-shots from the debugger UI, not for anything during runtime.
+	void GetBlockNumbersFromAddress(u32 em_address, std::vector<int> *block_numbers);
+
+	u32 GetOriginalCode(u32 address);
+	CompiledCode GetCompiledCodeFromBlock(int blockNumber);
+
+	// DOES NOT WORK CORRECTLY WITH INLINING
+	void InvalidateCodeRange(u32 em_address, u32 length);
+	void DestroyBlock(int blocknum, bool invalidate);
+
+	// Not currently used
+	void DestroyBlocksWithFlag(BlockFlag death_flag);
+};
+
+#endif
--- a/Source/Core/Core/Src/PowerPC/Jit64IL/JitRegCache.cpp
+++ b/Source/Core/Core/Src/PowerPC/Jit64IL/JitRegCache.cpp
@ -0,0 +1,395 @@
+// Copyright (C) 2003-2008 Dolphin Project.
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, version 2.0.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License 2.0 for more details.
+
+// A copy of the GPL 2.0 should have been included with the program.
+// If not, see http://www.gnu.org/licenses/
+
+// Official SVN repository and contact information can be found at
+// http://code.google.com/p/dolphin-emu/
+
+#include "../PowerPC.h"
+#include "../PPCTables.h"
+#include "../PPCAnalyst.h"
+#include "Jit.h"
+#include "JitCache.h"
+#include "JitAsm.h"
+#include "JitRegCache.h"
+
+using namespace Gen;
+using namespace PowerPC;
+
+void RegCache::Start(PPCAnalyst::BlockRegStats &stats)
+{
+	for (int i = 0; i < NUMXREGS; i++)
+	{
+		xregs[i].free = true;
+		xregs[i].dirty = false;
+		xlocks[i] = false;
+	}
+	for (int i = 0; i < 32; i++)
+	{
+		regs[i].location = GetDefaultLocation(i);
+		regs[i].away = false;
+	}
+	
+	// todo: sort to find the most popular regs
+	/*
+	int maxPreload = 2;
+	for (int i = 0; i < 32; i++)
+	{
+		if (stats.numReads[i] > 2 || stats.numWrites[i] >= 2)
+		{
+			LoadToX64(i, true, false); //stats.firstRead[i] <= stats.firstWrite[i], false);
+			maxPreload--;
+			if (!maxPreload)
+				break;
+		}
+	}*/
+	//Find top regs - preload them (load bursts ain't bad)
+	//But only preload IF written OR reads >= 3
+}
+
+// these are powerpc reg indices
+void RegCache::Lock(int p1, int p2, int p3, int p4)
+{
+	locks[p1] = true;
+	if (p2 != 0xFF) locks[p2] = true;
+	if (p3 != 0xFF) locks[p3] = true;
+	if (p4 != 0xFF) locks[p4] = true;
+}
+
+// these are x64 reg indices
+void RegCache::LockX(int x1, int x2, int x3, int x4)
+{
+	if (xlocks[x1]) {
+		PanicAlert("RegCache: x %i already locked!");
+	}
+	xlocks[x1] = true;
+	if (x2 != 0xFF) xlocks[x2] = true;
+	if (x3 != 0xFF) xlocks[x3] = true;
+	if (x4 != 0xFF) xlocks[x4] = true;
+}
+
+bool RegCache::IsFreeX(int xreg) const
+{
+	return xregs[xreg].free && !xlocks[xreg];
+}
+
+void RegCache::UnlockAll()
+{
+	for (int i = 0; i < 32; i++)
+		locks[i] = false;
+}
+
+void RegCache::UnlockAllX()
+{
+	for (int i = 0; i < NUMXREGS; i++)
+		xlocks[i] = false;
+}
+
+X64Reg RegCache::GetFreeXReg()
+{
+	int aCount;
+	const int *aOrder = GetAllocationOrder(aCount);
+	for (int i = 0; i < aCount; i++)
+	{
+		X64Reg xr = (X64Reg)aOrder[i];
+		if (!xlocks[xr] && xregs[xr].free)
+		{
+			return (X64Reg)xr;
+		}
+	}
+	//Okay, not found :( Force grab one
+
+	//TODO - add a pass to grab xregs whose ppcreg is not used in the next 3 instructions
+	for (int i = 0; i < aCount; i++)
+	{
+		X64Reg xr = (X64Reg)aOrder[i];
+		if (xlocks[xr]) 
+			continue;
+		int preg = xregs[xr].ppcReg;
+		if (!locks[preg])
+		{
+			StoreFromX64(preg);
+			return xr;
+		}
+	}
+	//Still no dice? Die!
+	_assert_msg_(DYNA_REC, 0, "Regcache ran out of regs");
+	return (X64Reg) -1;
+}
+
+void RegCache::SaveState()
+{
+	memcpy(saved_locks, locks, sizeof(locks));
+	memcpy(saved_xlocks, xlocks, sizeof(xlocks));
+	memcpy(saved_regs, regs, sizeof(regs));
+	memcpy(saved_xregs, xregs, sizeof(xregs));
+}
+
+void RegCache::LoadState()
+{
+	memcpy(xlocks, saved_xlocks, sizeof(xlocks));
+	memcpy(locks, saved_locks, sizeof(locks));
+	memcpy(regs, saved_regs, sizeof(regs));
+	memcpy(xregs, saved_xregs, sizeof(xregs));
+}
+
+void RegCache::FlushR(X64Reg reg)
+{
+	if (reg >= NUMXREGS)
+		PanicAlert("Flushing non existent reg");
+	if (!xregs[reg].free)
+	{
+		StoreFromX64(xregs[reg].ppcReg);
+	}
+}
+
+void RegCache::SanityCheck() const
+{
+	for (int i = 0; i < 32; i++) {
+		if (regs[i].away) {
+			if (regs[i].location.IsSimpleReg()) {
+				Gen::X64Reg simple = regs[i].location.GetSimpleReg();
+				if (xlocks[simple]) {
+					PanicAlert("%08x : PPC Reg %i is in locked x64 register %i", /*js.compilerPC*/ 0, i, regs[i].location.GetSimpleReg());
+				}
+				if (xregs[simple].ppcReg != i) {
+					PanicAlert("%08x : Xreg/ppcreg mismatch");
+				}
+			}
+		}
+	}
+}
+
+void RegCache::DiscardRegContentsIfCached(int preg)
+{
+	if (regs[preg].away && regs[preg].location.IsSimpleReg())
+	{
+		xregs[regs[preg].location.GetSimpleReg()].free = true;
+		xregs[regs[preg].location.GetSimpleReg()].dirty = false;
+		regs[preg].away = false;
+	}
+}
+
+
+void GPRRegCache::SetImmediate32(int preg, u32 immValue)
+{
+	DiscardRegContentsIfCached(preg);
+	regs[preg].away = true;
+	regs[preg].location = Imm32(immValue);
+}
+
+void GPRRegCache::Start(PPCAnalyst::BlockRegStats &stats)
+{
+	RegCache::Start(stats);
+}
+
+void FPURegCache::Start(PPCAnalyst::BlockRegStats &stats)
+{
+	RegCache::Start(stats);
+}
+
+const int *GPRRegCache::GetAllocationOrder(int &count)
+{
+	static const int allocationOrder[] = 
+	{
+#ifdef _M_X64
+#ifdef _WIN32
+		RSI, RDI, R12, R13, R14, R8, R9, R10, R11 //, RCX
+#else
+		RBP, R12, R13, R14, R8, R9, R10, R11, //, RCX
+#endif
+#elif _M_IX86
+		ESI, EDI, EBX, EBP, EDX, ECX,
+#endif
+	};
+	count = sizeof(allocationOrder) / sizeof(const int);
+	return allocationOrder;
+}
+
+const int *FPURegCache::GetAllocationOrder(int &count)
+{
+	static const int allocationOrder[] = 
+	{
+#ifdef _M_X64
+		XMM6, XMM7, XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, XMM2, XMM3, XMM4, XMM5
+#elif _M_IX86
+		XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+#endif
+	};
+	count = sizeof(allocationOrder) / sizeof(int);
+	return allocationOrder;
+}
+
+OpArg GPRRegCache::GetDefaultLocation(int reg) const
+{
+	return M(&ppcState.gpr[reg]);
+}
+
+OpArg FPURegCache::GetDefaultLocation(int reg) const
+{
+	return M(&ppcState.ps[reg][0]);
+}
+
+void RegCache::KillImmediate(int preg)
+{
+	if (regs[preg].away && regs[preg].location.IsImm())
+	{
+		LoadToX64(preg, true, true);
+	}
+}
+
+void GPRRegCache::LoadToX64(int i, bool doLoad, bool makeDirty)
+{
+	PanicAlert("BADNESS!");
+
+	if (!regs[i].away && regs[i].location.IsImm())
+		PanicAlert("Bad immedaite");
+
+	if (!regs[i].away || (regs[i].away && regs[i].location.IsImm()))
+	{
+		X64Reg xr = GetFreeXReg();
+		if (xregs[xr].dirty) PanicAlert("Xreg already dirty");
+		if (xlocks[xr]) PanicAlert("GetFreeXReg returned locked register");
+		xregs[xr].free = false;
+		xregs[xr].ppcReg = i;
+		xregs[xr].dirty = makeDirty || regs[i].location.IsImm();
+		OpArg newloc = ::Gen::R(xr);
+		if (doLoad)
+			emit->MOV(32, newloc, regs[i].location);
+		for (int j = 0; j < 32; j++)
+		{
+			if (i != j && regs[j].location.IsSimpleReg() && regs[j].location.GetSimpleReg() == xr)
+			{
+				Crash();
+			}
+		}
+		regs[i].away = true;
+		regs[i].location = newloc;
+	}
+	else
+	{
+		// reg location must be simplereg; memory locations
+		// and immediates are taken care of above.
+		xregs[RX(i)].dirty |= makeDirty;
+	}
+	if (xlocks[RX(i)]) {
+		PanicAlert("Seriously WTF, this reg should have been flushed");
+	}
+}
+
+void GPRRegCache::StoreFromX64(int i)
+{
+	if (regs[i].away)
+	{
+		bool doStore;
+		if (regs[i].location.IsSimpleReg())
+		{
+			X64Reg xr = RX(i);
+			xregs[xr].free = true;
+			xregs[xr].ppcReg = -1;
+			doStore = xregs[xr].dirty;
+			xregs[xr].dirty = false;
+		}
+		else
+		{
+			//must be immediate - do nothing
+			doStore = true;
+		}
+		OpArg newLoc = GetDefaultLocation(i);
+		// if (doStore) //<-- Breaks JIT compilation
+			emit->MOV(32, newLoc, regs[i].location);
+		regs[i].location = newLoc;
+		regs[i].away = false;
+	}
+}
+
+void FPURegCache::LoadToX64(int i, bool doLoad, bool makeDirty)
+{
+	_assert_msg_(DYNA_REC, !regs[i].location.IsImm(), "WTF - load - imm");
+	if (!regs[i].away)
+	{
+		// Reg is at home in the memory register file. Let's pull it out.
+		X64Reg xr = GetFreeXReg();
+		_assert_msg_(DYNA_REC, xr >= 0 && xr < NUMXREGS, "WTF - load - invalid reg");
+		xregs[xr].ppcReg = i;
+		xregs[xr].free = false;
+		xregs[xr].dirty = makeDirty;
+		OpArg newloc = ::Gen::R(xr);
+		if (doLoad)
+		{
+			if (!regs[i].location.IsImm() && (regs[i].location.offset & 0xF))
+			{
+				PanicAlert("WARNING - misaligned fp register location %i", i);
+			}
+			emit->MOVAPD(xr, regs[i].location);
+		}
+		regs[i].location = newloc;
+		regs[i].away = true;
+	} else {
+		// There are no immediates in the FPR reg file, so we already had this in a register. Make dirty as necessary.
+		xregs[RX(i)].dirty |= makeDirty;
+	}
+}
+
+void FPURegCache::StoreFromX64(int i)
+{
+	_assert_msg_(DYNA_REC, !regs[i].location.IsImm(), "WTF - store - imm");
+	if (regs[i].away)
+	{
+		X64Reg xr = regs[i].location.GetSimpleReg();
+		_assert_msg_(DYNA_REC, xr >= 0 && xr < NUMXREGS, "WTF - store - invalid reg");
+		xregs[xr].free = true;
+		xregs[xr].dirty = false;
+		xregs[xr].ppcReg = -1;
+		OpArg newLoc = GetDefaultLocation(i);
+		emit->MOVAPD(newLoc, xr);
+		regs[i].location = newLoc;
+		regs[i].away = false;
+	}
+	else
+	{
+	//	_assert_msg_(DYNA_REC,0,"already stored");
+	}
+}
+
+void RegCache::Flush(FlushMode mode)
+{
+	for (int i = 0; i < NUMXREGS; i++) {
+		if (xlocks[i])
+			PanicAlert("Somone forgot to unlock X64 reg %i.", i);
+	}
+	for (int i = 0; i < 32; i++)
+	{
+		if (locks[i])
+		{
+			PanicAlert("Somebody forgot to unlock PPC reg %i.", i);
+		}
+		if (regs[i].away)
+		{
+			if (regs[i].location.IsSimpleReg())
+			{
+				X64Reg xr = RX(i);
+				StoreFromX64(i);
+				xregs[xr].dirty = false;
+			}
+			else if (regs[i].location.IsImm())
+			{
+				StoreFromX64(i);
+			}
+			else
+			{
+				_assert_msg_(DYNA_REC,0,"Jit64 - Flush unhandled case, reg %i", i);
+			}
+		}
+	}
+}
--- a/Source/Core/Core/Src/PowerPC/Jit64IL/JitRegCache.h
+++ b/Source/Core/Core/Src/PowerPC/Jit64IL/JitRegCache.h
@ -0,0 +1,150 @@
+// Copyright (C) 2003-2008 Dolphin Project.
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, version 2.0.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License 2.0 for more details.
+
+// A copy of the GPL 2.0 should have been included with the program.
+// If not, see http://www.gnu.org/licenses/
+
+// Official SVN repository and contact information can be found at
+// http://code.google.com/p/dolphin-emu/
+
+#ifndef _JITREGCACHE_H
+#define _JITREGCACHE_H
+
+#include "x64Emitter.h"
+
+using namespace Gen;
+enum FlushMode
+{
+	FLUSH_ALL
+};
+
+enum GrabMode
+{
+	M_READ = 1,
+	M_WRITE = 2, 
+	M_READWRITE = 3,
+};
+
+struct PPCCachedReg
+{
+	OpArg location;
+	bool away;  // value not in source register
+};
+
+struct X64CachedReg
+{
+	int ppcReg;
+	bool dirty;
+	bool free;
+};
+
+typedef int XReg;
+typedef int PReg;
+
+#ifdef _M_X64
+#define NUMXREGS 16
+#elif _M_IX86
+#define NUMXREGS 8
+#endif
+
+class RegCache
+{
+private:
+	bool locks[32];
+	bool saved_locks[32];
+	bool saved_xlocks[NUMXREGS];
+
+protected:
+	bool xlocks[NUMXREGS];
+	PPCCachedReg regs[32];
+	X64CachedReg xregs[NUMXREGS];
+
+	PPCCachedReg saved_regs[32];
+	X64CachedReg saved_xregs[NUMXREGS];
+
+	void DiscardRegContentsIfCached(int preg);
+	virtual const int *GetAllocationOrder(int &count) = 0;
+	
+	XEmitter *emit;
+
+public:
+	virtual ~RegCache() {}
+	virtual void Start(PPCAnalyst::BlockRegStats &stats) = 0;
+
+	void SetEmitter(XEmitter *emitter) {emit = emitter;}
+
+	void FlushR(X64Reg reg); 
+	void FlushR(X64Reg reg, X64Reg reg2) {FlushR(reg); FlushR(reg2);}
+	void FlushLockX(X64Reg reg) {
+		FlushR(reg);
+		LockX(reg);
+	}
+	void FlushLockX(X64Reg reg1, X64Reg reg2) {
+		FlushR(reg1); FlushR(reg2);
+		LockX(reg1); LockX(reg2);
+	}
+	virtual void Flush(FlushMode mode);
+	virtual void Flush(PPCAnalyst::CodeOp *op) {Flush(FLUSH_ALL);}
+	void SanityCheck() const;
+	void KillImmediate(int preg);
+
+	//TODO - instead of doload, use "read", "write"
+	//read only will not set dirty flag
+	virtual void LoadToX64(int preg, bool doLoad = true, bool makeDirty = true) = 0;
+	virtual void StoreFromX64(int preg) = 0;
+
+	const OpArg &R(int preg) const {return regs[preg].location;}
+	X64Reg RX(int preg) const
+	{
+		if (regs[preg].away && regs[preg].location.IsSimpleReg()) 
+			return regs[preg].location.GetSimpleReg(); 
+		PanicAlert("Not so simple - %i", preg); 
+		return (X64Reg)-1;
+	}
+	virtual OpArg GetDefaultLocation(int reg) const = 0;
+
+	// Register locking.
+	void Lock(int p1, int p2=0xff, int p3=0xff, int p4=0xff);
+	void LockX(int x1, int x2=0xff, int x3=0xff, int x4=0xff);
+	void UnlockAll();
+	void UnlockAllX();
+
+	bool IsFreeX(int xreg) const;
+
+	X64Reg GetFreeXReg();
+
+	void SaveState();
+	void LoadState();
+};
+
+class GPRRegCache : public RegCache
+{
+public:
+	void Start(PPCAnalyst::BlockRegStats &stats);
+	void LoadToX64(int preg, bool doLoad = true, bool makeDirty = true);
+	void StoreFromX64(int preg);
+	OpArg GetDefaultLocation(int reg) const;
+	const int *GetAllocationOrder(int &count);
+	void SetImmediate32(int preg, u32 immValue);
+};
+
+
+class FPURegCache : public RegCache
+{
+public:
+	void Start(PPCAnalyst::BlockRegStats &stats);
+	void LoadToX64(int preg, bool doLoad = true, bool makeDirty = true);
+	void StoreFromX64(int preg);
+	const int *GetAllocationOrder(int &count);
+	OpArg GetDefaultLocation(int reg) const;
+};
+
+#endif
--- a/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_Branch.cpp
+++ b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_Branch.cpp
@ -0,0 +1,200 @@
+// Copyright (C) 2003-2008 Dolphin Project.
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, version 2.0.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License 2.0 for more details.
+
+// A copy of the GPL 2.0 should have been included with the program.
+// If not, see http://www.gnu.org/licenses/
+
+// Official SVN repository and contact information can be found at
+// http://code.google.com/p/dolphin-emu/
+#include "Common.h"
+#include "Thunk.h"
+
+#include "../../Core.h"
+#include "../PowerPC.h"
+#include "../../CoreTiming.h"
+#include "../PPCTables.h"
+#include "x64Emitter.h"
+
+#include "Jit.h"
+#include "JitRegCache.h"
+#include "JitCache.h"
+#include "JitAsm.h"
+
+// The branches are known good, or at least reasonably good.
+// No need for a disable-mechanism.
+
+// If defined, clears CR0 at blr and bl-s. If the assumption that
+// flags never carry over between functions holds, then the task for 
+// an optimizer becomes much easier.
+
+// #define ACID_TEST
+
+// Zelda and many more games seem to pass the Acid Test. 
+
+using namespace Gen;
+
+	void Jit64::sc(UGeckoInstruction inst)
+	{
+		if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITBranchOff)
+			{Default(inst); return;} // turn off from debugger
+
+		gpr.Flush(FLUSH_ALL);
+		fpr.Flush(FLUSH_ALL);
+		WriteExceptionExit(EXCEPTION_SYSCALL);
+	}
+
+	void Jit64::rfi(UGeckoInstruction inst)
+	{
+		if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITBranchOff)
+			{Default(inst); return;} // turn off from debugger
+
+		gpr.Flush(FLUSH_ALL);
+		fpr.Flush(FLUSH_ALL);
+		//Bits SRR1[0, 5-9, 16-23, 25-27, 30-31] are placed into the corresponding bits of the MSR.
+		//MSR[13] is set to 0.
+		const u32 mask = 0x87C0FF73;
+		// MSR = (MSR & ~mask) | (SRR1 & mask);
+		MOV(32, R(EAX), M(&MSR));
+		MOV(32, R(ECX), M(&SRR1));
+		AND(32, R(EAX), Imm32(~mask));
+		AND(32, R(ECX), Imm32(mask));
+		OR(32, R(EAX), R(ECX));       
+		// MSR &= 0xFFFDFFFF; //TODO: VERIFY
+		AND(32, R(EAX), Imm32(0xFFFDFFFF));
+		MOV(32, M(&MSR), R(EAX));
+		// NPC = SRR0; 
+		MOV(32, R(EAX), M(&SRR0));
+		WriteRfiExitDestInEAX();
+	}
+
+	void Jit64::bx(UGeckoInstruction inst)
+	{
+		if (inst.LK)
+			ibuild.EmitStoreLink(ibuild.EmitIntConst(js.compilerPC + 4));
+
+		u32 destination;
+		if (inst.AA)
+			destination = SignExt26(inst.LI << 2);
+		else
+			destination = js.compilerPC + SignExt26(inst.LI << 2);
+
+		ibuild.EmitBranchUncond(ibuild.EmitIntConst(destination));
+	}
+
+	// TODO - optimize to hell and beyond
+	// TODO - make nice easy to optimize special cases for the most common
+	// variants of this instruction.
+	void Jit64::bcx(UGeckoInstruction inst)
+	{
+		if (inst.LK)
+			ibuild.EmitStoreLink(
+				ibuild.EmitIntConst(js.compilerPC + 4));
+
+		IREmitter::InstLoc CRTest = 0, CTRTest = 0;
+		if ((inst.BO & 16) == 0)  // Test a CR bit
+		{
+			IREmitter::InstLoc CRReg = ibuild.EmitLoadCR(inst.BI >> 2);
+			IREmitter::InstLoc CRCmp = ibuild.EmitIntConst(8 >> (inst.BI & 3));
+			CRTest = ibuild.EmitAnd(CRReg, CRCmp);
+			if (inst.BO & 8)
+				CRTest = ibuild.EmitXor(CRTest, CRCmp);
+		}
+
+		if ((inst.BO & BO_DONT_DECREMENT_FLAG) == 0) {
+			IREmitter::InstLoc c = ibuild.EmitLoadCTR();
+			c = ibuild.EmitSub(c, ibuild.EmitIntConst(1));
+			ibuild.EmitStoreCTR(c);
+		}
+
+		if ((inst.BO & 4) == 0) {
+			IREmitter::InstLoc c = ibuild.EmitLoadCTR();
+			if (!(inst.BO & 2)) {
+				CTRTest = ibuild.EmitICmpEq(c,
+						ibuild.EmitIntConst(0));
+			} else {
+				CTRTest = c;
+			}
+		}
+
+		IREmitter::InstLoc Test = CRTest;
+		if (CTRTest) {
+			if (Test)
+				Test = ibuild.EmitOr(Test, CTRTest);
+			else
+				Test = CTRTest;
+		}
+
+		if (!Test) {
+			PanicAlert("Unconditional conditional branch?!");
+		}
+
+		u32 destination;
+		if(inst.AA)
+			destination = SignExt16(inst.BD << 2);
+		else
+			destination = js.compilerPC + SignExt16(inst.BD << 2);
+
+		ibuild.EmitBranchCond(Test, ibuild.EmitIntConst(destination));
+		ibuild.EmitBranchUncond(ibuild.EmitIntConst(js.compilerPC + 4));
+	}
+
+	void Jit64::bcctrx(UGeckoInstruction inst)
+	{
+		if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITBranchOff)
+			{Default(inst); return;} // turn off from debugger
+
+		gpr.Flush(FLUSH_ALL);
+		fpr.Flush(FLUSH_ALL);
+
+		// bool fastway = true;
+
+		if ((inst.BO & 16) == 0)				
+		{
+			PanicAlert("Bizarro bcctrx %08x, not supported.", inst.hex);
+			_assert_msg_(DYNA_REC, 0, "Bizarro bcctrx");
+			/*
+			fastway = false;
+			MOV(32, M(&PC), Imm32(js.compilerPC+4));
+			MOV(32, R(EAX), M(&CR));
+			XOR(32, R(ECX), R(ECX));
+			AND(32, R(EAX), Imm32(0x80000000 >> inst.BI));
+
+			CCFlags branch;
+			if(inst.BO & 8)
+				branch = CC_NZ;
+			else
+				branch = CC_Z;
+				*/
+			// TODO(ector): Why is this commented out?
+			//SETcc(branch, R(ECX));
+			// check for EBX
+			//TEST(32, R(ECX), R(ECX));
+			//linkEnd = J_CC(branch);
+		}
+		// NPC = CTR & 0xfffffffc;
+		MOV(32, R(EAX), M(&CTR));
+		if (inst.LK)
+			MOV(32, M(&LR), Imm32(js.compilerPC + 4)); //	LR = PC + 4;
+		AND(32, R(EAX), Imm32(0xFFFFFFFC));
+		WriteExitDestInEAX(0);
+	}
+
+
+	void Jit64::bclrx(UGeckoInstruction inst)
+	{
+		if (inst.hex == 0x4e800020) {
+			ibuild.EmitBranchUncond(ibuild.EmitLoadLink());
+			return;
+		}
+		Default(inst);
+		return;
+	}
+
--- a/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_FloatingPoint.cpp
+++ b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_FloatingPoint.cpp
@ -0,0 +1,224 @@
+// Copyright (C) 2003-2008 Dolphin Project.
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, version 2.0.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License 2.0 for more details.
+
+// A copy of the GPL 2.0 should have been included with the program.
+// If not, see http://www.gnu.org/licenses/
+
+// Official SVN repository and contact information can be found at
+// http://code.google.com/p/dolphin-emu/
+
+#include "Common.h"
+
+#include "../../Core.h"
+#include "../PowerPC.h"
+#include "../PPCTables.h"
+#include "x64Emitter.h"
+
+#include "Jit.h"
+#include "JitCache.h"
+#include "JitRegCache.h"
+
+#define INSTRUCTION_START
+// #define INSTRUCTION_START Default(inst); return;
+
+	const u64 GC_ALIGNED16(psSignBits2[2]) = {0x8000000000000000ULL, 0x8000000000000000ULL};
+	const u64 GC_ALIGNED16(psAbsMask2[2])  = {0x7FFFFFFFFFFFFFFFULL, 0x7FFFFFFFFFFFFFFFULL};
+	const double GC_ALIGNED16(psOneOne2[2]) = {1.0, 1.0};
+
+	void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool dupe, void (XEmitter::*op)(Gen::X64Reg, Gen::OpArg))
+	{
+		fpr.Lock(d, a, b);
+		if (d == a)
+		{
+			fpr.LoadToX64(d, true);
+			(this->*op)(fpr.RX(d), fpr.R(b));
+		}
+		else if (d == b && reversible)
+		{
+			fpr.LoadToX64(d, true);
+			(this->*op)(fpr.RX(d), fpr.R(a));
+		}
+		else if (a != d && b != d) 
+		{
+			// Sources different from d, can use rather quick solution
+			fpr.LoadToX64(d, !dupe);
+			MOVSD(fpr.RX(d), fpr.R(a));
+			(this->*op)(fpr.RX(d), fpr.R(b));
+		}
+		else if (b != d)
+		{
+			fpr.LoadToX64(d, !dupe);
+			MOVSD(XMM0, fpr.R(b));
+			MOVSD(fpr.RX(d), fpr.R(a));
+			(this->*op)(fpr.RX(d), Gen::R(XMM0));
+		}
+		else // Other combo, must use two temps :(
+		{
+			MOVSD(XMM0, fpr.R(a));
+			MOVSD(XMM1, fpr.R(b));
+			fpr.LoadToX64(d, !dupe);
+			(this->*op)(XMM0, Gen::R(XMM1));
+			MOVSD(fpr.RX(d), Gen::R(XMM0));
+		}
+		if (dupe) {
+			ForceSinglePrecisionS(fpr.RX(d));
+			MOVDDUP(fpr.RX(d), fpr.R(d));
+		}
+		fpr.UnlockAll();
+	}
+
+	void Jit64::fp_arith_s(UGeckoInstruction inst)
+	{
+		if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITFloatingPointOff)
+			{Default(inst); return;} // turn off from debugger
+		INSTRUCTION_START;
+		if (inst.Rc) {
+			Default(inst); return;
+		}
+		bool dupe = inst.OPCD == 59;
+		switch (inst.SUBOP5)
+		{
+		case 18: fp_tri_op(inst.FD, inst.FA, inst.FB, false, dupe, &XEmitter::DIVSD); break; //div
+		case 20: fp_tri_op(inst.FD, inst.FA, inst.FB, false, dupe, &XEmitter::SUBSD); break; //sub
+		case 21: fp_tri_op(inst.FD, inst.FA, inst.FB, true,  dupe, &XEmitter::ADDSD); break; //add
+		case 23: //sel
+			Default(inst);
+			break;
+		case 24: //res
+			Default(inst);
+			break;
+		case 25: fp_tri_op(inst.FD, inst.FA, inst.FC, true, dupe, &XEmitter::MULSD); break; //mul
+		default:
+			_assert_msg_(DYNA_REC, 0, "fp_arith_s WTF!!!");
+		}
+	}
+
+	void Jit64::fmaddXX(UGeckoInstruction inst)
+	{
+		if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITFloatingPointOff)
+			{Default(inst); return;} // turn off from debugger		
+		INSTRUCTION_START;
+		if (inst.Rc) {
+			Default(inst); return;
+		}
+
+		bool single_precision = inst.OPCD == 59;
+
+		int a = inst.FA;
+		int b = inst.FB;
+		int c = inst.FC;
+		int d = inst.FD;
+
+		fpr.Lock(a, b, c, d);
+		MOVSD(XMM0, fpr.R(a));
+		switch (inst.SUBOP5)
+		{
+		case 28: //msub
+			MULSD(XMM0, fpr.R(c));
+			SUBSD(XMM0, fpr.R(b));
+			break;
+		case 29: //madd
+			MULSD(XMM0, fpr.R(c));
+			ADDSD(XMM0, fpr.R(b));
+			break;
+		case 30: //nmsub
+			MULSD(XMM0, fpr.R(c));
+			SUBSD(XMM0, fpr.R(b));
+			XORPD(XMM0, M((void*)&psSignBits2));
+			break;
+		case 31: //nmadd
+			MULSD(XMM0, fpr.R(c));
+			ADDSD(XMM0, fpr.R(b));
+			XORPD(XMM0, M((void*)&psSignBits2));
+			break;
+		}
+		fpr.LoadToX64(d, false);
+		//YES it is necessary to dupe the result :(
+		//TODO : analysis - does the top reg get used? If so, dupe, if not, don't.
+		if (single_precision) {
+			ForceSinglePrecisionS(XMM0);
+			MOVDDUP(fpr.RX(d), R(XMM0));
+		} else {
+			MOVSD(fpr.RX(d), R(XMM0));
+		}
+		fpr.UnlockAll();
+	}
+	
+	void Jit64::fmrx(UGeckoInstruction inst)
+	{
+		if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITFloatingPointOff)
+			{Default(inst); return;} // turn off from debugger
+		INSTRUCTION_START;
+		if (inst.Rc) {
+			Default(inst); return;
+		}
+		int d = inst.FD;
+		int b = inst.FB;
+		fpr.LoadToX64(d, true);  // we don't want to destroy the high bit
+		MOVSD(fpr.RX(d), fpr.R(b));
+	}
+
+	void Jit64::fcmpx(UGeckoInstruction inst)
+	{
+		if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITFloatingPointOff)
+			{Default(inst); return;} // turn off from debugger
+		INSTRUCTION_START;
+		if (jo.fpAccurateFlags)
+		{
+			Default(inst);
+			return;
+		}
+		bool ordered = inst.SUBOP10 == 32;
+	/*
+	double fa =	rPS0(_inst.FA);
+	double fb =	rPS0(_inst.FB);
+	u32 compareResult;
+
+	if(IsNAN(fa) ||	IsNAN(fb))	compareResult = 1; 
+	else if(fa < fb)			compareResult = 8;	
+	else if(fa > fb)			compareResult = 4;
+	else						compareResult = 2;
+
+	FPSCR.FPRF = compareResult;
+	CR = (CR & (~(0xf0000000 >> (_inst.CRFD * 4)))) | (compareResult <<	((7	- _inst.CRFD) *	4));
+*/
+		int a = inst.FA;
+		int b = inst.FB;
+		int crf = inst.CRFD;
+		int shift = crf * 4;
+		//FPSCR
+		//XOR(32,R(EAX),R(EAX));
+
+		fpr.Lock(a,b);
+		if (a != b)
+			fpr.LoadToX64(a, true);
+
+		// USES_CR
+		if (ordered)
+			COMISD(fpr.R(a).GetSimpleReg(), fpr.R(b));
+		else
+			UCOMISD(fpr.R(a).GetSimpleReg(), fpr.R(b));
+		FixupBranch pLesser  = J_CC(CC_B);
+		FixupBranch pGreater = J_CC(CC_A);
+		// _x86Reg == 0
+		MOV(8, M(&PowerPC::ppcState.cr_fast[crf]), Imm8(0x2));
+		FixupBranch continue1 = J();
+		// _x86Reg > 0
+		SetJumpTarget(pGreater);
+		MOV(8, M(&PowerPC::ppcState.cr_fast[crf]), Imm8(0x4));
+		FixupBranch continue2 = J();
+		// _x86Reg < 0
+		SetJumpTarget(pLesser);
+		MOV(8, M(&PowerPC::ppcState.cr_fast[crf]), Imm8(0x8));
+		SetJumpTarget(continue1);
+		SetJumpTarget(continue2);
+		fpr.UnlockAll();
+	}
--- a/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_Integer.cpp
+++ b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_Integer.cpp
@ -0,0 +1,520 @@
+// Copyright (C) 2003-2008 Dolphin Project.
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, version 2.0.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License 2.0 for more details.
+
+// A copy of the GPL 2.0 should have been included with the program.
+// If not, see http://www.gnu.org/licenses/
+
+// Official SVN repository and contact information can be found at
+// http://code.google.com/p/dolphin-emu/
+
+#include "../../Core.h" // include "Common.h", "CoreParameter.h", SCoreStartupParameter
+#include "../PowerPC.h"
+#include "../PPCTables.h"
+#include "x64Emitter.h"
+
+#include "Jit.h"
+#include "JitCache.h"
+#include "JitRegCache.h"
+#include "JitAsm.h"
+
+// #define INSTRUCTION_START Default(inst); return;
+#define INSTRUCTION_START
+
+	static void ComputeRC(IREmitter::IRBuilder& ibuild,
+			      IREmitter::InstLoc val) {
+		IREmitter::InstLoc res =
+			ibuild.EmitICmpCRSigned(val, ibuild.EmitIntConst(0));
+		ibuild.EmitStoreCR(res, 0);
+	}
+
+	void Jit64::reg_imm(UGeckoInstruction inst)
+	{
+		int d = inst.RD, a = inst.RA, s = inst.RS;
+		IREmitter::InstLoc val, test, c;
+		switch (inst.OPCD)
+		{
+		case 14: //addi
+			val = ibuild.EmitIntConst(inst.SIMM_16);
+			if (a)
+				val = ibuild.EmitAdd(ibuild.EmitLoadGReg(a), val);
+			ibuild.EmitStoreGReg(val, d);
+			break;
+		case 15: //addis
+			val = ibuild.EmitIntConst(inst.SIMM_16 << 16);
+			if (a)
+				val = ibuild.EmitAdd(ibuild.EmitLoadGReg(a), val);
+			ibuild.EmitStoreGReg(val, d);
+			break;
+		case 24: //ori
+			val = ibuild.EmitIntConst(inst.UIMM);
+			val = ibuild.EmitOr(ibuild.EmitLoadGReg(s), val);
+			ibuild.EmitStoreGReg(val, a);
+			break;
+		case 25: //oris
+			val = ibuild.EmitIntConst(inst.UIMM << 16);
+			val = ibuild.EmitOr(ibuild.EmitLoadGReg(s), val);
+			ibuild.EmitStoreGReg(val, a);
+			break;
+		case 28: //andi
+			val = ibuild.EmitIntConst(inst.UIMM);
+			val = ibuild.EmitAnd(ibuild.EmitLoadGReg(s), val);
+			ibuild.EmitStoreGReg(val, a);
+			ComputeRC(ibuild, val);
+			break;
+		case 29: //andis
+			val = ibuild.EmitIntConst(inst.UIMM << 16);
+			val = ibuild.EmitAnd(ibuild.EmitLoadGReg(s), val);
+			ibuild.EmitStoreGReg(val, a);
+			ComputeRC(ibuild, val);
+			break;
+		case 26: //xori
+			val = ibuild.EmitIntConst(inst.UIMM);
+			val = ibuild.EmitXor(ibuild.EmitLoadGReg(s), val);
+			ibuild.EmitStoreGReg(val, a);
+			break;
+		case 27: //xoris
+			val = ibuild.EmitIntConst(inst.UIMM << 16);
+			val = ibuild.EmitXor(ibuild.EmitLoadGReg(s), val);
+			ibuild.EmitStoreGReg(val, a);
+			break;
+		case 12: //addic
+		case 13: //addic_rc
+			c = ibuild.EmitIntConst(inst.SIMM_16);
+			val = ibuild.EmitAdd(ibuild.EmitLoadGReg(a), c);
+			ibuild.EmitStoreGReg(val, d);
+			test = ibuild.EmitICmpUgt(c, val);
+			ibuild.EmitStoreCarry(test);
+			if (inst.OPCD == 13)
+				ComputeRC(ibuild, val);
+			break;
+		default:
+			Default(inst);
+			break;
+		}
+	}
+
+	void Jit64::cmpXX(UGeckoInstruction inst)
+	{
+		IREmitter::InstLoc lhs, rhs, res;
+		lhs = ibuild.EmitLoadGReg(inst.RA);
+		if (inst.OPCD == 31) {
+			rhs = ibuild.EmitLoadGReg(inst.RB);
+			if (inst.SUBOP10 == 32) {
+				res = ibuild.EmitICmpCRUnsigned(lhs, rhs);
+			} else {
+				res = ibuild.EmitICmpCRSigned(lhs, rhs);
+			}
+		} else if (inst.OPCD == 10) {
+			rhs = ibuild.EmitIntConst(inst.UIMM);
+			res = ibuild.EmitICmpCRUnsigned(lhs, rhs);
+		} else { // inst.OPCD == 11
+			rhs = ibuild.EmitIntConst(inst.SIMM_16);
+			res = ibuild.EmitICmpCRSigned(lhs, rhs);
+		}
+		
+		ibuild.EmitStoreCR(res, inst.CRFD);
+	}
+
+	void Jit64::orx(UGeckoInstruction inst)
+	{
+		IREmitter::InstLoc val = ibuild.EmitLoadGReg(inst.RB);
+		val = ibuild.EmitOr(ibuild.EmitLoadGReg(inst.RS), val);
+		ibuild.EmitStoreGReg(val, inst.RA);
+		if (inst.Rc)
+			ComputeRC(ibuild, val);
+	}
+
+	
+	// m_GPR[_inst.RA] = m_GPR[_inst.RS] ^ m_GPR[_inst.RB];
+	void Jit64::xorx(UGeckoInstruction inst)
+	{
+		IREmitter::InstLoc val = ibuild.EmitLoadGReg(inst.RB);
+		val = ibuild.EmitXor(ibuild.EmitLoadGReg(inst.RS), val);
+		ibuild.EmitStoreGReg(val, inst.RA);
+		if (inst.Rc)
+			ComputeRC(ibuild, val);
+	}
+
+	void Jit64::andx(UGeckoInstruction inst)
+	{
+		IREmitter::InstLoc val = ibuild.EmitLoadGReg(inst.RB);
+		val = ibuild.EmitAnd(ibuild.EmitLoadGReg(inst.RS), val);
+		ibuild.EmitStoreGReg(val, inst.RA);
+		if (inst.Rc)
+			ComputeRC(ibuild, val);
+	}
+
+	void Jit64::extsbx(UGeckoInstruction inst)
+	{
+		IREmitter::InstLoc val = ibuild.EmitLoadGReg(inst.RS);
+		val = ibuild.EmitSExt8(val);
+		ibuild.EmitStoreGReg(val, inst.RA);
+		if (inst.Rc)
+			ComputeRC(ibuild, val);
+	}
+
+	void Jit64::extshx(UGeckoInstruction inst)
+	{
+		IREmitter::InstLoc val = ibuild.EmitLoadGReg(inst.RS);
+		val = ibuild.EmitSExt16(val);
+		ibuild.EmitStoreGReg(val, inst.RA);
+		if (inst.Rc)
+			ComputeRC(ibuild, val);
+	}
+
+	void Jit64::subfic(UGeckoInstruction inst)
+	{
+		if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITIntegerOff)
+			{Default(inst); return;} // turn off from debugger
+
+		INSTRUCTION_START;
+		int a = inst.RA, d = inst.RD;
+		gpr.FlushLockX(ECX);
+		gpr.Lock(a, d);
+		gpr.LoadToX64(d, a == d, true);
+		int imm = inst.SIMM_16;
+		MOV(32, R(EAX), gpr.R(a));
+		NOT(32, R(EAX));
+		ADD(32, R(EAX), Imm32(imm + 1));
+		MOV(32, gpr.R(d), R(EAX));
+		//GenerateCarry(ECX);
+		gpr.UnlockAll();
+		gpr.UnlockAllX();
+		// This instruction has no RC flag
+	}
+
+	void Jit64::subfcx(UGeckoInstruction inst) 
+	{
+		INSTRUCTION_START;
+		Default(inst);
+		return;
+		/*
+		u32 a = m_GPR[_inst.RA];
+		u32 b = m_GPR[_inst.RB];
+		m_GPR[_inst.RD] = b - a;
+		SetCarry(a == 0 || Helper_Carry(b, 0-a));
+
+		if (_inst.OE) PanicAlert("OE: subfcx");
+		if (_inst.Rc) Helper_UpdateCR0(m_GPR[_inst.RD]);
+		*/
+	}
+
+	void Jit64::subfex(UGeckoInstruction inst) 
+	{
+		INSTRUCTION_START;
+		Default(inst);
+		return;
+		/*
+		u32 a = m_GPR[_inst.RA];
+		u32 b = m_GPR[_inst.RB];
+		int carry = GetCarry();
+		m_GPR[_inst.RD] = (~a) + b + carry;
+		SetCarry(Helper_Carry(~a, b) || Helper_Carry((~a) + b, carry));
+
+		if (_inst.OE) PanicAlert("OE: subfcx");
+		if (_inst.Rc) Helper_UpdateCR0(m_GPR[_inst.RD]);
+		*/
+	}
+
+	void Jit64::subfx(UGeckoInstruction inst)
+	{
+		if (inst.OE) PanicAlert("OE: subfx");
+		IREmitter::InstLoc val = ibuild.EmitLoadGReg(inst.RB);
+		val = ibuild.EmitSub(val, ibuild.EmitLoadGReg(inst.RA));
+		ibuild.EmitStoreGReg(val, inst.RD);
+		if (inst.Rc)
+			ComputeRC(ibuild, val);
+	}
+
+	void Jit64::mulli(UGeckoInstruction inst)
+	{
+		IREmitter::InstLoc val = ibuild.EmitLoadGReg(inst.RA);
+		val = ibuild.EmitMul(val, ibuild.EmitIntConst(inst.SIMM_16));
+		ibuild.EmitStoreGReg(val, inst.RD);
+	}
+
+	void Jit64::mullwx(UGeckoInstruction inst)
+	{
+		IREmitter::InstLoc val = ibuild.EmitLoadGReg(inst.RB);
+		val = ibuild.EmitMul(ibuild.EmitLoadGReg(inst.RA), val);
+		ibuild.EmitStoreGReg(val, inst.RD);
+		if (inst.Rc)
+			ComputeRC(ibuild, val);
+	}
+
+	void Jit64::mulhwux(UGeckoInstruction inst)
+	{
+		if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITIntegerOff)
+			{Default(inst); return;} // turn off from debugger
+
+		INSTRUCTION_START;
+		int a = inst.RA, b = inst.RB, d = inst.RD;
+		gpr.FlushLockX(EDX);
+		gpr.Lock(a, b, d);
+		if (d != a && d != b) {
+			gpr.LoadToX64(d, false, true);
+		} else {
+			gpr.LoadToX64(d, true, true);
+		}
+		if (gpr.RX(d) == EDX)
+			PanicAlert("mulhwux : WTF");
+		MOV(32, R(EAX), gpr.R(a));
+		gpr.KillImmediate(b);
+		MUL(32, gpr.R(b));
+		gpr.UnlockAll();
+		gpr.UnlockAllX();
+		if (inst.Rc) {
+			MOV(32, R(EAX), R(EDX));
+			MOV(32, gpr.R(d), R(EDX));
+			// result is already in eax
+			CALL((u8*)asm_routines.computeRc);
+		} else {
+			MOV(32, gpr.R(d), R(EDX));
+		}
+	}
+
+	// skipped some of the special handling in here - if we get crashes, let the interpreter handle this op
+	void Jit64::divwux(UGeckoInstruction inst) {
+		Default(inst); return;
+
+		int a = inst.RA, b = inst.RB, d = inst.RD;
+		gpr.FlushLockX(EDX);
+		gpr.Lock(a, b, d);
+		if (d != a && d != b) {
+			gpr.LoadToX64(d, false, true);
+		} else {
+			gpr.LoadToX64(d, true, true);
+		}
+		MOV(32, R(EAX), gpr.R(a));
+		XOR(32, R(EDX), R(EDX));
+		gpr.KillImmediate(b);
+		DIV(32, gpr.R(b));
+		MOV(32, gpr.R(d), R(EAX));
+		gpr.UnlockAll();
+		gpr.UnlockAllX();
+		if (inst.Rc) {
+			CALL((u8*)asm_routines.computeRc);
+		}
+	}
+
+	u32 Helper_Mask(u8 mb, u8 me)
+	{
+		return (((mb > me) ?
+			~(((u32)-1 >> mb) ^ ((me >= 31) ? 0 : (u32) -1 >> (me + 1)))
+			:
+			(((u32)-1 >> mb) ^ ((me >= 31) ? 0 : (u32) -1 >> (me + 1))))
+			);
+	}
+
+	void Jit64::addx(UGeckoInstruction inst)
+	{
+		IREmitter::InstLoc val = ibuild.EmitLoadGReg(inst.RB);
+		val = ibuild.EmitAdd(ibuild.EmitLoadGReg(inst.RA), val);
+		ibuild.EmitStoreGReg(val, inst.RD);
+		if (inst.Rc)
+			ComputeRC(ibuild, val);
+	}
+
+	// This can be optimized
+	void Jit64::addex(UGeckoInstruction inst)
+	{
+		Default(inst); return;
+		// USES_XER
+		if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITIntegerOff)
+			{Default(inst); return;} // turn off from debugger
+
+		INSTRUCTION_START;
+		int a = inst.RA, b = inst.RB, d = inst.RD;
+		gpr.FlushLockX(ECX);
+		gpr.Lock(a, b, d);
+		if (d != a && d != b)
+			gpr.LoadToX64(d, false);
+		else
+			gpr.LoadToX64(d, true);
+		MOV(32, R(EAX), M(&PowerPC::ppcState.spr[SPR_XER]));
+		SHR(32, R(EAX), Imm8(30)); // shift the carry flag out into the x86 carry flag
+		MOV(32, R(EAX), gpr.R(a));
+		ADC(32, R(EAX), gpr.R(b));
+		MOV(32, gpr.R(d), R(EAX));
+		//GenerateCarry(ECX);
+		gpr.UnlockAll();
+		gpr.UnlockAllX();
+		if (inst.Rc)
+		{
+			CALL((u8*)asm_routines.computeRc);
+		}
+	}
+
+	void Jit64::rlwinmx(UGeckoInstruction inst)
+	{
+		unsigned mask = Helper_Mask(inst.MB, inst.ME);
+		IREmitter::InstLoc val = ibuild.EmitLoadGReg(inst.RS);
+		val = ibuild.EmitRol(val, ibuild.EmitIntConst(inst.SH));
+		val = ibuild.EmitAnd(val, ibuild.EmitIntConst(mask));
+		ibuild.EmitStoreGReg(val, inst.RA);
+		if (inst.Rc)
+			ComputeRC(ibuild, val);
+	}
+
+
+	void Jit64::rlwimix(UGeckoInstruction inst)
+	{
+		unsigned mask = Helper_Mask(inst.MB, inst.ME);
+		IREmitter::InstLoc val = ibuild.EmitLoadGReg(inst.RS);
+		val = ibuild.EmitRol(val, ibuild.EmitIntConst(inst.SH));
+		val = ibuild.EmitAnd(val, ibuild.EmitIntConst(mask));
+		IREmitter::InstLoc ival = ibuild.EmitLoadGReg(inst.RA);
+		ival = ibuild.EmitAnd(ival, ibuild.EmitIntConst(~mask));
+		val = ibuild.EmitOr(ival, val);
+		ibuild.EmitStoreGReg(val, inst.RA);
+		if (inst.Rc)
+			ComputeRC(ibuild, val);
+	}
+
+	void Jit64::rlwnmx(UGeckoInstruction inst)
+	{
+		if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITIntegerOff)
+			{Default(inst); return;} // turn off from debugger
+
+		INSTRUCTION_START;
+		int a = inst.RA, b = inst.RB, s = inst.RS;
+		if (gpr.R(a).IsImm())
+		{
+			Default(inst);
+			return;
+		}
+
+		u32 mask = Helper_Mask(inst.MB, inst.ME);
+		gpr.FlushLockX(ECX);
+		gpr.Lock(a, b, s);
+		MOV(32, R(EAX), gpr.R(s));
+		MOV(32, R(ECX), gpr.R(b));
+		AND(32, R(ECX), Imm32(0x1f));
+		ROL(32, R(EAX), R(ECX));
+		AND(32, R(EAX), Imm32(mask));
+		MOV(32, gpr.R(a), R(EAX));
+		gpr.UnlockAll();
+		gpr.UnlockAllX();
+		if (inst.Rc)
+		{
+			MOV(32, R(EAX), gpr.R(a));
+			CALL((u8*)asm_routines.computeRc);
+		}
+	}
+
+	void Jit64::negx(UGeckoInstruction inst)
+	{
+		IREmitter::InstLoc val = ibuild.EmitLoadGReg(inst.RA);
+		val = ibuild.EmitSub(ibuild.EmitIntConst(0), val);
+		ibuild.EmitStoreGReg(val, inst.RD);
+		if (inst.Rc)
+			ComputeRC(ibuild, val);
+	}
+
+	void Jit64::srwx(UGeckoInstruction inst)
+	{
+		IREmitter::InstLoc val = ibuild.EmitLoadGReg(inst.RS),
+			           samt = ibuild.EmitLoadGReg(inst.RB),
+			           corr;
+		// FIXME: We can do better with a cmov
+		// FIXME: We can do better on 64-bit
+		val = ibuild.EmitShrl(val, samt);
+		corr = ibuild.EmitShl(samt, ibuild.EmitIntConst(26));
+		corr = ibuild.EmitSarl(corr, ibuild.EmitIntConst(31));
+		corr = ibuild.EmitXor(corr, ibuild.EmitIntConst(-1));
+		val = ibuild.EmitAnd(corr, val);
+		ibuild.EmitStoreGReg(val, inst.RA);
+		if (inst.Rc)
+			ComputeRC(ibuild, val);
+	}
+
+	void Jit64::slwx(UGeckoInstruction inst)
+	{
+		IREmitter::InstLoc val = ibuild.EmitLoadGReg(inst.RS),
+			           samt = ibuild.EmitLoadGReg(inst.RB),
+			           corr;
+		// FIXME: We can do better with a cmov
+		// FIXME: We can do better on 64-bit
+		val = ibuild.EmitShl(val, samt);
+		corr = ibuild.EmitShl(samt, ibuild.EmitIntConst(26));
+		corr = ibuild.EmitSarl(corr, ibuild.EmitIntConst(31));
+		corr = ibuild.EmitXor(corr, ibuild.EmitIntConst(-1));
+		val = ibuild.EmitAnd(corr, val);
+		ibuild.EmitStoreGReg(val, inst.RA);
+		if (inst.Rc)
+			ComputeRC(ibuild, val);
+	}
+
+	void Jit64::srawx(UGeckoInstruction inst)
+	{
+		// FIXME: We can do a lot better on 64-bit
+		IREmitter::InstLoc val, samt, mask, mask2, test;
+		val = ibuild.EmitLoadGReg(inst.RS);
+		samt = ibuild.EmitLoadGReg(inst.RB);
+		mask = ibuild.EmitIntConst(-1);
+		val = ibuild.EmitSarl(val, samt);
+		mask = ibuild.EmitShl(mask, samt);
+		samt = ibuild.EmitShl(samt, ibuild.EmitIntConst(26));
+		samt = ibuild.EmitSarl(samt, ibuild.EmitIntConst(31));
+		samt = ibuild.EmitAnd(samt, ibuild.EmitIntConst(31));
+		val = ibuild.EmitSarl(val, samt);
+		ibuild.EmitStoreGReg(val, inst.RA);
+		mask = ibuild.EmitShl(mask, samt);
+		mask2 = ibuild.EmitAnd(mask, ibuild.EmitIntConst(0x7FFFFFFF));
+		test = ibuild.EmitOr(val, mask2);
+		test = ibuild.EmitICmpUgt(test, mask);
+		ibuild.EmitStoreCarry(test);
+	}
+
+	void Jit64::srawix(UGeckoInstruction inst)
+	{
+		IREmitter::InstLoc val = ibuild.EmitLoadGReg(inst.RS), test;
+		val = ibuild.EmitSarl(val, ibuild.EmitIntConst(inst.SH));
+		ibuild.EmitStoreGReg(val, inst.RA);
+		unsigned mask = -1u << inst.SH;
+		test = ibuild.EmitOr(val, ibuild.EmitIntConst(mask & 0x7FFFFFFF));
+		test = ibuild.EmitICmpUgt(test, ibuild.EmitIntConst(mask));
+		
+		ibuild.EmitStoreCarry(test);
+		if (inst.Rc)
+			ComputeRC(ibuild, val);
+	}
+
+	// count leading zeroes
+	void Jit64::cntlzwx(UGeckoInstruction inst)
+	{
+		if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITIntegerOff)
+			{Default(inst); return;} // turn off from debugger
+
+		INSTRUCTION_START;
+		int a = inst.RA;
+		int s = inst.RS;
+		if (gpr.R(a).IsImm() || gpr.R(s).IsImm() || s == a)
+		{
+			Default(inst);
+			return;
+		}
+		gpr.Lock(a,s);
+		gpr.LoadToX64(a,false);
+		BSR(32, gpr.R(a).GetSimpleReg(), gpr.R(s));
+		FixupBranch gotone = J_CC(CC_NZ);
+		MOV(32, gpr.R(a), Imm32(63));
+		SetJumpTarget(gotone);
+		XOR(32, gpr.R(a), Imm8(0x1f));  // flip order
+		gpr.UnlockAll();
+
+		if (inst.Rc)
+		{
+			MOV(32, R(EAX), gpr.R(a));
+			CALL((u8*)asm_routines.computeRc);
+			// TODO: Check PPC manual too
+		}
+	}
--- a/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_LoadStore.cpp
+++ b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_LoadStore.cpp
@ -0,0 +1,198 @@
+// Copyright (C) 2003-2008 Dolphin Project.
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, version 2.0.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License 2.0 for more details.
+
+// A copy of the GPL 2.0 should have been included with the program.
+// If not, see http://www.gnu.org/licenses/
+
+// Official SVN repository and contact information can be found at
+// http://code.google.com/p/dolphin-emu/
+
+// TODO(ector): Tons of pshufb optimization of the loads/stores, for SSSE3+, possibly SSE4, only.
+// Should give a very noticable speed boost to paired single heavy code.
+
+#include "Common.h"
+#include "Thunk.h"
+
+#include "../PowerPC.h"
+#include "../../Core.h"
+#include "../../HW/GPFifo.h"
+#include "../../HW/CommandProcessor.h"
+#include "../../HW/PixelEngine.h"
+#include "../../HW/Memmap.h"
+#include "../PPCTables.h"
+#include "x64Emitter.h"
+#include "ABI.h"
+
+#include "Jit.h"
+#include "JitCache.h"
+#include "JitAsm.h"
+#include "JitRegCache.h"
+
+// #define INSTRUCTION_START Default(inst); return;
+#define INSTRUCTION_START
+
+	void Jit64::lbzx(UGeckoInstruction inst)
+	{
+		IREmitter::InstLoc addr = ibuild.EmitLoadGReg(inst.RB);
+		if (inst.RA)
+			addr = ibuild.EmitAdd(addr, ibuild.EmitLoadGReg(inst.RA));
+		ibuild.EmitStoreGReg(ibuild.EmitLoad8(addr), inst.RD);
+	}
+
+	void Jit64::lwzx(UGeckoInstruction inst)
+	{
+		IREmitter::InstLoc addr = ibuild.EmitLoadGReg(inst.RB);
+		if (inst.RA)
+			addr = ibuild.EmitAdd(addr, ibuild.EmitLoadGReg(inst.RA));
+		ibuild.EmitStoreGReg(ibuild.EmitLoad32(addr), inst.RD);
+	}
+
+	void Jit64::lhax(UGeckoInstruction inst)
+	{
+		IREmitter::InstLoc addr = ibuild.EmitLoadGReg(inst.RB);
+		if (inst.RA)
+			addr = ibuild.EmitAdd(addr, ibuild.EmitLoadGReg(inst.RA));
+		IREmitter::InstLoc val = ibuild.EmitLoad16(addr);
+		val = ibuild.EmitSExt16(val);
+		ibuild.EmitStoreGReg(val, inst.RD);
+	}
+
+	void Jit64::lXz(UGeckoInstruction inst)
+	{
+		IREmitter::InstLoc addr = ibuild.EmitIntConst(inst.SIMM_16);
+		if (inst.RA)
+			addr = ibuild.EmitAdd(addr, ibuild.EmitLoadGReg(inst.RA));
+		IREmitter::InstLoc val;
+		switch (inst.OPCD)
+		{
+		case 32: val = ibuild.EmitLoad32(addr); break; //lwz	
+		case 40: val = ibuild.EmitLoad16(addr); break; //lhz
+		case 34: val = ibuild.EmitLoad8(addr);  break; //lbz
+		default: PanicAlert("lXz: invalid access size");
+		}
+		ibuild.EmitStoreGReg(val, inst.RD);
+	}
+
+	void Jit64::lha(UGeckoInstruction inst)
+	{
+		IREmitter::InstLoc addr =
+			ibuild.EmitIntConst((s32)(s16)inst.SIMM_16);
+		if (inst.RA)
+			addr = ibuild.EmitAdd(addr, ibuild.EmitLoadGReg(inst.RA));
+		IREmitter::InstLoc val = ibuild.EmitLoad16(addr);
+		val = ibuild.EmitSExt16(val);
+		ibuild.EmitStoreGReg(val, inst.RD);
+	}
+
+	void Jit64::lwzux(UGeckoInstruction inst)
+	{
+		IREmitter::InstLoc addr = ibuild.EmitLoadGReg(inst.RB);
+		if (inst.RA) {
+			addr = ibuild.EmitAdd(addr, ibuild.EmitLoadGReg(inst.RA));
+			ibuild.EmitStoreGReg(addr, inst.RA);
+		}
+		ibuild.EmitStoreGReg(ibuild.EmitLoad32(addr), inst.RD);
+	}
+
+	// Zero cache line.
+	void Jit64::dcbz(UGeckoInstruction inst)
+	{
+		if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITLoadStoreOff)
+			{Default(inst); return;} // turn off from debugger	
+		INSTRUCTION_START;
+
+		MOV(32, R(EAX), gpr.R(inst.RB));
+		if (inst.RA)
+			ADD(32, R(EAX), gpr.R(inst.RA));
+		AND(32, R(EAX), Imm32(~31));
+		XORPD(XMM0, R(XMM0));
+#ifdef _M_X64
+		MOVAPS(MComplex(EBX, EAX, SCALE_1, 0), XMM0);
+		MOVAPS(MComplex(EBX, EAX, SCALE_1, 16), XMM0);
+#else
+		AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK));
+		MOVAPS(MDisp(EAX, (u32)Memory::base), XMM0);
+		MOVAPS(MDisp(EAX, (u32)Memory::base + 16), XMM0);
+#endif
+	}
+
+	void Jit64::stX(UGeckoInstruction inst)
+	{
+		IREmitter::InstLoc addr = ibuild.EmitIntConst(inst.SIMM_16),
+				   value = ibuild.EmitLoadGReg(inst.RS);
+		if (inst.RA)
+			addr = ibuild.EmitAdd(ibuild.EmitLoadGReg(inst.RA), addr);
+		if (inst.OPCD & 1)
+			ibuild.EmitStoreGReg(addr, inst.RA);
+		switch (inst.OPCD & ~1)
+		{
+		case 36: ibuild.EmitStore32(value, addr); break; //stw
+		case 44: ibuild.EmitStore16(value, addr); break; //sth
+		case 38: ibuild.EmitStore8(value, addr); break;  //stb
+		default: _assert_msg_(DYNA_REC, 0, "AWETKLJASDLKF"); return;
+		}
+	}
+
+	void Jit64::stXx(UGeckoInstruction inst)
+	{
+		IREmitter::InstLoc addr = ibuild.EmitLoadGReg(inst.RB),
+				   value = ibuild.EmitLoadGReg(inst.RS);
+		addr = ibuild.EmitAdd(addr, ibuild.EmitLoadGReg(inst.RA));
+		if (inst.SUBOP10 & 32)
+			ibuild.EmitStoreGReg(addr, inst.RA);
+		switch (inst.SUBOP10 & ~32)
+		{
+		case 151: ibuild.EmitStore32(value, addr); break; //stw
+		case 407: ibuild.EmitStore16(value, addr); break; //sth
+		case 215: ibuild.EmitStore8(value, addr); break;  //stb
+		default: _assert_msg_(DYNA_REC, 0, "AWETKLJASDLKF"); return;
+		}
+	}
+
+// A few games use these heavily in video codecs.
+void Jit64::lmw(UGeckoInstruction inst)
+{
+#ifdef _M_IX86
+	Default(inst); return;
+#else
+	gpr.FlushLockX(ECX);
+	MOV(32, R(EAX), Imm32((u32)(s32)inst.SIMM_16));
+	if (inst.RA)
+		ADD(32, R(EAX), gpr.R(inst.RA));
+	for (int i = inst.RD; i < 32; i++)
+	{
+		MOV(32, R(ECX), MComplex(EBX, EAX, SCALE_1, (i - inst.RD) * 4));
+		BSWAP(32, ECX);
+		gpr.LoadToX64(i, false, true);
+		MOV(32, gpr.R(i), R(ECX));
+	}
+	gpr.UnlockAllX();
+#endif
+}
+
+void Jit64::stmw(UGeckoInstruction inst)
+{
+#ifdef _M_IX86
+	Default(inst); return;
+#else
+	gpr.FlushLockX(ECX);
+	MOV(32, R(EAX), Imm32((u32)(s32)inst.SIMM_16));
+	if (inst.RA)
+		ADD(32, R(EAX), gpr.R(inst.RA));
+	for (int i = inst.RD; i < 32; i++)
+	{
+		MOV(32, R(ECX), gpr.R(i));
+		BSWAP(32, ECX);
+		MOV(32, MComplex(EBX, EAX, SCALE_1, (i - inst.RD) * 4), R(ECX));
+	}
+	gpr.UnlockAllX();
+#endif
+}
--- a/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_LoadStoreFloating.cpp
+++ b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_LoadStoreFloating.cpp
@ -0,0 +1,322 @@
+// Copyright (C) 2003-2008 Dolphin Project.
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, version 2.0.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License 2.0 for more details.
+
+// A copy of the GPL 2.0 should have been included with the program.
+// If not, see http://www.gnu.org/licenses/
+
+// Official SVN repository and contact information can be found at
+// http://code.google.com/p/dolphin-emu/
+
+// TODO(ector): Tons of pshufb optimization of the loads/stores, for SSSE3+, possibly SSE4, only.
+// Should give a very noticable speed boost to paired single heavy code.
+
+#include "Common.h"
+
+#include "../PowerPC.h"
+#include "../../Core.h" // include "Common.h", "CoreParameter.h"
+#include "../../HW/GPFifo.h"
+#include "../../HW/CommandProcessor.h"
+#include "../../HW/PixelEngine.h"
+#include "../../HW/Memmap.h"
+#include "../PPCTables.h"
+#include "CPUDetect.h"
+#include "x64Emitter.h"
+#include "ABI.h"
+
+#include "Jit.h"
+#include "JitCache.h"
+#include "JitAsm.h"
+#include "JitRegCache.h"
+
+// #define INSTRUCTION_START Default(inst); return;
+#define INSTRUCTION_START
+
+// pshufb todo: MOVQ
+const u8 GC_ALIGNED16(bswapShuffle1x4[16]) = {3, 2, 1, 0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+const u8 GC_ALIGNED16(bswapShuffle2x4[16]) = {3, 2, 1, 0, 7, 6, 5, 4, 8, 9, 10, 11, 12, 13, 14, 15};
+const u8 GC_ALIGNED16(bswapShuffle1x8[16]) = {7, 6, 5, 4, 3, 2, 1, 0, 8, 9, 10, 11, 12, 13, 14, 15};
+const u8 GC_ALIGNED16(bswapShuffle1x8Dupe[16]) = {7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0};
+const u8 GC_ALIGNED16(bswapShuffle2x8[16]) = {7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8};
+
+namespace {
+
+u64 GC_ALIGNED16(temp64);
+u32 GC_ALIGNED16(temp32);
+}
+// TODO: Add peephole optimizations for multiple consecutive lfd/lfs/stfd/stfs since they are so common,
+// and pshufb could help a lot.
+// Also add hacks for things like lfs/stfs the same reg consecutively, that is, simple memory moves.
+
+void Jit64::lfs(UGeckoInstruction inst)
+{
+	if (Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITLoadStoreFloatingOff)
+		{Default(inst); return;} // turn off from debugger	
+	INSTRUCTION_START;
+
+	int d = inst.RD;
+	int a = inst.RA;
+	if (!a) 
+	{
+		Default(inst);
+		return;
+	}
+	s32 offset = (s32)(s16)inst.SIMM_16;
+	gpr.FlushLockX(ABI_PARAM1);
+	gpr.Lock(a);
+	MOV(32, R(ABI_PARAM1), gpr.R(a));
+	if (jo.assumeFPLoadFromMem)
+	{
+		UnsafeLoadRegToReg(ABI_PARAM1, EAX, 32, offset, false);
+	}
+	else
+	{
+		SafeLoadRegToEAX(ABI_PARAM1, 32, offset);
+	}
+
+	MOV(32, M(&temp32), R(EAX));
+	fpr.Lock(d);
+	fpr.LoadToX64(d, false);
+	CVTSS2SD(fpr.RX(d), M(&temp32));
+	MOVDDUP(fpr.RX(d), fpr.R(d));
+	gpr.UnlockAll();
+	gpr.UnlockAllX();
+	fpr.UnlockAll();
+}
+
+
+void Jit64::lfd(UGeckoInstruction inst)
+{
+	if (Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITLoadStoreFloatingOff)
+		{Default(inst); return;} // turn off from debugger	
+	INSTRUCTION_START;
+
+	int d = inst.RD;
+	int a = inst.RA;
+	if (!a) 
+	{
+		Default(inst);
+		return;
+	}
+	s32 offset = (s32)(s16)inst.SIMM_16;
+	gpr.FlushLockX(ABI_PARAM1);
+	gpr.Lock(a);
+	MOV(32, R(ABI_PARAM1), gpr.R(a));
+	// TODO - optimize. This has to load the previous value - upper double should stay unmodified.
+	fpr.LoadToX64(d, true);
+	fpr.Lock(d);
+	X64Reg xd = fpr.RX(d);
+	if (cpu_info.bSSSE3) {
+#ifdef _M_X64
+		MOVQ_xmm(XMM0, MComplex(RBX, ABI_PARAM1, SCALE_1, offset));
+#else
+		AND(32, R(ABI_PARAM1), Imm32(Memory::MEMVIEW32_MASK));
+		MOVQ_xmm(XMM0, MDisp(ABI_PARAM1, (u32)Memory::base + offset));
+#endif
+		PSHUFB(XMM0, M((void *)bswapShuffle1x8Dupe));
+		MOVSD(xd, R(XMM0));
+	} else {
+#ifdef _M_X64
+		MOV(64, R(EAX), MComplex(RBX, ABI_PARAM1, SCALE_1, offset));
+		BSWAP(64, EAX);
+		MOV(64, M(&temp64), R(EAX));
+		MOVSD(XMM0, M(&temp64));
+		MOVSD(xd, R(XMM0));
+#else
+		AND(32, R(ABI_PARAM1), Imm32(Memory::MEMVIEW32_MASK));
+		MOV(32, R(EAX), MDisp(ABI_PARAM1, (u32)Memory::base + offset));
+		BSWAP(32, EAX);
+		MOV(32, M((void*)((u32)&temp64+4)), R(EAX));
+		MOV(32, R(EAX), MDisp(ABI_PARAM1, (u32)Memory::base + offset + 4));
+		BSWAP(32, EAX);
+		MOV(32, M(&temp64), R(EAX));
+		MOVSD(XMM0, M(&temp64));
+		MOVSD(xd, R(XMM0));
+#if 0
+		// Alternate implementation; possibly faster
+		AND(32, R(ABI_PARAM1), Imm32(Memory::MEMVIEW32_MASK));
+		MOVQ_xmm(XMM0, MDisp(ABI_PARAM1, (u32)Memory::base + offset));
+		PSHUFLW(XMM0, R(XMM0), 0x1B);
+		PSRLW(XMM0, 8);
+		MOVSD(xd, R(XMM0));
+		MOVQ_xmm(XMM0, MDisp(ABI_PARAM1, (u32)Memory::base + offset));
+		PSHUFLW(XMM0, R(XMM0), 0x1B);
+		PSLLW(XMM0, 8);
+		POR(xd, R(XMM0));
+#endif
+#endif
+	}
+	gpr.UnlockAll();
+	gpr.UnlockAllX();
+	fpr.UnlockAll();
+}
+
+
+void Jit64::stfd(UGeckoInstruction inst)
+{
+	if (Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITLoadStoreFloatingOff)
+		{Default(inst); return;} // turn off from debugger	
+	INSTRUCTION_START;
+
+	int s = inst.RS;
+	int a = inst.RA;
+	if (!a)
+	{
+		Default(inst);
+		return;
+	}
+	s32 offset = (s32)(s16)inst.SIMM_16;
+	gpr.FlushLockX(ABI_PARAM1);
+	gpr.Lock(a);
+	fpr.Lock(s);
+	MOV(32, R(ABI_PARAM1), gpr.R(a));
+#ifdef _M_IX86
+	AND(32, R(ABI_PARAM1), Imm32(Memory::MEMVIEW32_MASK));
+#endif
+	if (cpu_info.bSSSE3) {
+		MOVAPD(XMM0, fpr.R(s));
+		PSHUFB(XMM0, M((void *)bswapShuffle1x8));
+#ifdef _M_X64
+		MOVQ_xmm(MComplex(RBX, ABI_PARAM1, SCALE_1, offset), XMM0);
+#else
+		MOVQ_xmm(MDisp(ABI_PARAM1, (u32)Memory::base + offset), XMM0);
+#endif
+	} else {
+#ifdef _M_X64
+		fpr.LoadToX64(s, true, false);
+		MOVSD(M(&temp64), fpr.RX(s));
+		MOV(64, R(EAX), M(&temp64));
+		BSWAP(64, EAX);
+		MOV(64, MComplex(RBX, ABI_PARAM1, SCALE_1, offset), R(EAX));
+#else
+		fpr.LoadToX64(s, true, false);
+		MOVSD(M(&temp64), fpr.RX(s));
+		MOV(32, R(EAX), M(&temp64));
+		BSWAP(32, EAX);
+		MOV(32, MDisp(ABI_PARAM1, (u32)Memory::base + offset + 4), R(EAX));
+		MOV(32, R(EAX), M((void*)((u32)&temp64 + 4)));
+		BSWAP(32, EAX);
+		MOV(32, MDisp(ABI_PARAM1, (u32)Memory::base + offset), R(EAX));
+#endif
+	}
+	gpr.UnlockAll();
+	gpr.UnlockAllX();
+	fpr.UnlockAll();
+}
+
+
+void Jit64::stfs(UGeckoInstruction inst)
+{
+	if (Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITLoadStoreFloatingOff)
+		{Default(inst); return;} // turn off from debugger	
+	INSTRUCTION_START;
+
+	bool update = inst.OPCD & 1;
+	int s = inst.RS;
+	int a = inst.RA;
+	s32 offset = (s32)(s16)inst.SIMM_16;
+	if (!a || update) {
+		Default(inst);
+		return;
+	}
+
+	if (gpr.R(a).IsImm())
+	{
+		u32 addr = (u32)(gpr.R(a).offset + offset);
+		if (Memory::IsRAMAddress(addr))
+		{
+			if (cpu_info.bSSSE3) {
+				CVTSD2SS(XMM0, fpr.R(s));
+				PSHUFB(XMM0, M((void *)bswapShuffle1x4));
+				WriteFloatToConstRamAddress(XMM0, addr);
+				return;
+			}
+		}
+		else if (addr == 0xCC008000)
+		{
+			// Float directly to write gather pipe! Fun!
+			CVTSD2SS(XMM0, fpr.R(s));
+			CALL((void*)asm_routines.fifoDirectWriteFloat);
+			// TODO
+			js.fifoBytesThisBlock += 4;
+			return;
+		}
+	}
+
+	gpr.FlushLockX(ABI_PARAM1, ABI_PARAM2);
+	gpr.Lock(a);
+	fpr.Lock(s);
+	MOV(32, R(ABI_PARAM2), gpr.R(a));
+	ADD(32, R(ABI_PARAM2), Imm32(offset));
+	if (update && offset)
+	{
+		MOV(32, gpr.R(a), R(ABI_PARAM2));
+	}
+	CVTSD2SS(XMM0, fpr.R(s));
+	MOVSS(M(&temp32), XMM0);
+	MOV(32, R(ABI_PARAM1), M(&temp32));
+	SafeWriteRegToReg(ABI_PARAM1, ABI_PARAM2, 32, 0);
+	gpr.UnlockAll();
+	gpr.UnlockAllX();
+	fpr.UnlockAll();
+}
+
+
+void Jit64::stfsx(UGeckoInstruction inst)
+{
+	if (Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITLoadStoreFloatingOff)
+		{Default(inst); return;} // turn off from debugger	
+	INSTRUCTION_START;
+
+	// We can take a shortcut here - it's not likely that a hardware access would use this instruction.
+	gpr.FlushLockX(ABI_PARAM1);
+	fpr.Lock(inst.RS);
+	MOV(32, R(ABI_PARAM1), gpr.R(inst.RB));
+	if (inst.RA)
+		ADD(32, R(ABI_PARAM1), gpr.R(inst.RA));
+	CVTSD2SS(XMM0, fpr.R(inst.RS));
+	MOVD_xmm(R(EAX), XMM0);
+	UnsafeWriteRegToReg(EAX, ABI_PARAM1, 32, 0);
+	gpr.UnlockAllX();
+	fpr.UnlockAll();
+}
+
+
+void Jit64::lfsx(UGeckoInstruction inst)
+{
+	if (Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITLoadStoreFloatingOff)
+		{Default(inst); return;} // turn off from debugger	
+	INSTRUCTION_START;
+
+	fpr.Lock(inst.RS);
+	fpr.LoadToX64(inst.RS, false, true);
+	MOV(32, R(EAX), gpr.R(inst.RB));
+	if (inst.RA)
+		ADD(32, R(EAX), gpr.R(inst.RA));
+	if (cpu_info.bSSSE3) {
+		X64Reg r = fpr.R(inst.RS).GetSimpleReg();
+#ifdef _M_IX86
+		AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK));
+		MOVD_xmm(r, MDisp(EAX, (u32)Memory::base));
+#else
+		MOVD_xmm(r, MComplex(RBX, EAX, SCALE_1, 0));
+#endif
+		PSHUFB(r, M((void *)bswapShuffle1x4));
+		CVTSS2SD(r, R(r));
+		MOVDDUP(r, R(r));
+	} else {
+		UnsafeLoadRegToReg(EAX, EAX, 32, false);
+		MOV(32, M(&temp32), R(EAX));
+		CVTSS2SD(XMM0, M(&temp32));
+		MOVDDUP(fpr.R(inst.RS).GetSimpleReg(), R(XMM0));
+	}
+	fpr.UnlockAll();
+}
+
--- a/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_LoadStorePaired.cpp
+++ b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_LoadStorePaired.cpp
@ -0,0 +1,458 @@
+// Copyright (C) 2003-2008 Dolphin Project.
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, version 2.0.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License 2.0 for more details.
+
+// A copy of the GPL 2.0 should have been included with the program.
+// If not, see http://www.gnu.org/licenses/
+
+// Official SVN repository and contact information can be found at
+// http://code.google.com/p/dolphin-emu/
+
+// TODO(ector): Tons of pshufb optimization of the loads/stores, for SSSE3+, possibly SSE4, only.
+// Should give a very noticable speed boost to paired single heavy code.
+
+#include "Common.h"
+
+#include "Thunk.h"
+#include "../PowerPC.h"
+#include "../../Core.h"
+#include "../../HW/GPFifo.h"
+#include "../../HW/CommandProcessor.h"
+#include "../../HW/PixelEngine.h"
+#include "../../HW/Memmap.h"
+#include "../PPCTables.h"
+#include "CPUDetect.h"
+#include "x64Emitter.h"
+#include "ABI.h"
+
+#include "Jit.h"
+#include "JitCache.h"
+#include "JitAsm.h"
+#include "JitRegCache.h"
+
+#define INSTRUCTION_START
+// #define INSTRUCTION_START Default(inst); return;
+
+const u8 GC_ALIGNED16(pbswapShuffle2x4[16]) = {3, 2, 1, 0, 7, 6, 5, 4, 8, 9, 10, 11, 12, 13, 14, 15};
+const u8 GC_ALIGNED16(pbswapShuffleNoop[16]) = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+
+static double GC_ALIGNED16(psTemp[2]) = {1.0, 1.0};
+static u64 GC_ALIGNED16(temp64);
+
+// TODO(ector): Improve 64-bit version
+static void WriteDual32(u64 value, u32 address)
+{
+	Memory::Write_U32((u32)(value >> 32), address);
+	Memory::Write_U32((u32)value, address + 4);
+}
+
+const double GC_ALIGNED16(m_quantizeTableD[]) =
+{
+	(1 <<  0),	(1 <<  1),	(1 <<  2),	(1 <<  3),
+	(1 <<  4),	(1 <<  5),	(1 <<  6),	(1 <<  7),
+	(1 <<  8),	(1 <<  9),	(1 << 10),	(1 << 11),
+	(1 << 12),	(1 << 13),	(1 << 14),	(1 << 15),
+	(1 << 16),	(1 << 17),	(1 << 18),	(1 << 19),
+	(1 << 20),	(1 << 21),	(1 << 22),	(1 << 23),
+	(1 << 24),	(1 << 25),	(1 << 26),	(1 << 27),
+	(1 << 28),	(1 << 29),	(1 << 30),	(1 << 31),
+	1.0 / (1ULL << 32),	1.0 / (1 << 31),	1.0 / (1 << 30),	1.0 / (1 << 29),
+	1.0 / (1 << 28),	1.0 / (1 << 27),	1.0 / (1 << 26),	1.0 / (1 << 25),
+	1.0 / (1 << 24),	1.0 / (1 << 23),	1.0 / (1 << 22),	1.0 / (1 << 21),
+	1.0 / (1 << 20),	1.0 / (1 << 19),	1.0 / (1 << 18),	1.0 / (1 << 17),
+	1.0 / (1 << 16),	1.0 / (1 << 15),	1.0 / (1 << 14),	1.0 / (1 << 13),
+	1.0 / (1 << 12),	1.0 / (1 << 11),	1.0 / (1 << 10),	1.0 / (1 <<  9),
+	1.0 / (1 <<  8),	1.0 / (1 <<  7),	1.0 / (1 <<  6),	1.0 / (1 <<  5),
+	1.0 / (1 <<  4),	1.0 / (1 <<  3),	1.0 / (1 <<  2),	1.0 / (1 <<  1),
+}; 
+
+const double GC_ALIGNED16(m_dequantizeTableD[]) =
+{
+	1.0 / (1 <<  0),	1.0 / (1 <<  1),	1.0 / (1 <<  2),	1.0 / (1 <<  3),
+	1.0 / (1 <<  4),	1.0 / (1 <<  5),	1.0 / (1 <<  6),	1.0 / (1 <<  7),
+	1.0 / (1 <<  8),	1.0 / (1 <<  9),	1.0 / (1 << 10),	1.0 / (1 << 11),
+	1.0 / (1 << 12),	1.0 / (1 << 13),	1.0 / (1 << 14),	1.0 / (1 << 15),
+	1.0 / (1 << 16),	1.0 / (1 << 17),	1.0 / (1 << 18),	1.0 / (1 << 19),
+	1.0 / (1 << 20),	1.0 / (1 << 21),	1.0 / (1 << 22),	1.0 / (1 << 23),
+	1.0 / (1 << 24),	1.0 / (1 << 25),	1.0 / (1 << 26),	1.0 / (1 << 27),
+	1.0 / (1 << 28),	1.0 / (1 << 29),	1.0 / (1 << 30),	1.0 / (1 << 31),
+	(1ULL << 32),	(1 << 31),		(1 << 30),		(1 << 29),
+	(1 << 28),		(1 << 27),		(1 << 26),		(1 << 25),
+	(1 << 24),		(1 << 23),		(1 << 22),		(1 << 21),
+	(1 << 20),		(1 << 19),		(1 << 18),		(1 << 17),
+	(1 << 16),		(1 << 15),		(1 << 14),		(1 << 13),
+	(1 << 12),		(1 << 11),		(1 << 10),		(1 <<  9),
+	(1 <<  8),		(1 <<  7),		(1 <<  6),		(1 <<  5),
+	(1 <<  4),		(1 <<  3),		(1 <<  2),		(1 <<  1),
+};  
+
+// The big problem is likely instructions that set the quantizers in the same block.
+// We will have to break block after quantizers are written to.
+void Jit64::psq_st(UGeckoInstruction inst)
+{
+	if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITLoadStorePairedOff)
+		{Default(inst); return;} // turn off from debugger	
+	INSTRUCTION_START;
+	js.block_flags |= BLOCK_USE_GQR0 << inst.I;
+
+	if (js.blockSetsQuantizers || !Core::GetStartupParameter().bOptimizeQuantizers)
+	{
+		Default(inst);
+		return;
+	}
+	if (!inst.RA)
+	{
+		// This really should never happen. Unless we change this to also support stwux
+		Default(inst);
+		return;
+	}
+
+	const UGQR gqr(rSPR(SPR_GQR0 + inst.I));
+	const EQuantizeType stType = static_cast<EQuantizeType>(gqr.ST_TYPE);
+	int stScale = gqr.ST_SCALE;
+	bool update = inst.OPCD == 61;
+
+	int offset = inst.SIMM_12;
+	int a = inst.RA;
+	int s = inst.RS; // Fp numbers
+
+	if (inst.W) {
+		// PanicAlert("W=1: stType %i stScale %i update %i", (int)stType, (int)stScale, (int)update); 
+		// It's fairly common that games write stuff to the pipe using this. Then, it's pretty much only
+		// floats so that's what we'll work on.
+		switch (stType)
+		{
+		case QUANTIZE_FLOAT:
+			{
+			// This one has quite a bit of optimization potential.
+			if (gpr.R(a).IsImm())
+			{
+				PanicAlert("Imm: %08x", gpr.R(a).offset);
+			}
+			gpr.FlushLockX(ABI_PARAM1, ABI_PARAM2);
+			gpr.Lock(a);
+			fpr.Lock(s);
+			if (update)
+				gpr.LoadToX64(a, true, true);
+			MOV(32, R(ABI_PARAM2), gpr.R(a));
+			if (offset)
+				ADD(32, R(ABI_PARAM2), Imm32((u32)offset));
+			TEST(32, R(ABI_PARAM2), Imm32(0x0C000000));
+			if (update && offset)
+				MOV(32, gpr.R(a), R(ABI_PARAM2));
+			CVTSD2SS(XMM0, fpr.R(s));
+			MOVD_xmm(M(&temp64), XMM0);
+			MOV(32, R(ABI_PARAM1), M(&temp64));
+			FixupBranch argh = J_CC(CC_NZ);
+			BSWAP(32, ABI_PARAM1);
+#ifdef _M_X64
+			MOV(32, MComplex(RBX, ABI_PARAM2, SCALE_1, 0), R(ABI_PARAM1));
+#else
+			MOV(32, R(EAX), R(ABI_PARAM2));
+			AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK));
+			MOV(32, MDisp(EAX, (u32)Memory::base), R(ABI_PARAM1));
+#endif
+			FixupBranch skip_call = J();
+			SetJumpTarget(argh);
+			ABI_CallFunctionRR(thunks.ProtectFunction((void *)&Memory::Write_U32, 2), ABI_PARAM1, ABI_PARAM2); 
+			SetJumpTarget(skip_call);
+			gpr.UnlockAll();
+			gpr.UnlockAllX();
+			fpr.UnlockAll();
+			return;
+			}
+		default:
+			Default(inst);
+			return;
+		}
+		return;
+	}
+
+	if (stType == QUANTIZE_FLOAT)
+	{
+		if (gpr.R(a).IsImm() && !update && cpu_info.bSSSE3)
+		{
+			u32 addr = (u32)(gpr.R(a).offset + offset);
+			if (addr == 0xCC008000) {
+				// Writing to FIFO. Let's do fast method.
+				CVTPD2PS(XMM0, fpr.R(s));
+				PSHUFB(XMM0, M((void*)&pbswapShuffle2x4));
+				CALL((void*)asm_routines.fifoDirectWriteXmm64);
+				js.fifoBytesThisBlock += 8;
+				return;
+			}
+		}
+
+		gpr.FlushLockX(ABI_PARAM1, ABI_PARAM2);
+		gpr.Lock(a);
+		fpr.Lock(s);
+		if (update)
+			gpr.LoadToX64(a, true, true);
+		MOV(32, R(ABI_PARAM2), gpr.R(a));
+		if (offset)
+			ADD(32, R(ABI_PARAM2), Imm32((u32)offset));
+		TEST(32, R(ABI_PARAM2), Imm32(0x0C000000));
+		if (update && offset)
+			MOV(32, gpr.R(a), R(ABI_PARAM2));
+		CVTPD2PS(XMM0, fpr.R(s));
+		SHUFPS(XMM0, R(XMM0), 1);
+		MOVQ_xmm(M(&temp64), XMM0);
+#ifdef _M_X64
+		MOV(64, R(ABI_PARAM1), M(&temp64));
+		FixupBranch argh = J_CC(CC_NZ);
+		BSWAP(64, ABI_PARAM1);
+		MOV(64, MComplex(RBX, ABI_PARAM2, SCALE_1, 0), R(ABI_PARAM1));
+		FixupBranch arg2 = J();
+		SetJumpTarget(argh);
+		CALL(thunks.ProtectFunction((void *)&WriteDual32, 0));
+#else
+		FixupBranch argh = J_CC(CC_NZ);
+		MOV(32, R(ABI_PARAM1), M(((char*)&temp64) + 4));
+		BSWAP(32, ABI_PARAM1);
+		AND(32, R(ABI_PARAM2), Imm32(Memory::MEMVIEW32_MASK));
+		MOV(32, MDisp(ABI_PARAM2, (u32)Memory::base), R(ABI_PARAM1));
+		MOV(32, R(ABI_PARAM1), M(&temp64));
+		BSWAP(32, ABI_PARAM1);
+		MOV(32, MDisp(ABI_PARAM2, 4+(u32)Memory::base), R(ABI_PARAM1));
+		FixupBranch arg2 = J();
+		SetJumpTarget(argh);
+		MOV(32, R(ABI_PARAM1), M(((char*)&temp64) + 4));
+		ABI_CallFunctionRR(thunks.ProtectFunction((void *)&Memory::Write_U32, 2), ABI_PARAM1, ABI_PARAM2); 
+		MOV(32, R(ABI_PARAM1), M(((char*)&temp64)));
+		ADD(32, R(ABI_PARAM2), Imm32(4));
+		ABI_CallFunctionRR(thunks.ProtectFunction((void *)&Memory::Write_U32, 2), ABI_PARAM1, ABI_PARAM2); 
+#endif
+		SetJumpTarget(arg2);
+		gpr.UnlockAll();
+		gpr.UnlockAllX();
+		fpr.UnlockAll();
+	}
+	else if (stType == QUANTIZE_U8)
+	{
+		gpr.FlushLockX(ABI_PARAM1, ABI_PARAM2);
+		gpr.Lock(a);
+		fpr.Lock(s);
+		if (update)
+			gpr.LoadToX64(a, true, update);
+		MOV(32, R(ABI_PARAM2), gpr.R(a));
+		if (offset)
+			ADD(32, R(ABI_PARAM2), Imm32((u32)offset));
+		if (update && offset)
+			MOV(32, gpr.R(a), R(ABI_PARAM2));
+		MOVAPD(XMM0, fpr.R(s));
+		MOVDDUP(XMM1, M((void*)&m_quantizeTableD[stScale]));
+		MULPD(XMM0, R(XMM1));
+		CVTPD2DQ(XMM0, R(XMM0));
+		PACKSSDW(XMM0, R(XMM0));
+		PACKUSWB(XMM0, R(XMM0));
+		MOVD_xmm(M(&temp64), XMM0);
+		MOV(16, R(ABI_PARAM1), M(&temp64));
+#ifdef _M_X64
+		MOV(16, MComplex(RBX, ABI_PARAM2, SCALE_1, 0), R(ABI_PARAM1));
+#else
+		MOV(32, R(EAX), R(ABI_PARAM2));
+		AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK));
+		MOV(16, MDisp(EAX, (u32)Memory::base), R(ABI_PARAM1));
+#endif
+		if (update)
+			MOV(32, gpr.R(a), R(ABI_PARAM2));
+		gpr.UnlockAll();
+		gpr.UnlockAllX();
+		fpr.UnlockAll();
+	} 
+	else if (stType == QUANTIZE_S16)
+	{
+		gpr.FlushLockX(ABI_PARAM1, ABI_PARAM2);
+		gpr.Lock(a);
+		fpr.Lock(s);
+		if (update)
+			gpr.LoadToX64(a, true, update);
+		MOV(32, R(ABI_PARAM2), gpr.R(a));
+		if (offset)
+			ADD(32, R(ABI_PARAM2), Imm32((u32)offset));
+		if (update)
+			MOV(32, gpr.R(a), R(ABI_PARAM2));
+		MOVAPD(XMM0, fpr.R(s));
+		MOVDDUP(XMM1, M((void*)&m_quantizeTableD[stScale]));
+		MULPD(XMM0, R(XMM1));
+		SHUFPD(XMM0, R(XMM0), 1);
+		CVTPD2DQ(XMM0, R(XMM0));
+		PACKSSDW(XMM0, R(XMM0));
+		MOVD_xmm(M(&temp64), XMM0);
+		MOV(32, R(ABI_PARAM1), M(&temp64));
+		BSWAP(32, ABI_PARAM1);
+#ifdef _M_X64
+		MOV(32, MComplex(RBX, ABI_PARAM2, SCALE_1, 0), R(ABI_PARAM1));
+#else
+		MOV(32, R(EAX), R(ABI_PARAM2));
+		AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK));
+		MOV(32, MDisp(EAX, (u32)Memory::base), R(ABI_PARAM1));
+#endif
+		gpr.UnlockAll();
+		gpr.UnlockAllX();
+		fpr.UnlockAll();
+	}
+	else {
+		// Dodger uses this.
+        // mario tennis
+		//PanicAlert("st %i:%i", stType, inst.W);
+		Default(inst);
+	}
+}
+
+void Jit64::psq_l(UGeckoInstruction inst)
+{
+	if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITLoadStorePairedOff)
+		{Default(inst); return;} // turn off from debugger	
+	INSTRUCTION_START;
+
+	js.block_flags |= BLOCK_USE_GQR0 << inst.I;
+
+	if (js.blockSetsQuantizers || !Core::GetStartupParameter().bOptimizeQuantizers)
+	{
+		Default(inst);
+		return;
+	}
+
+	const UGQR gqr(rSPR(SPR_GQR0 + inst.I));
+	const EQuantizeType ldType = static_cast<EQuantizeType>(gqr.LD_TYPE);
+	int ldScale = gqr.LD_SCALE;
+	bool update = inst.OPCD == 57;
+	if (!inst.RA || inst.W)
+	{
+		// 0 1 during load
+		//PanicAlert("ld:%i %i", ldType, (int)inst.W);
+		Default(inst);
+		return;
+	}
+	int offset = inst.SIMM_12;
+	switch (ldType) {
+		case QUANTIZE_FLOAT:  // We know this is from RAM, so we don't need to check the address.
+			{
+#ifdef _M_X64
+			gpr.LoadToX64(inst.RA, true, update);
+			fpr.LoadToX64(inst.RS, false);
+			if (cpu_info.bSSSE3) {
+				X64Reg xd = fpr.R(inst.RS).GetSimpleReg();
+				MOVQ_xmm(xd, MComplex(RBX, gpr.R(inst.RA).GetSimpleReg(), 1, offset));
+				PSHUFB(xd, M((void *)pbswapShuffle2x4));
+				CVTPS2PD(xd, R(xd));
+			} else {
+				MOV(64, R(RAX), MComplex(RBX, gpr.R(inst.RA).GetSimpleReg(), 1, offset));
+				BSWAP(64, RAX);
+				MOV(64, M(&psTemp[0]), R(RAX));
+				X64Reg r = fpr.R(inst.RS).GetSimpleReg();
+				CVTPS2PD(r, M(&psTemp[0]));
+				SHUFPD(r, R(r), 1);
+			}
+			if (update && offset != 0)
+				ADD(32, gpr.R(inst.RA), Imm32(offset));
+			break;
+#else
+			if (cpu_info.bSSSE3) {
+				gpr.LoadToX64(inst.RA, true, update);
+				fpr.LoadToX64(inst.RS, false);
+				X64Reg xd = fpr.R(inst.RS).GetSimpleReg();
+				MOV(32, R(EAX), gpr.R(inst.RA));
+				AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK));
+				MOVQ_xmm(xd, MDisp(EAX, (u32)Memory::base + offset));
+				PSHUFB(xd, M((void *)pbswapShuffle2x4));
+				CVTPS2PD(xd, R(xd));
+			} else {
+				gpr.FlushLockX(ECX);
+				gpr.LoadToX64(inst.RA, true, update);
+				// This can probably be optimized somewhat.
+				LEA(32, ECX, MDisp(gpr.R(inst.RA).GetSimpleReg(), offset));
+				AND(32, R(ECX), Imm32(Memory::MEMVIEW32_MASK));
+				MOV(32, R(EAX), MDisp(ECX, (u32)Memory::base));
+				BSWAP(32, RAX);
+				MOV(32, M(&psTemp[0]), R(RAX));
+				MOV(32, R(EAX), MDisp(ECX, (u32)Memory::base + 4));
+				BSWAP(32, RAX);
+				MOV(32, M(((float *)&psTemp[0]) + 1), R(RAX));
+				fpr.LoadToX64(inst.RS, false, true);
+				X64Reg r = fpr.R(inst.RS).GetSimpleReg();
+				CVTPS2PD(r, M(&psTemp[0]));
+				gpr.UnlockAllX();
+			}
+			if (update && offset != 0)
+				ADD(32, gpr.R(inst.RA), Imm32(offset));
+			break;
+#endif
+			}
+		case QUANTIZE_U8:
+			{
+			gpr.LoadToX64(inst.RA, true, update);
+#ifdef _M_X64
+			MOVZX(32, 16, EAX, MComplex(RBX, gpr.R(inst.RA).GetSimpleReg(), 1, offset));
+#else
+			LEA(32, EAX, MDisp(gpr.R(inst.RA).GetSimpleReg(), offset));
+			AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK));
+			MOVZX(32, 16, EAX, MDisp(EAX, (u32)Memory::base));
+#endif
+			MOV(32, M(&temp64), R(EAX));
+			MOVD_xmm(XMM0, M(&temp64));
+			// SSE4 optimization opportunity here.
+			PXOR(XMM1, R(XMM1));
+			PUNPCKLBW(XMM0, R(XMM1));
+			PUNPCKLWD(XMM0, R(XMM1));
+			CVTDQ2PD(XMM0, R(XMM0));
+			fpr.LoadToX64(inst.RS, false, true);
+			X64Reg r = fpr.R(inst.RS).GetSimpleReg();
+			MOVDDUP(r, M((void *)&m_dequantizeTableD[ldScale]));
+			MULPD(r, R(XMM0));
+			if (update && offset != 0)
+				ADD(32, gpr.R(inst.RA), Imm32(offset));
+			}
+			break;
+		case QUANTIZE_S16:
+			{
+			gpr.LoadToX64(inst.RA, true, update);
+#ifdef _M_X64
+			MOV(32, R(EAX), MComplex(RBX, gpr.R(inst.RA).GetSimpleReg(), 1, offset));
+#else
+			LEA(32, EAX, MDisp(gpr.R(inst.RA).GetSimpleReg(), offset));
+			AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK));
+			MOV(32, R(EAX), MDisp(EAX, (u32)Memory::base));
+#endif
+			BSWAP(32, EAX);
+			MOV(32, M(&temp64), R(EAX));
+			fpr.LoadToX64(inst.RS, false, true);
+			X64Reg r = fpr.R(inst.RS).GetSimpleReg();
+			MOVD_xmm(XMM0, M(&temp64));
+			PUNPCKLWD(XMM0, R(XMM0)); // unpack to higher word in each dword..
+			PSRAD(XMM0, 16);          // then use this signed shift to sign extend. clever eh? :P
+			CVTDQ2PD(XMM0, R(XMM0));
+			MOVDDUP(r, M((void*)&m_dequantizeTableD[ldScale]));
+			MULPD(r, R(XMM0));
+			SHUFPD(r, R(r), 1);
+			if (update && offset != 0)
+				ADD(32, gpr.R(inst.RA), Imm32(offset));
+			}
+			break;
+
+			/*
+			Dynamic quantizer. Todo when we have a test set.
+			MOVZX(32, 8, EAX, M(((char *)&PowerPC::ppcState.spr[SPR_GQR0 + inst.I]) + 3));  // it's in the high byte.
+			AND(32, R(EAX), Imm8(0x3F));
+			MOV(32, R(ECX), Imm32((u32)&m_dequantizeTableD));
+			MOVDDUP(r, MComplex(RCX, EAX, 8, 0));
+			*/
+		default:
+			// 4 0
+			// 6 0 //power tennis
+			// 5 0 
+			// PanicAlert("ld:%i %i", ldType, (int)inst.W);
+			Default(inst);
+			return;
+	}
+
+	//u32 EA = (m_GPR[_inst.RA] + _inst.SIMM_12) : _inst.SIMM_12;
+}
--- a/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_Paired.cpp
+++ b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_Paired.cpp
@ -0,0 +1,407 @@
+// Copyright (C) 2003-2008 Dolphin Project.
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, version 2.0.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License 2.0 for more details.
+
+// A copy of the GPL 2.0 should have been included with the program.
+// If not, see http://www.gnu.org/licenses/
+
+// Official SVN repository and contact information can be found at
+// http://code.google.com/p/dolphin-emu/
+
+#include "Common.h"
+
+#include "../../Core.h"
+#include "../PowerPC.h"
+#include "../PPCTables.h"
+#include "x64Emitter.h"
+#include "../../HW/GPFifo.h"
+
+#include "Jit.h"
+#include "JitCache.h"
+#include "JitRegCache.h"
+
+// TODO
+// ps_madds0
+// ps_muls0
+// ps_madds1
+// ps_sel
+//   cmppd, andpd, andnpd, or
+//   lfsx, ps_merge01 etc
+
+// #define INSTRUCTION_START Default(inst); return;
+#define INSTRUCTION_START
+
+	const u64 GC_ALIGNED16(psSignBits[2]) = {0x8000000000000000ULL, 0x8000000000000000ULL};
+	const u64 GC_ALIGNED16(psAbsMask[2])  = {0x7FFFFFFFFFFFFFFFULL, 0x7FFFFFFFFFFFFFFFULL};
+	const double GC_ALIGNED16(psOneOne[2])  = {1.0, 1.0};
+	const double GC_ALIGNED16(psZeroZero[2]) = {0.0, 0.0};
+
+	void Jit64::ps_mr(UGeckoInstruction inst)
+	{
+		if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITPairedOff)
+			{Default(inst); return;} // turn off from debugger
+		INSTRUCTION_START;
+		if (inst.Rc) {
+			Default(inst); return;
+		}
+		int d = inst.FD;
+		int b = inst.FB;
+		if (d == b)
+			return;
+		fpr.LoadToX64(d, false);
+		MOVAPD(fpr.RX(d), fpr.R(b));
+	}
+
+	void Jit64::ps_sel(UGeckoInstruction inst)
+	{
+		if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITPairedOff)
+			{Default(inst); return;} // turn off from debugger
+		INSTRUCTION_START;
+		Default(inst);
+		return;
+		
+		if (inst.Rc) {
+			Default(inst); return;
+		}
+		// GRR can't get this to work 100%. Getting artifacts in D.O.N. intro.
+		int d = inst.FD;
+		int a = inst.FA;
+		int b = inst.FB;
+		int c = inst.FC;
+		fpr.FlushLockX(XMM7);
+		fpr.FlushLockX(XMM6);
+		fpr.Lock(a, b, c, d);
+		fpr.LoadToX64(a, true, false);
+		fpr.LoadToX64(d, false, true);
+		// BLENDPD would have been nice...
+		MOVAPD(XMM7, fpr.R(a));
+		CMPPD(XMM7, M((void*)psZeroZero), 1); //less-than = 111111
+		MOVAPD(XMM6, R(XMM7));
+		ANDPD(XMM7, fpr.R(d));
+		ANDNPD(XMM6, fpr.R(c));
+		MOVAPD(fpr.RX(d), R(XMM7));
+		ORPD(fpr.RX(d), R(XMM6));
+		fpr.UnlockAll();
+		fpr.UnlockAllX();
+	}
+
+	void Jit64::ps_sign(UGeckoInstruction inst)
+	{
+		if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITPairedOff)
+			{Default(inst); return;} // turn off from debugger
+		INSTRUCTION_START;
+		if (inst.Rc) {
+			Default(inst); return;
+		}
+		int d = inst.FD;
+		int b = inst.FB;
+
+		fpr.Lock(d, b);
+		if (d != b)
+		{
+			fpr.LoadToX64(d, false);
+			MOVAPD(fpr.RX(d), fpr.R(b));
+		}
+		else
+		{
+			fpr.LoadToX64(d, true);
+		}
+
+		switch (inst.SUBOP10)
+		{
+		case 40: //neg 
+			XORPD(fpr.RX(d), M((void*)&psSignBits));
+			break;
+		case 136: //nabs
+			ORPD(fpr.RX(d), M((void*)&psSignBits));
+			break;
+		case 264: //abs
+			ANDPD(fpr.RX(d), M((void*)&psAbsMask));
+			break;
+		}
+
+		fpr.UnlockAll();
+	}
+
+	void Jit64::ps_rsqrte(UGeckoInstruction inst)
+	{
+		if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITPairedOff)
+			{Default(inst); return;} // turn off from debugger
+		INSTRUCTION_START;
+		if (inst.Rc) {
+			Default(inst); return;
+		}
+		int d = inst.FD;
+		int b = inst.FB;
+		fpr.Lock(d, b);
+		SQRTPD(XMM0, fpr.R(b));
+		MOVAPD(XMM1, M((void*)&psOneOne));
+		DIVPD(XMM1, R(XMM0));
+		MOVAPD(fpr.R(d), XMM1);
+		fpr.UnlockAll();
+	}
+
+	//add a, b, c
+	
+	//mov a, b
+	//add a, c
+	//we need:
+	/*
+	psq_l
+	psq_stu
+	*/
+	
+	/*
+	add a,b,a
+	*/
+
+	//There's still a little bit more optimization that can be squeezed out of this
+	void Jit64::tri_op(int d, int a, int b, bool reversible, void (XEmitter::*op)(X64Reg, OpArg))
+	{
+		fpr.Lock(d, a, b);
+		
+		if (d == a)
+		{
+			fpr.LoadToX64(d, true);
+			(this->*op)(fpr.RX(d), fpr.R(b));
+		}
+		else if (d == b && reversible)
+		{
+			fpr.LoadToX64(d, true);
+			(this->*op)(fpr.RX(d), fpr.R(a));
+		}
+		else if (a != d && b != d) 
+		{
+			//sources different from d, can use rather quick solution
+			fpr.LoadToX64(d, false);
+			MOVAPD(fpr.RX(d), fpr.R(a));
+			(this->*op)(fpr.RX(d), fpr.R(b));
+		}
+		else if (b != d)
+		{
+			fpr.LoadToX64(d, false);
+			MOVAPD(XMM0, fpr.R(b));
+			MOVAPD(fpr.RX(d), fpr.R(a));
+			(this->*op)(fpr.RX(d), Gen::R(XMM0));
+		}
+		else //Other combo, must use two temps :(
+		{
+			MOVAPD(XMM0, fpr.R(a));
+			MOVAPD(XMM1, fpr.R(b));
+			fpr.LoadToX64(d, false);
+			(this->*op)(XMM0, Gen::R(XMM1));
+			MOVAPD(fpr.RX(d), Gen::R(XMM0));
+		}
+		ForceSinglePrecisionP(fpr.RX(d));
+		fpr.UnlockAll();
+	}
+
+	void Jit64::ps_arith(UGeckoInstruction inst)
+	{	
+		if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITPairedOff)
+			{Default(inst); return;} // turn off from debugger
+		INSTRUCTION_START;
+		if (inst.Rc) {
+			Default(inst); return;
+		}
+		switch (inst.SUBOP5)
+		{
+		case 18: tri_op(inst.FD, inst.FA, inst.FB, false, &XEmitter::DIVPD); break; //div
+		case 20: tri_op(inst.FD, inst.FA, inst.FB, false, &XEmitter::SUBPD); break; //sub 
+		case 21: tri_op(inst.FD, inst.FA, inst.FB, true,  &XEmitter::ADDPD); break; //add
+		case 23://sel
+			Default(inst);
+			break;
+		case 24://res
+			Default(inst);
+			break;
+		case 25: tri_op(inst.FD, inst.FA, inst.FC, true, &XEmitter::MULPD); break; //mul
+		default:
+			_assert_msg_(DYNA_REC, 0, "ps_arith WTF!!!");
+		}
+	}
+
+	void Jit64::ps_sum(UGeckoInstruction inst)
+	{	
+		if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITPairedOff)
+			{Default(inst); return;} // turn off from debugger
+		INSTRUCTION_START;
+		if (inst.Rc) {
+			Default(inst); return;
+		}
+		int d = inst.FD;
+		int a = inst.FA;
+		int b = inst.FB;
+		int c = inst.FC;
+		fpr.Lock(a,b,c,d);
+		fpr.LoadToX64(d, d == a || d == b || d == c, true);
+		switch (inst.SUBOP5)
+		{
+		case 10:
+			// Do the sum in upper subregisters, merge uppers
+			MOVDDUP(XMM0, fpr.R(a));
+			MOVAPD(XMM1, fpr.R(b));
+			ADDPD(XMM0, R(XMM1));
+			UNPCKHPD(XMM0, fpr.R(c)); //merge
+			MOVAPD(fpr.R(d), XMM0);
+			break;
+		case 11:
+			// Do the sum in lower subregisters, merge lowers
+			MOVAPD(XMM0, fpr.R(a));
+			MOVAPD(XMM1, fpr.R(b));
+			SHUFPD(XMM1, R(XMM1), 5); // copy higher to lower
+			ADDPD(XMM0, R(XMM1)); // sum lowers
+			MOVAPD(XMM1, fpr.R(c));  
+			UNPCKLPD(XMM1, R(XMM0)); // merge
+			MOVAPD(fpr.R(d), XMM1);
+			break;
+		default:
+			PanicAlert("ps_sum WTF!!!");
+		}
+		ForceSinglePrecisionP(fpr.RX(d));
+		fpr.UnlockAll();
+	}
+
+
+	void Jit64::ps_muls(UGeckoInstruction inst)
+	{
+		if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITPairedOff)
+			{Default(inst); return;} // turn off from debugger
+		INSTRUCTION_START;
+		if (inst.Rc) {
+			Default(inst); return;
+		}
+		int d = inst.FD;
+		int a = inst.FA;
+		int c = inst.FC;
+		fpr.Lock(a, c, d);
+		fpr.LoadToX64(d, d == a || d == c, true);
+		switch (inst.SUBOP5)
+		{
+		case 12:
+			// Single multiply scalar high
+			// TODO - faster version for when regs are different
+			MOVAPD(XMM0, fpr.R(a));
+			MOVDDUP(XMM1, fpr.R(c));
+			MULPD(XMM0, R(XMM1));
+			MOVAPD(fpr.R(d), XMM0);
+			break;
+		case 13:
+			// TODO - faster version for when regs are different
+			MOVAPD(XMM0, fpr.R(a));
+			MOVAPD(XMM1, fpr.R(c));
+			SHUFPD(XMM1, R(XMM1), 3); // copy higher to lower
+			MULPD(XMM0, R(XMM1));
+			MOVAPD(fpr.R(d), XMM0);
+			break;
+		default:
+			PanicAlert("ps_muls WTF!!!");
+		}
+		ForceSinglePrecisionP(fpr.RX(d));
+		fpr.UnlockAll();
+	}
+
+
+	//TODO: find easy cases and optimize them, do a breakout like ps_arith
+	void Jit64::ps_mergeXX(UGeckoInstruction inst)
+	{
+		if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITPairedOff)
+			{Default(inst); return;} // turn off from debugger
+		INSTRUCTION_START;
+		if (inst.Rc) {
+			Default(inst); return;
+		}
+		int d = inst.FD;
+		int a = inst.FA;
+		int b = inst.FB;
+		fpr.Lock(a,b,d);
+
+		MOVAPD(XMM0, fpr.R(a));
+		switch (inst.SUBOP10)
+		{
+		case 528: 
+			UNPCKLPD(XMM0, fpr.R(b)); //unpck is faster than shuf
+			break; //00
+		case 560:
+			SHUFPD(XMM0, fpr.R(b), 2); //must use shuf here
+			break; //01
+		case 592:
+			SHUFPD(XMM0, fpr.R(b), 1);
+			break; //10
+		case 624:
+			UNPCKHPD(XMM0, fpr.R(b));
+			break; //11
+		default:
+			_assert_msg_(DYNA_REC, 0, "ps_merge - invalid op");
+		}
+		fpr.LoadToX64(d, false);
+		MOVAPD(fpr.RX(d), Gen::R(XMM0));
+		fpr.UnlockAll();
+	}
+
+
+	//TODO: add optimized cases
+	void Jit64::ps_maddXX(UGeckoInstruction inst)
+	{
+		if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITPairedOff)
+			{Default(inst); return;} // turn off from debugger
+		INSTRUCTION_START;
+		if (inst.Rc) {
+			Default(inst); return;
+		}
+		int a = inst.FA;
+		int b = inst.FB;
+		int c = inst.FC;
+		int d = inst.FD;
+		fpr.Lock(a,b,c,d);
+
+		MOVAPD(XMM0, fpr.R(a));
+		switch (inst.SUBOP5)
+		{
+		case 14: //madds0
+			MOVDDUP(XMM1, fpr.R(c));
+			MULPD(XMM0, R(XMM1));
+			ADDPD(XMM0, fpr.R(b));
+			break;
+		case 15: //madds1
+			MOVAPD(XMM1, fpr.R(c));
+			SHUFPD(XMM1, R(XMM1), 3); // copy higher to lower
+			MULPD(XMM0, R(XMM1));
+			ADDPD(XMM0, fpr.R(b));
+			break;
+		case 28: //msub
+			MULPD(XMM0, fpr.R(c));
+			SUBPD(XMM0, fpr.R(b));
+			break;
+		case 29: //madd
+			MULPD(XMM0, fpr.R(c));
+			ADDPD(XMM0, fpr.R(b));
+			break;
+		case 30: //nmsub
+			MULPD(XMM0, fpr.R(c));
+			SUBPD(XMM0, fpr.R(b));
+			XORPD(XMM0, M((void*)&psSignBits));
+			break;
+		case 31: //nmadd
+			MULPD(XMM0, fpr.R(c));
+			ADDPD(XMM0, fpr.R(b));
+			XORPD(XMM0, M((void*)&psSignBits));
+			break;
+		default:
+			_assert_msg_(DYNA_REC, 0, "ps_maddXX WTF!!!");
+			//Default(inst);
+			//fpr.UnlockAll();
+			return;
+		}
+		fpr.LoadToX64(d, false);
+		MOVAPD(fpr.RX(d), Gen::R(XMM0));
+		ForceSinglePrecisionP(fpr.RX(d));
+		fpr.UnlockAll();
+	}
--- a/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_SystemRegisters.cpp
+++ b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_SystemRegisters.cpp
@ -0,0 +1,149 @@
+// Copyright (C) 2003-2008 Dolphin Project.
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, version 2.0.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License 2.0 for more details.
+
+// A copy of the GPL 2.0 should have been included with the program.
+// If not, see http://www.gnu.org/licenses/
+
+// Official SVN repository and contact information can be found at
+// http://code.google.com/p/dolphin-emu/
+
+#include "Common.h"
+
+#include "../../Core.h"
+#include "../../CoreTiming.h"
+#include "../../HW/SystemTimers.h"
+#include "../PowerPC.h"
+#include "../PPCTables.h"
+#include "x64Emitter.h"
+#include "ABI.h"
+#include "Thunk.h"
+
+#include "Jit.h"
+#include "JitCache.h"
+#include "JitRegCache.h"
+
+#define INSTRUCTION_START
+// #define INSTRUCTION_START Default(inst); return;
+
+	void Jit64::mtspr(UGeckoInstruction inst)
+	{
+		u32 iIndex = (inst.SPRU << 5) | (inst.SPRL & 0x1F);
+		switch(iIndex) {
+			case SPR_LR:
+				ibuild.EmitStoreLink(ibuild.EmitLoadGReg(inst.RD));
+				return;
+			case SPR_CTR:
+				ibuild.EmitStoreCTR(ibuild.EmitLoadGReg(inst.RD));
+				return;
+			default:
+				printf("mtspr case %d", iIndex);
+				Default(inst);
+				return;
+		}
+	}
+
+	void Jit64::mfspr(UGeckoInstruction inst)
+	{
+		u32 iIndex = (inst.SPRU << 5) | (inst.SPRL & 0x1F);
+		switch (iIndex)
+		{
+		case SPR_LR:
+			ibuild.EmitStoreGReg(ibuild.EmitLoadLink(), inst.RD);
+			return;
+		case SPR_CTR:
+			ibuild.EmitStoreGReg(ibuild.EmitLoadCTR(), inst.RD);
+			return;
+		default:
+			printf("mfspr case %d", iIndex);
+			Default(inst);
+			return;
+		}
+	}
+
+
+	// =======================================================================================
+	// Don't interpret this, if we do we get thrown out
+	// --------------
+	void Jit64::mtmsr(UGeckoInstruction inst)
+	{
+		ibuild.EmitStoreMSR(ibuild.EmitLoadGReg(inst.RS));
+		ibuild.EmitBranchUncond(ibuild.EmitIntConst(js.compilerPC + 4));
+	}
+	// ==============
+
+
+	void Jit64::mfmsr(UGeckoInstruction inst)
+	{
+		ibuild.EmitStoreGReg(ibuild.EmitLoadMSR(), inst.RD);
+	}
+
+	void Jit64::mftb(UGeckoInstruction inst)
+	{
+		if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITSystemRegistersOff)
+			{Default(inst); return;} // turn off from debugger
+		INSTRUCTION_START;
+		mfspr(inst);
+	}
+
+	void Jit64::mfcr(UGeckoInstruction inst)
+	{
+		if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITSystemRegistersOff)
+			{Default(inst); return;} // turn off from debugger
+		INSTRUCTION_START;
+		// USES_CR
+		int d = inst.RD;
+		gpr.LoadToX64(d, false, true);
+		MOV(8, R(EAX), M(&PowerPC::ppcState.cr_fast[0]));
+		SHL(32, R(EAX), Imm8(4));
+		for (int i = 1; i < 7; i++) {
+			OR(8, R(EAX), M(&PowerPC::ppcState.cr_fast[i]));
+			SHL(32, R(EAX), Imm8(4));
+		}
+		OR(8, R(EAX), M(&PowerPC::ppcState.cr_fast[7]));
+		MOV(32, gpr.R(d), R(EAX));
+	}
+
+	void Jit64::mtcrf(UGeckoInstruction inst)
+	{
+		if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITSystemRegistersOff)
+			{Default(inst); return;} // turn off from debugger
+		INSTRUCTION_START;
+
+		// USES_CR
+		u32 mask = 0;
+		u32 crm = inst.CRM;
+		if (crm == 0xFF) {
+			gpr.FlushLockX(ECX);			
+			MOV(32, R(EAX), gpr.R(inst.RS));
+			for (int i = 0; i < 8; i++) {
+				MOV(32, R(ECX), R(EAX));
+				SHR(32, R(ECX), Imm8(28 - (i * 4)));
+				AND(32, R(ECX), Imm32(0xF));
+				MOV(8, M(&PowerPC::ppcState.cr_fast[i]), R(ECX));
+			}
+			gpr.UnlockAllX();
+		} else {
+			Default(inst);
+			return;
+
+			// TODO: translate this to work in new CR model.
+			for (int i = 0; i < 8; i++) {
+				if (crm & (1 << i))
+					mask |= 0xF << (i*4);
+			}
+			MOV(32, R(EAX), gpr.R(inst.RS));
+			MOV(32, R(ECX), M(&PowerPC::ppcState.cr));
+			AND(32, R(EAX), Imm32(mask));
+			AND(32, R(ECX), Imm32(~mask));
+			OR(32, R(EAX), R(ECX));
+			MOV(32, M(&PowerPC::ppcState.cr), R(EAX));
+		}
+	}
--- a/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_Util.cpp
+++ b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_Util.cpp
@ -0,0 +1,161 @@
+// Copyright (C) 2003-2008 Dolphin Project.
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, version 2.0.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License 2.0 for more details.
+
+// A copy of the GPL 2.0 should have been included with the program.
+// If not, see http://www.gnu.org/licenses/
+
+// Official SVN repository and contact information can be found at
+// http://code.google.com/p/dolphin-emu/
+
+#include "Common.h"
+#include "Thunk.h"
+
+#include "../PowerPC.h"
+#include "../../Core.h"
+#include "../../HW/GPFifo.h"
+#include "../../HW/CommandProcessor.h"
+#include "../../HW/PixelEngine.h"
+#include "../../HW/Memmap.h"
+#include "../PPCTables.h"
+#include "x64Emitter.h"
+#include "ABI.h"
+
+#include "Jit.h"
+#include "JitCache.h"
+#include "JitAsm.h"
+#include "JitRegCache.h"
+
+void Jit64::JitClearCA()
+{
+	AND(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(~XER_CA_MASK)); //XER.CA = 0
+}
+
+void Jit64::JitSetCA()
+{
+	OR(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(XER_CA_MASK)); //XER.CA = 1
+}
+
+void Jit64::UnsafeLoadRegToReg(X64Reg reg_addr, X64Reg reg_value, int accessSize, s32 offset, bool signExtend)
+{
+#ifdef _M_IX86
+	AND(32, R(reg_addr), Imm32(Memory::MEMVIEW32_MASK));
+	MOVZX(32, accessSize, reg_value, MDisp(reg_addr, (u32)Memory::base + offset));
+#else
+	MOVZX(32, accessSize, reg_value, MComplex(RBX, reg_addr, SCALE_1, offset));
+#endif
+	if (accessSize == 32)
+	{
+		BSWAP(32, reg_value);
+	}
+	else if (accessSize == 16)
+	{
+		BSWAP(32, reg_value);
+		if (signExtend)
+			SAR(32, R(reg_value), Imm8(16));
+		else
+			SHR(32, R(reg_value), Imm8(16));
+	} else if (signExtend) {
+		// TODO: bake 8-bit into the original load.
+		MOVSX(32, accessSize, reg_value, R(reg_value));   
+	}
+}
+
+void Jit64::SafeLoadRegToEAX(X64Reg reg, int accessSize, s32 offset, bool signExtend)
+{
+	if (offset)
+		ADD(32, R(reg), Imm32((u32)offset));
+	TEST(32, R(reg), Imm32(0x0C000000));
+	FixupBranch argh = J_CC(CC_Z);
+	switch (accessSize)
+	{
+	case 32: ABI_CallFunctionR(thunks.ProtectFunction((void *)&Memory::Read_U32, 1), reg); break;
+	case 16: ABI_CallFunctionR(thunks.ProtectFunction((void *)&Memory::Read_U16, 1), reg); break;
+	case 8:  ABI_CallFunctionR(thunks.ProtectFunction((void *)&Memory::Read_U8, 1), reg);  break;
+	}
+	if (signExtend && accessSize < 32) {
+		// Need to sign extend values coming from the Read_U* functions.
+		MOVSX(32, accessSize, EAX, R(EAX));
+	}
+	FixupBranch arg2 = J();
+	SetJumpTarget(argh);
+	UnsafeLoadRegToReg(reg, EAX, accessSize, 0, signExtend);
+	SetJumpTarget(arg2);
+}
+
+void Jit64::UnsafeWriteRegToReg(X64Reg reg_value, X64Reg reg_addr, int accessSize, s32 offset)
+{
+	if (accessSize == 8 && reg_value >= 4) {
+		PanicAlert("WARNING: likely incorrect use of UnsafeWriteRegToReg!");
+	}
+	BSWAP(accessSize, reg_value);
+#ifdef _M_IX86
+	AND(32, R(reg_addr), Imm32(Memory::MEMVIEW32_MASK));
+	MOV(accessSize, MDisp(reg_addr, (u32)Memory::base + offset), R(reg_value));
+#else
+	MOV(accessSize, MComplex(RBX, reg_addr, SCALE_1, offset), R(reg_value));
+#endif
+}
+
+// Destroys both arg registers
+void Jit64::SafeWriteRegToReg(X64Reg reg_value, X64Reg reg_addr, int accessSize, s32 offset)
+{
+	if (offset)
+		ADD(32, R(reg_addr), Imm32(offset));
+	TEST(32, R(reg_addr), Imm32(0x0C000000));
+	FixupBranch argh = J_CC(CC_Z);
+	switch (accessSize)
+	{
+	case 32: ABI_CallFunctionRR(thunks.ProtectFunction((void *)&Memory::Write_U32, 2), reg_value, reg_addr); break;
+	case 16: ABI_CallFunctionRR(thunks.ProtectFunction((void *)&Memory::Write_U16, 2), reg_value, reg_addr); break;
+	case 8:  ABI_CallFunctionRR(thunks.ProtectFunction((void *)&Memory::Write_U8, 2), reg_value, reg_addr);  break;
+	}
+	FixupBranch arg2 = J();
+	SetJumpTarget(argh);
+	UnsafeWriteRegToReg(reg_value, reg_addr, accessSize, 0);
+	SetJumpTarget(arg2);
+}
+
+void Jit64::WriteToConstRamAddress(int accessSize, const Gen::OpArg& arg, u32 address)
+{
+#ifdef _M_X64
+ 	MOV(accessSize, MDisp(RBX, address & 0x3FFFFFFF), arg);
+#else
+	MOV(accessSize, M((void*)(Memory::base + (address & Memory::MEMVIEW32_MASK))), arg);
+#endif
+}
+
+void Jit64::WriteFloatToConstRamAddress(const Gen::X64Reg& xmm_reg, u32 address)
+{
+#ifdef _M_X64
+	MOV(32, R(RAX), Imm32(address));
+	MOVSS(MComplex(RBX, RAX, 1, 0), xmm_reg);
+#else
+	MOVSS(M((void*)((u32)Memory::base + (address & Memory::MEMVIEW32_MASK))), xmm_reg);
+#endif
+}
+
+void Jit64::ForceSinglePrecisionS(X64Reg xmm) {
+	// Most games don't need these. Zelda requires it though - some platforms get stuck without them.
+	if (jo.accurateSinglePrecision)
+	{
+		CVTSD2SS(xmm, R(xmm));
+		CVTSS2SD(xmm, R(xmm));
+	}
+}
+
+void Jit64::ForceSinglePrecisionP(X64Reg xmm) {
+	// Most games don't need these. Zelda requires it though - some platforms get stuck without them.
+	if (jo.accurateSinglePrecision)
+	{
+		CVTPD2PS(xmm, R(xmm));
+		CVTPS2PD(xmm, R(xmm));
+	}
+}