Implements block time profiling on ARMv7.

This was interesting implementing.
Our generic QueryPerformanceCounter function on ARMv7 was so slow that profiling a block was impossible.
I waited about five minutes and I couldn't even get a single frame to output.
This instead uses ARMv7's PMU to get cycle counts, which are a relatively minor performance drop in my testing.
One disadvantage of this method is that the kernel can lock us out of using these co-processor registers, but it seems to work on my Jetson board.
Another disadvantage is that we aren't having block times in "real" time but cycles instead, not too big of a deal.

This also removes instruction run counts from profiling because that's just annoying and we don't expose an interface for even getting those results
from our UI.
This commit is contained in:
Ryan Houdek 2014-11-16 09:20:01 +00:00
parent b24197f913
commit 30e1749d00
4 changed files with 90 additions and 32 deletions

View file

@ -150,6 +150,10 @@ void JitArm::WriteExitDestInR(ARMReg Reg)
STR(Reg, R9, PPCSTATE_OFF(pc));
Cleanup();
DoDownCount();
if (Profiler::g_ProfileBlocks)
EndTimeProfile(js.curBlock);
MOVI2R(Reg, (u32)asm_routines.dispatcher);
B(Reg);
gpr.Unlock(Reg);
@ -160,6 +164,9 @@ void JitArm::WriteRfiExitDestInR(ARMReg Reg)
Cleanup();
DoDownCount();
if (Profiler::g_ProfileBlocks)
EndTimeProfile(js.curBlock);
ARMReg A = gpr.GetReg(false);
LDR(A, R9, PPCSTATE_OFF(pc));
@ -177,6 +184,9 @@ void JitArm::WriteExceptionExit()
Cleanup();
DoDownCount();
if (Profiler::g_ProfileBlocks)
EndTimeProfile(js.curBlock);
ARMReg A = gpr.GetReg(false);
LDR(A, R9, PPCSTATE_OFF(pc));
@ -193,6 +203,10 @@ void JitArm::WriteExit(u32 destination)
Cleanup();
DoDownCount();
if (Profiler::g_ProfileBlocks)
EndTimeProfile(js.curBlock);
//If nobody has taken care of this yet (this can be removed when all branches are done)
JitBlock *b = js.curBlock;
JitBlock::LinkData linkData;
@ -273,6 +287,64 @@ void JitArm::Break(UGeckoInstruction inst)
BKPT(0x4444);
}
void JitArm::BeginTimeProfile(JitBlock* b)
{
b->ticCounter = 0;
b->ticStart = 0;
b->ticStop = 0;
// Performance counters are bit finnicky on ARM
// We must first enable and program the PMU before using it
// This is a per core operation so with thread scheduling we may jump to a core we haven't enabled PMU yet
// Work around this by enabling PMU each time at the start of a block
// Some ARM CPUs are getting absurd core counts(48+!)
// We have to reset counters at the start of every block anyway, so may as well.
// One thing to note about performance counters on ARM
// The kernel can block access to these co-processor registers
// In the case that this happens, these will generate a SIGILL
// Refer to the ARM ARM about PMCR for what these do exactly
enum
{
PERF_OPTION_ENABLE = (1 << 0),
PERF_OPTION_RESET_CR = (1 << 1),
PERF_OPTION_RESET_CCR = (1 << 2),
PERF_OPTION_DIVIDER_MODE = (1 << 3),
PERF_OPTION_EXPORT_ENABLE = (1 << 4),
};
const u32 perf_options =
PERF_OPTION_ENABLE |
PERF_OPTION_RESET_CR |
PERF_OPTION_RESET_CCR |
PERF_OPTION_EXPORT_ENABLE;
MOVI2R(R0, perf_options);
// Programs the PMCR
MCR(15, 0, R0, 9, 12, 0);
MOVI2R(R0, 0x8000000F);
// Enables all counters
MCR(15, 0, R0, 9, 12, 1);
// Clears all counter overflows
MCR(15, 0, R0, 9, 12, 3);
// Gets the cycle counter
MRC(15, 0, R1, 9, 13, 0);
MOVI2R(R0, (u32)&b->ticStart);
STR(R1, R0, 0);
}
void JitArm::EndTimeProfile(JitBlock* b)
{
// Gets the cycle counter
MRC(15, 0, R1, 9, 13, 0);
MOVI2R(R0, (u32)&b->ticStop);
STR(R1, R0, 0);
MOVI2R(R0, (u32)&b->ticStart);
MOVI2R(R14, (u32)asm_routines.m_increment_profile_counter);
BL(R14);
}
const u8* JitArm::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBlock *b)
{
int blockSize = code_buf->GetSize();
@ -362,8 +434,7 @@ const u8* JitArm::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBlo
LDR(rB, rA); // Load the actual value in to R11.
ADD(rB, rB, 1); // Add one to the value
STR(rB, rA); // Now store it back in the memory location
// get start tic
PROFILER_QUERY_PERFORMANCE_COUNTER(&b->ticStart);
BeginTimeProfile(b);
gpr.Unlock(rA, rB);
}
gpr.Start(js.gpa);
@ -390,16 +461,6 @@ const u8* JitArm::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBlo
// WARNING - cmp->branch merging will screw this up.
js.isLastInstruction = true;
js.next_inst = 0;
if (Profiler::g_ProfileBlocks)
{
// CAUTION!!! push on stack regs you use, do your stuff, then pop
PROFILER_VPUSH;
// get end tic
PROFILER_QUERY_PERFORMANCE_COUNTER(&b->ticStop);
// tic counter += (end tic - start tic)
PROFILER_UPDATE_TIME(&b);
PROFILER_VPOP;
}
}
else
{
@ -416,26 +477,6 @@ const u8* JitArm::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBlo
POP(4, R0, R1, R2, R3);
}
if (Profiler::g_ProfileBlocks)
{
// Add run count
static const u64 One = 1;
ARMReg RA = gpr.GetReg();
ARMReg RB = gpr.GetReg();
ARMReg VA = fpr.GetReg();
ARMReg VB = fpr.GetReg();
MOVI2R(RA, (u32)&opinfo->runCount);
MOVI2R(RB, (u32)&One);
VLDR(VA, RA, 0);
VLDR(VB, RB, 0);
NEONXEmitter nemit(this);
nemit.VADD(I_64, VA, VA, VB);
VSTR(VA, RA, 0);
gpr.Unlock(RA, RB);
fpr.Unlock(VA);
fpr.Unlock(VB);
}
if (!ops[i].skip)
{
if (js.memcheck && (opinfo->flags & FL_USE_FPU))

View file

@ -58,6 +58,10 @@ private:
ArmGen::FixupBranch JumpIfCRFieldBit(int field, int bit, bool jump_if_set);
bool BackPatch(SContext* ctx);
void BeginTimeProfile(JitBlock* b);
void EndTimeProfile(JitBlock* b);
public:
JitArm() : code_buffer(32000) {}
~JitArm() {}

View file

@ -609,4 +609,15 @@ void JitArmAsmRoutineManager::GenerateCommon()
pairedStoreQuantized[14] = storeSingleS8;
pairedStoreQuantized[15] = storeSingleS16;
m_increment_profile_counter = AlignCode16();
nemit.VLD1(I_64, D0, R0); // Start
ADD(R0, R0, 8);
nemit.VLD1(I_64, D1, R0); // End
ADD(R0, R0, 8);
nemit.VLD1(I_64, D2, R0); // Counter
nemit.VSUB(I_64, D1, D1, D0);
nemit.VADD(I_64, D2, D2, D1);
nemit.VST1(I_64, D2, R0);
MOV(_PC, _LR);
}

View file

@ -14,6 +14,8 @@ private:
void GenerateCommon();
public:
const u8* m_increment_profile_counter;
void Init()
{
AllocCodeSpace(8192);