Latte: Optimizations and tweaks (#706)

This commit is contained in:
Exzap 2023-09-19 21:17:21 +02:00 committed by GitHub
parent 323bdfa183
commit 90c56b7731
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
10 changed files with 822 additions and 482 deletions

File diff suppressed because it is too large Load diff

View file

@ -26,6 +26,7 @@ struct OverlayStats
double fps{};
uint32 draw_calls_per_frame{};
uint32 fast_draw_calls_per_frame{};
float cpu_usage{}; // cemu cpu usage in %
std::vector<float> cpu_per_core; // global cpu usage in % per core
uint32 ram_usage{}; // ram usage in MB
@ -86,7 +87,7 @@ void LatteOverlay_renderOverlay(ImVec2& position, ImVec2& pivot, sint32 directio
ImGui::Text("FPS: %.2lf", g_state.fps);
if (config.overlay.drawcalls)
ImGui::Text("Draws/f: %d", g_state.draw_calls_per_frame);
ImGui::Text("Draws/f: %d (fast: %d)", g_state.draw_calls_per_frame, g_state.fast_draw_calls_per_frame);
if (config.overlay.cpu_usage)
ImGui::Text("CPU: %.2lf%%", g_state.cpu_usage);
@ -588,13 +589,14 @@ static void UpdateStats_CpuPerCore()
}
}
void LatteOverlay_updateStats(double fps, sint32 drawcalls)
void LatteOverlay_updateStats(double fps, sint32 drawcalls, sint32 fastDrawcalls)
{
if (GetConfig().overlay.position == ScreenPosition::kDisabled)
return;
g_state.fps = fps;
g_state.draw_calls_per_frame = drawcalls;
g_state.fast_draw_calls_per_frame = fastDrawcalls;
UpdateStats_CemuCpu();
UpdateStats_CpuPerCore();

View file

@ -2,6 +2,6 @@
void LatteOverlay_init();
void LatteOverlay_render(bool pad_view);
void LatteOverlay_updateStats(double fps, sint32 drawcalls);
void LatteOverlay_updateStats(double fps, sint32 drawcalls, sint32 fastDrawcalls);
void LatteOverlay_pushNotification(const std::string& text, sint32 duration);

View file

@ -38,6 +38,7 @@ void LattePerformanceMonitor_frameEnd()
uint64 indexDataCached = 0;
uint32 frameCounter = 0;
uint32 drawCallCounter = 0;
uint32 fastDrawCallCounter = 0;
uint32 shaderBindCounter = 0;
uint32 recompilerLeaveCount = 0;
uint32 threadLeaveCount = 0;
@ -53,6 +54,7 @@ void LattePerformanceMonitor_frameEnd()
indexDataCached += performanceMonitor.cycle[i].indexDataCached;
frameCounter += performanceMonitor.cycle[i].frameCounter;
drawCallCounter += performanceMonitor.cycle[i].drawCallCounter;
fastDrawCallCounter += performanceMonitor.cycle[i].fastDrawCallCounter;
shaderBindCounter += performanceMonitor.cycle[i].shaderBindCount;
recompilerLeaveCount += performanceMonitor.cycle[i].recompilerLeaveCount;
threadLeaveCount += performanceMonitor.cycle[i].threadLeaveCount;
@ -75,7 +77,6 @@ void LattePerformanceMonitor_frameEnd()
indexDataUploadPerFrame /= 1024ULL;
double fps = (double)elapsedFrames2S * 1000.0 / (double)totalElapsedTimeFPS;
uint32 drawCallsPerFrame = drawCallCounter / elapsedFrames;
uint32 shaderBindsPerFrame = shaderBindCounter / elapsedFrames;
passedCycles = passedCycles * 1000ULL / totalElapsedTime;
uint32 rlps = (uint32)((uint64)recompilerLeaveCount * 1000ULL / (uint64)totalElapsedTime);
@ -85,6 +86,7 @@ void LattePerformanceMonitor_frameEnd()
// next counter cycle
sint32 nextCycleIndex = (performanceMonitor.cycleIndex + 1) % PERFORMANCE_MONITOR_TRACK_CYCLES;
performanceMonitor.cycle[nextCycleIndex].drawCallCounter = 0;
performanceMonitor.cycle[nextCycleIndex].fastDrawCallCounter = 0;
performanceMonitor.cycle[nextCycleIndex].frameCounter = 0;
performanceMonitor.cycle[nextCycleIndex].shaderBindCount = 0;
performanceMonitor.cycle[nextCycleIndex].lastCycleCount = PPCInterpreter_getMainCoreCycleCounter();
@ -104,12 +106,12 @@ void LattePerformanceMonitor_frameEnd()
if (isFirstUpdate)
{
LatteOverlay_updateStats(0.0, 0);
LatteOverlay_updateStats(0.0, 0, 0);
gui_updateWindowTitles(false, false, 0.0);
}
else
{
LatteOverlay_updateStats(fps, drawCallCounter / elapsedFrames);
LatteOverlay_updateStats(fps, drawCallCounter / elapsedFrames, fastDrawCallCounter / elapsedFrames);
gui_updateWindowTitles(false, false, fps);
}
}

View file

@ -84,6 +84,7 @@ typedef struct
uint32 lastUpdate;
uint32 frameCounter;
uint32 drawCallCounter;
uint32 fastDrawCallCounter;
uint32 shaderBindCount;
uint64 vertexDataUploaded; // amount of vertex data uploaded to GPU (bytes)
uint64 vertexDataCached; // amount of vertex data reused from GPU cache (bytes)

View file

@ -295,6 +295,34 @@ LatteTextureView* LatteMRT::GetColorAttachmentTexture(uint32 index, bool createN
uint32 colorBufferHeight = pitchHeight / colorBufferPitch;
uint32 colorBufferWidth = colorBufferPitch;
// colorbuffer width/height has to be padded to 8/32 alignment but the actual resolution might be smaller
// use the scissor box as a clue to figure out the original resolution if possible
#if 0
uint32 scissorBoxWidth = LatteGPUState.contextNew.PA_SC_GENERIC_SCISSOR_BR.get_BR_X();
uint32 scissorBoxHeight = LatteGPUState.contextNew.PA_SC_GENERIC_SCISSOR_BR.get_BR_Y();
if (((scissorBoxWidth + 7) & ~7) == colorBufferWidth)
colorBufferWidth = scissorBoxWidth;
if (((colorBufferHeight + 31) & ~31) == colorBufferHeight)
colorBufferHeight = scissorBoxHeight;
#endif
// log resolution changes if the above heuristic takes effect
// this is useful to find resolutions which need to be updated in gfx pack texture rules
#if 0
uint32 colorBufferHeight2 = pitchHeight / colorBufferPitch;
static std::unordered_set<uint64> s_foundColorBufferResMappings;
if (colorBufferPitch != colorBufferWidth || colorBufferHeight != colorBufferHeight2)
{
// only log unique, source and dest resolution. Encode into a key with 16 bits per component
uint64 resHash = (uint64)colorBufferWidth | ((uint64)colorBufferHeight << 16) | ((uint64)colorBufferPitch << 32) | ((uint64)colorBufferHeight2 << 48);
if( !s_foundColorBufferResMappings.contains(resHash) )
{
s_foundColorBufferResMappings.insert(resHash);
cemuLog_log(LogType::Force, "[COLORBUFFER-DBG] Using res {}x{} instead of {}x{}", colorBufferWidth, colorBufferHeight, colorBufferPitch, colorBufferHeight2);
}
}
#endif
bool colorBufferWasFound = false;
sint32 viewFirstMip = 0; // todo

View file

@ -8,10 +8,11 @@
#include "Cafe/HW/Latte/Core/LatteTexture.h"
#include "Cafe/HW/Latte/Renderer/OpenGL/LatteTextureViewGL.h"
// #define LOG_READBACK_TIME
//#define LOG_READBACK_TIME
struct LatteTextureReadbackQueueEntry
{
HRTick initiateTime;
uint32 lastUpdateDrawcallIndex;
LatteTextureView* textureView;
};
@ -22,12 +23,12 @@ std::queue<LatteTextureReadbackInfo*> sTextureActiveReadbackQueue; // readbacks
void LatteTextureReadback_StartTransfer(LatteTextureView* textureView)
{
cemuLog_log(LogType::TextureReadback, "[TextureReadback-Start] PhysAddr {:08x} Res {}x{} Fmt {} Slice {} Mip {}", textureView->baseTexture->physAddress, textureView->baseTexture->width, textureView->baseTexture->height, textureView->baseTexture->format, textureView->firstSlice, textureView->firstMip);
HRTick currentTick = HighResolutionTimer().now().getTick();
// create info entry and store in ordered linked list
LatteTextureReadbackInfo* readbackInfo = g_renderer->texture_createReadback(textureView);
sTextureActiveReadbackQueue.push(readbackInfo);
readbackInfo->StartTransfer();
//debug_printf("[Tex-Readback] %08x %dx%d TM %d FMT %04x\n", textureView->baseTexture->physAddress, textureView->baseTexture->width, textureView->baseTexture->height, textureView->baseTexture->tileMode, textureView->baseTexture->format);
readbackInfo->transferStartTime = HighResolutionTimer().now().getTick();
readbackInfo->transferStartTime = currentTick;
}
/*
@ -41,9 +42,15 @@ bool LatteTextureReadback_Update(bool forceStart)
for (size_t i = 0; i < sTextureScheduledReadbacks.size(); i++)
{
LatteTextureReadbackQueueEntry& entry = sTextureScheduledReadbacks[i];
uint32 numPassedDrawcalls = LatteGPUState.drawCallCounter - entry.lastUpdateDrawcallIndex;
if (forceStart || numPassedDrawcalls >= 5)
uint32 numElapsedDrawcalls = LatteGPUState.drawCallCounter - entry.lastUpdateDrawcallIndex;
if (forceStart || numElapsedDrawcalls >= 5)
{
#ifdef LOG_READBACK_TIME
double elapsedSecondsSinceInitiate = HighResolutionTimer::getTimeDiff(entry.initiateTime, HighResolutionTimer().now().getTick());
char initiateElapsedTimeStr[32];
sprintf(initiateElapsedTimeStr, "%.4lfms", elapsedSecondsSinceInitiate);
cemuLog_log(LogType::TextureReadback, "[TextureReadback-Update] Starting transfer for {:08x} after {} elapsed drawcalls. Time since initiate: {} Force-start: {}", entry.textureView->baseTexture->physAddress, numElapsedDrawcalls, initiateElapsedTimeStr, forceStart?"yes":"no");
#endif
LatteTextureReadback_StartTransfer(entry.textureView);
// remove element
vectorRemoveByIndex(sTextureScheduledReadbacks, i);
@ -91,6 +98,7 @@ void LatteTextureReadback_Initate(LatteTextureView* textureView)
}
// queue
LatteTextureReadbackQueueEntry queueEntry;
queueEntry.initiateTime = HighResolutionTimer().now().getTick();
queueEntry.textureView = textureView;
queueEntry.lastUpdateDrawcallIndex = LatteGPUState.drawCallCounter;
sTextureScheduledReadbacks.emplace_back(queueEntry);
@ -112,6 +120,14 @@ void LatteTextureReadback_UpdateFinishedTransfers(bool forceFinish)
if (!readbackInfo->IsFinished())
{
readbackInfo->waitStartTime = HighResolutionTimer().now().getTick();
#ifdef LOG_READBACK_TIME
if (cemuLog_isLoggingEnabled(LogType::TextureReadback))
{
double elapsedSecondsTransfer = HighResolutionTimer::getTimeDiff(readbackInfo->transferStartTime, HighResolutionTimer().now().getTick());
forceLog_printf("[Texture-Readback] Force-finish: %08x Res %4d/%4d TM %d FMT %04x Transfer time so far: %.4lfms", readbackInfo->hostTextureCopy.physAddress, readbackInfo->hostTextureCopy.width, readbackInfo->hostTextureCopy.height, readbackInfo->hostTextureCopy.tileMode, (uint32)readbackInfo->hostTextureCopy.format, elapsedSecondsTransfer * 1000.0);
}
#endif
readbackInfo->forceFinish = true;
readbackInfo->ForceFinish();
// rerun logic since ->ForceFinish() can recurively call this function and thus modify the queue
continue;
@ -125,10 +141,13 @@ void LatteTextureReadback_UpdateFinishedTransfers(bool forceFinish)
}
// performance testing
#ifdef LOG_READBACK_TIME
HRTick currentTick = HighResolutionTimer().now().getTick();
double elapsedSecondsTransfer = HighResolutionTimer::getTimeDiff(readbackInfo->transferStartTime, currentTick);
double elapsedSecondsWaiting = HighResolutionTimer::getTimeDiff(readbackInfo->waitStartTime, currentTick);
cemuLog_log(LogType::Force, "[Texture-Readback] {:08x} Res {:4}/{:4} TM {} FMT {:04x} ReadbackLatency: {:6.3}ms WaitTime: {:6.3}ms ForcedWait {}", readbackInfo->hostTextureCopy.physAddress, readbackInfo->hostTextureCopy.width, readbackInfo->hostTextureCopy.height, readbackInfo->hostTextureCopy.tileMode, (uint32)readbackInfo->hostTextureCopy.format, elapsedSecondsTransfer * 1000.0, elapsedSecondsWaiting * 1000.0, forceFinish?"yes":"no");
if (cemuLog_isLoggingEnabled(LogType::TextureReadback))
{
HRTick currentTick = HighResolutionTimer().now().getTick();
double elapsedSecondsTransfer = HighResolutionTimer::getTimeDiff(readbackInfo->transferStartTime, currentTick);
double elapsedSecondsWaiting = HighResolutionTimer::getTimeDiff(readbackInfo->waitStartTime, currentTick);
forceLog_printf("[Texture-Readback] %08x Res %4d/%4d TM %d FMT %04x ReadbackLatency: %6.3lfms WaitTime: %6.3lfms ForcedWait %s", readbackInfo->hostTextureCopy.physAddress, readbackInfo->hostTextureCopy.width, readbackInfo->hostTextureCopy.height, readbackInfo->hostTextureCopy.tileMode, (uint32)readbackInfo->hostTextureCopy.format, elapsedSecondsTransfer * 1000.0, elapsedSecondsWaiting * 1000.0, readbackInfo->forceFinish ? "yes" : "no");
}
#endif
uint8* pixelData = readbackInfo->GetData();
LatteTextureLoader_writeReadbackTextureToMemory(&readbackInfo->hostTextureCopy, 0, 0, pixelData);

View file

@ -21,6 +21,7 @@ public:
HRTick transferStartTime;
HRTick waitStartTime;
bool forceFinish{ false }; // set to true if not finished in time for dependent operation
// texture info
LatteTextureDefinition hostTextureCopy{};

View file

@ -484,7 +484,7 @@ namespace Latte
SQ_TEX_RESOURCE_WORD0_N_GS = 0xE930,
SQ_TEX_RESOURCE_WORD_FIRST = SQ_TEX_RESOURCE_WORD0_N_PS,
SQ_TEX_RESOURCE_WORD_LAST = (SQ_TEX_RESOURCE_WORD0_N_GS + GPU_LIMITS::NUM_TEXTURES_PER_STAGE * 7 - 1),
// there are 54 samplers with 3 registers each. 18 per stage. For stage indices see SAMPLER_BASE_INDEX_*
// there are 54 samplers with 3 registers each. 18 (actually only 16?) per stage. For stage indices see SAMPLER_BASE_INDEX_*
SQ_TEX_SAMPLER_WORD0_0 = 0xF000,
SQ_TEX_SAMPLER_WORD1_0 = 0xF001,
SQ_TEX_SAMPLER_WORD2_0 = 0xF002,

View file

@ -2002,7 +2002,7 @@ void VulkanRenderer::SubmitCommandBuffer(VkSemaphore signalSemaphore, VkSemaphor
occlusionQuery_notifyBeginCommandBuffer();
m_recordedDrawcalls = 0;
m_submitThreshold = 500; // this used to be 750 before 1.25.5, but more frequent submission is actually better for latency
m_submitThreshold = 300;
m_submitOnIdle = false;
}