Vulkan: Add profiler for Vulkan API CPU cost

Disabled by default. Set VULKAN_API_CPU_BENCHMARK to 1 to enable
This commit is contained in:
Exzap 2023-09-27 08:11:57 +02:00
parent 3e925b7707
commit f9f6206929
5 changed files with 99 additions and 13 deletions

View file

@ -8,7 +8,7 @@
#include "Cafe/HW/Latte/Core/LatteTexture.h"
#include "Cafe/HW/Latte/Renderer/OpenGL/LatteTextureViewGL.h"
//#define LOG_READBACK_TIME
#define LOG_READBACK_TIME
struct LatteTextureReadbackQueueEntry
{
@ -47,9 +47,7 @@ bool LatteTextureReadback_Update(bool forceStart)
{
#ifdef LOG_READBACK_TIME
double elapsedSecondsSinceInitiate = HighResolutionTimer::getTimeDiff(entry.initiateTime, HighResolutionTimer().now().getTick());
char initiateElapsedTimeStr[32];
sprintf(initiateElapsedTimeStr, "%.4lfms", elapsedSecondsSinceInitiate);
cemuLog_log(LogType::TextureReadback, "[TextureReadback-Update] Starting transfer for {:08x} after {} elapsed drawcalls. Time since initiate: {} Force-start: {}", entry.textureView->baseTexture->physAddress, numElapsedDrawcalls, initiateElapsedTimeStr, forceStart?"yes":"no");
cemuLog_log(LogType::TextureReadback, "[TextureReadback-Update] Starting transfer for {:08x} after {} elapsed drawcalls. Time since initiate: {:.4} Force-start: {}", entry.textureView->baseTexture->physAddress, numElapsedDrawcalls, elapsedSecondsSinceInitiate, forceStart?"yes":"no");
#endif
LatteTextureReadback_StartTransfer(entry.textureView);
// remove element
@ -83,7 +81,7 @@ void LatteTextureReadback_Initate(LatteTextureView* textureView)
// currently we don't support readback for resized textures
if (textureView->baseTexture->overwriteInfo.hasResolutionOverwrite)
{
cemuLog_log(LogType::Force, "_initate(): Readback is not supported for textures with modified resolution");
cemuLog_log(LogType::Force, "Texture readback is not supported for textures with modified resolution. Texture: {:08x} {}x{}", textureView->baseTexture->physAddress, textureView->baseTexture->width, textureView->baseTexture->height);
return;
}
// check if texture isn't already queued for transfer
@ -124,7 +122,7 @@ void LatteTextureReadback_UpdateFinishedTransfers(bool forceFinish)
if (cemuLog_isLoggingEnabled(LogType::TextureReadback))
{
double elapsedSecondsTransfer = HighResolutionTimer::getTimeDiff(readbackInfo->transferStartTime, HighResolutionTimer().now().getTick());
forceLog_printf("[Texture-Readback] Force-finish: %08x Res %4d/%4d TM %d FMT %04x Transfer time so far: %.4lfms", readbackInfo->hostTextureCopy.physAddress, readbackInfo->hostTextureCopy.width, readbackInfo->hostTextureCopy.height, readbackInfo->hostTextureCopy.tileMode, (uint32)readbackInfo->hostTextureCopy.format, elapsedSecondsTransfer * 1000.0);
cemuLog_log(LogType::TextureReadback, "[Texture-Readback] Force-finish: {:08x} Res {:}/{:} TM {:} FMT {:04x} Transfer time so far: {:.4}ms", readbackInfo->hostTextureCopy.physAddress, readbackInfo->hostTextureCopy.width, readbackInfo->hostTextureCopy.height, readbackInfo->hostTextureCopy.tileMode, (uint32)readbackInfo->hostTextureCopy.format, elapsedSecondsTransfer * 1000.0);
}
#endif
readbackInfo->forceFinish = true;
@ -146,7 +144,7 @@ void LatteTextureReadback_UpdateFinishedTransfers(bool forceFinish)
HRTick currentTick = HighResolutionTimer().now().getTick();
double elapsedSecondsTransfer = HighResolutionTimer::getTimeDiff(readbackInfo->transferStartTime, currentTick);
double elapsedSecondsWaiting = HighResolutionTimer::getTimeDiff(readbackInfo->waitStartTime, currentTick);
forceLog_printf("[Texture-Readback] %08x Res %4d/%4d TM %d FMT %04x ReadbackLatency: %6.3lfms WaitTime: %6.3lfms ForcedWait %s", readbackInfo->hostTextureCopy.physAddress, readbackInfo->hostTextureCopy.width, readbackInfo->hostTextureCopy.height, readbackInfo->hostTextureCopy.tileMode, (uint32)readbackInfo->hostTextureCopy.format, elapsedSecondsTransfer * 1000.0, elapsedSecondsWaiting * 1000.0, readbackInfo->forceFinish ? "yes" : "no");
cemuLog_log(LogType::TextureReadback, "[Texture-Readback] {:08x} Res {}/{} TM {} FMT {:04x} ReadbackLatency: {:6.3}ms WaitTime: {:6.3}ms ForcedWait {}", readbackInfo->hostTextureCopy.physAddress, readbackInfo->hostTextureCopy.width, readbackInfo->hostTextureCopy.height, readbackInfo->hostTextureCopy.tileMode, (uint32)readbackInfo->hostTextureCopy.format, elapsedSecondsTransfer * 1000.0, elapsedSecondsWaiting * 1000.0, readbackInfo->forceFinish ? "yes" : "no");
}
#endif
uint8* pixelData = readbackInfo->GetData();

View file

@ -1,13 +1,81 @@
#include "Cafe/HW/Latte/Renderer/Vulkan/VulkanAPI.h"
#define VKFUNC_DEFINE
#include "Cafe/HW/Latte/Renderer/Vulkan/VulkanAPI.h"
#include <numeric> // for std::iota
#if BOOST_OS_LINUX || BOOST_OS_MACOS
#include <dlfcn.h>
#endif
#define VULKAN_API_CPU_BENCHMARK 0 // if 1, Cemu will log the CPU time spent per Vulkan API function
bool g_vulkan_available = false;
#if VULKAN_API_CPU_BENCHMARK != 0
uint64 s_vulkanBenchmarkLastResultsTime = 0;
struct VulkanBenchmarkFuncInfo
{
std::string funcName;
uint64 cycles;
uint32 numCalls;
};
std::vector<VulkanBenchmarkFuncInfo*> s_vulkanBenchmarkFuncs;
template<typename TRet, typename... Args>
auto VkWrapperFuncGenTest(TRet (*func)(Args...), const char* name)
{
static VulkanBenchmarkFuncInfo _FuncInfo;
static auto _FuncPtrCopy = func;
TRet (*newFunc)(Args...);
if constexpr(std::is_void_v<TRet>)
{
newFunc = +[](Args... args) { uint64 t = __rdtsc(); _mm_mfence(); _FuncPtrCopy(args...); _mm_mfence(); _FuncInfo.cycles += (__rdtsc() - t); _FuncInfo.numCalls++; };
}
else
newFunc = +[](Args... args) -> TRet { uint64 t = __rdtsc(); _mm_mfence(); TRet r = _FuncPtrCopy(args...); _mm_mfence(); _FuncInfo.cycles += (__rdtsc() - t); _FuncInfo.numCalls++; return r; };
if(func && func != newFunc)
_FuncPtrCopy = func;
if(_FuncInfo.funcName.empty())
{
_FuncInfo = {.funcName = name, .cycles = 0, .numCalls = 0};
s_vulkanBenchmarkFuncs.emplace_back(&_FuncInfo);
}
return newFunc;
};
#endif
// called when a TV SwapBuffers is called
void VulkanBenchmarkPrintResults()
{
#if VULKAN_API_CPU_BENCHMARK != 0
// note: This could be done by hooking vk present functions
uint64 currentCycle = __rdtsc();
uint64 elapsedCycles = currentCycle - s_vulkanBenchmarkLastResultsTime;
s_vulkanBenchmarkLastResultsTime = currentCycle;
double elapsedCyclesDbl = (double)elapsedCycles;
cemuLog_log(LogType::Force, "--- Vulkan API CPU benchmark ---");
cemuLog_log(LogType::Force, "Elapsed cycles this frame: {:} | Current cycle {:} | NumFunc {:}", elapsedCycles, currentCycle, s_vulkanBenchmarkFuncs.size());
std::vector<sint32> sortedIndices(s_vulkanBenchmarkFuncs.size());
std::iota(sortedIndices.begin(), sortedIndices.end(), 0);
std::sort(sortedIndices.begin(), sortedIndices.end(),
[](int32_t a, int32_t b) {
return s_vulkanBenchmarkFuncs[a]->cycles > s_vulkanBenchmarkFuncs[b]->cycles;
});
for (sint32 idx : sortedIndices)
{
auto& func = s_vulkanBenchmarkFuncs[idx];
if(func->cycles == 0)
return;
cemuLog_log(LogType::Force, "{}: {} cycles ({:.4}%) {} calls", func->funcName.c_str(), func->cycles, ((double)func->cycles / elapsedCyclesDbl) * 100.0, func->numCalls);
func->cycles = 0;
func->numCalls = 0;
}
#endif
}
#if BOOST_OS_WINDOWS
bool InitializeGlobalVulkan()
@ -57,7 +125,12 @@ bool InitializeDeviceVulkan(VkDevice device)
#define VKFUNC_DEVICE_INIT
#include "Cafe/HW/Latte/Renderer/Vulkan/VulkanAPI.h"
#if VULKAN_API_CPU_BENCHMARK != 0
#define VKFUNC_DEFINE_CUSTOM(__func) __func = VkWrapperFuncGenTest(__func, #__func)
#include "Cafe/HW/Latte/Renderer/Vulkan/VulkanAPI.h"
#endif
return true;
}
@ -121,7 +194,12 @@ bool InitializeDeviceVulkan(VkDevice device)
#define VKFUNC_DEVICE_INIT
#include "Cafe/HW/Latte/Renderer/Vulkan/VulkanAPI.h"
#if VULKAN_API_CPU_BENCHMARK != 0
#define VKFUNC_DEFINE_CUSTOM(__func) __func = VkWrapperFuncGenTest(__func, #__func)
#include "Cafe/HW/Latte/Renderer/Vulkan/VulkanAPI.h"
#endif
return true;
}

View file

@ -14,7 +14,11 @@ extern bool g_vulkan_available;
#endif
#ifdef VKFUNC_DEFINE
#ifdef VKFUNC_DEFINE_CUSTOM
#define VKFUNC(__FUNC__) VKFUNC_DEFINE_CUSTOM(__FUNC__)
#define VKFUNC_INSTANCE(__FUNC__) VKFUNC_DEFINE_CUSTOM(__FUNC__)
#define VKFUNC_DEVICE(__FUNC__) VKFUNC_DEFINE_CUSTOM(__FUNC__)
#elif defined(VKFUNC_DEFINE)
#define VKFUNC(__FUNC__) NOEXPORT PFN_##__FUNC__ __FUNC__ = nullptr
#define VKFUNC_INSTANCE(__FUNC__) NOEXPORT PFN_##__FUNC__ __FUNC__ = nullptr
#define VKFUNC_DEVICE(__FUNC__) NOEXPORT PFN_##__FUNC__ __FUNC__ = nullptr
@ -238,4 +242,5 @@ VKFUNC_DEVICE(vkDestroyDescriptorSetLayout);
#undef VKFUNC
#undef VKFUNC_INSTANCE
#undef VKFUNC_DEVICE
#undef VKFUNC_DEVICE
#undef VKFUNC_DEFINE_CUSTOM

View file

@ -2768,6 +2768,8 @@ void VulkanRenderer::NotifyLatteCommandProcessorIdle()
SubmitCommandBuffer();
}
void VulkanBenchmarkPrintResults();
void VulkanRenderer::SwapBuffers(bool swapTV, bool swapDRC)
{
SubmitCommandBuffer();
@ -2777,6 +2779,9 @@ void VulkanRenderer::SwapBuffers(bool swapTV, bool swapDRC)
if (swapDRC && IsSwapchainInfoValid(false))
SwapBuffer(false);
if(swapTV)
VulkanBenchmarkPrintResults();
}
void VulkanRenderer::ClearColorbuffer(bool padView)

View file

@ -1574,7 +1574,7 @@ void VulkanRenderer::draw_updateVertexBuffersDirectAccess()
uint32 bufferSize = LatteGPUState.contextRegister[bufferBaseRegisterIndex + 1] + 1;
uint32 bufferStride = (LatteGPUState.contextRegister[bufferBaseRegisterIndex + 2] >> 11) & 0xFFFF;
if (bufferAddress == MPTR_NULL)
if (bufferAddress == MPTR_NULL) [[unlikely]]
{
bufferAddress = 0x10000000;
}
@ -1597,7 +1597,7 @@ void VulkanRenderer::draw_updateUniformBuffersDirectAccess(LatteDecompilerShader
MPTR physicalAddr = LatteGPUState.contextRegister[uniformBufferRegOffset + i * 7 + 0];
uint32 uniformSize = LatteGPUState.contextRegister[uniformBufferRegOffset + i * 7 + 1] + 1;
if (physicalAddr == MPTR_NULL)
if (physicalAddr == MPTR_NULL) [[unlikely]]
{
cemu_assert_unimplemented();
continue;