D3D12: Implement perf query support

This commit is contained in:
Stenzek 2016-03-06 21:34:41 +10:00
parent 25d5da0ea3
commit 4269abdc3e
5 changed files with 280 additions and 25 deletions

View file

@ -274,6 +274,45 @@ void ID3D12QueuedCommandList::BackgroundThreadFunction(ID3D12QueuedCommandList*
break;
}
case D3DQueueItemType::BeginQuery:
{
command_list->BeginQuery(
reinterpret_cast<D3DQueueItem*>(item)->BeginQuery.pQueryHeap,
reinterpret_cast<D3DQueueItem*>(item)->BeginQuery.Type,
reinterpret_cast<D3DQueueItem*>(item)->BeginQuery.Index
);
item += BufferOffsetForQueueItemType<BeginQueryArguments>();
break;
}
case D3DQueueItemType::EndQuery:
{
command_list->EndQuery(
reinterpret_cast<D3DQueueItem*>(item)->EndQuery.pQueryHeap,
reinterpret_cast<D3DQueueItem*>(item)->EndQuery.Type,
reinterpret_cast<D3DQueueItem*>(item)->EndQuery.Index
);
item += BufferOffsetForQueueItemType<EndQueryArguments>();
break;
}
case D3DQueueItemType::ResolveQueryData:
{
command_list->ResolveQueryData(
reinterpret_cast<D3DQueueItem*>(item)->ResolveQueryData.pQueryHeap,
reinterpret_cast<D3DQueueItem*>(item)->ResolveQueryData.Type,
reinterpret_cast<D3DQueueItem*>(item)->ResolveQueryData.StartElement,
reinterpret_cast<D3DQueueItem*>(item)->ResolveQueryData.ElementCount,
reinterpret_cast<D3DQueueItem*>(item)->ResolveQueryData.pDestinationBuffer,
reinterpret_cast<D3DQueueItem*>(item)->ResolveQueryData.AlignedDestinationBufferOffset
);
item += BufferOffsetForQueueItemType<ResolveQueryDataArguments>();
break;
}
case D3DQueueItemType::CloseCommandList:
{
CheckHR(command_list->Close());
@ -916,8 +955,14 @@ void STDMETHODCALLTYPE ID3D12QueuedCommandList::BeginQuery(
_In_ UINT Index
)
{
// Function not implemented yet.
DEBUGCHECK(0, "Function not implemented yet.");
reinterpret_cast<D3DQueueItem*>(m_queue_array_back)->Type = D3DQueueItemType::BeginQuery;
reinterpret_cast<D3DQueueItem*>(m_queue_array_back)->BeginQuery.pQueryHeap = pQueryHeap;
reinterpret_cast<D3DQueueItem*>(m_queue_array_back)->BeginQuery.Type = Type;
reinterpret_cast<D3DQueueItem*>(m_queue_array_back)->BeginQuery.Index = Index;
m_queue_array_back += BufferOffsetForQueueItemType<BeginQueryArguments>();
CheckForOverflow();
}
void STDMETHODCALLTYPE ID3D12QueuedCommandList::EndQuery(
@ -926,8 +971,14 @@ void STDMETHODCALLTYPE ID3D12QueuedCommandList::EndQuery(
_In_ UINT Index
)
{
// Function not implemented yet.
DEBUGCHECK(0, "Function not implemented yet.");
reinterpret_cast<D3DQueueItem*>(m_queue_array_back)->Type = D3DQueueItemType::EndQuery;
reinterpret_cast<D3DQueueItem*>(m_queue_array_back)->EndQuery.pQueryHeap = pQueryHeap;
reinterpret_cast<D3DQueueItem*>(m_queue_array_back)->EndQuery.Type = Type;
reinterpret_cast<D3DQueueItem*>(m_queue_array_back)->EndQuery.Index = Index;
m_queue_array_back += BufferOffsetForQueueItemType<EndQueryArguments>();
CheckForOverflow();
}
void STDMETHODCALLTYPE ID3D12QueuedCommandList::ResolveQueryData(
@ -939,8 +990,17 @@ void STDMETHODCALLTYPE ID3D12QueuedCommandList::ResolveQueryData(
_In_ UINT64 AlignedDestinationBufferOffset
)
{
// Function not implemented yet.
DEBUGCHECK(0, "Function not implemented yet.");
reinterpret_cast<D3DQueueItem*>(m_queue_array_back)->Type = D3DQueueItemType::ResolveQueryData;
reinterpret_cast<D3DQueueItem*>(m_queue_array_back)->ResolveQueryData.pQueryHeap = pQueryHeap;
reinterpret_cast<D3DQueueItem*>(m_queue_array_back)->ResolveQueryData.Type = Type;
reinterpret_cast<D3DQueueItem*>(m_queue_array_back)->ResolveQueryData.StartElement = StartElement;
reinterpret_cast<D3DQueueItem*>(m_queue_array_back)->ResolveQueryData.ElementCount = ElementCount;
reinterpret_cast<D3DQueueItem*>(m_queue_array_back)->ResolveQueryData.pDestinationBuffer = pDestinationBuffer;
reinterpret_cast<D3DQueueItem*>(m_queue_array_back)->ResolveQueryData.AlignedDestinationBufferOffset = AlignedDestinationBufferOffset;
m_queue_array_back += BufferOffsetForQueueItemType<ResolveQueryDataArguments>();
CheckForOverflow();
}
void STDMETHODCALLTYPE ID3D12QueuedCommandList::SetPredication(

View file

@ -35,6 +35,9 @@ enum D3DQueueItemType
SetDescriptorHeaps,
ResourceBarrier,
ResolveSubresource,
BeginQuery,
EndQuery,
ResolveQueryData,
ExecuteCommandList,
CloseCommandList,
Present,
@ -170,6 +173,30 @@ struct ResolveSubresourceArguments
DXGI_FORMAT Format;
};
struct BeginQueryArguments
{
ID3D12QueryHeap* pQueryHeap;
D3D12_QUERY_TYPE Type;
UINT Index;
};
struct EndQueryArguments
{
ID3D12QueryHeap* pQueryHeap;
D3D12_QUERY_TYPE Type;
UINT Index;
};
struct ResolveQueryDataArguments
{
ID3D12QueryHeap* pQueryHeap;
D3D12_QUERY_TYPE Type;
UINT StartElement;
UINT ElementCount;
ID3D12Resource* pDestinationBuffer;
UINT64 AlignedDestinationBufferOffset;
};
struct CloseCommandListArguments
{
};
@ -239,6 +266,9 @@ struct D3DQueueItem
SetDescriptorHeapsArguments SetDescriptorHeaps;
ResourceBarrierArguments ResourceBarrier;
ResolveSubresourceArguments ResolveSubresource;
BeginQueryArguments BeginQuery;
EndQueryArguments EndQuery;
ResolveQueryDataArguments ResolveQueryData;
CloseCommandListArguments CloseCommandList;
ExecuteCommandListArguments ExecuteCommandList;
PresentArguments Present;

View file

@ -2,68 +2,215 @@
// Licensed under GPLv2+
// Refer to the license.txt file included.
#include <algorithm>
#include "Common/CommonFuncs.h"
#include "Common/CommonTypes.h"
#include "Common/Logging/Log.h"
#include "VideoBackends/D3D12/D3DBase.h"
#include "VideoBackends/D3D12/D3DCommandListManager.h"
#include "VideoBackends/D3D12/PerfQuery.h"
#include "VideoCommon/RenderBase.h"
//D3D12TODO: Implement PerfQuery class.
namespace DX12
{
PerfQuery::PerfQuery()
{
//D3D12TODO: Add implementation
D3D12_QUERY_HEAP_DESC desc = { D3D12_QUERY_HEAP_TYPE_OCCLUSION, PERF_QUERY_BUFFER_SIZE, 0 };
CheckHR(D3D::device12->CreateQueryHeap(&desc, IID_PPV_ARGS(&m_query_heap)));
CheckHR(D3D::device12->CreateCommittedResource(
&CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_READBACK),
D3D12_HEAP_FLAG_NONE,
&CD3DX12_RESOURCE_DESC::Buffer(QUERY_READBACK_BUFFER_SIZE),
D3D12_RESOURCE_STATE_COPY_DEST,
nullptr,
IID_PPV_ARGS(&m_query_readback_buffer)));
m_tracking_fence = D3D::command_list_mgr->RegisterQueueFenceCallback(this, &PerfQuery::QueueFenceCallback);
}
PerfQuery::~PerfQuery()
{
//D3D12TODO: Add implementation
D3D::command_list_mgr->RemoveQueueFenceCallback(this);
SAFE_RELEASE(m_query_heap);
SAFE_RELEASE(m_query_readback_buffer);
}
void PerfQuery::EnableQuery(PerfQueryGroup type)
{
//D3D12TODO: Add implementation
if (m_query_count > m_query_buffer.size() / 2)
WeakFlush();
// all queries already used?
if (m_query_buffer.size() == m_query_count)
{
FlushOne();
//WARN_LOG(VIDEO, "Flushed query buffer early!");
}
if (type == PQG_ZCOMP_ZCOMPLOC || type == PQG_ZCOMP)
{
size_t index = (m_query_read_pos + m_query_count) % m_query_buffer.size();
auto& entry = m_query_buffer[index];
D3D::current_command_list->BeginQuery(m_query_heap, D3D12_QUERY_TYPE_OCCLUSION, static_cast<UINT>(index));
entry.query_type = type;
entry.fence_value = -1;
++m_query_count;
}
}
void PerfQuery::DisableQuery(PerfQueryGroup type)
{
//D3D12TODO: Add implementation
if (type == PQG_ZCOMP_ZCOMPLOC || type == PQG_ZCOMP)
{
size_t index = (m_query_read_pos + m_query_count + m_query_buffer.size() - 1) % m_query_buffer.size();
auto& entry = m_query_buffer[index];
D3D::current_command_list->EndQuery(m_query_heap, D3D12_QUERY_TYPE_OCCLUSION, static_cast<UINT>(index));
D3D::current_command_list->ResolveQueryData(m_query_heap, D3D12_QUERY_TYPE_OCCLUSION, static_cast<UINT>(index), 1, m_query_readback_buffer, index * sizeof(UINT64));
entry.fence_value = m_next_fence_value;
}
}
void PerfQuery::ResetQuery()
{
//D3D12TODO: Add implementation
m_query_count = 0;
std::fill_n(m_results, ArraySize(m_results), 0);
}
u32 PerfQuery::GetQueryResult(PerfQueryType type)
{
//D3D12TODO: Add implementation
return 0;
u32 result = 0;
if (type == PQ_ZCOMP_INPUT_ZCOMPLOC || type == PQ_ZCOMP_OUTPUT_ZCOMPLOC)
result = m_results[PQG_ZCOMP_ZCOMPLOC];
else if (type == PQ_ZCOMP_INPUT || type == PQ_ZCOMP_OUTPUT)
result = m_results[PQG_ZCOMP];
else if (type == PQ_BLEND_INPUT)
result = m_results[PQG_ZCOMP] + m_results[PQG_ZCOMP_ZCOMPLOC];
else if (type == PQ_EFB_COPY_CLOCKS)
result = m_results[PQG_EFB_COPY_CLOCKS];
return result / 4;
}
void PerfQuery::FlushOne()
{
//D3D12TODO: Add implementation
size_t index = m_query_read_pos;
ActiveQuery& entry = m_query_buffer[index];
// Has the command list been executed yet?
if (entry.fence_value == m_next_fence_value)
D3D::command_list_mgr->ExecuteQueuedWork(false);
// Block until the fence is reached
D3D::command_list_mgr->WaitOnCPUForFence(m_tracking_fence, entry.fence_value);
// Copy from readback buffer to local
void* readback_buffer_map;
D3D12_RANGE read_range = { sizeof(UINT64) * index, sizeof(UINT64) * (index + 1) };
CheckHR(m_query_readback_buffer->Map(0, &read_range, &readback_buffer_map));
UINT64 result;
memcpy(&result, reinterpret_cast<u8*>(readback_buffer_map) + sizeof(UINT64) * index, sizeof(UINT64));
D3D12_RANGE empty_range = {};
m_query_readback_buffer->Unmap(0, &empty_range);
// NOTE: Reported pixel metrics should be referenced to native resolution
m_results[entry.query_type] += (u32)(result * EFB_WIDTH / g_renderer->GetTargetWidth() * EFB_HEIGHT / g_renderer->GetTargetHeight());
m_query_read_pos = (m_query_read_pos + 1) % m_query_buffer.size();
m_query_count--;
}
UINT64 PerfQuery::FindLastPendingFenceValue() const
{
UINT64 last_fence_value = 0;
u32 query_count = m_query_count;
u32 query_read_pos = m_query_read_pos;
while (query_count > 0)
{
const ActiveQuery& entry = m_query_buffer[query_read_pos];
last_fence_value = std::max(entry.fence_value, last_fence_value);
query_read_pos = (query_read_pos + 1) % m_query_buffer.size();
query_count--;
}
return last_fence_value;
}
void PerfQuery::FlushResults()
{
//D3D12TODO: Add implementation
if (IsFlushed())
return;
// Find the fence value we have to wait for.
UINT64 last_fence_value = FindLastPendingFenceValue();
if (last_fence_value == m_next_fence_value)
D3D::command_list_mgr->ExecuteQueuedWork(false);
// Wait for all queries to be resolved.
D3D::command_list_mgr->WaitOnCPUForFence(m_tracking_fence, last_fence_value);
// Map the whole readback buffer. Shouldn't have much overhead, and saves taking the wrapped-around cases into consideration.
void* readback_buffer_map;
D3D12_RANGE read_range = { 0, QUERY_READBACK_BUFFER_SIZE };
CheckHR(m_query_readback_buffer->Map(0, &read_range, &readback_buffer_map));
// Read all pending queries.
while (m_query_count > 0)
{
ActiveQuery& entry = m_query_buffer[m_query_read_pos];
UINT64 result;
memcpy(&result, reinterpret_cast<u8*>(readback_buffer_map) + sizeof(UINT64) * m_query_read_pos, sizeof(UINT64));
// NOTE: Reported pixel metrics should be referenced to native resolution
m_results[entry.query_type] += (u32)(result * EFB_WIDTH / g_renderer->GetTargetWidth() * EFB_HEIGHT / g_renderer->GetTargetHeight());
m_query_read_pos = (m_query_read_pos + 1) % m_query_buffer.size();
m_query_count--;
}
D3D12_RANGE write_range = {};
m_query_readback_buffer->Unmap(0, &write_range);
}
void PerfQuery::WeakFlush()
{
//D3D12TODO: Add implementation
UINT64 completed_fence = m_tracking_fence->GetCompletedValue();
while (!IsFlushed())
{
ActiveQuery& entry = m_query_buffer[m_query_read_pos];
if (entry.fence_value > completed_fence)
break;
FlushOne();
}
}
bool PerfQuery::IsFlushed() const
{
//D3D12TODO: Add implementation
return true;
return m_query_count == 0;
}
void PerfQuery::QueueFenceCallback(void* owning_object, UINT64 fence_value)
{
PerfQuery* owning_perf_query = static_cast<PerfQuery*>(owning_object);
owning_perf_query->QueueFence(fence_value);
}
void PerfQuery::QueueFence(UINT64 fence_value)
{
m_next_fence_value = fence_value + 1;
}
} // namespace

View file

@ -5,6 +5,7 @@
#pragma once
#include <array>
#include <d3d12.h>
#include "VideoCommon/PerfQueryBase.h"
@ -27,20 +28,33 @@ public:
private:
struct ActiveQuery
{
//ID3D11Query* query;
PerfQueryGroup query_type;
UINT64 fence_value;
};
void WeakFlush();
// Find the last fence value of all pending queries.
UINT64 FindLastPendingFenceValue() const;
// Only use when non-empty
void FlushOne();
// when testing in SMS: 64 was too small, 128 was ok
static const int s_perf_query_buffer_size = 512;
static void QueueFenceCallback(void* owning_object, UINT64 fence_value);
void QueueFence(UINT64 fence_value);
std::array<ActiveQuery, s_perf_query_buffer_size> m_query_buffer;
// when testing in SMS: 64 was too small, 128 was ok
static constexpr size_t PERF_QUERY_BUFFER_SIZE = 512;
static constexpr size_t QUERY_READBACK_BUFFER_SIZE = PERF_QUERY_BUFFER_SIZE * sizeof(UINT64);
std::array<ActiveQuery, PERF_QUERY_BUFFER_SIZE> m_query_buffer;
int m_query_read_pos = 0;
ID3D12QueryHeap* m_query_heap = nullptr;
ID3D12Resource* m_query_readback_buffer = nullptr;
ID3D12Fence* m_tracking_fence = nullptr;
UINT64 m_next_fence_value = 0;
};
} // namespace

View file

@ -158,7 +158,11 @@ void VertexManager::vFlush(bool use_dst_alpha)
// D3D12TODO: Decide right threshold for drawCountSinceAsyncFlush at runtime depending on
// amount of stall measured in AccessEFB.
if (D3D::command_list_mgr->m_draws_since_last_execution > 100 && D3D::command_list_mgr->m_cpu_access_last_frame)
// We can't do this with perf queries enabled since it can leave queries open.
if (D3D::command_list_mgr->m_cpu_access_last_frame &&
D3D::command_list_mgr->m_draws_since_last_execution > 100 &&
!PerfQueryBase::ShouldEmulate())
{
D3D::command_list_mgr->m_draws_since_last_execution = 0;