OpenGL: refactor all of our StreamBuffers

The old way was to use big switch/case statements based on a type of buffer.
The new one is to use inheritance.

This change prohibits us to change the buffer type while running, but I doubt we'll ever do so.
Performance should also be a bit better. Also a nice cleanup.

Added some comments about this different kind of buffers.
This commit is contained in:
degasus 2014-01-23 00:47:49 +01:00
parent be1fee6d74
commit 128fcdac26
4 changed files with 373 additions and 256 deletions

View file

@ -193,18 +193,18 @@ void ProgramShaderCache::UploadConstants()
{
if(PixelShaderManager::dirty || VertexShaderManager::dirty)
{
u8* buffer = s_buffer->Map(s_ubo_buffer_size, s_ubo_align);
auto buffer = s_buffer->Map(s_ubo_buffer_size, s_ubo_align);
memcpy(buffer,
memcpy(buffer.first,
&PixelShaderManager::constants, sizeof(PixelShaderConstants));
memcpy(buffer + ROUND_UP(sizeof(PixelShaderConstants), s_ubo_align),
memcpy(buffer.first + ROUND_UP(sizeof(PixelShaderConstants), s_ubo_align),
&VertexShaderManager::constants, sizeof(VertexShaderConstants));
size_t offset = s_buffer->Unmap(s_ubo_buffer_size);
glBindBufferRange(GL_UNIFORM_BUFFER, 1, s_buffer->getBuffer(), offset,
s_buffer->Unmap(s_ubo_buffer_size);
glBindBufferRange(GL_UNIFORM_BUFFER, 1, s_buffer->m_buffer, buffer.second,
sizeof(PixelShaderConstants));
glBindBufferRange(GL_UNIFORM_BUFFER, 2, s_buffer->getBuffer(), offset + ROUND_UP(sizeof(PixelShaderConstants), s_ubo_align),
glBindBufferRange(GL_UNIFORM_BUFFER, 2, s_buffer->m_buffer, buffer.second + ROUND_UP(sizeof(PixelShaderConstants), s_ubo_align),
sizeof(VertexShaderConstants));
PixelShaderManager::dirty = false;
@ -471,7 +471,7 @@ void ProgramShaderCache::Init(void)
// We multiply by *4*4 because we need to get down to basic machine units.
// So multiply by four to get how many floats we have from vec4s
// Then once more to get bytes
s_buffer = new StreamBuffer(GL_UNIFORM_BUFFER, UBO_LENGTH);
s_buffer = StreamBuffer::Create(GL_UNIFORM_BUFFER, UBO_LENGTH);
}
// Read our shader cache, only if supported

View file

@ -13,233 +13,60 @@
namespace OGL
{
static const u32 SYNC_POINTS = 16;
static const u32 ALIGN_PINNED_MEMORY = 4096;
// moved out of constructor, so m_buffer is allowed to be const
static u32 genBuffer()
{
u32 id;
glGenBuffers(1, &id);
return id;
}
StreamBuffer::StreamBuffer(u32 type, size_t size)
: m_buffertype(type), m_size(size)
{
glGenBuffers(1, &m_buffer);
bool nvidia = !strcmp(g_ogl_config.gl_vendor, "NVIDIA Corporation");
if (g_ogl_config.bSupportsGLBufferStorage &&
!(DriverDetails::HasBug(DriverDetails::BUG_BROKENBUFFERSTORAGE) && type == GL_ARRAY_BUFFER))
m_uploadtype = BUFFERSTORAGE;
else if(!g_ogl_config.bSupportsGLBaseVertex && !DriverDetails::HasBug(DriverDetails::BUG_BROKENBUFFERSTREAM))
m_uploadtype = BUFFERSUBDATA;
else if(!g_ogl_config.bSupportsGLBaseVertex)
m_uploadtype = BUFFERDATA;
else if(g_ogl_config.bSupportsGLSync && g_ogl_config.bSupportsGLPinnedMemory &&
!(DriverDetails::HasBug(DriverDetails::BUG_BROKENPINNEDMEMORY) && type == GL_ELEMENT_ARRAY_BUFFER))
m_uploadtype = PINNED_MEMORY;
else if(nvidia)
m_uploadtype = BUFFERSUBDATA;
else if(g_ogl_config.bSupportsGLSync)
m_uploadtype = MAP_AND_SYNC;
else
m_uploadtype = MAP_AND_ORPHAN;
Init();
}
StreamBuffer::~StreamBuffer()
{
Shutdown();
glDeleteBuffers(1, &m_buffer);
}
#define SLOT(x) ((x)*SYNC_POINTS/m_size)
u8* StreamBuffer::Map ( size_t size, u32 stride )
{
if(m_iterator && stride) {
m_iterator--;
m_iterator = m_iterator - (m_iterator % stride) + stride;
}
switch(m_uploadtype) {
case MAP_AND_ORPHAN:
if(m_iterator + size >= m_size) {
glBufferData(m_buffertype, m_size, NULL, GL_STREAM_DRAW);
m_iterator = 0;
}
break;
case MAP_AND_SYNC:
case PINNED_MEMORY:
case BUFFERSTORAGE:
// insert waiting slots for used memory
for (size_t i = SLOT(m_used_iterator); i < SLOT(m_iterator); i++)
{
fences[i] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
}
m_used_iterator = m_iterator;
// wait for new slots to end of buffer
for (size_t i = SLOT(m_free_iterator) + 1; i <= SLOT(m_iterator + size) && i < SYNC_POINTS; i++)
{
glClientWaitSync(fences[i], GL_SYNC_FLUSH_COMMANDS_BIT, GL_TIMEOUT_IGNORED);
glDeleteSync(fences[i]);
}
m_free_iterator = m_iterator + size;
// if buffer is full
if (m_iterator + size >= m_size) {
// insert waiting slots in unused space at the end of the buffer
for (size_t i = SLOT(m_used_iterator); i < SYNC_POINTS; i++)
{
fences[i] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
}
// move to the start
m_used_iterator = m_iterator = 0; // offset 0 is always aligned
// wait for space at the start
for (u32 i = 0; i <= SLOT(m_iterator + size); i++)
{
glClientWaitSync(fences[i], GL_SYNC_FLUSH_COMMANDS_BIT, GL_TIMEOUT_IGNORED);
glDeleteSync(fences[i]);
}
m_free_iterator = m_iterator + size;
}
break;
case BUFFERSUBDATA:
case BUFFERDATA:
m_iterator = 0;
break;
}
// MAP_AND_* methods need to remap this buffer every time
switch(m_uploadtype) {
case MAP_AND_ORPHAN:
case MAP_AND_SYNC:
pointer = (u8*)glMapBufferRange(m_buffertype, m_iterator, size,
GL_MAP_WRITE_BIT | GL_MAP_INVALIDATE_RANGE_BIT | GL_MAP_FLUSH_EXPLICIT_BIT | GL_MAP_UNSYNCHRONIZED_BIT) - m_iterator;
break;
case PINNED_MEMORY:
case BUFFERSTORAGE:
case BUFFERSUBDATA:
case BUFFERDATA:
break;
}
return pointer + m_iterator;
}
size_t StreamBuffer::Unmap(size_t used_size)
{
size_t ret = m_iterator;
switch(m_uploadtype) {
case MAP_AND_SYNC:
case MAP_AND_ORPHAN:
glFlushMappedBufferRange(m_buffertype, 0, used_size);
glUnmapBuffer(m_buffertype);
break;
case PINNED_MEMORY:
case BUFFERSTORAGE:
case BUFFERSUBDATA:
glBufferSubData(m_buffertype, 0, used_size, pointer);
break;
case BUFFERDATA:
glBufferData(m_buffertype, used_size, pointer, GL_STREAM_DRAW);
break;
}
m_iterator += used_size;
return ret;
}
void StreamBuffer::Init()
: m_buffer(genBuffer()), m_buffertype(type), m_size(size)
{
m_iterator = 0;
m_used_iterator = 0;
m_free_iterator = 0;
switch(m_uploadtype) {
case MAP_AND_SYNC:
fences = new GLsync[SYNC_POINTS];
for(u32 i=0; i<SYNC_POINTS; i++)
fences[i] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
case MAP_AND_ORPHAN:
case BUFFERSUBDATA:
glBindBuffer(m_buffertype, m_buffer);
glBufferData(m_buffertype, m_size, NULL, GL_STREAM_DRAW);
pointer = new u8[m_size];
break;
case PINNED_MEMORY:
glGetError(); // errors before this allocation should be ignored
fences = new GLsync[SYNC_POINTS];
for(u32 i=0; i<SYNC_POINTS; i++)
fences[i] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
pointer = (u8*)AllocateAlignedMemory(ROUND_UP(m_size,ALIGN_PINNED_MEMORY), ALIGN_PINNED_MEMORY );
glBindBuffer(GL_EXTERNAL_VIRTUAL_MEMORY_BUFFER_AMD, m_buffer);
glBufferData(GL_EXTERNAL_VIRTUAL_MEMORY_BUFFER_AMD, ROUND_UP(m_size,ALIGN_PINNED_MEMORY), pointer, GL_STREAM_COPY);
glBindBuffer(GL_EXTERNAL_VIRTUAL_MEMORY_BUFFER_AMD, 0);
glBindBuffer(m_buffertype, m_buffer);
// on error, switch to another backend. some old catalyst seems to have broken pinned memory support
if(glGetError() != GL_NO_ERROR) {
ERROR_LOG(VIDEO, "Pinned memory detected, but not working. Please report this: %s, %s, %s", g_ogl_config.gl_vendor, g_ogl_config.gl_renderer, g_ogl_config.gl_version);
Shutdown();
m_uploadtype = MAP_AND_SYNC;
Init();
}
break;
case BUFFERSTORAGE:
glGetError(); // errors before this allocation should be ignored
fences = new GLsync[SYNC_POINTS];
for (u32 i = 0; i<SYNC_POINTS; i++)
fences[i] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
glBindBuffer(m_buffertype, m_buffer);
// PERSISTANT_BIT to make sure that the buffer can be used while mapped
// COHERENT_BIT is set so we don't have to use a MemoryBarrier on write
// CLIENT_STORAGE_BIT is set since we access the buffer more frequently on the client side then server side
glBufferStorage(m_buffertype, m_size, NULL,
GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | GL_MAP_COHERENT_BIT | GL_CLIENT_STORAGE_BIT);
pointer = (u8*)glMapBufferRange(m_buffertype, 0, m_size,
GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | GL_MAP_COHERENT_BIT);
if(!pointer)
ERROR_LOG(VIDEO, "Buffer allocation failed");
break;
case BUFFERDATA:
glBindBuffer(m_buffertype, m_buffer);
pointer = new u8[m_size];
break;
}
fences = nullptr;
}
void StreamBuffer::Shutdown()
StreamBuffer::~StreamBuffer()
{
switch(m_uploadtype) {
case MAP_AND_SYNC:
DeleteFences();
break;
case MAP_AND_ORPHAN:
break;
case BUFFERSUBDATA:
case BUFFERDATA:
delete [] pointer;
break;
case PINNED_MEMORY:
DeleteFences();
glBindBuffer(m_buffertype, 0);
glFinish(); // ogl pipeline must be flushed, else this buffer can be in use
FreeAlignedMemory(pointer);
break;
case BUFFERSTORAGE:
DeleteFences();
glUnmapBuffer(m_buffertype);
glBindBuffer(m_buffertype, 0);
glFinish(); // ogl pipeline must be flushed, else this buffer can be in use
break;
}
glDeleteBuffers(1, &m_buffer);
}
/* Shared synchronisation code for ring buffers
*
* The next three functions are to create/delete/use the OpenGL synchronisation.
* ARB_sync (OpenGL 3.2) is used and required.
*
* To reduce overhead, the complete buffer is splitted up into SYNC_POINTS chunks.
* For each of this chunks, there is a fence which checks if this chunk is still in use.
*
* As our API allows to alloc more memory then it has to use, we have to catch how much is already written.
*
* m_iterator - writing position
* m_free_iterator - last position checked if free
* m_used_iterator - last position known to be written
*
* So on alloc, we have to wait for all slots between m_free_iterator and m_iterator (and set m_free_iterator to m_iterator afterwards).
*
* We also assume that this buffer is accessed by the gpu between the Unmap and Map function,
* so we may create the fences on the start of mapping.
* Some here, new fences for the chunks between m_used_iterator and m_iterator (also update m_used_iterator).
*
* As ring buffers have an ugly behavoir on rollover, have fun to read this code ;)
*/
#define SLOT(x) ((x)*SYNC_POINTS/m_size)
static const u32 SYNC_POINTS = 16;
void StreamBuffer::CreateFences()
{
fences = new GLsync[SYNC_POINTS];
for(u32 i=0; i<SYNC_POINTS; i++)
fences[i] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
}
void StreamBuffer::DeleteFences()
{
for (size_t i = SLOT(m_free_iterator) + 1; i < SYNC_POINTS; i++)
@ -252,5 +79,291 @@ void StreamBuffer::DeleteFences()
}
delete [] fences;
}
void StreamBuffer::AllocMemory(size_t size)
{
// insert waiting slots for used memory
for (size_t i = SLOT(m_used_iterator); i < SLOT(m_iterator); i++)
{
fences[i] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
}
m_used_iterator = m_iterator;
// wait for new slots to end of buffer
for (size_t i = SLOT(m_free_iterator) + 1; i <= SLOT(m_iterator + size) && i < SYNC_POINTS; i++)
{
glClientWaitSync(fences[i], GL_SYNC_FLUSH_COMMANDS_BIT, GL_TIMEOUT_IGNORED);
glDeleteSync(fences[i]);
}
m_free_iterator = m_iterator + size;
// if buffer is full
if (m_iterator + size >= m_size) {
// insert waiting slots in unused space at the end of the buffer
for (size_t i = SLOT(m_used_iterator); i < SYNC_POINTS; i++)
{
fences[i] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
}
// move to the start
m_used_iterator = m_iterator = 0; // offset 0 is always aligned
// wait for space at the start
for (u32 i = 0; i <= SLOT(m_iterator + size); i++)
{
glClientWaitSync(fences[i], GL_SYNC_FLUSH_COMMANDS_BIT, GL_TIMEOUT_IGNORED);
glDeleteSync(fences[i]);
}
m_free_iterator = m_iterator + size;
}
}
#undef SLOT
void StreamBuffer::Align(u32 stride)
{
if(m_iterator && stride) {
m_iterator--;
m_iterator = m_iterator - (m_iterator % stride) + stride;
}
}
/* The usual way to stream data to the gpu.
* Described here: https://www.opengl.org/wiki/Buffer_Object_Streaming#Unsynchronized_buffer_mapping
* Just do unsync appends until the buffer is full.
* When it's full, orphan (alloc a new buffer and free the old one)
*
* As reallocation is an overhead, this method isn't as fast as it is known to be.
*/
class MapAndOrphan : public StreamBuffer
{
public:
MapAndOrphan(u32 type, size_t size) : StreamBuffer(type, size) {
glBindBuffer(m_buffertype, m_buffer);
glBufferData(m_buffertype, m_size, NULL, GL_STREAM_DRAW);
}
~MapAndOrphan() {
}
std::pair<u8*, size_t> Map(size_t size, u32 stride) {
Align(stride);
if(m_iterator + size >= m_size) {
glBufferData(m_buffertype, m_size, NULL, GL_STREAM_DRAW);
m_iterator = 0;
}
u8* pointer = (u8*)glMapBufferRange(m_buffertype, m_iterator, size,
GL_MAP_WRITE_BIT | GL_MAP_INVALIDATE_RANGE_BIT | GL_MAP_FLUSH_EXPLICIT_BIT | GL_MAP_UNSYNCHRONIZED_BIT);
return std::make_pair(pointer, m_iterator);
}
void Unmap(size_t used_size) {
glFlushMappedBufferRange(m_buffertype, 0, used_size);
glUnmapBuffer(m_buffertype);
m_iterator += used_size;
}
};
/* A modified streaming way without reallocation
* This one fixes the reallocation overhead of the MapAndOrphan one.
* So it alloc a ring buffer on initialization.
* But with this limited ressource, we have to care about the cpu-gpu distance.
* Else this fifo may overflow.
* So we had traded orphan vs syncing.
*/
class MapAndSync : public StreamBuffer
{
public:
MapAndSync(u32 type, size_t size) : StreamBuffer(type, size) {
CreateFences();
glBindBuffer(m_buffertype, m_buffer);
glBufferData(m_buffertype, m_size, NULL, GL_STREAM_DRAW);
}
~MapAndSync() {
DeleteFences();
}
std::pair<u8*, size_t> Map(size_t size, u32 stride) {
Align(stride);
AllocMemory(size);
u8* pointer = (u8*)glMapBufferRange(m_buffertype, m_iterator, size,
GL_MAP_WRITE_BIT | GL_MAP_INVALIDATE_RANGE_BIT | GL_MAP_FLUSH_EXPLICIT_BIT | GL_MAP_UNSYNCHRONIZED_BIT);
return std::make_pair(pointer, m_iterator);
}
void Unmap(size_t used_size) {
glFlushMappedBufferRange(m_buffertype, 0, used_size);
glUnmapBuffer(m_buffertype);
m_iterator += used_size;
}
};
/* Streaming fifo without mapping ovearhead.
* This one usually requires ARB_buffer_storage (OpenGL 4.4).
* And is usually not available on OpenGL3 gpus.
*
* ARB_buffer_storage allows us to render from a mapped buffer.
* So we map it persistently in the initialization.
*
* Unsync mapping sounds like an easy task, but it isn't for threaded drivers.
* So every mapping on current close-source driver _will_ end in
* at least a round trip time between two threads.
*
* As persistently mapped buffer can't use orphaning, we also have to sync.
*/
class BufferStorage : public StreamBuffer
{
public:
BufferStorage(u32 type, size_t size) : StreamBuffer(type, size) {
CreateFences();
glBindBuffer(m_buffertype, m_buffer);
// PERSISTANT_BIT to make sure that the buffer can be used while mapped
// COHERENT_BIT is set so we don't have to use a MemoryBarrier on write
// CLIENT_STORAGE_BIT is set since we access the buffer more frequently on the client side then server side
glBufferStorage(m_buffertype, m_size, NULL,
GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | GL_MAP_COHERENT_BIT | GL_CLIENT_STORAGE_BIT);
m_pointer = (u8*)glMapBufferRange(m_buffertype, 0, m_size,
GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | GL_MAP_COHERENT_BIT);
}
~BufferStorage() {
DeleteFences();
glUnmapBuffer(m_buffertype);
glBindBuffer(m_buffertype, 0);
}
std::pair<u8*, size_t> Map(size_t size, u32 stride) {
Align(stride);
AllocMemory(size);
return std::make_pair(m_pointer + m_iterator, m_iterator);
}
void Unmap(size_t used_size) {
m_iterator += used_size;
}
u8* m_pointer;
};
/* --- AMD only ---
* Another streaming fifo without mapping overhead.
* As we can't orphan without mapping, we have to sync.
*
* This one uses AMD_pinned_memory which is available on all AMD gpus.
* OpenGL 4.4 drivers should use BufferStorage.
*/
class PinnedMemory : public StreamBuffer
{
public:
PinnedMemory(u32 type, size_t size) : StreamBuffer(type, size) {
CreateFences();
m_pointer = (u8*)AllocateAlignedMemory(ROUND_UP(m_size,ALIGN_PINNED_MEMORY), ALIGN_PINNED_MEMORY );
glBindBuffer(GL_EXTERNAL_VIRTUAL_MEMORY_BUFFER_AMD, m_buffer);
glBufferData(GL_EXTERNAL_VIRTUAL_MEMORY_BUFFER_AMD, ROUND_UP(m_size,ALIGN_PINNED_MEMORY), m_pointer, GL_STREAM_COPY);
glBindBuffer(GL_EXTERNAL_VIRTUAL_MEMORY_BUFFER_AMD, 0);
glBindBuffer(m_buffertype, m_buffer);
}
~PinnedMemory() {
DeleteFences();
glBindBuffer(m_buffertype, 0);
glFinish(); // ogl pipeline must be flushed, else this buffer can be in use
FreeAlignedMemory(m_pointer);
}
std::pair<u8*, size_t> Map(size_t size, u32 stride) {
Align(stride);
AllocMemory(size);
return std::make_pair(m_pointer + m_iterator, m_iterator);
}
void Unmap(size_t used_size) {
m_iterator += used_size;
}
u8* m_pointer;
static const u32 ALIGN_PINNED_MEMORY = 4096;
};
/* Fifo based on the glBufferSubData call.
* As everything must be copied before glBufferSubData returns,
* an additional memcpy in the driver will be done.
* So this is a huge overhead, only use it if required.
*/
class BufferSubData : public StreamBuffer
{
public:
BufferSubData(u32 type, size_t size) : StreamBuffer(type, size) {
glBindBuffer(m_buffertype, m_buffer);
glBufferData(m_buffertype, size, 0, GL_STATIC_DRAW);
m_pointer = new u8[m_size];
}
~BufferSubData() {
delete [] m_pointer;
}
std::pair<u8*, size_t> Map(size_t size, u32 stride) {
return std::make_pair(m_pointer, 0);
}
void Unmap(size_t used_size) {
glBufferSubData(m_buffertype, 0, used_size, m_pointer);
}
u8* m_pointer;
};
/* Fifo based on the glBufferData call.
* Some trashy drivers stall in BufferSubData.
* So here we use glBufferData, which realloc this buffer every time.
* This may avoid stalls, but it is a bigger overhead than BufferSubData.
*/
class BufferData : public StreamBuffer
{
public:
BufferData(u32 type, size_t size) : StreamBuffer(type, size) {
glBindBuffer(m_buffertype, m_buffer);
m_pointer = new u8[m_size];
}
~BufferData() {
delete [] m_pointer;
}
std::pair<u8*, size_t> Map(size_t size, u32 stride) {
return std::make_pair(m_pointer, 0);
}
void Unmap(size_t used_size) {
glBufferData(m_buffertype, used_size, m_pointer, GL_STREAM_DRAW);
}
u8* m_pointer;
};
// choose best streaming library based on the supported extensions and known issues
StreamBuffer* StreamBuffer::Create(u32 type, size_t size)
{
bool nvidia = !strcmp(g_ogl_config.gl_vendor, "NVIDIA Corporation");
if (g_ogl_config.bSupportsGLBufferStorage &&
!(DriverDetails::HasBug(DriverDetails::BUG_BROKENBUFFERSTORAGE) && type == GL_ARRAY_BUFFER))
return new BufferStorage(type, size);
else if(!g_ogl_config.bSupportsGLBaseVertex && !DriverDetails::HasBug(DriverDetails::BUG_BROKENBUFFERSTREAM))
return new BufferSubData(type, size);
else if(!g_ogl_config.bSupportsGLBaseVertex)
return new BufferData(type, size);
else if(g_ogl_config.bSupportsGLSync && g_ogl_config.bSupportsGLPinnedMemory &&
!(DriverDetails::HasBug(DriverDetails::BUG_BROKENPINNEDMEMORY) && type == GL_ELEMENT_ARRAY_BUFFER))
return new PinnedMemory(type, size);
else if(nvidia)
return new BufferSubData(type, size);
else if(g_ogl_config.bSupportsGLSync)
return new MapAndSync(type, size);
else
return new MapAndOrphan(type, size);
}
}

View file

@ -5,6 +5,7 @@
#ifndef STREAMBUFFER_H
#define STREAMBUFFER_H
#include <utility>
#include "VideoCommon.h"
#include "FramebufferManager.h"
#include "GLUtil.h"
@ -17,39 +18,41 @@
namespace OGL
{
enum StreamType {
MAP_AND_ORPHAN = (1 << 1),
MAP_AND_SYNC = (1 << 2),
PINNED_MEMORY = (1 << 3),
BUFFERSUBDATA = (1 << 4),
BUFFERDATA = (1 << 5),
BUFFERSTORAGE = (1 << 6),
};
class StreamBuffer {
public:
static StreamBuffer* Create(u32 type, size_t size);
virtual ~StreamBuffer();
/* This mapping function will return a pair of:
* - the pointer to the mapped buffer
* - the offset into the real gpu buffer (always multiple of stride)
* On mapping, the maximum of size for allocation has to be set.
* The size really pushed into this fifo only has to be known on Unmapping.
* Mapping invalidates the current buffer content,
* so it isn't allowed to access the old content any more.
*/
virtual std::pair<u8*, size_t> Map(size_t size, u32 stride = 0) = 0;
virtual void Unmap(size_t used_size) = 0;
const u32 m_buffer;
protected:
StreamBuffer(u32 type, size_t size);
~StreamBuffer();
u8* Map(size_t size, u32 stride = 0);
size_t Unmap(size_t used_size); // returns the offset of the beginning of the uploaded block
inline u32 getBuffer() { return m_buffer; }
private:
void Init();
void Shutdown();
void CreateFences();
void DeleteFences();
void AllocMemory(size_t size);
void Align(u32 stride);
StreamType m_uploadtype;
u32 m_buffer;
u32 m_buffertype;
size_t m_size;
u8 *pointer;
const u32 m_buffertype;
const size_t m_size;
size_t m_iterator;
size_t m_used_iterator;
size_t m_free_iterator;
private:
GLsync *fences;
};

View file

@ -58,11 +58,11 @@ VertexManager::~VertexManager()
void VertexManager::CreateDeviceObjects()
{
s_vertexBuffer = new StreamBuffer(GL_ARRAY_BUFFER, MAX_VBUFFER_SIZE);
m_vertex_buffers = s_vertexBuffer->getBuffer();
s_vertexBuffer = StreamBuffer::Create(GL_ARRAY_BUFFER, MAX_VBUFFER_SIZE);
m_vertex_buffers = s_vertexBuffer->m_buffer;
s_indexBuffer = new StreamBuffer(GL_ELEMENT_ARRAY_BUFFER, MAX_IBUFFER_SIZE);
m_index_buffers = s_indexBuffer->getBuffer();
s_indexBuffer = StreamBuffer::Create(GL_ELEMENT_ARRAY_BUFFER, MAX_IBUFFER_SIZE);
m_index_buffers = s_indexBuffer->m_buffer;
m_CurrentVertexFmt = NULL;
m_last_vao = 0;
@ -85,14 +85,15 @@ void VertexManager::PrepareDrawBuffers(u32 stride)
u32 vertex_data_size = IndexGenerator::GetNumVerts() * stride;
u32 index_data_size = IndexGenerator::GetIndexLen() * sizeof(u16);
u8* buffer = s_vertexBuffer->Map(vertex_data_size, stride);
memcpy(buffer, GetVertexBuffer(), vertex_data_size);
size_t offset = s_vertexBuffer->Unmap(vertex_data_size);
s_baseVertex = offset / stride;
auto buffer = s_vertexBuffer->Map(vertex_data_size, stride);
memcpy(buffer.first, GetVertexBuffer(), vertex_data_size);
s_vertexBuffer->Unmap(vertex_data_size);
s_baseVertex = buffer.second / stride;
buffer = s_indexBuffer->Map(index_data_size);
memcpy(buffer, GetIndexBuffer(), index_data_size);
s_index_offset = s_indexBuffer->Unmap(index_data_size);
memcpy(buffer.first, GetIndexBuffer(), index_data_size);
s_indexBuffer->Unmap(index_data_size);
s_index_offset = buffer.second;
ADDSTAT(stats.thisFrame.bytesVertexStreamed, vertex_data_size);
ADDSTAT(stats.thisFrame.bytesIndexStreamed, index_data_size);