dolphin/Source/Core/VideoBackends/OGL/OGLBoundingBox.cpp

// Copyright 2014 Dolphin Emulator Project
// Licensed under GPLv2+
// Refer to the license.txt file included.

#include <algorithm>
#include <array>
#include <cstring>

#include "Common/GL/GLUtil.h"

#include "VideoBackends/OGL/OGLBoundingBox.h"
#include "VideoBackends/OGL/OGLRender.h"

#include "VideoCommon/DriverDetails.h"
#include "VideoCommon/VideoConfig.h"

enum : u32
{
  NUM_BBOX_VALUES = 4,
};

static GLuint s_bbox_buffer_id;
static std::array<s32, NUM_BBOX_VALUES> s_bbox_values;
static std::array<bool, NUM_BBOX_VALUES> s_bbox_dirty;
static bool s_bbox_valid = false;

namespace OGL
{
void BoundingBox::Init()
{
  if (!g_ActiveConfig.backend_info.bSupportsBBox)
    return;

  const s32 initial_values[NUM_BBOX_VALUES] = {0, 0, 0, 0};
  std::memcpy(s_bbox_values.data(), initial_values, sizeof(s_bbox_values));
  s_bbox_dirty = {};
  s_bbox_valid = true;

  glGenBuffers(1, &s_bbox_buffer_id);
  glBindBuffer(GL_SHADER_STORAGE_BUFFER, s_bbox_buffer_id);
  glBufferData(GL_SHADER_STORAGE_BUFFER, sizeof(initial_values), initial_values, GL_DYNAMIC_DRAW);
  glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, s_bbox_buffer_id);
}

void BoundingBox::Shutdown()
{
  if (!g_ActiveConfig.backend_info.bSupportsBBox)
    return;

  glDeleteBuffers(1, &s_bbox_buffer_id);
}

void BoundingBox::Flush()
{
  s_bbox_valid = false;

  if (std::none_of(s_bbox_dirty.begin(), s_bbox_dirty.end(), [](bool dirty) { return dirty; }))
    return;

  glBindBuffer(GL_SHADER_STORAGE_BUFFER, s_bbox_buffer_id);

  for (u32 start = 0; start < NUM_BBOX_VALUES;)
  {
    if (!s_bbox_dirty[start])
    {
      start++;
      continue;
    }

    u32 end = start + 1;
    s_bbox_dirty[start] = false;
    for (; end < NUM_BBOX_VALUES; end++)
    {
      if (!s_bbox_dirty[end])
        break;

      s_bbox_dirty[end] = false;
    }

    glBufferSubData(GL_SHADER_STORAGE_BUFFER, start * sizeof(s32), (end - start) * sizeof(s32),
                    &s_bbox_values[start]);
  }

  glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0);
}

void BoundingBox::Readback()
{
  glBindBuffer(GL_SHADER_STORAGE_BUFFER, s_bbox_buffer_id);
  if (!DriverDetails::HasBug(DriverDetails::BUG_SLOW_GETBUFFERSUBDATA) &&
      !static_cast<Renderer*>(g_renderer.get())->IsGLES())
  {
    // Using glMapBufferRange to read back the contents of the SSBO is extremely slow
    // on nVidia drivers. This is more noticeable at higher internal resolutions.
    // Using glGetBufferSubData instead does not seem to exhibit this slowdown.
    std::array<s32, NUM_BBOX_VALUES> gpu_values;
    glGetBufferSubData(GL_SHADER_STORAGE_BUFFER, 0, sizeof(s32) * NUM_BBOX_VALUES,
                       gpu_values.data());
    for (u32 i = 0; i < NUM_BBOX_VALUES; i++)
    {
      if (!s_bbox_dirty[i])
        s_bbox_values[i] = gpu_values[i];
    }
  }
  else
  {
    // Using glMapBufferRange is faster on AMD cards by a measurable margin.
    void* ptr = glMapBufferRange(GL_SHADER_STORAGE_BUFFER, 0, sizeof(s32) * NUM_BBOX_VALUES,
                                 GL_MAP_READ_BIT);
    if (ptr)
    {
      for (u32 i = 0; i < NUM_BBOX_VALUES; i++)
      {
        if (!s_bbox_dirty[i])
        {
          std::memcpy(&s_bbox_values[i], reinterpret_cast<const u8*>(ptr) + sizeof(s32) * i,
                      sizeof(s32));
        }
      }

      glUnmapBuffer(GL_SHADER_STORAGE_BUFFER);
    }
  }
  glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0);
  s_bbox_valid = true;
}

void BoundingBox::Set(int index, int value)
{
  if (s_bbox_valid && s_bbox_values[index] == value)
    return;

  s_bbox_values[index] = value;
  s_bbox_dirty[index] = true;
}

int BoundingBox::Get(int index)
{
  if (!s_bbox_valid)
    Readback();

  return s_bbox_values[index];
}
};  // namespace OGL
OGL: implement bounding box support with ssbo This implemention tries to be as accurate as the old SW implemention, but it will remove the dependcy of our vertexloader on videosw. 2014-11-13 23:26:49 +01:00			`// Copyright 2014 Dolphin Emulator Project`
			`// Licensed under GPLv2+`
			`// Refer to the license.txt file included.`

VideoBackends/OGL: Cache bounding box values between reads 2021-04-17 19:14:52 +02:00			`#include <algorithm>`
			`#include <array>`
Fix building with PCH disabled. 2016-05-26 19:42:07 +02:00			`#include <cstring>`

Move GL interface code out of the OpenGL video backend. 2015-09-18 18:40:00 +02:00			`#include "Common/GL/GLUtil.h"`

normalize common filenames in VideoBackends/OGL 2020-09-15 14:00:24 +02:00			`#include "VideoBackends/OGL/OGLBoundingBox.h"`
			`#include "VideoBackends/OGL/OGLRender.h"`
OGL: implement bounding box support with ssbo This implemention tries to be as accurate as the old SW implemention, but it will remove the dependcy of our vertexloader on videosw. 2014-11-13 23:26:49 +01:00
OGL: Work around slowdown of glMapBufferRange with SSBO on NVIDIA drivers Using glMapBufferRange to read back the contents of the SSBO is extremely slow on NVIDIA drivers. This is more noticeable at higher internal resolutions. Using glGetBufferSubData instead does not seem to exhibit this slowdown. 2016-05-11 14:19:59 +02:00			`#include "VideoCommon/DriverDetails.h"`
OGL: implement bounding box support with ssbo This implemention tries to be as accurate as the old SW implemention, but it will remove the dependcy of our vertexloader on videosw. 2014-11-13 23:26:49 +01:00			`#include "VideoCommon/VideoConfig.h"`

VideoBackends/OGL: Cache bounding box values between reads 2021-04-17 19:14:52 +02:00			`enum : u32`
			`{`
			`NUM_BBOX_VALUES = 4,`
			`};`

OGL: implement bounding box support with ssbo This implemention tries to be as accurate as the old SW implemention, but it will remove the dependcy of our vertexloader on videosw. 2014-11-13 23:26:49 +01:00			`static GLuint s_bbox_buffer_id;`
VideoBackends/OGL: Cache bounding box values between reads 2021-04-17 19:14:52 +02:00			`static std::array<s32, NUM_BBOX_VALUES> s_bbox_values;`
			`static std::array<bool, NUM_BBOX_VALUES> s_bbox_dirty;`
			`static bool s_bbox_valid = false;`
OGL: implement bounding box support with ssbo This implemention tries to be as accurate as the old SW implemention, but it will remove the dependcy of our vertexloader on videosw. 2014-11-13 23:26:49 +01:00
			`namespace OGL`
			`{`
Move most backend functionality to VideoCommon 2019-02-15 02:59:50 +01:00			`void BoundingBox::Init()`
OGL: implement Bounding Box on systems w/o SSBO This commit should have zero performance effect if SSBOs are supported. If they aren't (e.g. on all Macs), this commit alters FramebufferManager to attach a new stencil buffer and VertexManager to draw to it when bounding box is active. `BBoxRead` gets the pixel data from the buffer and dumbly loops through it to find the bounding box. This patch can run Paper Mario: The Thousand-Year Door at almost full speed (50–60 FPS) without Dual-Core enabled for all common bounding box-using actions I tested (going through pipes, Plane Mode, Paper Mode, Prof. Frankly's gate, combat, walking around the overworld, etc.) on my computer (macOS 10.12.3, 2.8 GHz Intel Core i7, 16 GB 1600 MHz DDR3, and Intel Iris 1536 MB). A few more demanding scenes (e.g. the self-building bridge on the way to Petalburg) slow to ~15% of their speed without this patch (though they don't run quite at full speed even on master). The slowdown is caused almost solely by `glReadPixels` in `OGL::BoundingBox::Get`. Other implementation ideas: - Use a stencil buffer that's separate from the depth buffer. This would require ARB_texture_stencil8 / OpenGL 4.4, which isn't available on macOS. - Use `glGetTexImage` instead of `glReadPixels`. This is ~5 FPS slower on my computer, presumably because it has to transfer the entire combined depth-stencil buffer instead of only the stencil data. Getting only stencil data from `glGetTexImage` requires ARB_texture_stencil8 / OpenGL 4.4, which (again) is not available on macOS. - Don't use a PBO, and use `glReadPixels` synchronously. This has no visible performance effect on my computer, and is theoretically slower. 2017-03-06 00:34:30 +01:00			`{`
Move most backend functionality to VideoCommon 2019-02-15 02:59:50 +01:00			`if (!g_ActiveConfig.backend_info.bSupportsBBox)`
OGL: implement Bounding Box on systems w/o SSBO This commit should have zero performance effect if SSBOs are supported. If they aren't (e.g. on all Macs), this commit alters FramebufferManager to attach a new stencil buffer and VertexManager to draw to it when bounding box is active. `BBoxRead` gets the pixel data from the buffer and dumbly loops through it to find the bounding box. This patch can run Paper Mario: The Thousand-Year Door at almost full speed (50–60 FPS) without Dual-Core enabled for all common bounding box-using actions I tested (going through pipes, Plane Mode, Paper Mode, Prof. Frankly's gate, combat, walking around the overworld, etc.) on my computer (macOS 10.12.3, 2.8 GHz Intel Core i7, 16 GB 1600 MHz DDR3, and Intel Iris 1536 MB). A few more demanding scenes (e.g. the self-building bridge on the way to Petalburg) slow to ~15% of their speed without this patch (though they don't run quite at full speed even on master). The slowdown is caused almost solely by `glReadPixels` in `OGL::BoundingBox::Get`. Other implementation ideas: - Use a stencil buffer that's separate from the depth buffer. This would require ARB_texture_stencil8 / OpenGL 4.4, which isn't available on macOS. - Use `glGetTexImage` instead of `glReadPixels`. This is ~5 FPS slower on my computer, presumably because it has to transfer the entire combined depth-stencil buffer instead of only the stencil data. Getting only stencil data from `glGetTexImage` requires ARB_texture_stencil8 / OpenGL 4.4, which (again) is not available on macOS. - Don't use a PBO, and use `glReadPixels` synchronously. This has no visible performance effect on my computer, and is theoretically slower. 2017-03-06 00:34:30 +01:00			`return;`

VideoBackends/OGL: Cache bounding box values between reads 2021-04-17 19:14:52 +02:00			`const s32 initial_values[NUM_BBOX_VALUES] = {0, 0, 0, 0};`
			`std::memcpy(s_bbox_values.data(), initial_values, sizeof(s_bbox_values));`
			`s_bbox_dirty = {};`
			`s_bbox_valid = true;`

Move most backend functionality to VideoCommon 2019-02-15 02:59:50 +01:00			`glGenBuffers(1, &s_bbox_buffer_id);`
			`glBindBuffer(GL_SHADER_STORAGE_BUFFER, s_bbox_buffer_id);`
VideoBackends/OGL: Cache bounding box values between reads 2021-04-17 19:14:52 +02:00			`glBufferData(GL_SHADER_STORAGE_BUFFER, sizeof(initial_values), initial_values, GL_DYNAMIC_DRAW);`
Move most backend functionality to VideoCommon 2019-02-15 02:59:50 +01:00			`glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, s_bbox_buffer_id);`
OGL: implement bounding box support with ssbo This implemention tries to be as accurate as the old SW implemention, but it will remove the dependcy of our vertexloader on videosw. 2014-11-13 23:26:49 +01:00			`}`

			`void BoundingBox::Shutdown()`
			`{`
Move most backend functionality to VideoCommon 2019-02-15 02:59:50 +01:00			`if (!g_ActiveConfig.backend_info.bSupportsBBox)`
			`return;`

			`glDeleteBuffers(1, &s_bbox_buffer_id);`
OGL: implement bounding box support with ssbo This implemention tries to be as accurate as the old SW implemention, but it will remove the dependcy of our vertexloader on videosw. 2014-11-13 23:26:49 +01:00			`}`

VideoBackends/OGL: Cache bounding box values between reads 2021-04-17 19:14:52 +02:00			`void BoundingBox::Flush()`
OGL: implement bounding box support with ssbo This implemention tries to be as accurate as the old SW implemention, but it will remove the dependcy of our vertexloader on videosw. 2014-11-13 23:26:49 +01:00			`{`
VideoBackends/OGL: Cache bounding box values between reads 2021-04-17 19:14:52 +02:00			`s_bbox_valid = false;`

			`if (std::none_of(s_bbox_dirty.begin(), s_bbox_dirty.end(), [](bool dirty) { return dirty; }))`
Move most backend functionality to VideoCommon 2019-02-15 02:59:50 +01:00			`return;`
OGL: implement Bounding Box on systems w/o SSBO This commit should have zero performance effect if SSBOs are supported. If they aren't (e.g. on all Macs), this commit alters FramebufferManager to attach a new stencil buffer and VertexManager to draw to it when bounding box is active. `BBoxRead` gets the pixel data from the buffer and dumbly loops through it to find the bounding box. This patch can run Paper Mario: The Thousand-Year Door at almost full speed (50–60 FPS) without Dual-Core enabled for all common bounding box-using actions I tested (going through pipes, Plane Mode, Paper Mode, Prof. Frankly's gate, combat, walking around the overworld, etc.) on my computer (macOS 10.12.3, 2.8 GHz Intel Core i7, 16 GB 1600 MHz DDR3, and Intel Iris 1536 MB). A few more demanding scenes (e.g. the self-building bridge on the way to Petalburg) slow to ~15% of their speed without this patch (though they don't run quite at full speed even on master). The slowdown is caused almost solely by `glReadPixels` in `OGL::BoundingBox::Get`. Other implementation ideas: - Use a stencil buffer that's separate from the depth buffer. This would require ARB_texture_stencil8 / OpenGL 4.4, which isn't available on macOS. - Use `glGetTexImage` instead of `glReadPixels`. This is ~5 FPS slower on my computer, presumably because it has to transfer the entire combined depth-stencil buffer instead of only the stencil data. Getting only stencil data from `glGetTexImage` requires ARB_texture_stencil8 / OpenGL 4.4, which (again) is not available on macOS. - Don't use a PBO, and use `glReadPixels` synchronously. This has no visible performance effect on my computer, and is theoretically slower. 2017-03-06 00:34:30 +01:00
Move most backend functionality to VideoCommon 2019-02-15 02:59:50 +01:00			`glBindBuffer(GL_SHADER_STORAGE_BUFFER, s_bbox_buffer_id);`
VideoBackends/OGL: Cache bounding box values between reads 2021-04-17 19:14:52 +02:00
			`for (u32 start = 0; start < NUM_BBOX_VALUES;)`
			`{`
			`if (!s_bbox_dirty[start])`
			`{`
			`start++;`
			`continue;`
			`}`

			`u32 end = start + 1;`
			`s_bbox_dirty[start] = false;`
			`for (; end < NUM_BBOX_VALUES; end++)`
			`{`
			`if (!s_bbox_dirty[end])`
			`break;`

			`s_bbox_dirty[end] = false;`
			`}`

			`glBufferSubData(GL_SHADER_STORAGE_BUFFER, start * sizeof(s32), (end - start) * sizeof(s32),`
			`&s_bbox_values[start]);`
			`}`

Move most backend functionality to VideoCommon 2019-02-15 02:59:50 +01:00			`glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0);`
OGL: implement bounding box support with ssbo This implemention tries to be as accurate as the old SW implemention, but it will remove the dependcy of our vertexloader on videosw. 2014-11-13 23:26:49 +01:00			`}`

VideoBackends/OGL: Cache bounding box values between reads 2021-04-17 19:14:52 +02:00			`void BoundingBox::Readback()`
OGL: implement bounding box support with ssbo This implemention tries to be as accurate as the old SW implemention, but it will remove the dependcy of our vertexloader on videosw. 2014-11-13 23:26:49 +01:00			`{`
Move most backend functionality to VideoCommon 2019-02-15 02:59:50 +01:00			`glBindBuffer(GL_SHADER_STORAGE_BUFFER, s_bbox_buffer_id);`
			`if (!DriverDetails::HasBug(DriverDetails::BUG_SLOW_GETBUFFERSUBDATA) &&`
			`!static_cast<Renderer*>(g_renderer.get())->IsGLES())`
OGL: Work around slowdown of glMapBufferRange with SSBO on NVIDIA drivers Using glMapBufferRange to read back the contents of the SSBO is extremely slow on NVIDIA drivers. This is more noticeable at higher internal resolutions. Using glGetBufferSubData instead does not seem to exhibit this slowdown. 2016-05-11 14:19:59 +02:00			`{`
Move most backend functionality to VideoCommon 2019-02-15 02:59:50 +01:00			`// Using glMapBufferRange to read back the contents of the SSBO is extremely slow`
			`// on nVidia drivers. This is more noticeable at higher internal resolutions.`
			`// Using glGetBufferSubData instead does not seem to exhibit this slowdown.`
VideoBackends/OGL: Cache bounding box values between reads 2021-04-17 19:14:52 +02:00			`std::array<s32, NUM_BBOX_VALUES> gpu_values;`
			`glGetBufferSubData(GL_SHADER_STORAGE_BUFFER, 0, sizeof(s32) * NUM_BBOX_VALUES,`
			`gpu_values.data());`
			`for (u32 i = 0; i < NUM_BBOX_VALUES; i++)`
			`{`
			`if (!s_bbox_dirty[i])`
			`s_bbox_values[i] = gpu_values[i];`
			`}`
OGL: Work around slowdown of glMapBufferRange with SSBO on NVIDIA drivers Using glMapBufferRange to read back the contents of the SSBO is extremely slow on NVIDIA drivers. This is more noticeable at higher internal resolutions. Using glGetBufferSubData instead does not seem to exhibit this slowdown. 2016-05-11 14:19:59 +02:00			`}`
			`else`
OGL: implement bounding box support with ssbo This implemention tries to be as accurate as the old SW implemention, but it will remove the dependcy of our vertexloader on videosw. 2014-11-13 23:26:49 +01:00			`{`
Move most backend functionality to VideoCommon 2019-02-15 02:59:50 +01:00			`// Using glMapBufferRange is faster on AMD cards by a measurable margin.`
VideoBackends/OGL: Cache bounding box values between reads 2021-04-17 19:14:52 +02:00			`void* ptr = glMapBufferRange(GL_SHADER_STORAGE_BUFFER, 0, sizeof(s32) * NUM_BBOX_VALUES,`
Move most backend functionality to VideoCommon 2019-02-15 02:59:50 +01:00			`GL_MAP_READ_BIT);`
			`if (ptr)`
OGL: Work around slowdown of glMapBufferRange with SSBO on NVIDIA drivers Using glMapBufferRange to read back the contents of the SSBO is extremely slow on NVIDIA drivers. This is more noticeable at higher internal resolutions. Using glGetBufferSubData instead does not seem to exhibit this slowdown. 2016-05-11 14:19:59 +02:00			`{`
VideoBackends/OGL: Cache bounding box values between reads 2021-04-17 19:14:52 +02:00			`for (u32 i = 0; i < NUM_BBOX_VALUES; i++)`
			`{`
			`if (!s_bbox_dirty[i])`
			`{`
			`std::memcpy(&s_bbox_values[i], reinterpret_cast<const u8>(ptr) + sizeof(s32) i,`
			`sizeof(s32));`
			`}`
			`}`

Move most backend functionality to VideoCommon 2019-02-15 02:59:50 +01:00			`glUnmapBuffer(GL_SHADER_STORAGE_BUFFER);`
OGL: Work around slowdown of glMapBufferRange with SSBO on NVIDIA drivers Using glMapBufferRange to read back the contents of the SSBO is extremely slow on NVIDIA drivers. This is more noticeable at higher internal resolutions. Using glGetBufferSubData instead does not seem to exhibit this slowdown. 2016-05-11 14:19:59 +02:00			`}`
OGL: implement bounding box support with ssbo This implemention tries to be as accurate as the old SW implemention, but it will remove the dependcy of our vertexloader on videosw. 2014-11-13 23:26:49 +01:00			`}`
Move most backend functionality to VideoCommon 2019-02-15 02:59:50 +01:00			`glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0);`
VideoBackends/OGL: Cache bounding box values between reads 2021-04-17 19:14:52 +02:00			`s_bbox_valid = true;`
			`}`

			`void BoundingBox::Set(int index, int value)`
			`{`
			`if (s_bbox_valid && s_bbox_values[index] == value)`
			`return;`

			`s_bbox_values[index] = value;`
			`s_bbox_dirty[index] = true;`
			`}`

			`int BoundingBox::Get(int index)`
			`{`
			`if (!s_bbox_valid)`
			`Readback();`

			`return s_bbox_values[index];`
OGL: implement Bounding Box on systems w/o SSBO This commit should have zero performance effect if SSBOs are supported. If they aren't (e.g. on all Macs), this commit alters FramebufferManager to attach a new stencil buffer and VertexManager to draw to it when bounding box is active. `BBoxRead` gets the pixel data from the buffer and dumbly loops through it to find the bounding box. This patch can run Paper Mario: The Thousand-Year Door at almost full speed (50–60 FPS) without Dual-Core enabled for all common bounding box-using actions I tested (going through pipes, Plane Mode, Paper Mode, Prof. Frankly's gate, combat, walking around the overworld, etc.) on my computer (macOS 10.12.3, 2.8 GHz Intel Core i7, 16 GB 1600 MHz DDR3, and Intel Iris 1536 MB). A few more demanding scenes (e.g. the self-building bridge on the way to Petalburg) slow to ~15% of their speed without this patch (though they don't run quite at full speed even on master). The slowdown is caused almost solely by `glReadPixels` in `OGL::BoundingBox::Get`. Other implementation ideas: - Use a stencil buffer that's separate from the depth buffer. This would require ARB_texture_stencil8 / OpenGL 4.4, which isn't available on macOS. - Use `glGetTexImage` instead of `glReadPixels`. This is ~5 FPS slower on my computer, presumably because it has to transfer the entire combined depth-stencil buffer instead of only the stencil data. Getting only stencil data from `glGetTexImage` requires ARB_texture_stencil8 / OpenGL 4.4, which (again) is not available on macOS. - Don't use a PBO, and use `glReadPixels` synchronously. This has no visible performance effect on my computer, and is theoretically slower. 2017-03-06 00:34:30 +01:00			`}`
Move most backend functionality to VideoCommon 2019-02-15 02:59:50 +01:00			`}; // namespace OGL`