Merge pull request #11703 from degasus/asciiart

Shaders: Add error message for asciiart shader if shader_subgroup is missing.
This commit is contained in:
Mai 2023-03-29 23:34:30 -04:00 committed by GitHub
commit 806ea59d77
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -14,10 +14,10 @@ DefaultValue = false
[/configuration] [/configuration]
*/ */
const uint MAX_CHARS = 96u; // max 96, must be a multiple of 32 const uint MAX_CHARS = 96u; // max 96, must be a multiple of 32
const bool HAVE_FULL_FEATURE_FALLBACK = true; const bool HAVE_FULL_FEATURE_FALLBACK = false; // terrible slow, can easily softlock the GPU
const uint UNROLL_FALLBACK = 4; const uint UNROLL_FALLBACK = 4;
const uint UNROLL_SIMD = 3; // max MAX_CHARS / 32 const uint UNROLL_SIMD = 3; // max MAX_CHARS / 32
// #undef SUPPORTS_SUBGROUP_REDUCTION // #undef SUPPORTS_SUBGROUP_REDUCTION
@ -95,34 +95,37 @@ const uint rasters[char_count][(char_pixels + 31) / 32] = {
// Precalculated sum of all pixels per character // Precalculated sum of all pixels per character
const uint raster_active_pixels[char_count] = { const uint raster_active_pixels[char_count] = {
96, 18, 16, 40, 56, 42, 46, 10, 22, 22, 32, 28, 10, 16, 6, 24, 96, 18, 16, 40, 56, 42, 46, 10, 22, 22, 32, 28, 10, 16, 6, 24, 52, 29, 36, 44, 35, 42, 50, 28,
52, 29, 36, 44, 35, 42, 50, 28, 58, 51, 12, 16, 22, 32, 22, 26, 58, 51, 12, 16, 22, 32, 22, 26, 41, 46, 57, 38, 52, 38, 32, 46, 48, 30, 31, 43, 28, 56, 64, 52,
41, 46, 57, 38, 52, 38, 32, 46, 48, 30, 31, 43, 28, 56, 64, 52, 42, 52, 52, 44, 28, 48, 42, 58, 42, 32, 38, 26, 24, 26, 14, 8, 10, 34, 40, 26, 40, 32, 30, 33,
42, 52, 52, 44, 28, 48, 42, 58, 42, 32, 38, 26, 24, 26, 14, 8, 39, 16, 20, 37, 28, 43, 30, 30, 34, 34, 20, 28, 27, 30, 26, 36, 26, 24, 26, 30, 24, 30, 14, 0};
10, 34, 40, 26, 40, 32, 30, 33, 39, 16, 20, 37, 28, 43, 30, 30,
34, 34, 20, 28, 27, 30, 26, 36, 26, 24, 26, 30, 24, 30, 14, 0};
// Get one sample of the font: (pixel index, character index) // Get one sample of the font: (pixel index, character index)
float SampleFont(uint2 pos) { float SampleFont(uint2 pos)
{
return (rasters[pos.y][pos.x / 32] >> (pos.x % 32)) & uint(1); return (rasters[pos.y][pos.x / 32] >> (pos.x % 32)) & uint(1);
} }
// Get one sample of the framebuffer: (character position in screen space, pixel index) // Get one sample of the framebuffer: (character position in screen space, pixel index)
float3 SampleTex(uint2 char_pos, uint pixel) { float3 SampleTex(uint2 char_pos, uint pixel)
float2 inv_resoltion = OptionEnabled(USE_WINDOW_RES) ? GetInvWindowResolution() : GetInvResolution(); {
float2 inv_resoltion =
OptionEnabled(USE_WINDOW_RES) ? GetInvWindowResolution() : GetInvResolution();
float2 tex_pos = char_pos * char_dim + float2(pixel % char_width, pixel / char_width) + 0.5; float2 tex_pos = char_pos * char_dim + float2(pixel % char_width, pixel / char_width) + 0.5;
return SampleLocation(tex_pos * inv_resoltion).xyz; return SampleLocation(tex_pos * inv_resoltion).xyz;
} }
struct CharResults { struct CharResults
float3 fg; // font color {
float3 bg; // background color float3 fg; // font color
float err; // MSE of this configuration float3 bg; // background color
uint c; // character index float err; // MSE of this configuration
uint c; // character index
}; };
// Calculate the font and background color and the MSE for a given character // Calculate the font and background color and the MSE for a given character
CharResults CalcCharRes(uint c, float3 t, float3 ft) { CharResults CalcCharRes(uint c, float3 t, float3 ft)
{
CharResults o; CharResults o;
o.c = c; o.c = c;
@ -140,7 +143,8 @@ CharResults CalcCharRes(uint c, float3 t, float3 ft) {
// The calculation isn't stable if the font is all-one. Return max err // The calculation isn't stable if the font is all-one. Return max err
// instead. // instead.
if (f == char_pixels) { if (f == char_pixels)
{
o.err = char_pixels * char_pixels; o.err = char_pixels * char_pixels;
return o; return o;
} }
@ -184,11 +188,10 @@ CharResults CalcCharRes(uint c, float3 t, float3 ft) {
// solution. // solution.
float3 a = (ft * (f - float(char_pixels)) + t * (f - ff)) / (f * f - ff * float(char_pixels)); float3 a = (ft * (f - float(char_pixels)) + t * (f - ff)) / (f * f - ff * float(char_pixels));
float3 b = (ft * f - t * ff) / (f * f - ff * float(char_pixels)); float3 b = (ft * f - t * ff) / (f * f - ff * float(char_pixels));
float3 e = a * a * ff + 2.0 * a * b * (f - ff) - 2.0 * a * ft + float3 e = a * a * ff + 2.0 * a * b * (f - ff) - 2.0 * a * ft +
b * b * (-2.0 * f + ff + float(char_pixels)) + 2.0 * b * ft - b * b * (-2.0 * f + ff + float(char_pixels)) + 2.0 * b * ft - 2.0 * b * t + tt;
2.0 * b * t + tt;
o.err = dot(e, float3(1.0, 1.0, 1.0)); o.err = dot(e, float3(1.0, 1.0, 1.0));
o.fg = a; o.fg = a;
@ -199,12 +202,13 @@ CharResults CalcCharRes(uint c, float3 t, float3 ft) {
} }
// Get the color of the pixel of this invocation based on the character details // Get the color of the pixel of this invocation based on the character details
float3 GetFinalPixel(CharResults char_out) { float3 GetFinalPixel(CharResults char_out)
float2 resolution = OptionEnabled(USE_WINDOW_RES) ? GetWindowResolution() : GetResolution(); {
uint2 char_pos = uint2(floor(GetCoordinates() * resolution / char_dim)); float2 resolution = OptionEnabled(USE_WINDOW_RES) ? GetWindowResolution() : GetResolution();
uint2 pixel_offset = uint2(floor(GetCoordinates() * resolution) - char_pos * char_dim); uint2 char_pos = uint2(floor(GetCoordinates() * resolution / char_dim));
float font = SampleFont(int2(pixel_offset.x + char_width * pixel_offset.y, char_out.c)); uint2 pixel_offset = uint2(floor(GetCoordinates() * resolution) - char_pos * char_dim);
return char_out.fg * font + char_out.bg * (1.0 - font); float font = SampleFont(int2(pixel_offset.x + char_width * pixel_offset.y, char_out.c));
return char_out.fg * font + char_out.bg * (1.0 - font);
} }
/* /*
@ -218,18 +222,22 @@ float3 GetFinalPixel(CharResults char_out) {
Terrible in performance, only for reference. Terrible in performance, only for reference.
*/ */
CharResults CalcCharTrivial(uint2 char_pos) { CharResults CalcCharTrivial(uint2 char_pos)
{
float3 t; float3 t;
CharResults char_out; CharResults char_out;
char_out.err = char_pixels * char_pixels; char_out.err = char_pixels * char_pixels;
for (uint c = 0; c < MAX_CHARS; c += 1) { for (uint c = 0; c < MAX_CHARS; c += 1)
{
float3 ft = float3(0.0, 0.0, 0.0); float3 ft = float3(0.0, 0.0, 0.0);
for (uint pixel = 0; pixel < char_pixels; pixel += 1) { for (uint pixel = 0; pixel < char_pixels; pixel += 1)
{
float3 tex = SampleTex(char_pos, pixel); float3 tex = SampleTex(char_pos, pixel);
float font = SampleFont(uint2(pixel, c)); float font = SampleFont(uint2(pixel, c));
ft += font * tex; ft += font * tex;
} }
if (c == 0) t = ft; if (c == 0)
t = ft;
CharResults res = CalcCharRes(c, t, ft); CharResults res = CalcCharRes(c, t, ft);
if (res.err < char_out.err) if (res.err < char_out.err)
char_out = res; char_out = res;
@ -238,43 +246,52 @@ CharResults CalcCharTrivial(uint2 char_pos) {
} }
/* /*
However for better performance, some characters are tested at once. This saves some expensive texture() calls. However for better performance, some characters are tested at once. This saves some expensive
Also split the loop over the pixels in groups of 32 for only fetching the uint32 of the font once. texture() calls. Also split the loop over the pixels in groups of 32 for only fetching the uint32
of the font once.
*/ */
CharResults CalcCharFallback(uint2 char_pos) { CharResults CalcCharFallback(uint2 char_pos)
{
float3 t; float3 t;
CharResults char_out; CharResults char_out;
char_out.err = char_pixels * char_pixels; char_out.err = char_pixels * char_pixels;
for (uint c = 0; c < MAX_CHARS; c += UNROLL_FALLBACK) { for (uint c = 0; c < MAX_CHARS; c += UNROLL_FALLBACK)
{
// Declare ft // Declare ft
float3 ft[UNROLL_FALLBACK]; float3 ft[UNROLL_FALLBACK];
for (uint i = 0; i < UNROLL_FALLBACK; i++) for (uint i = 0; i < UNROLL_FALLBACK; i++)
ft[i] = float3(0.0, 0.0, 0.0); ft[i] = float3(0.0, 0.0, 0.0);
// Split `for p : pixels` in groups of 32. This makes accessing the texture (bit in uint32) easier. // Split `for p : pixels` in groups of 32. This makes accessing the texture (bit in uint32)
for (uint pixel = 0; pixel < char_pixels; pixel += 32) { // easier.
for (uint pixel = 0; pixel < char_pixels; pixel += 32)
{
uint font_i[UNROLL_FALLBACK]; uint font_i[UNROLL_FALLBACK];
for (uint i = 0; i < UNROLL_FALLBACK; i++) for (uint i = 0; i < UNROLL_FALLBACK; i++)
font_i[i] = rasters[c + i][pixel / 32]; font_i[i] = rasters[c + i][pixel / 32];
for (uint pixel_offset = 0; pixel_offset < 32; pixel_offset += 1) { for (uint pixel_offset = 0; pixel_offset < 32; pixel_offset += 1)
{
float3 tex = SampleTex(char_pos, pixel + pixel_offset); float3 tex = SampleTex(char_pos, pixel + pixel_offset);
// Inner kernel of `ft += font * tex`. Most time is spend in here. // Inner kernel of `ft += font * tex`. Most time is spend in here.
for (uint i = 0; i < UNROLL_FALLBACK; i++) { for (uint i = 0; i < UNROLL_FALLBACK; i++)
{
float font = (font_i[i] >> pixel_offset) & uint(1); float font = (font_i[i] >> pixel_offset) & uint(1);
ft[i] += font * tex; ft[i] += font * tex;
} }
} }
} }
if (c == 0) { if (c == 0)
{
// First char has font := 1, so t = ft. Cache this value for the next iterations. // First char has font := 1, so t = ft. Cache this value for the next iterations.
t = ft[0]; t = ft[0];
} }
// Check if this character fits better than the last one. // Check if this character fits better than the last one.
for (uint i = 0; i < UNROLL_FALLBACK; i++) { for (uint i = 0; i < UNROLL_FALLBACK; i++)
{
CharResults res = CalcCharRes(c + i, t, ft[i]); CharResults res = CalcCharRes(c + i, t, ft[i]);
if (res.err < char_out.err) if (res.err < char_out.err)
char_out = res; char_out = res;
@ -289,58 +306,63 @@ CharResults CalcCharFallback(uint2 char_pos) {
- distribute all characters over the lanes and check for them in parallel - distribute all characters over the lanes and check for them in parallel
- distribute the uniform texture access and broadcast each back to each lane - distribute the uniform texture access and broadcast each back to each lane
*/ */
CharResults CalcCharSIMD(uint2 char_pos, uint simd_width) { CharResults CalcCharSIMD(uint2 char_pos, uint simd_width)
{
// Font color, bg color, character, error -- of character with minimum error // Font color, bg color, character, error -- of character with minimum error
CharResults char_out; CharResults char_out;
char_out.err = char_pixels * char_pixels; char_out.err = char_pixels * char_pixels;
float3 t; float3 t;
#ifdef SUPPORTS_SUBGROUP_REDUCTION #ifdef SUPPORTS_SUBGROUP_REDUCTION
// Hack: Work in hard-codeded fixed SIMD mode // Hack: Work in hard-codeded fixed SIMD mode
if (gl_SubgroupInvocationID < simd_width) { if (gl_SubgroupInvocationID < simd_width)
{
// Loop over all characters // Loop over all characters
for (uint c = 0; c < MAX_CHARS; c += UNROLL_SIMD * simd_width) { for (uint c = 0; c < MAX_CHARS; c += UNROLL_SIMD * simd_width)
{
// registers for "sum of font * texture" // registers for "sum of font * texture"
float3 ft[UNROLL_SIMD]; float3 ft[UNROLL_SIMD];
for (uint i = 0; i < UNROLL_SIMD; i++) for (uint i = 0; i < UNROLL_SIMD; i++)
ft[i] = float3(0.0, 0.0, 0.0); ft[i] = float3(0.0, 0.0, 0.0);
for (uint pixel = 0; pixel < char_pixels; pixel += 32) { for (uint pixel = 0; pixel < char_pixels; pixel += 32)
{
// Preload the font uint32 for the next 32 pixels // Preload the font uint32 for the next 32 pixels
uint font_i[UNROLL_SIMD]; uint font_i[UNROLL_SIMD];
for (uint i = 0; i < UNROLL_SIMD; i++) for (uint i = 0; i < UNROLL_SIMD; i++)
font_i[i] = rasters[c + UNROLL_SIMD*gl_SubgroupInvocationID + i][pixel / 32]; font_i[i] = rasters[c + UNROLL_SIMD * gl_SubgroupInvocationID + i][pixel / 32];
for (uint pixel_offset = 0; pixel_offset < 32; pixel_offset += simd_width) { for (uint pixel_offset = 0; pixel_offset < 32; pixel_offset += simd_width)
// Copy one full WRAP of textures into registers and shuffle them around {
// for later usage. This avoids one memory transaction per tested pixel // Copy one full WRAP of textures into registers and shuffle them around for later usage.
// & character. // This avoids one memory transaction per tested pixel & character.
float3 tex_simd = SampleTex(char_pos, pixel + pixel_offset + gl_SubgroupInvocationID); float3 tex_simd = SampleTex(char_pos, pixel + pixel_offset + gl_SubgroupInvocationID);
for (uint k = 0; k < simd_width; k += 1) { for (uint k = 0; k < simd_width; k += 1)
{
float3 tex = subgroupBroadcast(tex_simd, k); float3 tex = subgroupBroadcast(tex_simd, k);
// Note: As pixel iterates based on power-of-two gl_SubgroupSize, the // Note: As pixel iterates based on power-of-two gl_SubgroupSize,
// const memory access to rasters is CSE'd and the inner loop // the const memory access to rasters is CSE'd and the inner loop
// after unrolling only contains: testing one bit + shuffle + // after unrolling only contains: testing one bit + shuffle +
// conditional add // conditional add
for (uint i = 0; i < UNROLL_SIMD; i++) { for (uint i = 0; i < UNROLL_SIMD; i++)
{
float font = (font_i[i] >> (k + pixel_offset % 32)) & uint(1); float font = (font_i[i] >> (k + pixel_offset % 32)) & uint(1);
ft[i] += font * tex; ft[i] += font * tex;
} }
} }
} }
} }
if (c == 0) { if (c == 0)
{
// font[0] is a hardcoded 1 font, so t = ft // font[0] is a hardcoded 1 font, so t = ft
t = subgroupBroadcast(ft[0], 0); t = subgroupBroadcast(ft[0], 0);
} }
for (uint i = 0; i < UNROLL_SIMD; i++) { for (uint i = 0; i < UNROLL_SIMD; i++)
CharResults res = CalcCharRes(c + UNROLL_SIMD*gl_SubgroupInvocationID + i, t, ft[i]); {
CharResults res = CalcCharRes(c + UNROLL_SIMD * gl_SubgroupInvocationID + i, t, ft[i]);
if (res.err < char_out.err) if (res.err < char_out.err)
char_out = res; char_out = res;
} }
@ -355,63 +377,107 @@ CharResults CalcCharSIMD(uint2 char_pos, uint simd_width) {
char_out.c = subgroupBroadcast(char_out.c, smallest); char_out.c = subgroupBroadcast(char_out.c, smallest);
char_out.err = err_min; char_out.err = err_min;
#endif #endif
return char_out; return char_out;
} }
bool supportsSIMD(uint simd_width) { bool supportsSIMD(uint simd_width)
#ifdef SUPPORTS_SUBGROUP_REDUCTION {
#ifdef SUPPORTS_SUBGROUP_REDUCTION
const uint mask = simd_width == 32u ? 0xFFFFFFFFu : (1u << simd_width) - 1; const uint mask = simd_width == 32u ? 0xFFFFFFFFu : (1u << simd_width) - 1;
return (subgroupBallot(true)[0] & mask) == mask; return (subgroupBallot(true)[0] & mask) == mask;
#else #else
return false; return false;
#endif #endif
} }
void main() { // "Error: The AsciiArt shader requires the missing GPU extention KHR_shader_subgroup."
const uint missing_subgroup_warning_len = 82;
const uint missing_subgroup_warning[missing_subgroup_warning_len] = {
37, 82, 82, 79, 82, 26, 95, 52, 72, 69, 95, 33, 83, 67, 73, 73, 33, 82, 84, 95, 83,
72, 65, 68, 69, 82, 95, 82, 69, 81, 85, 73, 82, 69, 83, 95, 84, 72, 69, 95, 77, 73,
83, 83, 73, 78, 71, 95, 39, 48, 53, 95, 69, 88, 84, 69, 78, 84, 73, 79, 78, 95, 43,
40, 50, 63, 83, 72, 65, 68, 69, 82, 63, 83, 85, 66, 71, 82, 79, 85, 80, 14};
float3 ShowWarning(uint2 char_pos)
{
CharResults char_out;
char_out.fg = float3(1.0, 1.0, 1.0);
char_out.bg = float3(0.0, 0.0, 0.0);
char_out.c = 95u; // just background
if (char_pos.y == 0u && char_pos.x < missing_subgroup_warning_len)
{
char_out.c = missing_subgroup_warning[char_pos.x];
}
return GetFinalPixel(char_out);
}
void main()
{
// Calculate the character position of this pixel // Calculate the character position of this pixel
float2 resolution = OptionEnabled(USE_WINDOW_RES) ? GetWindowResolution() : GetResolution(); float2 resolution = OptionEnabled(USE_WINDOW_RES) ? GetWindowResolution() : GetResolution();
uint2 char_pos_self = uint2(floor(GetCoordinates() * resolution / char_dim)); uint2 char_pos_self = uint2(floor(GetCoordinates() * resolution / char_dim));
float3 color_out; float3 color_out;
#ifdef SUPPORTS_SUBGROUP_REDUCTION #ifdef SUPPORTS_SUBGROUP_REDUCTION
if (supportsSIMD(8)) { if (supportsSIMD(8))
{
// Loop over all character positions covered by this wave // Loop over all character positions covered by this wave
bool pixel_active = !gl_HelperInvocation; bool pixel_active = !gl_HelperInvocation;
CharResults char_out; CharResults char_out;
while (true) { while (true)
{
// Fetch the next active character position // Fetch the next active character position
uint4 active_lanes = subgroupBallot(pixel_active); uint4 active_lanes = subgroupBallot(pixel_active);
if (active_lanes == uint4(0, 0, 0, 0)) { if (active_lanes == uint4(0, 0, 0, 0))
{
break; break;
} }
uint2 char_pos = subgroupBroadcast(char_pos_self, subgroupBallotFindLSB(active_lanes)); uint2 char_pos = subgroupBroadcast(char_pos_self, subgroupBallotFindLSB(active_lanes));
// And calculate everything for this character position // And calculate everything for this character position
if (supportsSIMD(32)) { if (supportsSIMD(32))
{
char_out = CalcCharSIMD(char_pos, 32); char_out = CalcCharSIMD(char_pos, 32);
} else if (supportsSIMD(16)) { }
else if (supportsSIMD(16))
{
char_out = CalcCharSIMD(char_pos, 16); char_out = CalcCharSIMD(char_pos, 16);
} else if (supportsSIMD(8)) { }
else if (supportsSIMD(8))
{
char_out = CalcCharSIMD(char_pos, 8); char_out = CalcCharSIMD(char_pos, 8);
} }
// Draw the character on screen // Draw the character on screen
if (char_pos == char_pos_self) { if (char_pos == char_pos_self)
{
color_out = GetFinalPixel(char_out); color_out = GetFinalPixel(char_out);
pixel_active = false; pixel_active = false;
} }
if (OptionEnabled(DEBUG_ONLY_ONE_CHAR)) { if (OptionEnabled(DEBUG_ONLY_ONE_CHAR))
{
break; break;
} }
} }
} else }
#endif else
if (HAVE_FULL_FEATURE_FALLBACK) { #else
if (char_pos_self.y <= 1u)
{
color_out = ShowWarning(char_pos_self);
}
else
#endif
if (HAVE_FULL_FEATURE_FALLBACK)
{
color_out = GetFinalPixel(CalcCharFallback(char_pos_self)); color_out = GetFinalPixel(CalcCharFallback(char_pos_self));
} else { }
else
{
color_out = Sample().xyz; color_out = Sample().xyz;
} }