Added S-SSE3 implementation of one perf. critical texture decoder, please notify me if it breaks anything. Thanks.

git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@1868 8ced0084-cf51-0410-be5f-012b33b47a6e
This commit is contained in:
XTra.KrazzY 2009-01-14 11:28:48 +00:00
parent ed3281290d
commit b252be40cf
2 changed files with 128 additions and 56 deletions

View file

@ -30,4 +30,5 @@ files = [
env_common = env.Clone() env_common = env.Clone()
env_common.Append(CXXFLAGS = [ '-fPIC' ]) env_common.Append(CXXFLAGS = [ '-fPIC' ])
env_common.Append(CXXFLAGS = [ '-mssse3' ]) # For TextureDecoder
env_common.StaticLibrary("videocommon", files) env_common.StaticLibrary("videocommon", files)

View file

@ -17,9 +17,11 @@
#include "Common.h" #include "Common.h"
#include "CPUDetect.h"
#include "TextureDecoder.h" #include "TextureDecoder.h"
#include "LookUpTables.h" #include "LookUpTables.h"
#include <emmintrin.h> #include <emmintrin.h>
#include <tmmintrin.h>
//Uncomment this to enable Texture Format ID overlays //Uncomment this to enable Texture Format ID overlays
#define OVERLAY_TEXFMT #define OVERLAY_TEXFMT
@ -184,6 +186,126 @@ inline void decodebytesI4(u32 *dst, const u8 *src)
} }
} }
inline void sseDecodebytesI4(u32* dst, const __m128i* sseSrc, int height,
int width) {
__m128i* sseDst;
// SSSE3 variant
if(cpu_info.bSSSE3) {
// TODO(XK): Increase Loop Jump?
__m128i s, m[8];
unsigned char *umask;
for(int i = 0; i < 8; i++) {
umask = (unsigned char *)&(m[i]);
for(int j = 0; j < 14; j += 4) {
umask[j] = 0x00 + (i * 4);
umask[j+1] = 0x01 + (i * 4);
umask[j+2] = 0x02 + (i * 4);
umask[j+3] = 0x03 + (i * 4);
}
}
for (int y = 0; y < height; y += 8) {
for (int x = 0; x < width; x += 8) {
for (int iy = 0; iy < 8; iy++, sseSrc++) {
s = _mm_load_si128 (sseSrc);
// TODO: Supplemental Value Lazyness v3
sseDst = (__m128i*)(dst+((y+iy)*width+x)*4);
_mm_store_si128 (sseDst++, _mm_shuffle_epi8(s, m[1]));
_mm_store_si128 (sseDst, _mm_shuffle_epi8(s, m[0]));
iy++;
sseDst = (__m128i*)(dst+((y+iy)*width+x)*4);
_mm_store_si128 (sseDst++, _mm_shuffle_epi8(s, m[3]));
_mm_store_si128 (sseDst, _mm_shuffle_epi8(s, m[2]));
iy++;
sseDst = (__m128i*)(dst+((y+iy)*width+x)*4);
_mm_store_si128 (sseDst++, _mm_shuffle_epi8(s, m[5]));
_mm_store_si128 (sseDst, _mm_shuffle_epi8(s, m[4]));
iy++;
sseDst = (__m128i*)(dst+((y+iy)*width+x)*4);
_mm_store_si128 (sseDst++, _mm_shuffle_epi8(s, m[7]));
_mm_store_si128 (sseDst, _mm_shuffle_epi8(s, m[6]));
iy++;
}
}
}
return;
}
__m128i Lmask = _mm_set1_epi8 (0x0F);
__m128i Hmask = _mm_set1_epi8 (0xF0);
for (int y = 0; y < height; y += 8) {
for (int x = 0; x < width; x += 8) {
for (int iy = 0; iy < 8; iy++, sseSrc++) {
// TODO (mb2): Don't let the optimizer perform all the clean up by itself
// (XK): Huh? What clean up? Where? Who? :)
__m128i s = _mm_load_si128 (sseSrc); // ab cd ef gh ...
__m128i sl = _mm_and_si128 (s, Lmask); // 0b 0d 0f 0h ...
__m128i sls = _mm_slli_epi16 (sl, 4); // b0 d0 f0 h0 ...
__m128i sl_ = _mm_or_si128 (sl, sls); // bb dd ff ff ...
__m128i sh = _mm_and_si128 (s, Hmask); // a0 c0 e0 g0 ...
__m128i shs = _mm_srli_epi16 (sh, 4); // 0a 0c 0e g0 ...
__m128i sh_ = _mm_or_si128 (sh, shs); // aa cc ee gg ...
__m128i rl = _mm_unpacklo_epi8 (sh_, sl_); // bb aa dd cc ...
__m128i rh = _mm_unpackhi_epi8 (sh_, sl_); // ff ee hh gg ...
// result part a
__m128i ral = _mm_unpacklo_epi8 (rl, rl); // bb bb aa aa ...
__m128i rah = _mm_unpackhi_epi8 (rl, rl); // dd dd cc cc ...
__m128i rall = _mm_unpacklo_epi16 (ral, ral); // bb bb bb bb ... -> done
__m128i ralh = _mm_unpackhi_epi16 (ral, ral); // aa aa aa aa ... -> done
__m128i rahl = _mm_unpacklo_epi16 (rah, rah); // dd dd dd dd ... -> done
__m128i rahh = _mm_unpackhi_epi16 (rah, rah); // cc cc cc cc ... -> done
// result part b
__m128i rbl = _mm_unpacklo_epi8 (rh, rh); // ff ff ee ee ...
__m128i rbh = _mm_unpackhi_epi8 (rh, rh); // hh hh gg gg ...
__m128i rbll = _mm_unpacklo_epi16 (rbl, rbl); // ff ff ff ff ... -> done
__m128i rblh = _mm_unpackhi_epi16 (rbl, rbl); // ee ee ee ee ... -> done
__m128i rbhl = _mm_unpacklo_epi16 (rbh, rbh); // hh hh hh hh ... -> done
__m128i rbhh = _mm_unpackhi_epi16 (rbh, rbh); // gg gg gg gg ... -> done
// Store
// TODO: Value lazyness
sseDst = (__m128i*)(dst+((y+iy)*width+x)*4);
_mm_store_si128 (sseDst++, rall);
_mm_store_si128 (sseDst, ralh);
iy++;
sseDst = (__m128i*)(dst+((y+iy)*width+x)*4);
_mm_store_si128 (sseDst++, rahl);
_mm_store_si128 (sseDst, rahh);
iy++;
sseDst = (__m128i*)(dst+((y+iy)*width+x)*4);
_mm_store_si128 (sseDst++, rbll);
_mm_store_si128 (sseDst, rblh);
iy++;
sseDst = (__m128i*)(dst+((y+iy)*width+x)*4);
_mm_store_si128 (sseDst++, rbhl);
_mm_store_si128 (sseDst, rbhh);
}
}
}
}
inline void decodebytesI8_8(u32 *dst, const u8 *src) inline void decodebytesI8_8(u32 *dst, const u8 *src)
{ {
for (int x = 0; x < 8; x++) for (int x = 0; x < 8; x++)
@ -385,67 +507,16 @@ PC_TexFormat TexDecoder_Decode(u8 *dst, const u8 *src, int width, int height, in
return PC_TEX_FMT_BGRA32; return PC_TEX_FMT_BGRA32;
case GX_TF_I4: case GX_TF_I4:
{ {
// TODO: SSSE3 variant (pshufb), THP videos use this format. sseDecodebytesI4((u32 *)dst, (const __m128i *)src, height,
// SSSE3 variant could bring even more speed width);
#if 1
__m128i Lmask = _mm_set1_epi8 (0x0F); /* Old non-SSE way
__m128i Hmask = _mm_set1_epi8 (0xF0);
const __m128i* sseSrc = (const __m128i *)src;
__m128i* sseDst = (__m128i *)dst;
for (int y = 0; y < height; y += 8)
for (int x = 0; x < width; x += 8)
for (int iy = 0; iy < 8; iy++, sseSrc++) {
// TODO (mb2): func and don't let the optimizer perform all the clean up by itself
__m128i s = _mm_load_si128 (sseSrc); // ab cd ef gh ...
__m128i sl = _mm_and_si128 (s, Lmask); // 0b 0d 0f 0h ...
__m128i sls = _mm_slli_epi16 (sl, 4); // b0 d0 f0 h0 ...
__m128i sl_ = _mm_or_si128 (sl, sls); // bb dd ff ff ...
__m128i sh = _mm_and_si128 (s, Hmask); // a0 c0 e0 g0 ...
__m128i shs = _mm_srli_epi16 (sh, 4); // 0a 0c 0e g0 ...
__m128i sh_ = _mm_or_si128 (sh, shs); // aa cc ee gg ...
__m128i rl = _mm_unpacklo_epi8 (sh_, sl_); // bb aa dd cc ...
__m128i rh = _mm_unpackhi_epi8 (sh_, sl_); //
__m128i ral = _mm_unpacklo_epi8 (rl, rl); // bb bb aa aa ...
__m128i rah = _mm_unpackhi_epi8 (rl, rl); //
// result part a
__m128i rall = _mm_unpacklo_epi16 (ral, ral); // bb bb bb bb ... -> done
__m128i ralh = _mm_unpackhi_epi16 (ral, ral); // -> done
__m128i rahl = _mm_unpacklo_epi16 (rah, rah); // -> done
__m128i rahh = _mm_unpackhi_epi16 (rah, rah); // -> done
__m128i rbl = _mm_unpacklo_epi8 (rh, rh); //
__m128i rbh = _mm_unpackhi_epi8 (rh, rh); //
// result part b
__m128i rbll = _mm_unpacklo_epi16 (rbl, rbl); // -> done
__m128i rblh = _mm_unpackhi_epi16 (rbl, rbl); // -> done
__m128i rbhl = _mm_unpacklo_epi16 (rbh, rbh); // -> done
__m128i rbhh = _mm_unpackhi_epi16 (rbh, rbh); // -> done
// store
sseDst = (__m128i*)(dst+((y+iy)*width+x)*4); // that sucks... too lazy
_mm_store_si128 (sseDst++, rall);
_mm_store_si128 (sseDst, ralh);
iy++;
sseDst = (__m128i*)(dst+((y+iy)*width+x)*4);
_mm_store_si128 (sseDst++, rahl);
_mm_store_si128 (sseDst, rahh);
iy++;
sseDst = (__m128i*)(dst+((y+iy)*width+x)*4);
_mm_store_si128 (sseDst++, rbll);
_mm_store_si128 (sseDst, rblh);
iy++;
sseDst = (__m128i*)(dst+((y+iy)*width+x)*4);
_mm_store_si128 (sseDst++, rbhl);
_mm_store_si128 (sseDst, rbhh);
}
#else
for (int y = 0; y < height; y += 8) for (int y = 0; y < height; y += 8)
for (int x = 0; x < width; x += 8) for (int x = 0; x < width; x += 8)
for (int iy = 0; iy < 8; iy++, src += 4) for (int iy = 0; iy < 8; iy++, src += 4)
//decodebytesI4((u32*)dst+(y+iy)*width+x, src, 4); //decodebytesI4((u32*)dst+(y+iy)*width+x, src, 4);
decodebytesI4((u32*)dst+(y+iy)*width+x, src); decodebytesI4((u32*)dst+(y+iy)*width+x, src);
#endif */
} }
return PC_TEX_FMT_BGRA32; return PC_TEX_FMT_BGRA32;
case GX_TF_C8: case GX_TF_C8: