Added S-SSE3 implementation of one perf. critical texture decoder, please notify me if it breaks anything. Thanks.

git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@1868 8ced0084-cf51-0410-be5f-012b33b47a6e
2024-09-21 11:51:48 +02:00 · 2009-01-14 11:28:48 +00:00 · 2009-01-14 11:28:48 +00:00 · b252be40cf
commit b252be40cf
parent ed3281290d
2 changed files with 128 additions and 56 deletions
--- a/Source/Core/VideoCommon/Src/SConscript
+++ b/Source/Core/VideoCommon/Src/SConscript
@ -30,4 +30,5 @@ files = [
 env_common = env.Clone()
 env_common.Append(CXXFLAGS = [ '-fPIC' ])
 env_common.Append(CXXFLAGS = [ '-mssse3' ]) # For TextureDecoder
 env_common.StaticLibrary("videocommon", files)
--- a/Source/Core/VideoCommon/Src/TextureDecoder.cpp
+++ b/Source/Core/VideoCommon/Src/TextureDecoder.cpp
@ -17,9 +17,11 @@
 #include "Common.h"
 #include "CPUDetect.h"
 #include "TextureDecoder.h"
 #include "LookUpTables.h"
 #include <emmintrin.h>
 #include <tmmintrin.h>
 //Uncomment this to enable Texture Format ID overlays
 #define OVERLAY_TEXFMT
@ -184,6 +186,126 @@ inline void decodebytesI4(u32 *dst, const u8 *src)
    }
 }
 inline void sseDecodebytesI4(u32* dst, const __m128i* sseSrc, int height,
                             int width) {
    __m128i* sseDst;
    // SSSE3 variant
    if(cpu_info.bSSSE3) {
        // TODO(XK): Increase Loop Jump?
        __m128i s, m[8];
        unsigned char *umask;
        for(int i = 0; i < 8; i++) {
            umask = (unsigned char *)&(m[i]);
            for(int j = 0; j < 14; j += 4) {
                umask[j]   = 0x00 + (i * 4);
                umask[j+1] = 0x01 + (i * 4);
                umask[j+2] = 0x02 + (i * 4);
                umask[j+3] = 0x03 + (i * 4);
            }
        }
        for (int y = 0; y < height; y += 8) {
            for (int x = 0; x < width; x += 8) {
                for (int iy = 0; iy < 8; iy++, sseSrc++) {
                    s = _mm_load_si128 (sseSrc);
                    // TODO: Supplemental Value Lazyness v3
                    sseDst = (__m128i*)(dst+((y+iy)*width+x)*4);
                    _mm_store_si128 (sseDst++, _mm_shuffle_epi8(s, m[1]));
                    _mm_store_si128 (sseDst, _mm_shuffle_epi8(s, m[0]));
                    iy++;
                    sseDst = (__m128i*)(dst+((y+iy)*width+x)*4);
                    _mm_store_si128 (sseDst++, _mm_shuffle_epi8(s, m[3]));
                    _mm_store_si128 (sseDst, _mm_shuffle_epi8(s, m[2]));
                    iy++;
                    sseDst = (__m128i*)(dst+((y+iy)*width+x)*4);
                    _mm_store_si128 (sseDst++, _mm_shuffle_epi8(s, m[5]));
                    _mm_store_si128 (sseDst, _mm_shuffle_epi8(s, m[4]));
                    iy++;
                    sseDst = (__m128i*)(dst+((y+iy)*width+x)*4);
                    _mm_store_si128 (sseDst++, _mm_shuffle_epi8(s, m[7]));
                    _mm_store_si128 (sseDst, _mm_shuffle_epi8(s, m[6]));
                    iy++;
                }
            }
        }
        return;
    }
    __m128i Lmask = _mm_set1_epi8 (0x0F);
    __m128i Hmask = _mm_set1_epi8 (0xF0);
    for (int y = 0; y < height; y += 8) {
        for (int x = 0; x < width; x += 8) {
            for (int iy = 0; iy < 8; iy++, sseSrc++) {
                // TODO (mb2): Don't let the optimizer perform all the clean up by itself
                // (XK): Huh? What clean up? Where? Who? :)
                __m128i s = _mm_load_si128 (sseSrc);		// ab cd ef gh ...
                __m128i sl = _mm_and_si128 (s, Lmask);		// 0b 0d 0f 0h ...
                __m128i sls = _mm_slli_epi16 (sl, 4);		// b0 d0 f0 h0 ...
                __m128i sl_ = _mm_or_si128 (sl, sls);		// bb dd ff ff ...
                __m128i sh = _mm_and_si128 (s, Hmask);		// a0 c0 e0 g0 ...
                __m128i shs = _mm_srli_epi16 (sh, 4);		// 0a 0c 0e g0 ...
                __m128i sh_ = _mm_or_si128 (sh, shs);		// aa cc ee gg ...
                __m128i rl = _mm_unpacklo_epi8 (sh_, sl_);	// bb aa dd cc ...
                __m128i rh = _mm_unpackhi_epi8 (sh_, sl_);	// ff ee hh gg ...
                // result part a
                __m128i ral = _mm_unpacklo_epi8 (rl, rl);	// bb bb aa aa ...
                __m128i rah = _mm_unpackhi_epi8 (rl, rl);	// dd dd cc cc ...
                __m128i rall = _mm_unpacklo_epi16 (ral, ral);	// bb bb bb bb ...	-> done
                __m128i ralh = _mm_unpackhi_epi16 (ral, ral);	// aa aa aa aa ...	-> done
                __m128i rahl = _mm_unpacklo_epi16 (rah, rah);	// dd dd dd dd ...	-> done
                __m128i rahh = _mm_unpackhi_epi16 (rah, rah);	// cc cc cc cc ...	-> done
                // result part b
                __m128i rbl = _mm_unpacklo_epi8 (rh, rh);	// ff ff ee ee ...
                __m128i rbh = _mm_unpackhi_epi8 (rh, rh);	// hh hh gg gg ...
                __m128i rbll = _mm_unpacklo_epi16 (rbl, rbl);	// ff ff ff ff ...	-> done
                __m128i rblh = _mm_unpackhi_epi16 (rbl, rbl);	// ee ee ee ee ...	-> done
                __m128i rbhl = _mm_unpacklo_epi16 (rbh, rbh);	// hh hh hh hh ...	-> done
                __m128i rbhh = _mm_unpackhi_epi16 (rbh, rbh);	// gg gg gg gg ...	-> done
                // Store
                // TODO: Value lazyness
                sseDst = (__m128i*)(dst+((y+iy)*width+x)*4);
                _mm_store_si128 (sseDst++, rall); 
                _mm_store_si128 (sseDst, ralh);
                iy++;
                sseDst = (__m128i*)(dst+((y+iy)*width+x)*4);
                _mm_store_si128 (sseDst++, rahl);
                _mm_store_si128 (sseDst, rahh);
                iy++;
                sseDst = (__m128i*)(dst+((y+iy)*width+x)*4);
                _mm_store_si128 (sseDst++, rbll);
                _mm_store_si128 (sseDst, rblh);
                iy++;
                sseDst = (__m128i*)(dst+((y+iy)*width+x)*4);
                _mm_store_si128 (sseDst++, rbhl);
                _mm_store_si128 (sseDst, rbhh);
            }
        }
    }
 }
 inline void decodebytesI8_8(u32 *dst, const u8 *src)
 {
 	for (int x = 0; x < 8; x++)
@ -385,67 +507,16 @@ PC_TexFormat TexDecoder_Decode(u8 *dst, const u8 *src, int width, int height, in
        return PC_TEX_FMT_BGRA32;
    case GX_TF_I4:
        {
-        	// TODO: SSSE3 variant (pshufb), THP videos use this format.
+            sseDecodebytesI4((u32 *)dst, (const __m128i *)src, height,
-        	// SSSE3 variant could bring even more speed
+                             width);
-#if 1
+            
-			__m128i Lmask = _mm_set1_epi8 (0x0F);
+            /* Old non-SSE way
 			__m128i Hmask = _mm_set1_epi8 (0xF0);
 			const __m128i* sseSrc  = (const __m128i *)src;
 			__m128i* sseDst  = (__m128i *)dst;
            for (int y = 0; y < height; y += 8)
 	            for (int x = 0; x < width; x += 8)
 					for (int iy = 0; iy < 8; iy++, sseSrc++) {
 						// TODO (mb2): func and don't let the optimizer perform all the clean up by itself
 						__m128i s = _mm_load_si128 (sseSrc);			// ab cd ef gh ...
 						__m128i sl = _mm_and_si128 (s, Lmask);			// 0b 0d 0f 0h ...
 						__m128i sls = _mm_slli_epi16 (sl, 4);			// b0 d0 f0 h0 ...
 						__m128i sl_ = _mm_or_si128 (sl, sls);			// bb dd ff ff ...
 						__m128i sh = _mm_and_si128 (s, Hmask);			// a0 c0 e0 g0 ...
 						__m128i shs = _mm_srli_epi16 (sh, 4);			// 0a 0c 0e g0 ...
 						__m128i sh_ = _mm_or_si128 (sh, shs);			// aa cc ee gg ...
 						__m128i rl = _mm_unpacklo_epi8 (sh_, sl_);		// bb aa dd cc ...
 						__m128i rh = _mm_unpackhi_epi8 (sh_, sl_);		// 
 						__m128i ral = _mm_unpacklo_epi8 (rl, rl);		// bb bb aa aa ...
 						__m128i rah = _mm_unpackhi_epi8 (rl, rl);		// 
 						// result part a
 						__m128i rall = _mm_unpacklo_epi16 (ral, ral);	// bb bb bb bb ...	-> done
 						__m128i ralh = _mm_unpackhi_epi16 (ral, ral);	//					-> done
 						__m128i rahl = _mm_unpacklo_epi16 (rah, rah);	//					-> done
 						__m128i rahh = _mm_unpackhi_epi16 (rah, rah);	//					-> done
 						__m128i rbl = _mm_unpacklo_epi8 (rh, rh);		// 
 						__m128i rbh = _mm_unpackhi_epi8 (rh, rh);		// 
 						// result part b
 						__m128i rbll = _mm_unpacklo_epi16 (rbl, rbl);	//					-> done
 						__m128i rblh = _mm_unpackhi_epi16 (rbl, rbl);	//					-> done
 						__m128i rbhl = _mm_unpacklo_epi16 (rbh, rbh);	//					-> done
 						__m128i rbhh = _mm_unpackhi_epi16 (rbh, rbh);	//					-> done
 						// store
 						sseDst = (__m128i*)(dst+((y+iy)*width+x)*4);	// that sucks... too lazy 
 						_mm_store_si128 (sseDst++, rall); 
 						_mm_store_si128 (sseDst, ralh);
 						iy++;
 						sseDst = (__m128i*)(dst+((y+iy)*width+x)*4);
 						_mm_store_si128 (sseDst++, rahl);
 						_mm_store_si128 (sseDst, rahh);
 						iy++;
 						sseDst = (__m128i*)(dst+((y+iy)*width+x)*4);
 						_mm_store_si128 (sseDst++, rbll);
 						_mm_store_si128 (sseDst, rblh);
 						iy++;
 						sseDst = (__m128i*)(dst+((y+iy)*width+x)*4);
 						_mm_store_si128 (sseDst++, rbhl);
 						_mm_store_si128 (sseDst, rbhh);
 					}
 #else
            for (int y = 0; y < height; y += 8)
                for (int x = 0; x < width; x += 8)
                    for (int iy = 0; iy < 8; iy++, src += 4)
                        //decodebytesI4((u32*)dst+(y+iy)*width+x, src, 4);
                        decodebytesI4((u32*)dst+(y+iy)*width+x, src);
-#endif
+            */
        }
        return PC_TEX_FMT_BGRA32;
    case GX_TF_C8: