From b252be40cf361c7c9caad0e450b5b6cdbafb0abe Mon Sep 17 00:00:00 2001
From: "XTra.KrazzY" <XTra.KrazzY@gmail.com>
Date: Wed, 14 Jan 2009 11:28:48 +0000
Subject: [PATCH] Added S-SSE3 implementation of one perf. critical texture
 decoder, please notify me if it breaks anything. Thanks.

git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@1868 8ced0084-cf51-0410-be5f-012b33b47a6e
---
 Source/Core/VideoCommon/Src/SConscript        |   1 +
 .../Core/VideoCommon/Src/TextureDecoder.cpp   | 183 ++++++++++++------
 2 files changed, 128 insertions(+), 56 deletions(-)

diff --git a/Source/Core/VideoCommon/Src/SConscript b/Source/Core/VideoCommon/Src/SConscript
index f7274bedd4..0328d6595d 100644
--- a/Source/Core/VideoCommon/Src/SConscript
+++ b/Source/Core/VideoCommon/Src/SConscript
@@ -30,4 +30,5 @@ files = [
 
 env_common = env.Clone()
 env_common.Append(CXXFLAGS = [ '-fPIC' ])
+env_common.Append(CXXFLAGS = [ '-mssse3' ]) # For TextureDecoder
 env_common.StaticLibrary("videocommon", files)
diff --git a/Source/Core/VideoCommon/Src/TextureDecoder.cpp b/Source/Core/VideoCommon/Src/TextureDecoder.cpp
index 9901347cd9..a9e1234935 100644
--- a/Source/Core/VideoCommon/Src/TextureDecoder.cpp
+++ b/Source/Core/VideoCommon/Src/TextureDecoder.cpp
@@ -17,9 +17,11 @@
 
 #include "Common.h"
 
+#include "CPUDetect.h"
 #include "TextureDecoder.h"
 #include "LookUpTables.h"
 #include <emmintrin.h>
+#include <tmmintrin.h>
 
 //Uncomment this to enable Texture Format ID overlays
 #define OVERLAY_TEXFMT
@@ -184,6 +186,126 @@ inline void decodebytesI4(u32 *dst, const u8 *src)
     }
 }
 
+inline void sseDecodebytesI4(u32* dst, const __m128i* sseSrc, int height,
+                             int width) {
+    __m128i* sseDst;
+
+    // SSSE3 variant
+    if(cpu_info.bSSSE3) {
+
+        // TODO(XK): Increase Loop Jump?
+        
+        __m128i s, m[8];
+        unsigned char *umask;
+
+        for(int i = 0; i < 8; i++) {
+            umask = (unsigned char *)&(m[i]);
+            for(int j = 0; j < 14; j += 4) {
+                umask[j]   = 0x00 + (i * 4);
+                umask[j+1] = 0x01 + (i * 4);
+                umask[j+2] = 0x02 + (i * 4);
+                umask[j+3] = 0x03 + (i * 4);
+            }
+        }
+
+        for (int y = 0; y < height; y += 8) {
+            for (int x = 0; x < width; x += 8) {
+                for (int iy = 0; iy < 8; iy++, sseSrc++) {
+                    s = _mm_load_si128 (sseSrc);
+
+                    // TODO: Supplemental Value Lazyness v3
+                    sseDst = (__m128i*)(dst+((y+iy)*width+x)*4);
+                    _mm_store_si128 (sseDst++, _mm_shuffle_epi8(s, m[1]));
+                    _mm_store_si128 (sseDst, _mm_shuffle_epi8(s, m[0]));
+                    iy++;
+
+                    sseDst = (__m128i*)(dst+((y+iy)*width+x)*4);
+                    _mm_store_si128 (sseDst++, _mm_shuffle_epi8(s, m[3]));
+                    _mm_store_si128 (sseDst, _mm_shuffle_epi8(s, m[2]));
+                    iy++;
+
+                    sseDst = (__m128i*)(dst+((y+iy)*width+x)*4);
+                    _mm_store_si128 (sseDst++, _mm_shuffle_epi8(s, m[5]));
+                    _mm_store_si128 (sseDst, _mm_shuffle_epi8(s, m[4]));
+                    iy++;
+
+                    sseDst = (__m128i*)(dst+((y+iy)*width+x)*4);
+                    _mm_store_si128 (sseDst++, _mm_shuffle_epi8(s, m[7]));
+                    _mm_store_si128 (sseDst, _mm_shuffle_epi8(s, m[6]));
+                    iy++;
+
+                }
+            }
+        }
+
+        return;
+        
+    }
+
+    __m128i Lmask = _mm_set1_epi8 (0x0F);
+    __m128i Hmask = _mm_set1_epi8 (0xF0);
+
+    for (int y = 0; y < height; y += 8) {
+        for (int x = 0; x < width; x += 8) {
+            for (int iy = 0; iy < 8; iy++, sseSrc++) {
+                // TODO (mb2): Don't let the optimizer perform all the clean up by itself
+                // (XK): Huh? What clean up? Where? Who? :)
+                
+                __m128i s = _mm_load_si128 (sseSrc);		// ab cd ef gh ...
+                __m128i sl = _mm_and_si128 (s, Lmask);		// 0b 0d 0f 0h ...
+                __m128i sls = _mm_slli_epi16 (sl, 4);		// b0 d0 f0 h0 ...
+                __m128i sl_ = _mm_or_si128 (sl, sls);		// bb dd ff ff ...
+                        
+                __m128i sh = _mm_and_si128 (s, Hmask);		// a0 c0 e0 g0 ...
+                __m128i shs = _mm_srli_epi16 (sh, 4);		// 0a 0c 0e g0 ...
+                __m128i sh_ = _mm_or_si128 (sh, shs);		// aa cc ee gg ...
+                __m128i rl = _mm_unpacklo_epi8 (sh_, sl_);	// bb aa dd cc ...
+                __m128i rh = _mm_unpackhi_epi8 (sh_, sl_);	// ff ee hh gg ...
+                        
+                // result part a
+                __m128i ral = _mm_unpacklo_epi8 (rl, rl);	// bb bb aa aa ...
+                __m128i rah = _mm_unpackhi_epi8 (rl, rl);	// dd dd cc cc ...
+                
+                __m128i rall = _mm_unpacklo_epi16 (ral, ral);	// bb bb bb bb ...	-> done
+                __m128i ralh = _mm_unpackhi_epi16 (ral, ral);	// aa aa aa aa ...	-> done
+                __m128i rahl = _mm_unpacklo_epi16 (rah, rah);	// dd dd dd dd ...	-> done
+                __m128i rahh = _mm_unpackhi_epi16 (rah, rah);	// cc cc cc cc ...	-> done
+                        
+                // result part b
+                __m128i rbl = _mm_unpacklo_epi8 (rh, rh);	// ff ff ee ee ...
+                __m128i rbh = _mm_unpackhi_epi8 (rh, rh);	// hh hh gg gg ...
+
+                __m128i rbll = _mm_unpacklo_epi16 (rbl, rbl);	// ff ff ff ff ...	-> done
+                __m128i rblh = _mm_unpackhi_epi16 (rbl, rbl);	// ee ee ee ee ...	-> done
+                __m128i rbhl = _mm_unpacklo_epi16 (rbh, rbh);	// hh hh hh hh ...	-> done
+                __m128i rbhh = _mm_unpackhi_epi16 (rbh, rbh);	// gg gg gg gg ...	-> done
+                
+                // Store
+                // TODO: Value lazyness
+                sseDst = (__m128i*)(dst+((y+iy)*width+x)*4);
+                _mm_store_si128 (sseDst++, rall); 
+                _mm_store_si128 (sseDst, ralh);
+                iy++;
+                
+                sseDst = (__m128i*)(dst+((y+iy)*width+x)*4);
+                _mm_store_si128 (sseDst++, rahl);
+                _mm_store_si128 (sseDst, rahh);
+                iy++;
+                
+                sseDst = (__m128i*)(dst+((y+iy)*width+x)*4);
+                _mm_store_si128 (sseDst++, rbll);
+                _mm_store_si128 (sseDst, rblh);
+                iy++;
+                
+                
+                sseDst = (__m128i*)(dst+((y+iy)*width+x)*4);
+                _mm_store_si128 (sseDst++, rbhl);
+                _mm_store_si128 (sseDst, rbhh);
+            }
+        }
+    }
+}
+
 inline void decodebytesI8_8(u32 *dst, const u8 *src)
 {
 	for (int x = 0; x < 8; x++)
@@ -385,67 +507,16 @@ PC_TexFormat TexDecoder_Decode(u8 *dst, const u8 *src, int width, int height, in
         return PC_TEX_FMT_BGRA32;
     case GX_TF_I4:
         {
-        	// TODO: SSSE3 variant (pshufb), THP videos use this format.
-        	// SSSE3 variant could bring even more speed
-#if 1
-			__m128i Lmask = _mm_set1_epi8 (0x0F);
-			__m128i Hmask = _mm_set1_epi8 (0xF0);
-			const __m128i* sseSrc  = (const __m128i *)src;
-			__m128i* sseDst  = (__m128i *)dst;
-            for (int y = 0; y < height; y += 8)
-	            for (int x = 0; x < width; x += 8)
-					for (int iy = 0; iy < 8; iy++, sseSrc++) {
-						// TODO (mb2): func and don't let the optimizer perform all the clean up by itself
-						__m128i s = _mm_load_si128 (sseSrc);			// ab cd ef gh ...
-						__m128i sl = _mm_and_si128 (s, Lmask);			// 0b 0d 0f 0h ...
-						__m128i sls = _mm_slli_epi16 (sl, 4);			// b0 d0 f0 h0 ...
-						__m128i sl_ = _mm_or_si128 (sl, sls);			// bb dd ff ff ...
-
-						__m128i sh = _mm_and_si128 (s, Hmask);			// a0 c0 e0 g0 ...
-						__m128i shs = _mm_srli_epi16 (sh, 4);			// 0a 0c 0e g0 ...
-						__m128i sh_ = _mm_or_si128 (sh, shs);			// aa cc ee gg ...
-						__m128i rl = _mm_unpacklo_epi8 (sh_, sl_);		// bb aa dd cc ...
-						__m128i rh = _mm_unpackhi_epi8 (sh_, sl_);		// 
-
-						__m128i ral = _mm_unpacklo_epi8 (rl, rl);		// bb bb aa aa ...
-						__m128i rah = _mm_unpackhi_epi8 (rl, rl);		// 
-						// result part a
-						__m128i rall = _mm_unpacklo_epi16 (ral, ral);	// bb bb bb bb ...	-> done
-						__m128i ralh = _mm_unpackhi_epi16 (ral, ral);	//					-> done
-						__m128i rahl = _mm_unpacklo_epi16 (rah, rah);	//					-> done
-						__m128i rahh = _mm_unpackhi_epi16 (rah, rah);	//					-> done
-
-						__m128i rbl = _mm_unpacklo_epi8 (rh, rh);		// 
-						__m128i rbh = _mm_unpackhi_epi8 (rh, rh);		// 
-						// result part b
-						__m128i rbll = _mm_unpacklo_epi16 (rbl, rbl);	//					-> done
-						__m128i rblh = _mm_unpackhi_epi16 (rbl, rbl);	//					-> done
-						__m128i rbhl = _mm_unpacklo_epi16 (rbh, rbh);	//					-> done
-						__m128i rbhh = _mm_unpackhi_epi16 (rbh, rbh);	//					-> done
-						// store
-						sseDst = (__m128i*)(dst+((y+iy)*width+x)*4);	// that sucks... too lazy 
-						_mm_store_si128 (sseDst++, rall); 
-						_mm_store_si128 (sseDst, ralh);
-						iy++;
-						sseDst = (__m128i*)(dst+((y+iy)*width+x)*4);
-						_mm_store_si128 (sseDst++, rahl);
-						_mm_store_si128 (sseDst, rahh);
-						iy++;
-						sseDst = (__m128i*)(dst+((y+iy)*width+x)*4);
-						_mm_store_si128 (sseDst++, rbll);
-						_mm_store_si128 (sseDst, rblh);
-						iy++;
-						sseDst = (__m128i*)(dst+((y+iy)*width+x)*4);
-						_mm_store_si128 (sseDst++, rbhl);
-						_mm_store_si128 (sseDst, rbhh);
-					}
-#else
+            sseDecodebytesI4((u32 *)dst, (const __m128i *)src, height,
+                             width);
+            
+            /* Old non-SSE way
             for (int y = 0; y < height; y += 8)
                 for (int x = 0; x < width; x += 8)
                     for (int iy = 0; iy < 8; iy++, src += 4)
                         //decodebytesI4((u32*)dst+(y+iy)*width+x, src, 4);
                         decodebytesI4((u32*)dst+(y+iy)*width+x, src);
-#endif
+            */
         }
         return PC_TEX_FMT_BGRA32;
     case GX_TF_C8: