[Cvs-annodex] commit (annodex): liboggplay/trunk/src/tests/Makefile.am liboggplay/trunk/src/tests/glut-player.c

shans nobody at lists.annodex.net
Wed Jan 10 04:44:06 UTC 2007


Update of /var/local/lib/svn/annodex (new revision 2596)

Modified files:
   liboggplay/trunk/src/tests/Makefile.am
   liboggplay/trunk/src/tests/glut-player.c

Log Message:
mmx implementation of yuv->rgb



Modified: liboggplay/trunk/src/tests/Makefile.am
===================================================================
--- liboggplay/trunk/src/tests/Makefile.am	2007-01-03 00:07:41 UTC (rev 2595)
+++ liboggplay/trunk/src/tests/Makefile.am	2007-01-10 04:44:06 UTC (rev 2596)
@@ -1,6 +1,6 @@
 ## Process this file with automake to produce Makefile.in
 
-AM_CFLAGS = -Wall -pedantic
+AM_CFLAGS = -Wall -msse2 -march=pentium3 -std=c99
 
 INCLUDES = -I$(top_srcdir)/include
 

Modified: liboggplay/trunk/src/tests/glut-player.c
===================================================================
--- liboggplay/trunk/src/tests/glut-player.c	2007-01-03 00:07:41 UTC (rev 2595)
+++ liboggplay/trunk/src/tests/glut-player.c	2007-01-10 04:44:06 UTC (rev 2596)
@@ -3,6 +3,7 @@
 #include <oggplay/oggplay.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <stdint.h>
 
 #include <sndfile.h>
 
@@ -24,6 +25,8 @@
 
 #include <semaphore.h>
 
+#include <xmmintrin.h>
+
 static int n_frames = 0;
 
 static GLuint texture;
@@ -52,10 +55,11 @@
                     OggPlayVideoData * video_data, int frame) {
 
   int               i;
-  unsigned char   * ptry;
-  unsigned char   * ptru;
-  unsigned char   * ptrv;
+  unsigned char   * restrict ptry;
+  unsigned char   * restrict ptru;
+  unsigned char   * restrict ptrv;
   unsigned char   * ptro;
+  unsigned char   * ptro2;
   int               y_width;
   int               y_height;
   int               uv_width;
@@ -84,7 +88,7 @@
 
   if (texture_bits == NULL) {
 
-    texture_bits = calloc(1, po2_width * po2_height * 3);
+    texture_bits = calloc(1, po2_width * po2_height * 4);
     texture_width = po2_width;
     texture_height = po2_height;
     
@@ -92,7 +96,7 @@
 
     free(texture_bits);
     
-    texture_bits = calloc(1, po2_width * po2_height * 3);
+    texture_bits = calloc(1, po2_width * po2_height * 4);
     texture_width = po2_width;
     texture_height = po2_height;
   }
@@ -107,37 +111,159 @@
   ptru = video_data->u;
   ptrv = video_data->v;
   ptro = texture_bits;
+#if 1
+
+  register __m64 *y, *o;
+  register __m64 zero, ut, vt, imm, imm2;
+  register __m64 r, g, b;
+  register __m64 tmp, tmp2;
+
+  zero = _mm_setzero_si64();
   
   for (i = 0; i < y_height; i++) {
     int j;
+    o = ptro;
+    ptro += po2_width * 4;
+    for (j = 0; j < y_width; j += 8) {
+
+      y = &ptry[j];
+      
+      ut = _m_from_int(*(int *)(ptru + j/2));
+      vt = _m_from_int(*(int *)(ptrv + j/2));
+
+      //ut = _m_from_int(0);
+      //vt = _m_from_int(0);
+      
+      ut = _m_punpcklbw(ut, zero);
+      vt = _m_punpcklbw(vt, zero);
+ 
+      /* subtract 128 from u and v */ 
+      imm = _mm_set1_pi16(128);
+      ut = _m_psubw(ut, imm);
+      vt = _m_psubw(vt, imm);
+
+      /* transfer and multiply into r, g, b registers */
+      imm = _mm_set1_pi16(-51);
+      g = _m_pmullw(ut, imm);
+      imm = _mm_set1_pi16(130);
+      b = _m_pmullw(ut, imm);
+      imm = _mm_set1_pi16(146);
+      r = _m_pmullw(vt, imm);
+      imm = _mm_set1_pi16(-74);
+      imm = _m_pmullw(vt, imm);
+      g = _m_paddsw(g, imm); 
+
+      /* add 64 to r, g and b registers */
+      imm = _mm_set1_pi16(64);
+      r = _m_paddsw(r, imm);
+      g = _m_paddsw(g, imm);
+      imm = _mm_set1_pi16(32);
+      b = _m_paddsw(b, imm);      
+
+      /* shift r, g and b registers to the right */
+      r = _m_psrawi(r, 7);
+      g = _m_psrawi(g, 7);
+      b = _m_psrawi(b, 6);
+  
+      /* subtract 16 from r, g and b registers */
+      imm = _mm_set1_pi16(16);
+      r = _m_psubsw(r, imm);
+      g = _m_psubsw(g, imm);
+      b = _m_psubsw(b, imm);
+
+      y = &ptry[j];
+      
+      /* duplicate u and v channels and add y
+       * each of r,g, b in the form [s1(16), s2(16), s3(16), s4(16)]
+       * first interleave, so tmp is [s1(16), s1(16), s2(16), s2(16)]
+       * then add y, then interleave again
+       * then pack with saturation, to get the desired output of
+       *   [s1(8), s1(8), s2(8), s2(8), s3(8), s3(8), s4(8), s4(8)]
+       */
+      tmp = _m_punpckhwd(r, r); 
+      imm = _m_punpckhbw(*y, zero);
+      //printf("tmp: %llx imm: %llx\n", tmp, imm);
+      tmp = _m_paddsw(tmp, imm);
+      tmp2 = _m_punpcklwd(r, r);
+      imm2 = _m_punpcklbw(*y, zero);
+      tmp2 = _m_paddsw(tmp2, imm2);
+      r = _m_packuswb(tmp2, tmp);
+      
+      tmp = _m_punpckhwd(g, g);
+      tmp2 = _m_punpcklwd(g, g);
+      tmp = _m_paddsw(tmp, imm);
+      tmp2 = _m_paddsw(tmp2, imm2);
+      g = _m_packuswb(tmp2, tmp);
+      
+      tmp = _m_punpckhwd(b, b);
+      tmp2 = _m_punpcklwd(b, b);
+      tmp = _m_paddsw(tmp, imm);
+      tmp2 = _m_paddsw(tmp2, imm2);
+      b = _m_packuswb(tmp2, tmp);
+      //printf("duplicated r g and b: %llx %llx %llx\n", r, g, b);
+      
+      /* now we have 8 8-bit r, g and b samples.  we want these to be packed
+       * into 32-bit values.
+       */
+      //r = _m_from_int(0);
+      //b = _m_from_int(0);
+      imm = _mm_set1_pi32(0xFFFFFFFF);
+      tmp = _m_punpcklbw(r, b);
+      tmp2 = _m_punpcklbw(g, imm);
+      *o++ = _m_punpcklbw(tmp, tmp2);
+      *o++ = _m_punpckhbw(tmp, tmp2);
+      //printf("tmp, tmp2, write1, write2: %llx %llx %llx %llx\n", tmp, tmp2, 
+      //                _m_punpcklbw(tmp, tmp2), _m_punpckhbw(tmp, tmp2));
+      tmp = _m_punpckhbw(r, b);
+      tmp2 = _m_punpckhbw(g, imm);
+      *o++ = _m_punpcklbw(tmp, tmp2);
+      *o++ = _m_punpckhbw(tmp, tmp2);
+
+      //exit(1);
+    }
+    if (i & 0x1) {
+      ptru += uv_width;
+      ptrv += uv_width;
+    }
+    ptry += y_width;
+  }
+  _m_empty();
+  
+#else
+  for (i = 0; i < y_height; i++) {
+    int j;
+    ptro2 = ptro;
     for (j = 0; j < y_width; j += 2) {
 
-      long pr, pg, pb;
+      short pr, pg, pb;
       short r, g, b;
       
-      pr = ((128 + (ptrv[j/2] - 128) * 292) >> 8) - 16; /* 1.14 * 256 */
-      pg = ((128 - (ptru[j/2] - 128) * 101 - (ptrv[j/2] - 128) * 149) >> 8)-16; 
-                                    /* 0.395 & 0.581 */
-      pb = ((128 + (ptru[j/2] - 128) * 520) >> 8) - 16; /* 2.032 */
+    //pr = ((128 + (ptrv[j/2] - 128) * 292) >> 8) - 16; /* 1.14 * 256 */
+      pr = (-41344 + ptrv[j/2] * 292) >> 8;
+    //pg = ((128 - (ptru[j/2] - 128) * 101 - (ptrv[j/2] - 128) * 149) >> 8)-16; 
+    //                                /* 0.395 & 0.581 */
+      pg = (28032 - ptru[j/2] * 101 - ptrv[j/2] * 149) >> 8;
+    //pb = ((128 + (ptru[j/2] - 128) * 520) >> 8) - 16; /* 2.032 */
+      pb = (-70528 + ptru[j/2] * 520) >> 8;
 
       r = ptry[j] + pr;
       g = ptry[j] + pg;
       b = ptry[j] + pb;
+
+      *ptro2++ = CLAMP(r);
+      *ptro2++ = CLAMP(g);
+      *ptro2++ = CLAMP(b);
       
-      ptro[j * 3] = CLAMP(r);
-      ptro[j * 3 + 1] = CLAMP(g);
-      ptro[j * 3 + 2] = CLAMP(b);
-      
       r = ptry[j + 1] + pr;
       g = ptry[j + 1] + pg;
       b = ptry[j + 1] + pb;
       
-      ptro[j * 3 + 3] = CLAMP(r);
-      ptro[j * 3 + 4] = CLAMP(g);
-      ptro[j * 3 + 5] = CLAMP(b);
+      *ptro2++ = CLAMP(r);
+      *ptro2++ = CLAMP(g);
+      *ptro2++ = CLAMP(b);
     }
     ptry += y_width;
-    if (i % 2) {
+    if (i & 1) {
       ptru += uv_width;
       ptrv += uv_width;
     }
@@ -145,7 +271,7 @@
   }
   
   
-
+#endif
  
 }
 
@@ -278,8 +404,8 @@
   if (texture_bits != NULL) 
   {
 
-    glTexImage2D(GL_TEXTURE_2D, 0, 3, texture_width, 
-                    texture_height, 0, GL_RGB, GL_UNSIGNED_BYTE, 
+    glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, texture_width, 
+                    texture_height, 0, GL_RGBA, GL_UNSIGNED_BYTE, 
                     texture_bits);
 
   }


-- 
shans



More information about the cvs-annodex mailing list