[Cvs-annodex] commit (annodex):
liboggplay/trunk/src/tests/Makefile.am
liboggplay/trunk/src/tests/glut-player.c
shans
nobody at lists.annodex.net
Wed Jan 10 04:44:06 UTC 2007
Update of /var/local/lib/svn/annodex (new revision 2596)
Modified files:
liboggplay/trunk/src/tests/Makefile.am
liboggplay/trunk/src/tests/glut-player.c
Log Message:
mmx implementation of yuv->rgb
Modified: liboggplay/trunk/src/tests/Makefile.am
===================================================================
--- liboggplay/trunk/src/tests/Makefile.am 2007-01-03 00:07:41 UTC (rev 2595)
+++ liboggplay/trunk/src/tests/Makefile.am 2007-01-10 04:44:06 UTC (rev 2596)
@@ -1,6 +1,6 @@
## Process this file with automake to produce Makefile.in
-AM_CFLAGS = -Wall -pedantic
+AM_CFLAGS = -Wall -msse2 -march=pentium3 -std=c99
INCLUDES = -I$(top_srcdir)/include
Modified: liboggplay/trunk/src/tests/glut-player.c
===================================================================
--- liboggplay/trunk/src/tests/glut-player.c 2007-01-03 00:07:41 UTC (rev 2595)
+++ liboggplay/trunk/src/tests/glut-player.c 2007-01-10 04:44:06 UTC (rev 2596)
@@ -3,6 +3,7 @@
#include <oggplay/oggplay.h>
#include <stdio.h>
#include <stdlib.h>
+#include <stdint.h>
#include <sndfile.h>
@@ -24,6 +25,8 @@
#include <semaphore.h>
+#include <xmmintrin.h>
+
static int n_frames = 0;
static GLuint texture;
@@ -52,10 +55,11 @@
OggPlayVideoData * video_data, int frame) {
int i;
- unsigned char * ptry;
- unsigned char * ptru;
- unsigned char * ptrv;
+ unsigned char * restrict ptry;
+ unsigned char * restrict ptru;
+ unsigned char * restrict ptrv;
unsigned char * ptro;
+ unsigned char * ptro2;
int y_width;
int y_height;
int uv_width;
@@ -84,7 +88,7 @@
if (texture_bits == NULL) {
- texture_bits = calloc(1, po2_width * po2_height * 3);
+ texture_bits = calloc(1, po2_width * po2_height * 4);
texture_width = po2_width;
texture_height = po2_height;
@@ -92,7 +96,7 @@
free(texture_bits);
- texture_bits = calloc(1, po2_width * po2_height * 3);
+ texture_bits = calloc(1, po2_width * po2_height * 4);
texture_width = po2_width;
texture_height = po2_height;
}
@@ -107,37 +111,159 @@
ptru = video_data->u;
ptrv = video_data->v;
ptro = texture_bits;
+#if 1
+
+ register __m64 *y, *o;
+ register __m64 zero, ut, vt, imm, imm2;
+ register __m64 r, g, b;
+ register __m64 tmp, tmp2;
+
+ zero = _mm_setzero_si64();
for (i = 0; i < y_height; i++) {
int j;
+ o = ptro;
+ ptro += po2_width * 4;
+ for (j = 0; j < y_width; j += 8) {
+
+ y = &ptry[j];
+
+ ut = _m_from_int(*(int *)(ptru + j/2));
+ vt = _m_from_int(*(int *)(ptrv + j/2));
+
+ //ut = _m_from_int(0);
+ //vt = _m_from_int(0);
+
+ ut = _m_punpcklbw(ut, zero);
+ vt = _m_punpcklbw(vt, zero);
+
+ /* subtract 128 from u and v */
+ imm = _mm_set1_pi16(128);
+ ut = _m_psubw(ut, imm);
+ vt = _m_psubw(vt, imm);
+
+ /* transfer and multiply into r, g, b registers */
+ imm = _mm_set1_pi16(-51);
+ g = _m_pmullw(ut, imm);
+ imm = _mm_set1_pi16(130);
+ b = _m_pmullw(ut, imm);
+ imm = _mm_set1_pi16(146);
+ r = _m_pmullw(vt, imm);
+ imm = _mm_set1_pi16(-74);
+ imm = _m_pmullw(vt, imm);
+ g = _m_paddsw(g, imm);
+
+ /* add 64 to r, g and b registers */
+ imm = _mm_set1_pi16(64);
+ r = _m_paddsw(r, imm);
+ g = _m_paddsw(g, imm);
+ imm = _mm_set1_pi16(32);
+ b = _m_paddsw(b, imm);
+
+ /* shift r, g and b registers to the right */
+ r = _m_psrawi(r, 7);
+ g = _m_psrawi(g, 7);
+ b = _m_psrawi(b, 6);
+
+ /* subtract 16 from r, g and b registers */
+ imm = _mm_set1_pi16(16);
+ r = _m_psubsw(r, imm);
+ g = _m_psubsw(g, imm);
+ b = _m_psubsw(b, imm);
+
+ y = &ptry[j];
+
+ /* duplicate u and v channels and add y
+ * each of r,g, b in the form [s1(16), s2(16), s3(16), s4(16)]
+ * first interleave, so tmp is [s1(16), s1(16), s2(16), s2(16)]
+ * then add y, then interleave again
+ * then pack with saturation, to get the desired output of
+ * [s1(8), s1(8), s2(8), s2(8), s3(8), s3(8), s4(8), s4(8)]
+ */
+ tmp = _m_punpckhwd(r, r);
+ imm = _m_punpckhbw(*y, zero);
+ //printf("tmp: %llx imm: %llx\n", tmp, imm);
+ tmp = _m_paddsw(tmp, imm);
+ tmp2 = _m_punpcklwd(r, r);
+ imm2 = _m_punpcklbw(*y, zero);
+ tmp2 = _m_paddsw(tmp2, imm2);
+ r = _m_packuswb(tmp2, tmp);
+
+ tmp = _m_punpckhwd(g, g);
+ tmp2 = _m_punpcklwd(g, g);
+ tmp = _m_paddsw(tmp, imm);
+ tmp2 = _m_paddsw(tmp2, imm2);
+ g = _m_packuswb(tmp2, tmp);
+
+ tmp = _m_punpckhwd(b, b);
+ tmp2 = _m_punpcklwd(b, b);
+ tmp = _m_paddsw(tmp, imm);
+ tmp2 = _m_paddsw(tmp2, imm2);
+ b = _m_packuswb(tmp2, tmp);
+ //printf("duplicated r g and b: %llx %llx %llx\n", r, g, b);
+
+ /* now we have 8 8-bit r, g and b samples. we want these to be packed
+ * into 32-bit values.
+ */
+ //r = _m_from_int(0);
+ //b = _m_from_int(0);
+ imm = _mm_set1_pi32(0xFFFFFFFF);
+ tmp = _m_punpcklbw(r, b);
+ tmp2 = _m_punpcklbw(g, imm);
+ *o++ = _m_punpcklbw(tmp, tmp2);
+ *o++ = _m_punpckhbw(tmp, tmp2);
+ //printf("tmp, tmp2, write1, write2: %llx %llx %llx %llx\n", tmp, tmp2,
+ // _m_punpcklbw(tmp, tmp2), _m_punpckhbw(tmp, tmp2));
+ tmp = _m_punpckhbw(r, b);
+ tmp2 = _m_punpckhbw(g, imm);
+ *o++ = _m_punpcklbw(tmp, tmp2);
+ *o++ = _m_punpckhbw(tmp, tmp2);
+
+ //exit(1);
+ }
+ if (i & 0x1) {
+ ptru += uv_width;
+ ptrv += uv_width;
+ }
+ ptry += y_width;
+ }
+ _m_empty();
+
+#else
+ for (i = 0; i < y_height; i++) {
+ int j;
+ ptro2 = ptro;
for (j = 0; j < y_width; j += 2) {
- long pr, pg, pb;
+ short pr, pg, pb;
short r, g, b;
- pr = ((128 + (ptrv[j/2] - 128) * 292) >> 8) - 16; /* 1.14 * 256 */
- pg = ((128 - (ptru[j/2] - 128) * 101 - (ptrv[j/2] - 128) * 149) >> 8)-16;
- /* 0.395 & 0.581 */
- pb = ((128 + (ptru[j/2] - 128) * 520) >> 8) - 16; /* 2.032 */
+ //pr = ((128 + (ptrv[j/2] - 128) * 292) >> 8) - 16; /* 1.14 * 256 */
+ pr = (-41344 + ptrv[j/2] * 292) >> 8;
+ //pg = ((128 - (ptru[j/2] - 128) * 101 - (ptrv[j/2] - 128) * 149) >> 8)-16;
+ // /* 0.395 & 0.581 */
+ pg = (28032 - ptru[j/2] * 101 - ptrv[j/2] * 149) >> 8;
+ //pb = ((128 + (ptru[j/2] - 128) * 520) >> 8) - 16; /* 2.032 */
+ pb = (-70528 + ptru[j/2] * 520) >> 8;
r = ptry[j] + pr;
g = ptry[j] + pg;
b = ptry[j] + pb;
+
+ *ptro2++ = CLAMP(r);
+ *ptro2++ = CLAMP(g);
+ *ptro2++ = CLAMP(b);
- ptro[j * 3] = CLAMP(r);
- ptro[j * 3 + 1] = CLAMP(g);
- ptro[j * 3 + 2] = CLAMP(b);
-
r = ptry[j + 1] + pr;
g = ptry[j + 1] + pg;
b = ptry[j + 1] + pb;
- ptro[j * 3 + 3] = CLAMP(r);
- ptro[j * 3 + 4] = CLAMP(g);
- ptro[j * 3 + 5] = CLAMP(b);
+ *ptro2++ = CLAMP(r);
+ *ptro2++ = CLAMP(g);
+ *ptro2++ = CLAMP(b);
}
ptry += y_width;
- if (i % 2) {
+ if (i & 1) {
ptru += uv_width;
ptrv += uv_width;
}
@@ -145,7 +271,7 @@
}
-
+#endif
}
@@ -278,8 +404,8 @@
if (texture_bits != NULL)
{
- glTexImage2D(GL_TEXTURE_2D, 0, 3, texture_width,
- texture_height, 0, GL_RGB, GL_UNSIGNED_BYTE,
+ glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, texture_width,
+ texture_height, 0, GL_RGBA, GL_UNSIGNED_BYTE,
texture_bits);
}
--
shans
More information about the cvs-annodex
mailing list