[xiph-commits] r15557 - in branches/theora-thusnelda/lib/enc: . x86
xiphmont at svn.xiph.org
xiphmont at svn.xiph.org
Thu Dec 4 21:41:07 PST 2008
Author: xiphmont
Date: 2008-12-04 21:41:06 -0800 (Thu, 04 Dec 2008)
New Revision: 15557
Added:
branches/theora-thusnelda/lib/enc/x86/
branches/theora-thusnelda/lib/enc/x86/dct_decode_mmx.c
branches/theora-thusnelda/lib/enc/x86/dsp_mmx.c
branches/theora-thusnelda/lib/enc/x86/dsp_mmxext.c
branches/theora-thusnelda/lib/enc/x86/dsp_sse2.c
branches/theora-thusnelda/lib/enc/x86/fdct_mmx.c
branches/theora-thusnelda/lib/enc/x86/idct_mmx.c
branches/theora-thusnelda/lib/enc/x86/recon_mmx.c
Log:
Commit unified x86 encoder asm
Added: branches/theora-thusnelda/lib/enc/x86/dct_decode_mmx.c
===================================================================
--- branches/theora-thusnelda/lib/enc/x86/dct_decode_mmx.c (rev 0)
+++ branches/theora-thusnelda/lib/enc/x86/dct_decode_mmx.c 2008-12-05 05:41:06 UTC (rev 15557)
@@ -0,0 +1,397 @@
+/********************************************************************
+ * *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
+ * *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2008 *
+ * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * *
+ ********************************************************************
+
+ function:
+ last mod: $Id: dct_decode_mmx.c 15078 2008-06-27 22:07:19Z xiphmont $
+
+ ********************************************************************/
+
+#include <stdlib.h>
+
+#include "codec_internal.h"
+
+#if defined(USE_ASM)
+
+static const __attribute__((aligned(8),used)) ogg_int64_t OC_V3=
+ 0x0003000300030003LL;
+static const __attribute__((aligned(8),used)) ogg_int64_t OC_V4=
+ 0x0004000400040004LL;
+
+static void loop_filter_v(unsigned char *_pix,int _ystride,
+ const ogg_int16_t *_ll){
+ long esi;
+ _pix-=_ystride*2;
+ __asm__ __volatile__(
+ /*mm0=0*/
+ "pxor %%mm0,%%mm0\n\t"
+ /*esi=_ystride*3*/
+ "lea (%[ystride],%[ystride],2),%[s]\n\t"
+ /*mm7=_pix[0...8]*/
+ "movq (%[pix]),%%mm7\n\t"
+ /*mm4=_pix[0...8+_ystride*3]*/
+ "movq (%[pix],%[s]),%%mm4\n\t"
+ /*mm6=_pix[0...8]*/
+ "movq %%mm7,%%mm6\n\t"
+ /*Expand unsigned _pix[0...3] to 16 bits.*/
+ "punpcklbw %%mm0,%%mm6\n\t"
+ "movq %%mm4,%%mm5\n\t"
+ /*Expand unsigned _pix[4...8] to 16 bits.*/
+ "punpckhbw %%mm0,%%mm7\n\t"
+ /*Expand other arrays too.*/
+ "punpcklbw %%mm0,%%mm4\n\t"
+ "punpckhbw %%mm0,%%mm5\n\t"
+ /*mm7:mm6=_p[0...8]-_p[0...8+_ystride*3]:*/
+ "psubw %%mm4,%%mm6\n\t"
+ "psubw %%mm5,%%mm7\n\t"
+ /*mm5=mm4=_pix[0...8+_ystride]*/
+ "movq (%[pix],%[ystride]),%%mm4\n\t"
+ /*mm1=mm3=mm2=_pix[0..8]+_ystride*2]*/
+ "movq (%[pix],%[ystride],2),%%mm2\n\t"
+ "movq %%mm4,%%mm5\n\t"
+ "movq %%mm2,%%mm3\n\t"
+ "movq %%mm2,%%mm1\n\t"
+ /*Expand these arrays.*/
+ "punpckhbw %%mm0,%%mm5\n\t"
+ "punpcklbw %%mm0,%%mm4\n\t"
+ "punpckhbw %%mm0,%%mm3\n\t"
+ "punpcklbw %%mm0,%%mm2\n\t"
+ /*Preload...*/
+ "movq %[OC_V3],%%mm0\n\t"
+ /*mm3:mm2=_pix[0...8+_ystride*2]-_pix[0...8+_ystride]*/
+ "psubw %%mm5,%%mm3\n\t"
+ "psubw %%mm4,%%mm2\n\t"
+ /*Scale by 3.*/
+ "pmullw %%mm0,%%mm3\n\t"
+ "pmullw %%mm0,%%mm2\n\t"
+ /*Preload...*/
+ "movq %[OC_V4],%%mm0\n\t"
+ /*f=mm3:mm2==_pix[0...8]-_pix[0...8+_ystride*3]+
+ 3*(_pix[0...8+_ystride*2]-_pix[0...8+_ystride])*/
+ "paddw %%mm7,%%mm3\n\t"
+ "paddw %%mm6,%%mm2\n\t"
+ /*Add 4.*/
+ "paddw %%mm0,%%mm3\n\t"
+ "paddw %%mm0,%%mm2\n\t"
+ /*"Divide" by 8.*/
+ "psraw $3,%%mm3\n\t"
+ "psraw $3,%%mm2\n\t"
+ /*Now compute lflim of mm3:mm2 cf. Section 7.10 of the sepc.*/
+ /*Free up mm5.*/
+ "packuswb %%mm5,%%mm4\n\t"
+ /*mm0=L L L L*/
+ "movq (%[ll]),%%mm0\n\t"
+ /*if(R_i<-2L||R_i>2L)R_i=0:*/
+ "movq %%mm2,%%mm5\n\t"
+ "pxor %%mm6,%%mm6\n\t"
+ "movq %%mm0,%%mm7\n\t"
+ "psubw %%mm0,%%mm6\n\t"
+ "psllw $1,%%mm7\n\t"
+ "psllw $1,%%mm6\n\t"
+ /*mm2==R_3 R_2 R_1 R_0*/
+ /*mm5==R_3 R_2 R_1 R_0*/
+ /*mm6==-2L -2L -2L -2L*/
+ /*mm7==2L 2L 2L 2L*/
+ "pcmpgtw %%mm2,%%mm7\n\t"
+ "pcmpgtw %%mm6,%%mm5\n\t"
+ "pand %%mm7,%%mm2\n\t"
+ "movq %%mm0,%%mm7\n\t"
+ "pand %%mm5,%%mm2\n\t"
+ "psllw $1,%%mm7\n\t"
+ "movq %%mm3,%%mm5\n\t"
+ /*mm3==R_7 R_6 R_5 R_4*/
+ /*mm5==R_7 R_6 R_5 R_4*/
+ /*mm6==-2L -2L -2L -2L*/
+ /*mm7==2L 2L 2L 2L*/
+ "pcmpgtw %%mm3,%%mm7\n\t"
+ "pcmpgtw %%mm6,%%mm5\n\t"
+ "pand %%mm7,%%mm3\n\t"
+ "movq %%mm0,%%mm7\n\t"
+ "pand %%mm5,%%mm3\n\t"
+ /*if(R_i<-L)R_i'=R_i+2L;
+ if(R_i>L)R_i'=R_i-2L;
+ if(R_i<-L||R_i>L)R_i=-R_i':*/
+ "psraw $1,%%mm6\n\t"
+ "movq %%mm2,%%mm5\n\t"
+ "psllw $1,%%mm7\n\t"
+ /*mm2==R_3 R_2 R_1 R_0*/
+ /*mm5==R_3 R_2 R_1 R_0*/
+ /*mm6==-L -L -L -L*/
+ /*mm0==L L L L*/
+ /*mm5=R_i>L?FF:00*/
+ "pcmpgtw %%mm0,%%mm5\n\t"
+ /*mm6=-L>R_i?FF:00*/
+ "pcmpgtw %%mm2,%%mm6\n\t"
+ /*mm7=R_i>L?2L:0*/
+ "pand %%mm5,%%mm7\n\t"
+ /*mm2=R_i>L?R_i-2L:R_i*/
+ "psubw %%mm7,%%mm2\n\t"
+ "movq %%mm0,%%mm7\n\t"
+ /*mm5=-L>R_i||R_i>L*/
+ "por %%mm6,%%mm5\n\t"
+ "psllw $1,%%mm7\n\t"
+ /*mm7=-L>R_i?2L:0*/
+ "pand %%mm6,%%mm7\n\t"
+ "pxor %%mm6,%%mm6\n\t"
+ /*mm2=-L>R_i?R_i+2L:R_i*/
+ "paddw %%mm7,%%mm2\n\t"
+ "psubw %%mm0,%%mm6\n\t"
+ /*mm5=-L>R_i||R_i>L?-R_i':0*/
+ "pand %%mm2,%%mm5\n\t"
+ "movq %%mm0,%%mm7\n\t"
+ /*mm2=-L>R_i||R_i>L?0:R_i*/
+ "psubw %%mm5,%%mm2\n\t"
+ "psllw $1,%%mm7\n\t"
+ /*mm2=-L>R_i||R_i>L?-R_i':R_i*/
+ "psubw %%mm5,%%mm2\n\t"
+ "movq %%mm3,%%mm5\n\t"
+ /*mm3==R_7 R_6 R_5 R_4*/
+ /*mm5==R_7 R_6 R_5 R_4*/
+ /*mm6==-L -L -L -L*/
+ /*mm0==L L L L*/
+ /*mm6=-L>R_i?FF:00*/
+ "pcmpgtw %%mm3,%%mm6\n\t"
+ /*mm5=R_i>L?FF:00*/
+ "pcmpgtw %%mm0,%%mm5\n\t"
+ /*mm7=R_i>L?2L:0*/
+ "pand %%mm5,%%mm7\n\t"
+ /*mm2=R_i>L?R_i-2L:R_i*/
+ "psubw %%mm7,%%mm3\n\t"
+ "psllw $1,%%mm0\n\t"
+ /*mm5=-L>R_i||R_i>L*/
+ "por %%mm6,%%mm5\n\t"
+ /*mm0=-L>R_i?2L:0*/
+ "pand %%mm6,%%mm0\n\t"
+ /*mm3=-L>R_i?R_i+2L:R_i*/
+ "paddw %%mm0,%%mm3\n\t"
+ /*mm5=-L>R_i||R_i>L?-R_i':0*/
+ "pand %%mm3,%%mm5\n\t"
+ /*mm2=-L>R_i||R_i>L?0:R_i*/
+ "psubw %%mm5,%%mm3\n\t"
+ /*mm2=-L>R_i||R_i>L?-R_i':R_i*/
+ "psubw %%mm5,%%mm3\n\t"
+ /*Unfortunately, there's no unsigned byte+signed byte with unsigned
+ saturation op code, so we have to promote things back 16 bits.*/
+ "pxor %%mm0,%%mm0\n\t"
+ "movq %%mm4,%%mm5\n\t"
+ "punpcklbw %%mm0,%%mm4\n\t"
+ "punpckhbw %%mm0,%%mm5\n\t"
+ "movq %%mm1,%%mm6\n\t"
+ "punpcklbw %%mm0,%%mm1\n\t"
+ "punpckhbw %%mm0,%%mm6\n\t"
+ /*_pix[0...8+_ystride]+=R_i*/
+ "paddw %%mm2,%%mm4\n\t"
+ "paddw %%mm3,%%mm5\n\t"
+ /*_pix[0...8+_ystride*2]-=R_i*/
+ "psubw %%mm2,%%mm1\n\t"
+ "psubw %%mm3,%%mm6\n\t"
+ "packuswb %%mm5,%%mm4\n\t"
+ "packuswb %%mm6,%%mm1\n\t"
+ /*Write it back out.*/
+ "movq %%mm4,(%[pix],%[ystride])\n\t"
+ "movq %%mm1,(%[pix],%[ystride],2)\n\t"
+ :[s]"=&S"(esi)
+ :[pix]"r"(_pix),[ystride]"r"((long)_ystride),[ll]"r"(_ll),
+ [OC_V3]"m"(OC_V3),[OC_V4]"m"(OC_V4)
+ :"memory"
+ );
+}
+
+/*This code implements the bulk of loop_filter_h().
+ Data are striped p0 p1 p2 p3 ... p0 p1 p2 p3 ..., so in order to load all
+ four p0's to one register we must transpose the values in four mmx regs.
+ When half is done we repeat this for the rest.*/
+static void loop_filter_h4(unsigned char *_pix,long _ystride,
+ const ogg_int16_t *_ll){
+ long esi;
+ long edi;
+ __asm__ __volatile__(
+ /*x x x x 3 2 1 0*/
+ "movd (%[pix]),%%mm0\n\t"
+ /*esi=_ystride*3*/
+ "lea (%[ystride],%[ystride],2),%[s]\n\t"
+ /*x x x x 7 6 5 4*/
+ "movd (%[pix],%[ystride]),%%mm1\n\t"
+ /*x x x x B A 9 8*/
+ "movd (%[pix],%[ystride],2),%%mm2\n\t"
+ /*x x x x F E D C*/
+ "movd (%[pix],%[s]),%%mm3\n\t"
+ /*mm0=7 3 6 2 5 1 4 0*/
+ "punpcklbw %%mm1,%%mm0\n\t"
+ /*mm2=F B E A D 9 C 8*/
+ "punpcklbw %%mm3,%%mm2\n\t"
+ /*mm1=7 3 6 2 5 1 4 0*/
+ "movq %%mm0,%%mm1\n\t"
+ /*mm0=F B 7 3 E A 6 2*/
+ "punpckhwd %%mm2,%%mm0\n\t"
+ /*mm1=D 9 5 1 C 8 4 0*/
+ "punpcklwd %%mm2,%%mm1\n\t"
+ "pxor %%mm7,%%mm7\n\t"
+ /*mm5=D 9 5 1 C 8 4 0*/
+ "movq %%mm1,%%mm5\n\t"
+ /*mm1=x C x 8 x 4 x 0==pix[0]*/
+ "punpcklbw %%mm7,%%mm1\n\t"
+ /*mm5=x D x 9 x 5 x 1==pix[1]*/
+ "punpckhbw %%mm7,%%mm5\n\t"
+ /*mm3=F B 7 3 E A 6 2*/
+ "movq %%mm0,%%mm3\n\t"
+ /*mm0=x E x A x 6 x 2==pix[2]*/
+ "punpcklbw %%mm7,%%mm0\n\t"
+ /*mm3=x F x B x 7 x 3==pix[3]*/
+ "punpckhbw %%mm7,%%mm3\n\t"
+ /*mm1=mm1-mm3==pix[0]-pix[3]*/
+ "psubw %%mm3,%%mm1\n\t"
+ /*Save a copy of pix[2] for later.*/
+ "movq %%mm0,%%mm4\n\t"
+ /*mm0=mm0-mm5==pix[2]-pix[1]*/
+ "psubw %%mm5,%%mm0\n\t"
+ /*Scale by 3.*/
+ "pmullw %[OC_V3],%%mm0\n\t"
+ /*f=mm1==_pix[0]-_pix[3]+ 3*(_pix[2]-_pix[1])*/
+ "paddw %%mm1,%%mm0\n\t"
+ /*Add 4.*/
+ "paddw %[OC_V4],%%mm0\n\t"
+ /*"Divide" by 8, producing the residuals R_i.*/
+ "psraw $3,%%mm0\n\t"
+ /*Now compute lflim of mm0 cf. Section 7.10 of the sepc.*/
+ /*mm6=L L L L*/
+ "movq (%[ll]),%%mm6\n\t"
+ /*if(R_i<-2L||R_i>2L)R_i=0:*/
+ "movq %%mm0,%%mm1\n\t"
+ "pxor %%mm2,%%mm2\n\t"
+ "movq %%mm6,%%mm3\n\t"
+ "psubw %%mm6,%%mm2\n\t"
+ "psllw $1,%%mm3\n\t"
+ "psllw $1,%%mm2\n\t"
+ /*mm0==R_3 R_2 R_1 R_0*/
+ /*mm1==R_3 R_2 R_1 R_0*/
+ /*mm2==-2L -2L -2L -2L*/
+ /*mm3==2L 2L 2L 2L*/
+ "pcmpgtw %%mm0,%%mm3\n\t"
+ "pcmpgtw %%mm2,%%mm1\n\t"
+ "pand %%mm3,%%mm0\n\t"
+ "pand %%mm1,%%mm0\n\t"
+ /*if(R_i<-L)R_i'=R_i+2L;
+ if(R_i>L)R_i'=R_i-2L;
+ if(R_i<-L||R_i>L)R_i=-R_i':*/
+ "psraw $1,%%mm2\n\t"
+ "movq %%mm0,%%mm1\n\t"
+ "movq %%mm6,%%mm3\n\t"
+ /*mm0==R_3 R_2 R_1 R_0*/
+ /*mm1==R_3 R_2 R_1 R_0*/
+ /*mm2==-L -L -L -L*/
+ /*mm6==L L L L*/
+ /*mm2=-L>R_i?FF:00*/
+ "pcmpgtw %%mm0,%%mm2\n\t"
+ /*mm1=R_i>L?FF:00*/
+ "pcmpgtw %%mm6,%%mm1\n\t"
+ /*mm3=2L 2L 2L 2L*/
+ "psllw $1,%%mm3\n\t"
+ /*mm6=2L 2L 2L 2L*/
+ "psllw $1,%%mm6\n\t"
+ /*mm3=R_i>L?2L:0*/
+ "pand %%mm1,%%mm3\n\t"
+ /*mm6=-L>R_i?2L:0*/
+ "pand %%mm2,%%mm6\n\t"
+ /*mm0=R_i>L?R_i-2L:R_i*/
+ "psubw %%mm3,%%mm0\n\t"
+ /*mm1=-L>R_i||R_i>L*/
+ "por %%mm2,%%mm1\n\t"
+ /*mm0=-L>R_i?R_i+2L:R_i*/
+ "paddw %%mm6,%%mm0\n\t"
+ /*mm1=-L>R_i||R_i>L?R_i':0*/
+ "pand %%mm0,%%mm1\n\t"
+ /*mm0=-L>R_i||R_i>L?0:R_i*/
+ "psubw %%mm1,%%mm0\n\t"
+ /*mm0=-L>R_i||R_i>L?-R_i':R_i*/
+ "psubw %%mm1,%%mm0\n\t"
+ /*_pix[1]+=R_i;*/
+ "paddw %%mm0,%%mm5\n\t"
+ /*_pix[2]-=R_i;*/
+ "psubw %%mm0,%%mm4\n\t"
+ /*mm5=x x x x D 9 5 1*/
+ "packuswb %%mm7,%%mm5\n\t"
+ /*mm4=x x x x E A 6 2*/
+ "packuswb %%mm7,%%mm4\n\t"
+ /*mm5=E D A 9 6 5 2 1*/
+ "punpcklbw %%mm4,%%mm5\n\t"
+ /*edi=6 5 2 1*/
+ "movd %%mm5,%%edi\n\t"
+ "movw %%di,1(%[pix])\n\t"
+ /*Why is there such a big stall here?*/
+ "psrlq $32,%%mm5\n\t"
+ "shrl $16,%%edi\n\t"
+ "movw %%di,1(%[pix],%[ystride])\n\t"
+ /*edi=E D A 9*/
+ "movd %%mm5,%%edi\n\t"
+ "movw %%di,1(%[pix],%[ystride],2)\n\t"
+ "shrl $16,%%edi\n\t"
+ "movw %%di,1(%[pix],%[s])\n\t"
+ :[s]"=&S"(esi),[d]"=&D"(edi),
+ [pix]"+r"(_pix),[ystride]"+r"(_ystride),[ll]"+r"(_ll)
+ :[OC_V3]"m"(OC_V3),[OC_V4]"m"(OC_V4)
+ :"memory"
+ );
+}
+
+static void loop_filter_h(unsigned char *_pix,int _ystride,
+ const ogg_int16_t *_ll){
+ _pix-=2;
+ loop_filter_h4(_pix,_ystride,_ll);
+ loop_filter_h4(_pix+(_ystride<<2),_ystride,_ll);
+}
+
+static void loop_filter_mmx(CP_INSTANCE *cpi, int FLimit){
+ int j;
+ ogg_int16_t __attribute__((aligned(8))) ll[4];
+ unsigned char *cp = cpi->frag_coded;
+ ogg_uint32_t *bp = cpi->frag_buffer_index;
+
+ if ( FLimit == 0 ) return;
+ ll[0]=ll[1]=ll[2]=ll[3]=FLimit;
+
+ for ( j = 0; j < 3 ; j++){
+ ogg_uint32_t *bp_begin = bp;
+ ogg_uint32_t *bp_end = bp + cpi->frag_n[j];
+ int stride = cpi->stride[j];
+ int h = cpi->frag_h[j];
+
+ while(bp<bp_end){
+ ogg_uint32_t *bp_left = bp;
+ ogg_uint32_t *bp_right = bp + h;
+ while(bp<bp_right){
+ if(cp[0]){
+ if(bp>bp_left)
+ loop_filter_h(&cpi->lastrecon[bp[0]],stride,ll);
+ if(bp_left>bp_begin)
+ loop_filter_v(&cpi->lastrecon[bp[0]],stride,ll);
+ if(bp+1<bp_right && !cp[1])
+ loop_filter_h(&cpi->lastrecon[bp[0]]+8,stride,ll);
+ if(bp+h<bp_end && !cp[h])
+ loop_filter_v(&cpi->lastrecon[bp[h]],stride,ll);
+ }
+ bp++;
+ cp++;
+ }
+ }
+ }
+
+ /*This needs to be removed when decode specific functions are implemented:*/
+ __asm__ __volatile__("emms\n\t");
+}
+
+/* install our implementation in the function table */
+void dsp_mmx_dct_decode_init(DspFunctions *funcs)
+{
+ funcs->LoopFilter = loop_filter_mmx;
+}
+
+#endif /* USE_ASM */
Added: branches/theora-thusnelda/lib/enc/x86/dsp_mmx.c
===================================================================
--- branches/theora-thusnelda/lib/enc/x86/dsp_mmx.c (rev 0)
+++ branches/theora-thusnelda/lib/enc/x86/dsp_mmx.c 2008-12-05 05:41:06 UTC (rev 15557)
@@ -0,0 +1,117 @@
+/********************************************************************
+ * *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
+ * *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007 *
+ * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * *
+ ********************************************************************
+
+ function:
+ last mod: $Id: dsp_mmx.c 14579 2008-03-12 06:42:40Z xiphmont $
+
+ ********************************************************************/
+
+#include <stdlib.h>
+
+#include "codec_internal.h"
+#include "dsp.h"
+
+#if defined(USE_ASM)
+
+static const __attribute__ ((aligned(8),used)) ogg_int64_t V128 = 0x0080008000800080LL;
+
+#define DSP_OP_AVG(a,b) ((((int)(a)) + ((int)(b)))/2)
+#define DSP_OP_DIFF(a,b) (((int)(a)) - ((int)(b)))
+#define DSP_OP_ABS_DIFF(a,b) abs((((int)(a)) - ((int)(b))))
+
+static void sub8x8__mmx (const unsigned char *FiltPtr, const unsigned char *ReconPtr,
+ ogg_int16_t *DctInputPtr, ogg_uint32_t PixelsPerLine)
+{
+ __asm__ __volatile__ (
+ " .balign 16 \n\t"
+
+ " pxor %%mm7, %%mm7 \n\t"
+
+ ".rept 8 \n\t"
+ " movq (%0), %%mm0 \n\t" /* mm0 = FiltPtr */
+ " movq (%1), %%mm1 \n\t" /* mm1 = ReconPtr */
+ " movq %%mm0, %%mm2 \n\t" /* dup to prepare for up conversion */
+ " movq %%mm1, %%mm3 \n\t" /* dup to prepare for up conversion */
+ /* convert from UINT8 to INT16 */
+ " punpcklbw %%mm7, %%mm0 \n\t" /* mm0 = INT16(FiltPtr) */
+ " punpcklbw %%mm7, %%mm1 \n\t" /* mm1 = INT16(ReconPtr) */
+ " punpckhbw %%mm7, %%mm2 \n\t" /* mm2 = INT16(FiltPtr) */
+ " punpckhbw %%mm7, %%mm3 \n\t" /* mm3 = INT16(ReconPtr) */
+ /* start calculation */
+ " psubw %%mm1, %%mm0 \n\t" /* mm0 = FiltPtr - ReconPtr */
+ " psubw %%mm3, %%mm2 \n\t" /* mm2 = FiltPtr - ReconPtr */
+ " movq %%mm0, (%2) \n\t" /* write answer out */
+ " movq %%mm2, 8(%2) \n\t" /* write answer out */
+ /* Increment pointers */
+ " add $16, %2 \n\t"
+ " add %3, %0 \n\t"
+ " add %3, %1 \n\t"
+ ".endr \n\t"
+
+ : "+r" (FiltPtr),
+ "+r" (ReconPtr),
+ "+r" (DctInputPtr)
+
+ : "r" ((unsigned long)PixelsPerLine)
+ : "memory"
+ );
+}
+
+static void sub8x8_128__mmx (const unsigned char *FiltPtr, ogg_int16_t *DctInputPtr,
+ ogg_uint32_t PixelsPerLine)
+{
+
+ __asm__ __volatile__ (
+ " .balign 16 \n\t"
+
+ " pxor %%mm7, %%mm7 \n\t"
+ " movq %[V128], %%mm1 \n\t"
+
+ ".rept 8 \n\t"
+ " movq (%0), %%mm0 \n\t" /* mm0 = FiltPtr */
+ " movq %%mm0, %%mm2 \n\t" /* dup to prepare for up conversion */
+ /* convert from UINT8 to INT16 */
+ " punpcklbw %%mm7, %%mm0 \n\t" /* mm0 = INT16(FiltPtr) */
+ " punpckhbw %%mm7, %%mm2 \n\t" /* mm2 = INT16(FiltPtr) */
+ /* start calculation */
+ " psubw %%mm1, %%mm0 \n\t" /* mm0 = FiltPtr - 128 */
+ " psubw %%mm1, %%mm2 \n\t" /* mm2 = FiltPtr - 128 */
+ " movq %%mm0, (%1) \n\t" /* write answer out */
+ " movq %%mm2, 8(%1) \n\t" /* write answer out */
+ /* Increment pointers */
+ " add $16, %1 \n\t"
+ " add %2, %0 \n\t"
+ ".endr \n\t"
+
+ : "+r" (FiltPtr),
+ "+r" (DctInputPtr)
+ : "r" ((unsigned long)PixelsPerLine),
+ [V128] "m" (V128)
+ : "memory"
+ );
+}
+
+static void restore_fpu (void)
+{
+ __asm__ __volatile__ (
+ " emms \n\t"
+ );
+}
+
+void dsp_mmx_init(DspFunctions *funcs)
+{
+ funcs->restore_fpu = restore_fpu;
+ funcs->sub8x8 = sub8x8__mmx;
+ funcs->sub8x8_128 = sub8x8_128__mmx;
+}
+
+#endif /* USE_ASM */
Added: branches/theora-thusnelda/lib/enc/x86/dsp_mmxext.c
===================================================================
--- branches/theora-thusnelda/lib/enc/x86/dsp_mmxext.c (rev 0)
+++ branches/theora-thusnelda/lib/enc/x86/dsp_mmxext.c 2008-12-05 05:41:06 UTC (rev 15557)
@@ -0,0 +1,130 @@
+/********************************************************************
+ * *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
+ * *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007 *
+ * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * *
+ ********************************************************************
+
+ function:
+ last mod: $Id: dsp_mmxext.c 14579 2008-03-12 06:42:40Z xiphmont $
+
+ ********************************************************************/
+
+#include <stdlib.h>
+
+#include "codec_internal.h"
+#include "dsp.h"
+
+#if defined(USE_ASM)
+
+static ogg_uint32_t sad8x8__mmxext (const unsigned char *ptr1, const unsigned char *ptr2,
+ ogg_uint32_t stride)
+{
+ ogg_uint32_t DiffVal;
+
+ __asm__ __volatile__ (
+ " .balign 16 \n\t"
+ " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */
+
+ ".rept 7 \n\t"
+ " movq (%1), %%mm0 \n\t" /* take 8 bytes */
+ " movq (%2), %%mm1 \n\t"
+ " psadbw %%mm1, %%mm0 \n\t"
+ " add %3, %1 \n\t" /* Inc pointer into the new data */
+ " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */
+ " add %3, %2 \n\t" /* Inc pointer into ref data */
+ ".endr \n\t"
+
+ " movq (%1), %%mm0 \n\t" /* take 8 bytes */
+ " movq (%2), %%mm1 \n\t"
+ " psadbw %%mm1, %%mm0 \n\t"
+ " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */
+ " movd %%mm7, %0 \n\t"
+
+ : "=r" (DiffVal),
+ "+r" (ptr1),
+ "+r" (ptr2)
+ : "r" ((unsigned long)stride)
+ : "memory"
+ );
+
+ return DiffVal;
+}
+
+static ogg_uint32_t sad8x8_thres__mmxext (const unsigned char *ptr1, const unsigned char *ptr2,
+ ogg_uint32_t stride, ogg_uint32_t thres)
+{
+ ogg_uint32_t DiffVal;
+
+ __asm__ __volatile__ (
+ " .balign 16 \n\t"
+ " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */
+
+ ".rept 8 \n\t"
+ " movq (%1), %%mm0 \n\t" /* take 8 bytes */
+ " movq (%2), %%mm1 \n\t"
+ " psadbw %%mm1, %%mm0 \n\t"
+ " add %3, %1 \n\t" /* Inc pointer into the new data */
+ " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */
+ " add %3, %2 \n\t" /* Inc pointer into ref data */
+ ".endr \n\t"
+
+ " movd %%mm7, %0 \n\t"
+
+ : "=r" (DiffVal),
+ "+r" (ptr1),
+ "+r" (ptr2)
+ : "r" ((unsigned long)stride)
+ : "memory"
+ );
+
+ return DiffVal;
+}
+
+static ogg_uint32_t sad8x8_xy2_thres__mmxext (const unsigned char *SrcData, const unsigned char *RefDataPtr1,
+ const unsigned char *RefDataPtr2, ogg_uint32_t Stride,
+ ogg_uint32_t thres)
+{
+ ogg_uint32_t DiffVal;
+
+ __asm__ __volatile__ (
+ " .balign 16 \n\t"
+ " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */
+ ".rept 8 \n\t"
+ " movq (%1), %%mm0 \n\t" /* take 8 bytes */
+ " movq (%2), %%mm1 \n\t"
+ " movq (%3), %%mm2 \n\t"
+ " pavgb %%mm2, %%mm1 \n\t"
+ " psadbw %%mm1, %%mm0 \n\t"
+
+ " add %4, %1 \n\t" /* Inc pointer into the new data */
+ " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */
+ " add %4, %2 \n\t" /* Inc pointer into ref data */
+ " add %4, %3 \n\t" /* Inc pointer into ref data */
+ ".endr \n\t"
+
+ " movd %%mm7, %0 \n\t"
+ : "=m" (DiffVal),
+ "+r" (SrcData),
+ "+r" (RefDataPtr1),
+ "+r" (RefDataPtr2)
+ : "r" ((unsigned long)Stride)
+ : "memory"
+ );
+
+ return DiffVal;
+}
+
+void dsp_mmxext_init(DspFunctions *funcs)
+{
+ funcs->sad8x8 = sad8x8__mmxext;
+ funcs->sad8x8_thres = sad8x8_thres__mmxext;
+ funcs->sad8x8_xy2_thres = sad8x8_xy2_thres__mmxext;
+}
+
+#endif /* USE_ASM */
Added: branches/theora-thusnelda/lib/enc/x86/dsp_sse2.c
===================================================================
--- branches/theora-thusnelda/lib/enc/x86/dsp_sse2.c (rev 0)
+++ branches/theora-thusnelda/lib/enc/x86/dsp_sse2.c 2008-12-05 05:41:06 UTC (rev 15557)
@@ -0,0 +1,95 @@
+/********************************************************************
+ * *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
+ * *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2008 *
+ * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * *
+ ********************************************************************
+
+ function:
+ last mod: $Id$
+
+ ********************************************************************/
+
+#include <stdlib.h>
+
+#include "codec_internal.h"
+#include "dsp.h"
+
+#if defined(USE_ASM)
+
+static int find_nonzero__sse2(ogg_int16_t *q, int in){
+ int ret,tmp,tmp2;
+
+ __asm__ (
+ ".balign 16 \n"
+ "movd %[in],%%xmm0\n"
+ "punpcklwd %%xmm0,%%xmm0\n"
+ "punpcklwd %%xmm0,%%xmm0\n"
+ "punpcklwd %%xmm0,%%xmm0\n"
+
+ "movdqu 64(%[quant]),%%xmm1\n"
+ "pcmpgtw %%xmm0,%%xmm1\n"
+ "movdqu 80(%[quant]),%%xmm2\n"
+ "pcmpgtw %%xmm0,%%xmm2\n"
+ "packsswb %%xmm2,%%xmm1\n"
+
+ "movdqu 96(%[quant]),%%xmm2\n"
+ "pcmpgtw %%xmm0,%%xmm2\n"
+ "movdqu 112(%[quant]),%%xmm3\n"
+ "pcmpgtw %%xmm0,%%xmm3\n"
+ "packsswb %%xmm3,%%xmm2\n"
+
+ "pmovmskb %%xmm1,%[ret]\n"
+ "pmovmskb %%xmm2,%[tmp]\n"
+ "shl $16,%[tmp]\n"
+ "or %[tmp],%[ret]\n"
+ "bsr %[ret],%[ret]\n"
+ "jz %=1f\n"
+ "add $33,%[ret]\n"
+ "jmp %=3f\n"
+
+ "%=1:\n"
+ "movdqu (%[quant]),%%xmm1\n"
+ "pcmpgtw %%xmm0,%%xmm1\n"
+ "movdqu 16(%[quant]),%%xmm2\n"
+ "pcmpgtw %%xmm0,%%xmm2\n"
+ "packsswb %%xmm2,%%xmm1\n"
+
+ "movdqu 32(%[quant]),%%xmm2\n"
+ "pcmpgtw %%xmm0,%%xmm2\n"
+ "movdqu 48(%[quant]),%%xmm3\n"
+ "pcmpgtw %%xmm0,%%xmm3\n"
+ "packsswb %%xmm3,%%xmm2\n"
+
+ "pmovmskb %%xmm1,%[ret]\n"
+ "pmovmskb %%xmm2,%[tmp]\n"
+ "shl $16,%[tmp]\n"
+ "or %[tmp],%[ret]\n"
+ "bsr %[ret],%[ret]\n"
+ "jz %=2f\n"
+ "inc %[ret]\n"
+ "jmp %=3f\n"
+
+ "%=2:\n"
+ "xor %[ret],%[ret]\n"
+
+ "%=3:\n"
+
+ :[ret]"=&r"(ret),[tmp]"=&r"(tmp),[tmp2]"=&r"(tmp2)
+ :[quant]"r"(q),[in]"r"(in)
+ );
+
+ return ret;
+}
+
+void dsp_sse2_init(DspFunctions *funcs)
+{
+ funcs->find_nonzero = find_nonzero__sse2;
+}
+
+#endif /* USE_ASM */
Added: branches/theora-thusnelda/lib/enc/x86/fdct_mmx.c
===================================================================
--- branches/theora-thusnelda/lib/enc/x86/fdct_mmx.c (rev 0)
+++ branches/theora-thusnelda/lib/enc/x86/fdct_mmx.c 2008-12-05 05:41:06 UTC (rev 15557)
@@ -0,0 +1,335 @@
+/********************************************************************
+ * *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
+ * *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 1999-2006 *
+ * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * *
+ ********************************************************************/
+
+/* mmx fdct implementation for x86_64 */
+/* $Id: fdct_mmx.c 14579 2008-03-12 06:42:40Z xiphmont $ */
+
+#include "theora/theora.h"
+#include "codec_internal.h"
+#include "dsp.h"
+
+#if defined(USE_ASM)
+
+static const __attribute__ ((aligned(8),used)) ogg_int64_t xC1S7 = 0x0fb15fb15fb15fb15LL;
+static const __attribute__ ((aligned(8),used)) ogg_int64_t xC2S6 = 0x0ec83ec83ec83ec83LL;
+static const __attribute__ ((aligned(8),used)) ogg_int64_t xC3S5 = 0x0d4dbd4dbd4dbd4dbLL;
+static const __attribute__ ((aligned(8),used)) ogg_int64_t xC4S4 = 0x0b505b505b505b505LL;
+static const __attribute__ ((aligned(8),used)) ogg_int64_t xC5S3 = 0x08e3a8e3a8e3a8e3aLL;
+static const __attribute__ ((aligned(8),used)) ogg_int64_t xC6S2 = 0x061f861f861f861f8LL;
+static const __attribute__ ((aligned(8),used)) ogg_int64_t xC7S1 = 0x031f131f131f131f1LL;
+
+/* execute stage 1 of forward DCT */
+#define Fdct_mmx(ip0,ip1,ip2,ip3,ip4,ip5,ip6,ip7,temp) \
+ " movq " #ip0 ", %%mm0 \n\t" \
+ " movq " #ip1 ", %%mm1 \n\t" \
+ " movq " #ip3 ", %%mm2 \n\t" \
+ " movq " #ip5 ", %%mm3 \n\t" \
+ " movq %%mm0, %%mm4 \n\t" \
+ " movq %%mm1, %%mm5 \n\t" \
+ " movq %%mm2, %%mm6 \n\t" \
+ " movq %%mm3, %%mm7 \n\t" \
+ \
+ " paddsw " #ip7 ", %%mm0 \n\t" /* mm0 = ip0 + ip7 = is07 */ \
+ " paddsw " #ip2 ", %%mm1 \n\t" /* mm1 = ip1 + ip2 = is12 */ \
+ " paddsw " #ip4 ", %%mm2 \n\t" /* mm2 = ip3 + ip4 = is34 */ \
+ " paddsw " #ip6 ", %%mm3 \n\t" /* mm3 = ip5 + ip6 = is56 */ \
+ " psubsw " #ip7 ", %%mm4 \n\t" /* mm4 = ip0 - ip7 = id07 */ \
+ " psubsw " #ip2 ", %%mm5 \n\t" /* mm5 = ip1 - ip2 = id12 */ \
+ \
+ " psubsw %%mm2, %%mm0 \n\t" /* mm0 = is07 - is34 */ \
+ \
+ " paddsw %%mm2, %%mm2 \n\t" \
+ \
+ " psubsw " #ip4 ", %%mm6 \n\t" /* mm6 = ip3 - ip4 = id34 */ \
+ \
+ " paddsw %%mm0, %%mm2 \n\t" /* mm2 = is07 + is34 = is0734 */ \
+ " psubsw %%mm3, %%mm1 \n\t" /* mm1 = is12 - is56 */ \
+ " movq %%mm0," #temp " \n\t" /* Save is07 - is34 to free mm0; */ \
+ " paddsw %%mm3, %%mm3 \n\t" \
+ " paddsw %%mm1, %%mm3 \n\t" /* mm3 = is12 + 1s56 = is1256 */ \
+ \
+ " psubsw " #ip6 ", %%mm7 \n\t" /* mm7 = ip5 - ip6 = id56 */ \
+ /* ------------------------------------------------------------------- */ \
+ " psubsw %%mm7, %%mm5 \n\t" /* mm5 = id12 - id56 */ \
+ " paddsw %%mm7, %%mm7 \n\t" \
+ " paddsw %%mm5, %%mm7 \n\t" /* mm7 = id12 + id56 */ \
+ /* ------------------------------------------------------------------- */ \
+ " psubsw %%mm3, %%mm2 \n\t" /* mm2 = is0734 - is1256 */ \
+ " paddsw %%mm3, %%mm3 \n\t" \
+ \
+ " movq %%mm2, %%mm0 \n\t" /* make a copy */ \
+ " paddsw %%mm2, %%mm3 \n\t" /* mm3 = is0734 + is1256 */ \
+ \
+ " pmulhw %[xC4S4], %%mm0 \n\t" /* mm0 = xC4S4 * ( is0734 - is1256 ) - ( is0734 - is1256 ) */ \
+ " paddw %%mm2, %%mm0 \n\t" /* mm0 = xC4S4 * ( is0734 - is1256 ) */ \
+ " psrlw $15, %%mm2 \n\t" \
+ " paddw %%mm2, %%mm0 \n\t" /* Truncate mm0, now it is op[4] */ \
+ \
+ " movq %%mm3, %%mm2 \n\t" \
+ " movq %%mm0," #ip4 " \n\t" /* save ip4, now mm0,mm2 are free */ \
+ \
+ " movq %%mm3, %%mm0 \n\t" \
+ " pmulhw %[xC4S4], %%mm3 \n\t" /* mm3 = xC4S4 * ( is0734 +is1256 ) - ( is0734 +is1256 ) */ \
+ \
+ " psrlw $15, %%mm2 \n\t" \
+ " paddw %%mm0, %%mm3 \n\t" /* mm3 = xC4S4 * ( is0734 +is1256 ) */ \
+ " paddw %%mm2, %%mm3 \n\t" /* Truncate mm3, now it is op[0] */ \
+ \
+ " movq %%mm3," #ip0 " \n\t" \
+ /* ------------------------------------------------------------------- */ \
+ " movq " #temp ", %%mm3 \n\t" /* mm3 = irot_input_y */ \
+ " pmulhw %[xC2S6], %%mm3 \n\t" /* mm3 = xC2S6 * irot_input_y - irot_input_y */ \
+ \
+ " movq " #temp ", %%mm2 \n\t" \
+ " movq %%mm2, %%mm0 \n\t" \
+ \
+ " psrlw $15, %%mm2 \n\t" /* mm3 = xC2S6 * irot_input_y */ \
+ " paddw %%mm0, %%mm3 \n\t" \
+ \
+ " paddw %%mm2, %%mm3 \n\t" /* Truncated */ \
+ " movq %%mm5, %%mm0 \n\t" \
+ \
+ " movq %%mm5, %%mm2 \n\t" \
+ " pmulhw %[xC6S2], %%mm0 \n\t" /* mm0 = xC6S2 * irot_input_x */ \
+ \
+ " psrlw $15, %%mm2 \n\t" \
+ " paddw %%mm2, %%mm0 \n\t" /* Truncated */ \
+ \
+ " paddsw %%mm0, %%mm3 \n\t" /* ip[2] */ \
+ " movq %%mm3," #ip2 " \n\t" /* Save ip2 */ \
+ \
+ " movq %%mm5, %%mm0 \n\t" \
+ " movq %%mm5, %%mm2 \n\t" \
+ \
+ " pmulhw %[xC2S6], %%mm5 \n\t" /* mm5 = xC2S6 * irot_input_x - irot_input_x */ \
+ " psrlw $15, %%mm2 \n\t" \
+ \
+ " movq " #temp ", %%mm3 \n\t" \
+ " paddw %%mm0, %%mm5 \n\t" /* mm5 = xC2S6 * irot_input_x */ \
+ \
+ " paddw %%mm2, %%mm5 \n\t" /* Truncated */ \
+ " movq %%mm3, %%mm2 \n\t" \
+ \
+ " pmulhw %[xC6S2], %%mm3 \n\t" /* mm3 = xC6S2 * irot_input_y */ \
+ " psrlw $15, %%mm2 \n\t" \
+ \
+ " paddw %%mm2, %%mm3 \n\t" /* Truncated */ \
+ " psubsw %%mm5, %%mm3 \n\t" \
+ \
+ " movq %%mm3," #ip6 " \n\t" \
+ /* ------------------------------------------------------------------- */ \
+ " movq %[xC4S4], %%mm0 \n\t" \
+ " movq %%mm1, %%mm2 \n\t" \
+ " movq %%mm1, %%mm3 \n\t" \
+ \
+ " pmulhw %%mm0, %%mm1 \n\t" /* mm0 = xC4S4 * ( is12 - is56 ) - ( is12 - is56 ) */ \
+ " psrlw $15, %%mm2 \n\t" \
+ \
+ " paddw %%mm3, %%mm1 \n\t" /* mm0 = xC4S4 * ( is12 - is56 ) */ \
+ " paddw %%mm2, %%mm1 \n\t" /* Truncate mm1, now it is icommon_product1 */ \
+ \
+ " movq %%mm7, %%mm2 \n\t" \
+ " movq %%mm7, %%mm3 \n\t" \
+ \
+ " pmulhw %%mm0, %%mm7 \n\t" /* mm7 = xC4S4 * ( id12 + id56 ) - ( id12 + id56 ) */ \
+ " psrlw $15, %%mm2 \n\t" \
+ \
+ " paddw %%mm3, %%mm7 \n\t" /* mm7 = xC4S4 * ( id12 + id56 ) */ \
+ " paddw %%mm2, %%mm7 \n\t" /* Truncate mm7, now it is icommon_product2 */ \
+ /* ------------------------------------------------------------------- */ \
+ " pxor %%mm0, %%mm0 \n\t" /* Clear mm0 */ \
+ " psubsw %%mm6, %%mm0 \n\t" /* mm0 = - id34 */ \
+ \
+ " psubsw %%mm7, %%mm0 \n\t" /* mm0 = - ( id34 + idcommon_product2 ) */ \
+ " paddsw %%mm6, %%mm6 \n\t" \
+ " paddsw %%mm0, %%mm6 \n\t" /* mm6 = id34 - icommon_product2 */ \
+ \
+ " psubsw %%mm1, %%mm4 \n\t" /* mm4 = id07 - icommon_product1 */ \
+ " paddsw %%mm1, %%mm1 \n\t" \
+ " paddsw %%mm4, %%mm1 \n\t" /* mm1 = id07 + icommon_product1 */ \
+ /* ------------------------------------------------------------------- */ \
+ " movq %[xC1S7], %%mm7 \n\t" \
+ " movq %%mm1, %%mm2 \n\t" \
+ \
+ " movq %%mm1, %%mm3 \n\t" \
+ " pmulhw %%mm7, %%mm1 \n\t" /* mm1 = xC1S7 * irot_input_x - irot_input_x */ \
+ \
+ " movq %[xC7S1], %%mm7 \n\t" \
+ " psrlw $15, %%mm2 \n\t" \
+ \
+ " paddw %%mm3, %%mm1 \n\t" /* mm1 = xC1S7 * irot_input_x */ \
+ " paddw %%mm2, %%mm1 \n\t" /* Trucated */ \
+ \
+ " pmulhw %%mm7, %%mm3 \n\t" /* mm3 = xC7S1 * irot_input_x */ \
+ " paddw %%mm2, %%mm3 \n\t" /* Truncated */ \
+ \
+ " movq %%mm0, %%mm5 \n\t" \
+ " movq %%mm0, %%mm2 \n\t" \
+ \
+ " movq %[xC1S7], %%mm7 \n\t" \
+ " pmulhw %%mm7, %%mm0 \n\t" /* mm0 = xC1S7 * irot_input_y - irot_input_y */ \
+ \
+ " movq %[xC7S1], %%mm7 \n\t" \
+ " psrlw $15, %%mm2 \n\t" \
+ \
+ " paddw %%mm5, %%mm0 \n\t" /* mm0 = xC1S7 * irot_input_y */ \
+ " paddw %%mm2, %%mm0 \n\t" /* Truncated */ \
+ \
+ " pmulhw %%mm7, %%mm5 \n\t" /* mm5 = xC7S1 * irot_input_y */ \
+ " paddw %%mm2, %%mm5 \n\t" /* Truncated */ \
+ \
+ " psubsw %%mm5, %%mm1 \n\t" /* mm1 = xC1S7 * irot_input_x - xC7S1 * irot_input_y = ip1 */ \
+ " paddsw %%mm0, %%mm3 \n\t" /* mm3 = xC7S1 * irot_input_x - xC1S7 * irot_input_y = ip7 */ \
+ \
+ " movq %%mm1," #ip1 " \n\t" \
+ " movq %%mm3," #ip7 " \n\t" \
+ /* ------------------------------------------------------------------- */ \
+ " movq %[xC3S5], %%mm0 \n\t" \
+ " movq %[xC5S3], %%mm1 \n\t" \
+ \
+ " movq %%mm6, %%mm5 \n\t" \
+ " movq %%mm6, %%mm7 \n\t" \
+ \
+ " movq %%mm4, %%mm2 \n\t" \
+ " movq %%mm4, %%mm3 \n\t" \
+ \
+ " pmulhw %%mm0, %%mm4 \n\t" /* mm4 = xC3S5 * irot_input_x - irot_input_x */ \
+ " pmulhw %%mm1, %%mm6 \n\t" /* mm6 = xC5S3 * irot_input_y - irot_input_y */ \
+ \
+ " psrlw $15, %%mm2 \n\t" \
+ " psrlw $15, %%mm5 \n\t" \
+ \
+ " paddw %%mm3, %%mm4 \n\t" /* mm4 = xC3S5 * irot_input_x */ \
+ " paddw %%mm7, %%mm6 \n\t" /* mm6 = xC5S3 * irot_input_y */ \
+ \
+ " paddw %%mm2, %%mm4 \n\t" /* Truncated */ \
+ " paddw %%mm5, %%mm6 \n\t" /* Truncated */ \
+ \
+ " psubsw %%mm6, %%mm4 \n\t" /* ip3 */ \
+ " movq %%mm4," #ip3 " \n\t" \
+ \
+ " movq %%mm3, %%mm4 \n\t" \
+ " movq %%mm7, %%mm6 \n\t" \
+ \
+ " pmulhw %%mm1, %%mm3 \n\t" /* mm3 = xC5S3 * irot_input_x - irot_input_x */ \
+ " pmulhw %%mm0, %%mm7 \n\t" /* mm7 = xC3S5 * irot_input_y - irot_input_y */ \
+ \
+ " paddw %%mm2, %%mm4 \n\t" \
+ " paddw %%mm5, %%mm6 \n\t" \
+ \
+ " paddw %%mm4, %%mm3 \n\t" /* mm3 = xC5S3 * irot_input_x */ \
+ " paddw %%mm6, %%mm7 \n\t" /* mm7 = xC3S5 * irot_input_y */ \
+ \
+ " paddw %%mm7, %%mm3 \n\t" /* ip5 */ \
+ " movq %%mm3," #ip5 " \n\t"
+
+#define Transpose_mmx(ip0,ip1,ip2,ip3,ip4,ip5,ip6,ip7, \
+ op0,op1,op2,op3,op4,op5,op6,op7) \
+ " movq " #ip0 ", %%mm0 \n\t" /* mm0 = a0 a1 a2 a3 */ \
+ " movq " #ip4 ", %%mm4 \n\t" /* mm4 = e4 e5 e6 e7 */ \
+ " movq " #ip1 ", %%mm1 \n\t" /* mm1 = b0 b1 b2 b3 */ \
+ " movq " #ip5 ", %%mm5 \n\t" /* mm5 = f4 f5 f6 f7 */ \
+ " movq " #ip2 ", %%mm2 \n\t" /* mm2 = c0 c1 c2 c3 */ \
+ " movq " #ip6 ", %%mm6 \n\t" /* mm6 = g4 g5 g6 g7 */ \
+ " movq " #ip3 ", %%mm3 \n\t" /* mm3 = d0 d1 d2 d3 */ \
+ " movq %%mm1," #op1 " \n\t" /* save b0 b1 b2 b3 */ \
+ " movq " #ip7 ", %%mm7 \n\t" /* mm7 = h0 h1 h2 h3 */ \
+ /* Transpose 2x8 block */ \
+ " movq %%mm4, %%mm1 \n\t" /* mm1 = e3 e2 e1 e0 */ \
+ " punpcklwd %%mm5, %%mm4 \n\t" /* mm4 = f1 e1 f0 e0 */ \
+ " movq %%mm0," #op0 " \n\t" /* save a3 a2 a1 a0 */ \
+ " punpckhwd %%mm5, %%mm1 \n\t" /* mm1 = f3 e3 f2 e2 */ \
+ " movq %%mm6, %%mm0 \n\t" /* mm0 = g3 g2 g1 g0 */ \
+ " punpcklwd %%mm7, %%mm6 \n\t" /* mm6 = h1 g1 h0 g0 */ \
+ " movq %%mm4, %%mm5 \n\t" /* mm5 = f1 e1 f0 e0 */ \
+ " punpckldq %%mm6, %%mm4 \n\t" /* mm4 = h0 g0 f0 e0 = MM4 */ \
+ " punpckhdq %%mm6, %%mm5 \n\t" /* mm5 = h1 g1 f1 e1 = MM5 */ \
+ " movq %%mm1, %%mm6 \n\t" /* mm6 = f3 e3 f2 e2 */ \
+ " movq %%mm4," #op4 " \n\t" \
+ " punpckhwd %%mm7, %%mm0 \n\t" /* mm0 = h3 g3 h2 g2 */ \
+ " movq %%mm5," #op5 " \n\t" \
+ " punpckhdq %%mm0, %%mm6 \n\t" /* mm6 = h3 g3 f3 e3 = MM7 */ \
+ " movq " #op0 ", %%mm4 \n\t" /* mm4 = a3 a2 a1 a0 */ \
+ " punpckldq %%mm0, %%mm1 \n\t" /* mm1 = h2 g2 f2 e2 = MM6 */ \
+ " movq " #op1 ", %%mm5 \n\t" /* mm5 = b3 b2 b1 b0 */ \
+ " movq %%mm4, %%mm0 \n\t" /* mm0 = a3 a2 a1 a0 */ \
+ " movq %%mm6," #op7 " \n\t" \
+ " punpcklwd %%mm5, %%mm0 \n\t" /* mm0 = b1 a1 b0 a0 */ \
+ " movq %%mm1," #op6 " \n\t" \
+ " punpckhwd %%mm5, %%mm4 \n\t" /* mm4 = b3 a3 b2 a2 */ \
+ " movq %%mm2, %%mm5 \n\t" /* mm5 = c3 c2 c1 c0 */ \
+ " punpcklwd %%mm3, %%mm2 \n\t" /* mm2 = d1 c1 d0 c0 */ \
+ " movq %%mm0, %%mm1 \n\t" /* mm1 = b1 a1 b0 a0 */ \
+ " punpckldq %%mm2, %%mm0 \n\t" /* mm0 = d0 c0 b0 a0 = MM0 */ \
+ " punpckhdq %%mm2, %%mm1 \n\t" /* mm1 = d1 c1 b1 a1 = MM1 */ \
+ " movq %%mm4, %%mm2 \n\t" /* mm2 = b3 a3 b2 a2 */ \
+ " movq %%mm0," #op0 " \n\t" \
+ " punpckhwd %%mm3, %%mm5 \n\t" /* mm5 = d3 c3 d2 c2 */ \
+ " movq %%mm1," #op1 " \n\t" \
+ " punpckhdq %%mm5, %%mm4 \n\t" /* mm4 = d3 c3 b3 a3 = MM3 */ \
+ " punpckldq %%mm5, %%mm2 \n\t" /* mm2 = d2 c2 b2 a2 = MM2 */ \
+ " movq %%mm4," #op3 " \n\t" \
+ " movq %%mm2," #op2 " \n\t"
+
+
+/* This performs a 2D Forward DCT on an 8x8 block with short
+ coefficients. We try to do the truncation to match the C
+ version. */
+static void fdct_short__mmx ( ogg_int16_t *InputData, ogg_int16_t *OutputData)
+{
+ ogg_int16_t __attribute__((aligned(8))) temp[8*8];
+
+ __asm__ __volatile__ (
+ " .balign 16 \n\t"
+ /*
+ * Input data is an 8x8 block. To make processing of the data more efficent
+ * we will transpose the block of data to two 4x8 blocks???
+ */
+ Transpose_mmx ( (%0), 16(%0), 32(%0), 48(%0), 8(%0), 24(%0), 40(%0), 56(%0),
+ (%1), 16(%1), 32(%1), 48(%1), 8(%1), 24(%1), 40(%1), 56(%1))
+ Fdct_mmx ( (%1), 16(%1), 32(%1), 48(%1), 8(%1), 24(%1), 40(%1), 56(%1), (%2))
+
+ Transpose_mmx (64(%0), 80(%0), 96(%0),112(%0), 72(%0), 88(%0),104(%0),120(%0),
+ 64(%1), 80(%1), 96(%1),112(%1), 72(%1), 88(%1),104(%1),120(%1))
+ Fdct_mmx (64(%1), 80(%1), 96(%1),112(%1), 72(%1), 88(%1),104(%1),120(%1), (%2))
+
+ Transpose_mmx ( 0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1),
+ 0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1))
+ Fdct_mmx ( 0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1), (%2))
+
+ Transpose_mmx ( 8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1),
+ 8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1))
+ Fdct_mmx ( 8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1), (%2))
+
+ " emms \n\t"
+
+ : "+r" (InputData),
+ "+r" (OutputData)
+ : "r" (temp),
+ [xC1S7] "m" (xC1S7), /* gcc 3.1+ allows named asm parameters */
+ [xC2S6] "m" (xC2S6),
+ [xC3S5] "m" (xC3S5),
+ [xC4S4] "m" (xC4S4),
+ [xC5S3] "m" (xC5S3),
+ [xC6S2] "m" (xC6S2),
+ [xC7S1] "m" (xC7S1)
+ : "memory"
+ );
+}
+
+/* install our implementation in the function table */
+void dsp_mmx_fdct_init(DspFunctions *funcs)
+{
+ funcs->fdct_short = fdct_short__mmx;
+}
+
+#endif /* USE_ASM */
Added: branches/theora-thusnelda/lib/enc/x86/idct_mmx.c
===================================================================
--- branches/theora-thusnelda/lib/enc/x86/idct_mmx.c (rev 0)
+++ branches/theora-thusnelda/lib/enc/x86/idct_mmx.c 2008-12-05 05:41:06 UTC (rev 15557)
@@ -0,0 +1,1128 @@
+/********************************************************************
+ * *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
+ * *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2008 *
+ * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * *
+ ********************************************************************
+
+ function:
+ last mod: $Id: idct_mmx.c 14783 2008-04-22 16:23:11Z xiphmont $
+
+ ********************************************************************/
+
+#include "codec_internal.h"
+
+#if defined(USE_ASM)
+
+#define MaskOffset 0 // 4 masks come in order low word to high
+#define CosineOffset 32 // 7 cosines come in order pi/16 * (1 ... 7)
+#define EightOffset 88
+#define IdctAdjustBeforeShift 8
+
+ogg_uint16_t idctconstants[(4+7+1) * 4] = {
+ 65535, 0, 0, 0, 0, 65535, 0, 0,
+ 0, 0, 65535, 0, 0, 0, 0, 65535,
+ 64277, 64277, 64277, 64277, 60547, 60547, 60547, 60547,
+ 54491, 54491, 54491, 54491, 46341, 46341, 46341, 46341,
+ 36410, 36410, 36410, 36410, 25080, 25080, 25080, 25080,
+ 12785, 12785, 12785, 12785, 8, 8, 8, 8,
+};
+
+/**************************************************************************************
+ *
+ * Routine: BeginIDCT
+ *
+ * Description: The Macro does IDct on 4 1-D Dcts
+ *
+ * Input: None
+ *
+ * Output: None
+ *
+ * Return: None
+ *
+ * Special Note: None
+ *
+ * Error: None
+ *
+ ***************************************************************************************
+ */
+
+#define MtoSTR(s) #s
+
+#define BeginIDCT "#BeginIDCT\n" \
+ \
+ " movq "I(3)",%%mm2\n" \
+ \
+ " movq "C(3)",%%mm6\n" \
+ " movq %%mm2,%%mm4\n" \
+ " movq "J(5)",%%mm7\n" \
+ " pmulhw %%mm6,%%mm4\n" \
+ " movq "C(5)",%%mm1\n" \
+ " pmulhw %%mm7,%%mm6\n" \
+ " movq %%mm1,%%mm5\n" \
+ " pmulhw %%mm2,%%mm1\n" \
+ " movq "I(1)",%%mm3\n" \
+ " pmulhw %%mm7,%%mm5\n" \
+ " movq "C(1)",%%mm0\n" \
+ " paddw %%mm2,%%mm4\n" \
+ " paddw %%mm7,%%mm6\n" \
+ " paddw %%mm1,%%mm2\n" \
+ " movq "J(7)",%%mm1\n" \
+ " paddw %%mm5,%%mm7\n" \
+ " movq %%mm0,%%mm5\n" \
+ " pmulhw %%mm3,%%mm0\n" \
+ " paddsw %%mm7,%%mm4\n" \
+ " pmulhw %%mm1,%%mm5\n" \
+ " movq "C(7)",%%mm7\n" \
+ " psubsw %%mm2,%%mm6\n" \
+ " paddw %%mm3,%%mm0\n" \
+ " pmulhw %%mm7,%%mm3\n" \
+ " movq "I(2)",%%mm2\n" \
+ " pmulhw %%mm1,%%mm7\n" \
+ " paddw %%mm1,%%mm5\n" \
+ " movq %%mm2,%%mm1\n" \
+ " pmulhw "C(2)",%%mm2\n" \
+ " psubsw %%mm5,%%mm3\n" \
+ " movq "J(6)",%%mm5\n" \
+ " paddsw %%mm7,%%mm0\n" \
+ " movq %%mm5,%%mm7\n" \
+ " psubsw %%mm4,%%mm0\n" \
+ " pmulhw "C(2)",%%mm5\n" \
+ " paddw %%mm1,%%mm2\n" \
+ " pmulhw "C(6)",%%mm1\n" \
+ " paddsw %%mm4,%%mm4\n" \
+ " paddsw %%mm0,%%mm4\n" \
+ " psubsw %%mm6,%%mm3\n" \
+ " paddw %%mm7,%%mm5\n" \
+ " paddsw %%mm6,%%mm6\n" \
+ " pmulhw "C(6)",%%mm7\n" \
+ " paddsw %%mm3,%%mm6\n" \
+ " movq %%mm4,"I(1)"\n" \
+ " psubsw %%mm5,%%mm1\n" \
+ " movq "C(4)",%%mm4\n" \
+ " movq %%mm3,%%mm5\n" \
+ " pmulhw %%mm4,%%mm3\n" \
+ " paddsw %%mm2,%%mm7\n" \
+ " movq %%mm6,"I(2)"\n" \
+ " movq %%mm0,%%mm2\n" \
+ " movq "I(0)",%%mm6\n" \
+ " pmulhw %%mm4,%%mm0\n" \
+ " paddw %%mm3,%%mm5\n" \
+ "\n" \
+ " movq "J(4)",%%mm3\n" \
+ " psubsw %%mm1,%%mm5\n" \
+ " paddw %%mm0,%%mm2\n" \
+ " psubsw %%mm3,%%mm6\n" \
+ " movq %%mm6,%%mm0\n" \
+ " pmulhw %%mm4,%%mm6\n" \
+ " paddsw %%mm3,%%mm3\n" \
+ " paddsw %%mm1,%%mm1\n" \
+ " paddsw %%mm0,%%mm3\n" \
+ " paddsw %%mm5,%%mm1\n" \
+ " pmulhw %%mm3,%%mm4\n" \
+ " paddsw %%mm0,%%mm6\n" \
+ " psubsw %%mm2,%%mm6\n" \
+ " paddsw %%mm2,%%mm2\n" \
+ " movq "I(1)",%%mm0\n" \
+ " paddsw %%mm6,%%mm2\n" \
+ " paddw %%mm3,%%mm4\n" \
+ " psubsw %%mm1,%%mm2\n" \
+ "#end BeginIDCT\n"
+// end BeginIDCT macro (38 cycles).
+
+
+// Two versions of the end of the idct depending on whether we're feeding
+// into a transpose or dividing the final results by 16 and storing them.
+
+/**************************************************************************************
+ *
+ * Routine: RowIDCT
+ *
+ * Description: The Macro does 1-D IDct on 4 Rows
+ *
+ * Input: None
+ *
+ * Output: None
+ *
+ * Return: None
+ *
+ * Special Note: None
+ *
+ * Error: None
+ *
+ ***************************************************************************************
+ */
+
+// RowIDCT gets ready to transpose.
+
+#define RowIDCT "#RowIDCT\n" \
+ BeginIDCT \
+ "\n" \
+ " movq "I(2)",%%mm3\n" /* r3 = D. */ \
+ " psubsw %%mm7,%%mm4\n" /* r4 = E. = E - G */ \
+ " paddsw %%mm1,%%mm1\n" /* r1 = H. + H. */ \
+ " paddsw %%mm7,%%mm7\n" /* r7 = G + G */ \
+ " paddsw %%mm2,%%mm1\n" /* r1 = R1 = A.. + H. */\
+ " paddsw %%mm4,%%mm7\n" /* r7 = G. = E + G */ \
+ " psubsw %%mm3,%%mm4\n" /* r4 = R4 = E. - D. */ \
+ " paddsw %%mm3,%%mm3\n" \
+ " psubsw %%mm5,%%mm6\n" /* r6 = R6 = F. - B.. */\
+ " paddsw %%mm5,%%mm5\n" \
+ " paddsw %%mm4,%%mm3\n" /* r3 = R3 = E. + D. */ \
+ " paddsw %%mm6,%%mm5\n" /* r5 = R5 = F. + B.. */\
+ " psubsw %%mm0,%%mm7\n" /* r7 = R7 = G. - C. */ \
+ " paddsw %%mm0,%%mm0\n" \
+ " movq %%mm1,"I(1)"\n" /* save R1 */ \
+ " paddsw %%mm7,%%mm0\n" /* r0 = R0 = G. + C. */ \
+ "#end RowIDCT"
+
+// end RowIDCT macro (8 + 38 = 46 cycles)
+
+
+/**************************************************************************************
+ *
+ * Routine: ColumnIDCT
+ *
+ * Description: The Macro does 1-D IDct on 4 columns
+ *
+ * Input: None
+ *
+ * Output: None
+ *
+ * Return: None
+ *
+ * Special Note: None
+ *
+ * Error: None
+ *
+ ***************************************************************************************
+ */
+// Column IDCT normalizes and stores final results.
+
+#define ColumnIDCT "#ColumnIDCT\n" \
+ BeginIDCT \
+ "\n" \
+ " paddsw "Eight",%%mm2\n" \
+ " paddsw %%mm1,%%mm1\n" /* r1 = H. + H. */ \
+ " paddsw %%mm2,%%mm1\n" /* r1 = R1 = A.. + H. */\
+ " psraw ""$4"",%%mm2\n" /* r2 = NR2 */ \
+ " psubsw %%mm7,%%mm4\n" /* r4 = E. = E - G */ \
+ " psraw ""$4"",%%mm1\n" /* r1 = NR1 */ \
+ " movq "I(2)",%%mm3\n" /* r3 = D. */ \
+ " paddsw %%mm7,%%mm7\n" /* r7 = G + G */ \
+ " movq %%mm2,"I(2)"\n" /* store NR2 at I2 */ \
+ " paddsw %%mm4,%%mm7\n" /* r7 = G. = E + G */ \
+ " movq %%mm1,"I(1)"\n" /* store NR1 at I1 */ \
+ " psubsw %%mm3,%%mm4\n" /* r4 = R4 = E. - D. */ \
+ " paddsw "Eight",%%mm4\n" \
+ " paddsw %%mm3,%%mm3\n" /* r3 = D. + D. */ \
+ " paddsw %%mm4,%%mm3\n" /* r3 = R3 = E. + D. */ \
+ " psraw ""$4"",%%mm4\n" /* r4 = NR4 */ \
+ " psubsw %%mm5,%%mm6\n" /* r6 = R6 = F. - B.. */\
+ " psraw ""$4"",%%mm3\n" /* r3 = NR3 */ \
+ " paddsw "Eight",%%mm6\n" \
+ " paddsw %%mm5,%%mm5\n" /* r5 = B.. + B.. */ \
+ " paddsw %%mm6,%%mm5\n" /* r5 = R5 = F. + B.. */\
+ " psraw ""$4"",%%mm6\n" /* r6 = NR6 */ \
+ " movq %%mm4,"J(4)"\n" /* store NR4 at J4 */ \
+ " psraw ""$4"",%%mm5\n" /* r5 = NR5 */ \
+ " movq %%mm3,"I(3)"\n" /* store NR3 at I3 */ \
+ " psubsw %%mm0,%%mm7\n" /* r7 = R7 = G. - C. */ \
+ " paddsw "Eight",%%mm7\n" \
+ " paddsw %%mm0,%%mm0\n" /* r0 = C. + C. */ \
+ " paddsw %%mm7,%%mm0\n" /* r0 = R0 = G. + C. */ \
+ " psraw ""$4"",%%mm7\n" /* r7 = NR7 */ \
+ " movq %%mm6,"J(6)"\n" /* store NR6 at J6 */ \
+ " psraw ""$4"",%%mm0\n" /* r0 = NR0 */ \
+ " movq %%mm5,"J(5)"\n" /* store NR5 at J5 */ \
+ " movq %%mm7,"J(7)"\n" /* store NR7 at J7 */ \
+ " movq %%mm0,"I(0)"\n" /* store NR0 at I0 */ \
+ "#end ColumnIDCT\n"
+
+// end ColumnIDCT macro (38 + 19 = 57 cycles)
+
+/**************************************************************************************
+ *
+ * Routine: Transpose
+ *
+ * Description: The Macro does two 4x4 transposes in place.
+ *
+ * Input: None
+ *
+ * Output: None
+ *
+ * Return: None
+ *
+ * Special Note: None
+ *
+ * Error: None
+ *
+ ***************************************************************************************
+ */
+
+/* Following macro does two 4x4 transposes in place.
+
+ At entry (we assume):
+
+ r0 = a3 a2 a1 a0
+ I(1) = b3 b2 b1 b0
+ r2 = c3 c2 c1 c0
+ r3 = d3 d2 d1 d0
+
+ r4 = e3 e2 e1 e0
+ r5 = f3 f2 f1 f0
+ r6 = g3 g2 g1 g0
+ r7 = h3 h2 h1 h0
+
+ At exit, we have:
+
+ I(0) = d0 c0 b0 a0
+ I(1) = d1 c1 b1 a1
+ I(2) = d2 c2 b2 a2
+ I(3) = d3 c3 b3 a3
+
+ J(4) = h0 g0 f0 e0
+ J(5) = h1 g1 f1 e1
+ J(6) = h2 g2 f2 e2
+ J(7) = h3 g3 f3 e3
+
+ I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3.
+ J(4) J(5) J(6) J(7) is the transpose of r4 r5 r6 r7.
+
+ Since r1 is free at entry, we calculate the Js first. */
+
+
+#define Transpose "#Transpose\n" \
+ " movq %%mm4,%%mm1\n" \
+ " punpcklwd %%mm5,%%mm4\n" \
+ " movq %%mm0,"I(0)"\n" \
+ " punpckhwd %%mm5,%%mm1\n" \
+ " movq %%mm6,%%mm0\n" \
+ " punpcklwd %%mm7,%%mm6\n" \
+ " movq %%mm4,%%mm5\n" \
+ " punpckldq %%mm6,%%mm4\n" \
+ " punpckhdq %%mm6,%%mm5\n" \
+ " movq %%mm1,%%mm6\n" \
+ " movq %%mm4,"J(4)"\n" \
+ " punpckhwd %%mm7,%%mm0\n" \
+ " movq %%mm5,"J(5)"\n" \
+ " punpckhdq %%mm0,%%mm6\n" \
+ " movq "I(0)",%%mm4\n" \
+ " punpckldq %%mm0,%%mm1\n" \
+ " movq "I(1)",%%mm5\n" \
+ " movq %%mm4,%%mm0\n" \
+ " movq %%mm6,"J(7)"\n" \
+ " punpcklwd %%mm5,%%mm0\n" \
+ " movq %%mm1,"J(6)"\n" \
+ " punpckhwd %%mm5,%%mm4\n" \
+ " movq %%mm2,%%mm5\n" \
+ " punpcklwd %%mm3,%%mm2\n" \
+ " movq %%mm0,%%mm1\n" \
+ " punpckldq %%mm2,%%mm0\n" \
+ " punpckhdq %%mm2,%%mm1\n" \
+ " movq %%mm4,%%mm2\n" \
+ " movq %%mm0,"I(0)"\n" \
+ " punpckhwd %%mm3,%%mm5\n" \
+ " movq %%mm1,"I(1)"\n" \
+ " punpckhdq %%mm5,%%mm4\n" \
+ " punpckldq %%mm5,%%mm2\n" \
+ \
+ " movq %%mm4,"I(3)"\n" \
+ \
+ " movq %%mm2,"I(2)"\n" \
+ "#end Transpose\n"
+// end Transpose macro (19 cycles).
+
+/**************************************************************************************
+ *
+ * Routine: MMX_idct
+ *
+ * Description: Perform IDCT on a 8x8 block
+ *
+ * Input: Pointer to input and output buffer
+ *
+ * Output: None
+ *
+ * Return: None
+ *
+ * Special Note: The input coefficients are in ZigZag order
+ *
+ * Error: None
+ *
+ ***************************************************************************************
+ */
+void IDctSlow__mmx(const ogg_int16_t *in,
+ const ogg_int16_t *q,
+ ogg_int16_t *out ) {
+
+# define MID(M,I) MtoSTR(M+(I)*8)"(%[c])"
+# define M(I) MID( MaskOffset , I )
+# define C(I) MID( CosineOffset , I-1 )
+# define Eight MID(EightOffset,0)
+
+ /* eax = quantized input */
+ /* esi = quantization table */
+ /* edx = destination (= idct buffer) */
+ /* ecx = idctconstants */
+
+
+ __asm__ __volatile__ (
+ "# dequantize, de-zigzag\n"
+ "movq (%[i]), %%mm0\n"
+ "pmullw (%[q]), %%mm0\n" /* r0 = 03 02 01 00 */
+ "movq 16(%[i]), %%mm1\n"
+ "pmullw 16(%[q]), %%mm1\n" /* r1 = 13 12 11 10 */
+ "movq "M(0)", %%mm2\n" /* r2 = __ __ __ FF */
+ "movq %%mm0, %%mm3\n" /* r3 = 03 02 01 00 */
+ "movq 8(%[i]), %%mm4\n"
+ "psrlq $16, %%mm0\n" /* r0 = __ 03 02 01 */
+ "pmullw 8(%[q]), %%mm4\n" /* r4 = 07 06 05 04 */
+ "pand %%mm2, %%mm3\n" /* r3 = __ __ __ 00 */
+ "movq %%mm0, %%mm5\n" /* r5 = __ 03 02 01 */
+ "movq %%mm1, %%mm6\n" /* r6 = 13 12 11 10 */
+ "pand %%mm2, %%mm5\n" /* r5 = __ __ __ 01 */
+ "psllq $32, %%mm6\n" /* r6 = 11 10 __ __ */
+ "movq "M(3)", %%mm7\n" /* r7 = FF __ __ __ */
+ "pxor %%mm5, %%mm0\n" /* r0 = __ 03 02 __ */
+ "pand %%mm6, %%mm7\n" /* r7 = 11 __ __ __ */
+ "por %%mm3, %%mm0\n" /* r0 = __ 03 02 00 */
+ "pxor %%mm7, %%mm6\n" /* r6 = __ 10 __ __ */
+ "por %%mm7, %%mm0\n" /* r0 = 11 03 02 00 = R0 */
+ "movq "M(3)", %%mm7\n" /* r7 = FF __ __ __ */
+ "movq %%mm4, %%mm3\n" /* r3 = 07 06 05 04 */
+ "movq %%mm0, (%[o])\n" /* write R0 = r0 */
+ "pand %%mm2, %%mm3\n" /* r3 = __ __ __ 04 */
+ "movq 32(%[i]), %%mm0\n"
+ "psllq $16, %%mm3\n" /* r3 = __ __ 04 __ */
+ "pmullw 32(%[q]), %%mm0\n" /* r0 = 23 22 21 20 */
+ "pand %%mm1, %%mm7\n" /* r7 = 13 __ __ __ */
+ "por %%mm3, %%mm5\n" /* r5 = __ __ 04 01 */
+ "por %%mm6, %%mm7\n" /* r7 = 13 10 __ __ */
+ "movq 24(%[i]), %%mm3\n"
+ "por %%mm5, %%mm7\n" /* r7 = 13 10 04 01 = R1 */
+ "pmullw 24(%[q]), %%mm3\n" /* r3 = 17 16 15 14 */
+ "psrlq $16, %%mm4\n" /* r4 = __ 07 06 05 */
+ "movq %%mm7, 16(%[o])\n" /* write R1 = r7 */
+ "movq %%mm4, %%mm5\n" /* r5 = __ 07 06 05 */
+ "movq %%mm0, %%mm7\n" /* r7 = 23 22 21 20 */
+ "psrlq $16, %%mm4\n" /* r4 = __ __ 07 06 */
+ "psrlq $48, %%mm7\n" /* r7 = __ __ __ 23 */
+ "movq %%mm2, %%mm6\n" /* r6 = __ __ __ FF */
+ "pand %%mm2, %%mm5\n" /* r5 = __ __ __ 05 */
+ "pand %%mm4, %%mm6\n" /* r6 = __ __ __ 06 */
+ "movq %%mm7, 80(%[o])\n" /* partial R9 = __ __ __ 23 */
+ "pxor %%mm6, %%mm4\n" /* r4 = __ __ 07 __ */
+ "psrlq $32, %%mm1\n" /* r1 = __ __ 13 12 */
+ "por %%mm5, %%mm4\n" /* r4 = __ __ 07 05 */
+ "movq "M(3)", %%mm7\n" /* r7 = FF __ __ __ */
+ "pand %%mm2, %%mm1\n" /* r1 = __ __ __ 12 */
+ "movq 48(%[i]), %%mm5\n"
+ "psllq $16, %%mm0\n" /* r0 = 22 21 20 __ */
+ "pmullw 48(%[q]), %%mm5\n" /* r5 = 33 32 31 30 */
+ "pand %%mm0, %%mm7\n" /* r7 = 22 __ __ __ */
+ "movq %%mm1, 64(%[o])\n" /* partial R8 = __ __ __ 12 */
+ "por %%mm4, %%mm7\n" /* r7 = 22 __ 07 05 */
+ "movq %%mm3, %%mm4\n" /* r4 = 17 16 15 14 */
+ "pand %%mm2, %%mm3\n" /* r3 = __ __ __ 14 */
+ "movq "M(2)", %%mm1\n" /* r1 = __ FF __ __ */
+ "psllq $32, %%mm3\n" /* r3 = __ 14 __ __ */
+ "por %%mm3, %%mm7\n" /* r7 = 22 14 07 05 = R2 */
+ "movq %%mm5, %%mm3\n" /* r3 = 33 32 31 30 */
+ "psllq $48, %%mm3\n" /* r3 = 30 __ __ __ */
+ "pand %%mm0, %%mm1\n" /* r1 = __ 21 __ __ */
+ "movq %%mm7, 32(%[o])\n" /* write R2 = r7 */
+ "por %%mm3, %%mm6\n" /* r6 = 30 __ __ 06 */
+ "movq "M(1)", %%mm7\n" /* r7 = __ __ FF __ */
+ "por %%mm1, %%mm6\n" /* r6 = 30 21 __ 06 */
+ "movq 56(%[i]), %%mm1\n"
+ "pand %%mm4, %%mm7\n" /* r7 = __ __ 15 __ */
+ "pmullw 56(%[q]), %%mm1\n" /* r1 = 37 36 35 34 */
+ "por %%mm6, %%mm7\n" /* r7 = 30 21 15 06 = R3 */
+ "pand "M(1)", %%mm0\n" /* r0 = __ __ 20 __ */
+ "psrlq $32, %%mm4\n" /* r4 = __ __ 17 16 */
+ "movq %%mm7, 48(%[o])\n" /* write R3 = r7 */
+ "movq %%mm4, %%mm6\n" /* r6 = __ __ 17 16 */
+ "movq "M(3)", %%mm7\n" /* r7 = FF __ __ __ */
+ "pand %%mm2, %%mm4\n" /* r4 = __ __ __ 16 */
+ "movq "M(1)", %%mm3\n" /* r3 = __ __ FF __ */
+ "pand %%mm1, %%mm7\n" /* r7 = 37 __ __ __ */
+ "pand %%mm5, %%mm3\n" /* r3 = __ __ 31 __ */
+ "por %%mm4, %%mm0\n" /* r0 = __ __ 20 16 */
+ "psllq $16, %%mm3\n" /* r3 = __ 31 __ __ */
+ "por %%mm0, %%mm7\n" /* r7 = 37 __ 20 16 */
+ "movq "M(2)", %%mm4\n" /* r4 = __ FF __ __ */
+ "por %%mm3, %%mm7\n" /* r7 = 37 31 20 16 = R4 */
+ "movq 80(%[i]), %%mm0\n"
+ "movq %%mm4, %%mm3\n" /* r3 = __ __ FF __ */
+ "pmullw 80(%[q]), %%mm0\n" /* r0 = 53 52 51 50 */
+ "pand %%mm5, %%mm4\n" /* r4 = __ 32 __ __ */
+ "movq %%mm7, 8(%[o])\n" /* write R4 = r7 */
+ "por %%mm4, %%mm6\n" /* r6 = __ 32 17 16 */
+ "movq %%mm3, %%mm4\n" /* r4 = __ FF __ __ */
+ "psrlq $16, %%mm6\n" /* r6 = __ __ 32 17 */
+ "movq %%mm0, %%mm7\n" /* r7 = 53 52 51 50 */
+ "pand %%mm1, %%mm4\n" /* r4 = __ 36 __ __ */
+ "psllq $48, %%mm7\n" /* r7 = 50 __ __ __ */
+ "por %%mm4, %%mm6\n" /* r6 = __ 36 32 17 */
+ "movq 88(%[i]), %%mm4\n"
+ "por %%mm6, %%mm7\n" /* r7 = 50 36 32 17 = R5 */
+ "pmullw 88(%[q]), %%mm4\n" /* r4 = 57 56 55 54 */
+ "psrlq $16, %%mm3\n" /* r3 = __ __ FF __ */
+ "movq %%mm7, 24(%[o])\n" /* write R5 = r7 */
+ "pand %%mm1, %%mm3\n" /* r3 = __ __ 35 __ */
+ "psrlq $48, %%mm5\n" /* r5 = __ __ __ 33 */
+ "pand %%mm2, %%mm1\n" /* r1 = __ __ __ 34 */
+ "movq 104(%[i]), %%mm6\n"
+ "por %%mm3, %%mm5\n" /* r5 = __ __ 35 33 */
+ "pmullw 104(%[q]), %%mm6\n" /* r6 = 67 66 65 64 */
+ "psrlq $16, %%mm0\n" /* r0 = __ 53 52 51 */
+ "movq %%mm4, %%mm7\n" /* r7 = 57 56 55 54 */
+ "movq %%mm2, %%mm3\n" /* r3 = __ __ __ FF */
+ "psllq $48, %%mm7\n" /* r7 = 54 __ __ __ */
+ "pand %%mm0, %%mm3\n" /* r3 = __ __ __ 51 */
+ "pxor %%mm3, %%mm0\n" /* r0 = __ 53 52 __ */
+ "psllq $32, %%mm3\n" /* r3 = __ 51 __ __ */
+ "por %%mm5, %%mm7\n" /* r7 = 54 __ 35 33 */
+ "movq %%mm6, %%mm5\n" /* r5 = 67 66 65 64 */
+ "pand "M(1)", %%mm6\n" /* r6 = __ __ 65 __ */
+ "por %%mm3, %%mm7\n" /* r7 = 54 51 35 33 = R6 */
+ "psllq $32, %%mm6\n" /* r6 = 65 __ __ __ */
+ "por %%mm1, %%mm0\n" /* r0 = __ 53 52 34 */
+ "movq %%mm7, 40(%[o])\n" /* write R6 = r7 */
+ "por %%mm6, %%mm0\n" /* r0 = 65 53 52 34 = R7 */
+ "movq 120(%[i]), %%mm7\n"
+ "movq %%mm5, %%mm6\n" /* r6 = 67 66 65 64 */
+ "pmullw 120(%[q]), %%mm7\n" /* r7 = 77 76 75 74 */
+ "psrlq $32, %%mm5\n" /* r5 = __ __ 67 66 */
+ "pand %%mm2, %%mm6\n" /* r6 = __ __ __ 64 */
+ "movq %%mm5, %%mm1\n" /* r1 = __ __ 67 66 */
+ "movq %%mm0, 56(%[o])\n" /* write R7 = r0 */
+ "pand %%mm2, %%mm1\n" /* r1 = __ __ __ 66 */
+ "movq 112(%[i]), %%mm0\n"
+ "movq %%mm7, %%mm3\n" /* r3 = 77 76 75 74 */
+ "pmullw 112(%[q]), %%mm0\n" /* r0 = 73 72 71 70 */
+ "psllq $16, %%mm3\n" /* r3 = 76 75 74 __ */
+ "pand "M(3)", %%mm7\n" /* r7 = 77 __ __ __ */
+ "pxor %%mm1, %%mm5\n" /* r5 = __ __ 67 __ */
+ "por %%mm5, %%mm6\n" /* r6 = __ __ 67 64 */
+ "movq %%mm3, %%mm5\n" /* r5 = 76 75 74 __ */
+ "pand "M(3)", %%mm5\n" /* r5 = 76 __ __ __ */
+ "por %%mm1, %%mm7\n" /* r7 = 77 __ __ 66 */
+ "movq 96(%[i]), %%mm1\n"
+ "pxor %%mm5, %%mm3\n" /* r3 = __ 75 74 __ */
+ "pmullw 96(%[q]), %%mm1\n" /* r1 = 63 62 61 60 */
+ "por %%mm3, %%mm7\n" /* r7 = 77 75 74 66 = R15 */
+ "por %%mm5, %%mm6\n" /* r6 = 76 __ 67 64 */
+ "movq %%mm0, %%mm5\n" /* r5 = 73 72 71 70 */
+ "movq %%mm7, 120(%[o])\n" /* store R15 = r7 */
+ "psrlq $16, %%mm5\n" /* r5 = __ 73 72 71 */
+ "pand "M(2)", %%mm5\n" /* r5 = __ 73 __ __ */
+ "movq %%mm0, %%mm7\n" /* r7 = 73 72 71 70 */
+ "por %%mm5, %%mm6\n" /* r6 = 76 73 67 64 = R14 */
+ "pand %%mm2, %%mm0\n" /* r0 = __ __ __ 70 */
+ "pxor %%mm0, %%mm7\n" /* r7 = 73 72 71 __ */
+ "psllq $32, %%mm0\n" /* r0 = __ 70 __ __ */
+ "movq %%mm6, 104(%[o])\n" /* write R14 = r6 */
+ "psrlq $16, %%mm4\n" /* r4 = __ 57 56 55 */
+ "movq 72(%[i]), %%mm5\n"
+ "psllq $16, %%mm7\n" /* r7 = 72 71 __ __ */
+ "pmullw 72(%[q]), %%mm5\n" /* r5 = 47 46 45 44 */
+ "movq %%mm7, %%mm6\n" /* r6 = 72 71 __ __ */
+ "movq "M(2)", %%mm3\n" /* r3 = __ FF __ __ */
+ "psllq $16, %%mm6\n" /* r6 = 71 __ __ __ */
+ "pand "M(3)", %%mm7\n" /* r7 = 72 __ __ __ */
+ "pand %%mm1, %%mm3\n" /* r3 = __ 62 __ __ */
+ "por %%mm0, %%mm7\n" /* r7 = 72 70 __ __ */
+ "movq %%mm1, %%mm0\n" /* r0 = 63 62 61 60 */
+ "pand "M(3)", %%mm1\n" /* r1 = 63 __ __ __ */
+ "por %%mm3, %%mm6\n" /* r6 = 71 62 __ __ */
+ "movq %%mm4, %%mm3\n" /* r3 = __ 57 56 55 */
+ "psrlq $32, %%mm1\n" /* r1 = __ __ 63 __ */
+ "pand %%mm2, %%mm3\n" /* r3 = __ __ __ 55 */
+ "por %%mm1, %%mm7\n" /* r7 = 72 70 63 __ */
+ "por %%mm3, %%mm7\n" /* r7 = 72 70 63 55 = R13 */
+ "movq %%mm4, %%mm3\n" /* r3 = __ 57 56 55 */
+ "pand "M(1)", %%mm3\n" /* r3 = __ __ 56 __ */
+ "movq %%mm5, %%mm1\n" /* r1 = 47 46 45 44 */
+ "movq %%mm7, 88(%[o])\n" /* write R13 = r7 */
+ "psrlq $48, %%mm5\n" /* r5 = __ __ __ 47 */
+ "movq 64(%[i]), %%mm7\n"
+ "por %%mm3, %%mm6\n" /* r6 = 71 62 56 __ */
+ "pmullw 64(%[q]), %%mm7\n" /* r7 = 43 42 41 40 */
+ "por %%mm5, %%mm6\n" /* r6 = 71 62 56 47 = R12 */
+ "pand "M(2)", %%mm4\n" /* r4 = __ 57 __ __ */
+ "psllq $32, %%mm0\n" /* r0 = 61 60 __ __ */
+ "movq %%mm6, 72(%[o])\n" /* write R12 = r6 */
+ "movq %%mm0, %%mm6\n" /* r6 = 61 60 __ __ */
+ "pand "M(3)", %%mm0\n" /* r0 = 61 __ __ __ */
+ "psllq $16, %%mm6\n" /* r6 = 60 __ __ __ */
+ "movq 40(%[i]), %%mm5\n"
+ "movq %%mm1, %%mm3\n" /* r3 = 47 46 45 44 */
+ "pmullw 40(%[q]), %%mm5\n" /* r5 = 27 26 25 24 */
+ "psrlq $16, %%mm1\n" /* r1 = __ 47 46 45 */
+ "pand "M(1)", %%mm1\n" /* r1 = __ __ 46 __ */
+ "por %%mm4, %%mm0\n" /* r0 = 61 57 __ __ */
+ "pand %%mm7, %%mm2\n" /* r2 = __ __ __ 40 */
+ "por %%mm1, %%mm0\n" /* r0 = 61 57 46 __ */
+ "por %%mm2, %%mm0\n" /* r0 = 61 57 46 40 = R11 */
+ "psllq $16, %%mm3\n" /* r3 = 46 45 44 __ */
+ "movq %%mm3, %%mm4\n" /* r4 = 46 45 44 __ */
+ "movq %%mm5, %%mm2\n" /* r2 = 27 26 25 24 */
+ "movq %%mm0, 112(%[o])\n" /* write R11 = r0 */
+ "psrlq $48, %%mm2\n" /* r2 = __ __ __ 27 */
+ "pand "M(2)", %%mm4\n" /* r4 = __ 45 __ __ */
+ "por %%mm2, %%mm6\n" /* r6 = 60 __ __ 27 */
+ "movq "M(1)", %%mm2\n" /* r2 = __ __ FF __ */
+ "por %%mm4, %%mm6\n" /* r6 = 60 45 __ 27 */
+ "pand %%mm7, %%mm2\n" /* r2 = __ __ 41 __ */
+ "psllq $32, %%mm3\n" /* r3 = 44 __ __ __ */
+ "por 80(%[o]), %%mm3\n" /* r3 = 44 __ __ 23 */
+ "por %%mm2, %%mm6\n" /* r6 = 60 45 41 27 = R10 */
+ "movq "M(3)", %%mm2\n" /* r2 = FF __ __ __ */
+ "psllq $16, %%mm5\n" /* r5 = 26 25 24 __ */
+ "movq %%mm6, 96(%[o])\n" /* store R10 = r6 */
+ "pand %%mm5, %%mm2\n" /* r2 = 26 __ __ __ */
+ "movq "M(2)", %%mm6\n" /* r6 = __ FF __ __ */
+ "pxor %%mm2, %%mm5\n" /* r5 = __ 25 24 __ */
+ "pand %%mm7, %%mm6\n" /* r6 = __ 42 __ __ */
+ "psrlq $32, %%mm2\n" /* r2 = __ __ 26 __ */
+ "pand "M(3)", %%mm7\n" /* r7 = 43 __ __ __ */
+ "por %%mm2, %%mm3\n" /* r3 = 44 __ 26 23 */
+ "por 64(%[o]), %%mm7\n" /* r7 = 43 __ __ 12 */
+ "por %%mm3, %%mm6\n" /* r6 = 44 42 26 23 = R9 */
+ "por %%mm5, %%mm7\n" /* r7 = 43 25 24 12 = R8 */
+ "movq %%mm6, 80(%[o])\n" /* store R9 = r6 */
+ "movq %%mm7, 64(%[o])\n" /* store R8 = r7 */
+
+ /* 123c ( / 64 coeffs < 2c / coeff) */
+
+/* Done w/dequant + descramble + partial transpose; now do the idct itself. */
+
+# define I( K) MtoSTR((K*16))"(%[o])"
+# define J( K) MtoSTR(((K - 4)*16)+8)"(%[o])"
+
+ RowIDCT /* 46 c */
+ Transpose /* 19 c */
+
+# undef I
+# undef J
+# define I( K) MtoSTR((K*16)+64)"(%[o])"
+# define J( K) MtoSTR(((K-4)*16)+72)"(%[o])"
+
+ RowIDCT /* 46 c */
+ Transpose /* 19 c */
+
+# undef I
+# undef J
+# define I( K) MtoSTR((K * 16))"(%[o])"
+# define J( K) I( K)
+
+ ColumnIDCT /* 57 c */
+
+# undef I
+# undef J
+# define I( K) MtoSTR((K*16)+8)"(%[o])"
+# define J( K) I( K)
+
+ ColumnIDCT /* 57 c */
+
+# undef I
+# undef J
+ /* 368 cycles ( / 64 coeff < 6 c / coeff) */
+
+ "emms\n"
+ :
+ :[i]"r"(in),[q]"r"(q),[o]"r"(out),[c]"r"(idctconstants)
+ );
+}
+
+/**************************************************************************************
+ *
+ * Routine: MMX_idct10
+ *
+ * Description: Perform IDCT on a 8x8 block with at most 10 nonzero coefficients
+ *
+ * Input: Pointer to input and output buffer
+ *
+ * Output: None
+ *
+ * Return: None
+ *
+ * Special Note: The input coefficients are in transposed ZigZag order
+ *
+ * Error: None
+ *
+ ***************************************************************************************
+ */
+/* --------------------------------------------------------------- */
+// This macro does four 4-sample one-dimensional idcts in parallel. Inputs
+// 4 thru 7 are assumed to be zero.
+#define BeginIDCT_10 "#BeginIDCT_10\n" \
+ " movq "I(3)",%%mm2\n" \
+ \
+ " movq "C(3)",%%mm6\n" \
+ " movq %%mm2,%%mm4\n" \
+ \
+ " movq "C(5)",%%mm1\n" \
+ " pmulhw %%mm6,%%mm4\n" \
+ \
+ " movq "I(1)",%%mm3\n" \
+ " pmulhw %%mm2,%%mm1\n" \
+ \
+ " movq "C(1)",%%mm0\n" \
+ " paddw %%mm2,%%mm4\n" \
+ \
+ " pxor %%mm6,%%mm6\n" \
+ " paddw %%mm1,%%mm2\n" \
+ \
+ " movq "I(2)",%%mm5\n" \
+ " pmulhw %%mm3,%%mm0\n" \
+ \
+ " movq %%mm5,%%mm1\n" \
+ " paddw %%mm3,%%mm0\n" \
+ \
+ " pmulhw "C(7)",%%mm3\n" \
+ " psubsw %%mm2,%%mm6\n" \
+ \
+ " pmulhw "C(2)",%%mm5\n" \
+ " psubsw %%mm4,%%mm0\n" \
+ \
+ " movq "I(2)",%%mm7\n" \
+ " paddsw %%mm4,%%mm4\n" \
+ \
+ " paddw %%mm5,%%mm7\n" \
+ " paddsw %%mm0,%%mm4\n" \
+ \
+ " pmulhw "C(6)",%%mm1\n" \
+ " psubsw %%mm6,%%mm3\n" \
+ \
+ " movq %%mm4,"I(1)"\n" \
+ " paddsw %%mm6,%%mm6\n" \
+ \
+ " movq "C(4)",%%mm4\n" \
+ " paddsw %%mm3,%%mm6\n" \
+ \
+ " movq %%mm3,%%mm5\n" \
+ " pmulhw %%mm4,%%mm3\n" \
+ \
+ " movq %%mm6,"I(2)"\n" \
+ " movq %%mm0,%%mm2\n" \
+ \
+ " movq "I(0)",%%mm6\n" \
+ " pmulhw %%mm4,%%mm0\n" \
+ \
+ " paddw %%mm3,%%mm5\n" \
+ " paddw %%mm0,%%mm2\n" \
+ \
+ " psubsw %%mm1,%%mm5\n" \
+ " pmulhw %%mm4,%%mm6\n" \
+ \
+ " paddw "I(0)",%%mm6\n" \
+ " paddsw %%mm1,%%mm1\n" \
+ \
+ " movq %%mm6,%%mm4\n" \
+ " paddsw %%mm5,%%mm1\n" \
+ \
+ " psubsw %%mm2,%%mm6\n" \
+ " paddsw %%mm2,%%mm2\n" \
+ \
+ " movq "I(1)",%%mm0\n" \
+ " paddsw %%mm6,%%mm2\n" \
+ \
+ " psubsw %%mm1,%%mm2\n" \
+ "#end BeginIDCT_10\n"
+// end BeginIDCT_10 macro (25 cycles).
+
+#define RowIDCT_10 "#RowIDCT_10\n" \
+ BeginIDCT_10 \
+ "\n" \
+ " movq "I(2)",%%mm3\n" /* r3 = D. */ \
+ " psubsw %%mm7,%%mm4\n" /* r4 = E. = E - G */ \
+ " paddsw %%mm1,%%mm1\n" /* r1 = H. + H. */ \
+ " paddsw %%mm7,%%mm7\n" /* r7 = G + G */ \
+ " paddsw %%mm2,%%mm1\n" /* r1 = R1 = A.. + H. */\
+ " paddsw %%mm4,%%mm7\n" /* r7 = G. = E + G */ \
+ " psubsw %%mm3,%%mm4\n" /* r4 = R4 = E. - D. */ \
+ " paddsw %%mm3,%%mm3\n" \
+ " psubsw %%mm5,%%mm6\n" /* r6 = R6 = F. - B.. */\
+ " paddsw %%mm5,%%mm5\n" \
+ " paddsw %%mm4,%%mm3\n" /* r3 = R3 = E. + D. */ \
+ " paddsw %%mm6,%%mm5\n" /* r5 = R5 = F. + B.. */\
+ " psubsw %%mm0,%%mm7\n" /* r7 = R7 = G. - C. */ \
+ " paddsw %%mm0,%%mm0\n" \
+ " movq %%mm1,"I(1)"\n" /* save R1 */ \
+ " paddsw %%mm7,%%mm0\n" /* r0 = R0 = G. + C. */ \
+ "#end RowIDCT_10\n"
+// end RowIDCT macro (8 + 38 = 46 cycles)
+
+// Column IDCT normalizes and stores final results.
+
+#define ColumnIDCT_10 "#ColumnIDCT_10\n" \
+ BeginIDCT_10 \
+ "\n" \
+ " paddsw "Eight",%%mm2\n" \
+ " paddsw %%mm1,%%mm1\n" /* r1 = H. + H. */ \
+ " paddsw %%mm2,%%mm1\n" /* r1 = R1 = A.. + H. */\
+ " psraw ""$4"",%%mm2\n" /* r2 = NR2 */ \
+ " psubsw %%mm7,%%mm4\n" /* r4 = E. = E - G */ \
+ " psraw ""$4"",%%mm1\n" /* r1 = NR1 */ \
+ " movq "I(2)",%%mm3\n" /* r3 = D. */ \
+ " paddsw %%mm7,%%mm7\n" /* r7 = G + G */ \
+ " movq %%mm2,"I(2)"\n" /* store NR2 at I2 */ \
+ " paddsw %%mm4,%%mm7\n" /* r7 = G. = E + G */ \
+ " movq %%mm1,"I(1)"\n" /* store NR1 at I1 */ \
+ " psubsw %%mm3,%%mm4\n" /* r4 = R4 = E. - D. */ \
+ " paddsw "Eight",%%mm4\n" \
+ " paddsw %%mm3,%%mm3\n" /* r3 = D. + D. */ \
+ " paddsw %%mm4,%%mm3\n" /* r3 = R3 = E. + D. */ \
+ " psraw ""$4"",%%mm4\n" /* r4 = NR4 */ \
+ " psubsw %%mm5,%%mm6\n" /* r6 = R6 = F. - B.. */\
+ " psraw ""$4"",%%mm3\n" /* r3 = NR3 */ \
+ " paddsw "Eight",%%mm6\n" \
+ " paddsw %%mm5,%%mm5\n" /* r5 = B.. + B.. */ \
+ " paddsw %%mm6,%%mm5\n" /* r5 = R5 = F. + B.. */\
+ " psraw ""$4"",%%mm6\n" /* r6 = NR6 */ \
+ " movq %%mm4,"J(4)"\n" /* store NR4 at J4 */ \
+ " psraw ""$4"",%%mm5\n" /* r5 = NR5 */ \
+ " movq %%mm3,"I(3)"\n" /* store NR3 at I3 */ \
+ " psubsw %%mm0,%%mm7\n" /* r7 = R7 = G. - C. */ \
+ " paddsw "Eight",%%mm7\n" \
+ " paddsw %%mm0,%%mm0\n" /* r0 = C. + C. */ \
+ " paddsw %%mm7,%%mm0\n" /* r0 = R0 = G. + C. */ \
+ " psraw ""$4"",%%mm7\n" /* r7 = NR7 */ \
+ " movq %%mm6,"J(6)"\n" /* store NR6 at J6 */ \
+ " psraw ""$4"",%%mm0\n" /* r0 = NR0 */ \
+ " movq %%mm5,"J(5)"\n" /* store NR5 at J5 */ \
+ \
+ " movq %%mm7,"J(7)"\n" /* store NR7 at J7 */ \
+ \
+ " movq %%mm0,"I(0)"\n" /* store NR0 at I0 */ \
+ "#end ColumnIDCT_10\n"
+
+// end ColumnIDCT macro (38 + 19 = 57 cycles)
+/* --------------------------------------------------------------- */
+
+
+/* --------------------------------------------------------------- */
+/* IDCT 10 */
+void IDct10__mmx( const ogg_int16_t *in,
+ const ogg_int16_t *q,
+ ogg_int16_t *out ) {
+
+ __asm__ __volatile__ (
+
+ "movq (%[i]), %%mm0\n"
+ "pmullw (%[q]), %%mm0\n" /* r0 = 03 02 01 00 */
+ "movq 16(%[i]), %%mm1\n"
+ "pmullw 16(%[q]), %%mm1\n" /* r1 = 13 12 11 10 */
+ "movq "M(0)", %%mm2\n" /* r2 = __ __ __ FF */
+ "movq %%mm0, %%mm3\n" /* r3 = 03 02 01 00 */
+ "movq 8(%[i]), %%mm4\n"
+ "psrlq $16, %%mm0\n" /* r0 = __ 03 02 01 */
+ "pmullw 8(%[q]), %%mm4\n" /* r4 = 07 06 05 04 */
+ "pand %%mm2, %%mm3\n" /* r3 = __ __ __ 00 */
+ "movq %%mm0, %%mm5\n" /* r5 = __ 03 02 01 */
+ "pand %%mm2, %%mm5\n" /* r5 = __ __ __ 01 */
+ "psllq $32, %%mm1\n" /* r1 = 11 10 __ __ */
+ "movq "M(3)", %%mm7\n" /* r7 = FF __ __ __ */
+ "pxor %%mm5, %%mm0\n" /* r0 = __ 03 02 __ */
+ "pand %%mm1, %%mm7\n" /* r7 = 11 __ __ __ */
+ "por %%mm3, %%mm0\n" /* r0 = __ 03 02 00 */
+ "pxor %%mm7, %%mm1\n" /* r1 = __ 10 __ __ */
+ "por %%mm7, %%mm0\n" /* r0 = 11 03 02 00 = R0 */
+ "movq %%mm4, %%mm3\n" /* r3 = 07 06 05 04 */
+ "movq %%mm0, (%[o])\n" /* write R0 = r0 */
+ "pand %%mm2, %%mm3\n" /* r3 = __ __ __ 04 */
+ "psllq $16, %%mm3\n" /* r3 = __ __ 04 __ */
+ "por %%mm3, %%mm5\n" /* r5 = __ __ 04 01 */
+ "por %%mm5, %%mm1\n" /* r1 = __ 10 04 01 = R1 */
+ "psrlq $16, %%mm4\n" /* r4 = __ 07 06 05 */
+ "movq %%mm1, 16(%[o])\n" /* write R1 = r1 */
+ "movq %%mm4, %%mm5\n" /* r5 = __ 07 06 05 */
+ "psrlq $16, %%mm4\n" /* r4 = __ __ 07 06 */
+ "movq %%mm2, %%mm6\n" /* r6 = __ __ __ FF */
+ "pand %%mm2, %%mm5\n" /* r5 = __ __ __ 05 */
+ "pand %%mm4, %%mm6\n" /* r6 = __ __ __ 06 */
+ "pxor %%mm6, %%mm4\n" /* r4 = __ __ 07 __ */
+ "por %%mm5, %%mm4\n" /* r4 = __ __ 07 05 */
+ "movq %%mm4, 32(%[o])\n" /* write R2 = r4 */
+ "movq %%mm6, 48(%[o])\n" /* write R3 = r6 */
+
+# define I( K) MtoSTR((K*16))"(%[o])"
+# define J( K) MtoSTR(((K - 4) * 16)+8)"(%[o])"
+
+ RowIDCT_10 /* 33 c */
+ Transpose /* 19 c */
+
+# undef I
+# undef J
+
+# define I( K) MtoSTR((K * 16))"(%[o])"
+# define J( K) I( K)
+
+ ColumnIDCT_10 /* 44 c */
+
+# undef I
+# undef J
+# define I( K) MtoSTR((K * 16)+8)"(%[o])"
+# define J( K) I( K)
+
+ ColumnIDCT_10 /* 44 c */
+
+# undef I
+# undef J
+
+ "emms\n"
+ :
+ :[i]"r"(in),[q]"r"(q),[o]"r"(out),[c]"r"(idctconstants)
+ );
+}
+
+/**************************************************************************************
+ *
+ * Routine: MMX_idct3
+ *
+ * Description: Perform IDCT on a 8x8 block with at most 3 nonzero coefficients
+ *
+ * Input: Pointer to input and output buffer
+ *
+ * Output: None
+ *
+ * Return: None
+ *
+ * Special Note: Only works for three nonzero coefficients.
+ *
+ * Error: None
+ *
+ ***************************************************************************************
+ */
+/***************************************************************************************
+ In IDCT 3, we are dealing with only three Non-Zero coefficients in the 8x8 block.
+ In the case that we work in the fashion RowIDCT -> ColumnIDCT, we only have to
+ do 1-D row idcts on the first two rows, the rest six rows remain zero anyway.
+ After row IDCTs, since every column could have nonzero coefficients, we need do
+ eight 1-D column IDCT. However, for each column, there are at most two nonzero
+ coefficients, coefficient 0 and coefficient 1. Same for the coefficents for the
+ two 1-d row idcts. For this reason, the process of a 1-D IDCT is simplified
+
+ from a full version:
+
+ A = (C1 * I1) + (C7 * I7) B = (C7 * I1) - (C1 * I7)
+ C = (C3 * I3) + (C5 * I5) D = (C3 * I5) - (C5 * I3)
+ A. = C4 * (A - C) B. = C4 * (B - D)
+ C. = A + C D. = B + D
+
+ E = C4 * (I0 + I4) F = C4 * (I0 - I4)
+ G = (C2 * I2) + (C6 * I6) H = (C6 * I2) - (C2 * I6)
+ E. = E - G
+ G. = E + G
+
+ A.. = F + A. B.. = B. - H
+ F. = F - A. H. = B. + H
+
+ R0 = G. + C. R1 = A.. + H. R3 = E. + D. R5 = F. + B..
+ R7 = G. - C. R2 = A.. - H. R4 = E. - D. R6 = F. - B..
+
+ To:
+
+
+ A = (C1 * I1) B = (C7 * I1)
+ C = 0 D = 0
+ A. = C4 * A B. = C4 * B
+ C. = A D. = B
+
+ E = C4 * I0 F = E
+ G = 0 H = 0
+ E. = E
+ G. = E
+
+ A.. = E + A. B.. = B.
+ F. = E - A. H. = B.
+
+ R0 = E + A R1 = E + A. + B. R3 = E + B R5 = E - A. + B.
+ R7 = E - A R2 = E + A. - B. R4 = E - B R6 = F - A. - B.
+
+******************************************************************************************/
+
+#define RowIDCT_3 "#RowIDCT_3\n"\
+ " movq "I(1)",%%mm7\n" /* r7 = I1 */ \
+ " movq "C(1)",%%mm0\n" /* r0 = C1 */ \
+ " movq "C(7)",%%mm3\n" /* r3 = C7 */ \
+ " pmulhw %%mm7,%%mm0\n" /* r0 = C1 * I1 - I1 */ \
+ " pmulhw %%mm7,%%mm3\n" /* r3 = C7 * I1 = B, D. */ \
+ " movq "I(0)",%%mm6\n" /* r6 = I0 */ \
+ " movq "C(4)",%%mm4\n" /* r4 = C4 */ \
+ " paddw %%mm7,%%mm0\n" /* r0 = C1 * I1 = A, C. */ \
+ " movq %%mm6,%%mm1\n" /* make a copy of I0 */ \
+ " pmulhw %%mm4,%%mm6\n" /* r2 = C4 * I0 - I0 */ \
+ " movq %%mm0,%%mm2\n" /* make a copy of A */ \
+ " movq %%mm3,%%mm5\n" /* make a copy of B */ \
+ " pmulhw %%mm4,%%mm2\n" /* r2 = C4 * A - A */ \
+ " pmulhw %%mm4,%%mm5\n" /* r5 = C4 * B - B */ \
+ " paddw %%mm1,%%mm6\n" /* r2 = C4 * I0 = E, F */ \
+ " movq %%mm6,%%mm4\n" /* r4 = E */ \
+ " paddw %%mm0,%%mm2\n" /* r2 = A. */ \
+ " paddw %%mm3,%%mm5\n" /* r5 = B. */ \
+ " movq %%mm6,%%mm7\n" /* r7 = E */ \
+ " movq %%mm5,%%mm1\n" /* r1 = B. */ \
+ /* r0 = A */ \
+ /* r3 = B */ \
+ /* r2 = A. */ \
+ /* r5 = B. */ \
+ /* r6 = E */ \
+ /* r4 = E */ \
+ /* r7 = E */ \
+ /* r1 = B. */ \
+ " psubw %%mm2,%%mm6\n" /* r6 = E - A. */ \
+ " psubw %%mm3,%%mm4\n" /* r4 = E - B ----R4 */ \
+ " psubw %%mm0,%%mm7\n" /* r7 = E - A ----R7 */ \
+ " paddw %%mm2,%%mm2\n" /* r2 = A. + A. */ \
+ " paddw %%mm3,%%mm3\n" /* r3 = B + B */ \
+ " paddw %%mm0,%%mm0\n" /* r0 = A + A */ \
+ " paddw %%mm6,%%mm2\n" /* r2 = E + A. */ \
+ " paddw %%mm4,%%mm3\n" /* r3 = E + B ----R3 */ \
+ " psubw %%mm1,%%mm2\n" /* r2 = E + A. - B. ----R2 */ \
+ " psubw %%mm5,%%mm6\n" /* r6 = E - A. - B. ----R6 */ \
+ " paddw %%mm1,%%mm1\n" /* r1 = B. + B. */ \
+ " paddw %%mm5,%%mm5\n" /* r5 = B. + B. */ \
+ " paddw %%mm7,%%mm0\n" /* r0 = E + A ----R0 */ \
+ " paddw %%mm2,%%mm1\n" /* r1 = E + A. + B. -----R1 */ \
+ " movq %%mm1,"I(1)"\n" /* save r1 */ \
+ " paddw %%mm6,%%mm5\n" /* r5 = E - A. + B. -----R5 */ \
+ "#end RowIDCT_3\n"
+//End of RowIDCT_3
+
+#define ColumnIDCT_3 "#ColumnIDCT_3\n"\
+ " movq "I(1)",%%mm7\n" /* r7 = I1 */ \
+ " movq "C(1)",%%mm0\n" /* r0 = C1 */ \
+ " movq "C(7)",%%mm3\n" /* r3 = C7 */ \
+ " pmulhw %%mm7,%%mm0\n" /* r0 = C1 * I1 - I1 */ \
+ " pmulhw %%mm7,%%mm3\n" /* r3 = C7 * I1 = B, D. */ \
+ " movq "I(0)",%%mm6\n" /* r6 = I0 */ \
+ " movq "C(4)",%%mm4\n" /* r4 = C4 */ \
+ " paddw %%mm7,%%mm0\n" /* r0 = C1 * I1 = A, C. */ \
+ " movq %%mm6,%%mm1\n" /* make a copy of I0 */ \
+ " pmulhw %%mm4,%%mm6\n" /* r2 = C4 * I0 - I0 */ \
+ " movq %%mm0,%%mm2\n" /* make a copy of A */ \
+ " movq %%mm3,%%mm5\n" /* make a copy of B */ \
+ " pmulhw %%mm4,%%mm2\n" /* r2 = C4 * A - A */ \
+ " pmulhw %%mm4,%%mm5\n" /* r5 = C4 * B - B */ \
+ " paddw %%mm1,%%mm6\n" /* r2 = C4 * I0 = E, F */ \
+ " movq %%mm6,%%mm4\n" /* r4 = E */ \
+ " paddw "Eight",%%mm6\n" /* +8 for shift */ \
+ " paddw "Eight",%%mm4\n" /* +8 for shift */ \
+ " paddw %%mm0,%%mm2\n" /* r2 = A. */ \
+ " paddw %%mm3,%%mm5\n" /* r5 = B. */ \
+ " movq %%mm6,%%mm7\n" /* r7 = E */ \
+ " movq %%mm5,%%mm1\n" /* r1 = B. */ \
+/* r0 = A */ \
+/* r3 = B */ \
+/* r2 = A. */ \
+/* r5 = B. */ \
+/* r6 = E */ \
+/* r4 = E */ \
+/* r7 = E */ \
+/* r1 = B. */ \
+ " psubw %%mm2,%%mm6\n" /* r6 = E - A. */ \
+ " psubw %%mm3,%%mm4\n" /* r4 = E - B ----R4 */ \
+ " psubw %%mm0,%%mm7\n" /* r7 = E - A ----R7 */ \
+ " paddw %%mm2,%%mm2\n" /* r2 = A. + A. */ \
+ " paddw %%mm3,%%mm3\n" /* r3 = B + B */ \
+ " paddw %%mm0,%%mm0\n" /* r0 = A + A */ \
+ " paddw %%mm6,%%mm2\n" /* r2 = E + A. */ \
+ " paddw %%mm4,%%mm3\n" /* r3 = E + B ----R3 */ \
+ " psraw $4,%%mm4\n" /* shift */ \
+ " movq %%mm4,"J(4)"\n" /* store R4 at J4 */ \
+ " psraw $4,%%mm3\n" /* shift */ \
+ " movq %%mm3,"I(3)"\n" /* store R3 at I3 */ \
+ " psubw %%mm1,%%mm2\n" /* r2 = E + A. - B. ----R2 */ \
+ " psubw %%mm5,%%mm6\n" /* r6 = E - A. - B. ----R6 */ \
+ " paddw %%mm1,%%mm1\n" /* r1 = B. + B. */ \
+ " paddw %%mm5,%%mm5\n" /* r5 = B. + B. */ \
+ " paddw %%mm7,%%mm0\n" /* r0 = E + A ----R0 */ \
+ " paddw %%mm2,%%mm1\n" /* r1 = E + A. + B. -----R1 */ \
+ " psraw $4,%%mm7\n" /* shift */ \
+ " psraw $4,%%mm2\n" /* shift */ \
+ " psraw $4,%%mm0\n" /* shift */ \
+ " psraw $4,%%mm1\n" /* shift */ \
+ " movq %%mm7,"J(7)"\n" /* store R7 to J7 */ \
+ " movq %%mm0,"I(0)"\n" /* store R0 to I0 */ \
+ " movq %%mm1,"I(1)"\n" /* store R1 to I1 */ \
+ " movq %%mm2,"I(2)"\n" /* store R2 to I2 */ \
+ " movq %%mm1,"I(1)"\n" /* save r1 */ \
+ " paddw %%mm6,%%mm5\n" /* r5 = E - A. + B. -----R5 */ \
+ " psraw $4,%%mm5\n" /* shift */ \
+ " movq %%mm5,"J(5)"\n" /* store R5 at J5 */ \
+ " psraw $4,%%mm6\n" /* shift */ \
+ " movq %%mm6,"J(6)"\n" /* store R6 at J6 */ \
+ "#end ColumnIDCT_3\n"
+//End of ColumnIDCT_3
+
+void IDct3__mmx( const ogg_int16_t *in,
+ const ogg_int16_t *q,
+ ogg_int16_t *out ) {
+
+ __asm__ __volatile__ (
+
+ "movq (%[i]), %%mm0\n"
+ "pmullw (%[q]), %%mm0\n" /* r0 = 03 02 01 00 */
+ "movq "M(0)", %%mm2\n" /* r2 = __ __ __ FF */
+ "movq %%mm0, %%mm3\n" /* r3 = 03 02 01 00 */
+ "psrlq $16, %%mm0\n" /* r0 = __ 03 02 01 */
+ "pand %%mm2, %%mm3\n" /* r3 = __ __ __ 00 */
+ "movq %%mm0, %%mm5\n" /* r5 = __ 03 02 01 */
+ "pand %%mm2, %%mm5\n" /* r5 = __ __ __ 01 */
+ "pxor %%mm5, %%mm0\n" /* r0 = __ 03 02 __ */
+ "por %%mm3, %%mm0\n" /* r0 = __ 03 02 00 */
+ "movq %%mm0, (%[o])\n" /* write R0 = r0 */
+ "movq %%mm5, 16(%[o])\n" /* write R1 = r5 */
+
+/* Done partial transpose; now do the idct itself. */
+
+# define I( K) MtoSTR((K*16))"(%[o])"
+# define J( K) MtoSTR(((K - 4)*16)+8)"(%[o])"
+
+ RowIDCT_3 /* 33 c */
+ Transpose /* 19 c */
+
+# undef I
+# undef J
+
+# define I( K) MtoSTR((K * 16))"(%[o])"
+# define J( K) I( K)
+
+ ColumnIDCT_3 /* 44 c */
+
+# undef I
+# undef J
+# define I( K) MtoSTR((K*16)+8)"(%[o])"
+# define J( K) I( K)
+
+ ColumnIDCT_3 /* 44 c */
+
+# undef I
+# undef J
+
+ "emms\n"
+ :
+ :[i]"r"(in),[q]"r"(q),[o]"r"(out),[c]"r"(idctconstants)
+ );
+
+}
+
+/* install our implementation in the function table */
+void dsp_mmx_idct_init(DspFunctions *funcs)
+{
+ funcs->IDctSlow = IDctSlow__mmx;
+ funcs->IDct10 = IDct10__mmx;
+ funcs->IDct3 = IDct3__mmx;
+}
+
+#endif /* USE_ASM */
+
+
Added: branches/theora-thusnelda/lib/enc/x86/recon_mmx.c
===================================================================
--- branches/theora-thusnelda/lib/enc/x86/recon_mmx.c (rev 0)
+++ branches/theora-thusnelda/lib/enc/x86/recon_mmx.c 2008-12-05 05:41:06 UTC (rev 15557)
@@ -0,0 +1,105 @@
+/********************************************************************
+ * *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
+ * *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007 *
+ * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * *
+ ********************************************************************
+
+ function:
+ last mod: $Id: recon_mmx.c 14579 2008-03-12 06:42:40Z xiphmont $
+
+ ********************************************************************/
+
+#include "codec_internal.h"
+
+#if defined(USE_ASM)
+
+static const __attribute__ ((aligned(8),used)) ogg_int64_t V128 = 0x8080808080808080LL;
+
+static void copy8x8__mmx (const unsigned char *src,
+ unsigned char *dest,
+ ogg_uint32_t stride)
+{
+ __asm__ __volatile__ (
+ " .balign 16 \n\t"
+
+ " lea (%2, %2, 2), %%rdi \n\t"
+
+ " movq (%1), %%mm0 \n\t"
+ " movq (%1, %2), %%mm1 \n\t"
+ " movq (%1, %2, 2), %%mm2 \n\t"
+ " movq (%1, %%rdi), %%mm3 \n\t"
+
+ " lea (%1, %2, 4), %1 \n\t"
+
+ " movq %%mm0, (%0) \n\t"
+ " movq %%mm1, (%0, %2) \n\t"
+ " movq %%mm2, (%0, %2, 2) \n\t"
+ " movq %%mm3, (%0, %%rdi) \n\t"
+
+ " lea (%0, %2, 4), %0 \n\t"
+
+ " movq (%1), %%mm0 \n\t"
+ " movq (%1, %2), %%mm1 \n\t"
+ " movq (%1, %2, 2), %%mm2 \n\t"
+ " movq (%1, %%rdi), %%mm3 \n\t"
+
+ " movq %%mm0, (%0) \n\t"
+ " movq %%mm1, (%0, %2) \n\t"
+ " movq %%mm2, (%0, %2, 2) \n\t"
+ " movq %%mm3, (%0, %%rdi) \n\t"
+ : "+a" (dest)
+ : "c" (src),
+ "d" ((unsigned long)stride)
+ : "memory", "rdi"
+ );
+}
+
+static void recon8x8__mmx (unsigned char *ReconPtr,
+ const ogg_int16_t *ChangePtr,
+ ogg_uint32_t LineStep)
+{
+ __asm__ __volatile__ (
+ " .balign 16 \n\t"
+
+ " pxor %%mm0, %%mm0 \n\t"
+ " lea 128(%1), %%rdi \n\t"
+
+ "1: \n\t"
+ " movq (%0), %%mm2 \n\t" /* (+3 misaligned) 8 reference pixels */
+
+ " movq (%1), %%mm4 \n\t" /* first 4 changes */
+ " movq %%mm2, %%mm3 \n\t"
+ " movq 8(%1), %%mm5 \n\t" /* last 4 changes */
+ " punpcklbw %%mm0, %%mm2 \n\t" /* turn first 4 refs into positive 16-bit #s */
+ " paddsw %%mm4, %%mm2 \n\t" /* add in first 4 changes */
+ " punpckhbw %%mm0, %%mm3 \n\t" /* turn last 4 refs into positive 16-bit #s */
+ " paddsw %%mm5, %%mm3 \n\t" /* add in last 4 changes */
+
+ " packuswb %%mm3, %%mm2 \n\t" /* pack result to unsigned 8-bit values */
+ " lea 16(%1), %1 \n\t" /* next row of changes */
+ " cmp %%rdi, %1 \n\t" /* are we done? */
+
+ " movq %%mm2, (%0) \n\t" /* store result */
+
+ " lea (%0, %2), %0 \n\t" /* next row of output */
+ " jc 1b \n\t"
+ : "+r" (ReconPtr)
+ : "r" (ChangePtr),
+ "r" ((unsigned long)LineStep)
+ : "memory", "rdi"
+ );
+}
+
+void dsp_mmx_recon_init(DspFunctions *funcs)
+{
+ funcs->copy8x8 = copy8x8__mmx;
+ funcs->recon8x8 = recon8x8__mmx;
+}
+
+#endif /* USE_ASM */
More information about the commits
mailing list