[xiph-commits] r12881 - trunk/theora-exp/lib/x86

tterribe at svn.xiph.org tterribe at svn.xiph.org
Mon Apr 16 22:48:45 PDT 2007


Author: tterribe
Date: 2007-04-16 22:48:45 -0700 (Mon, 16 Apr 2007)
New Revision: 12881

Modified:
   trunk/theora-exp/lib/x86/mmxstate.c
Log:
Complete rewrite of the MMX loop filter.
Unlike the previous one, this does not use the bounding values array.
Instead, it computes the required values on the fly.
This is many more opcodes, but avoids the serial memory references, which on
 modern processors (P4/Core/Core2) have a 6 clock latency for MMX/SSE/fp
 instructions, even for an L1 cache hit.
The performance is indistinguishable from the original in my tests.


Modified: trunk/theora-exp/lib/x86/mmxstate.c
===================================================================
--- trunk/theora-exp/lib/x86/mmxstate.c	2007-04-17 04:26:52 UTC (rev 12880)
+++ trunk/theora-exp/lib/x86/mmxstate.c	2007-04-17 05:48:45 UTC (rev 12881)
@@ -18,9 +18,9 @@
 #if defined(OC_X86ASM)
 
 static const __attribute__((aligned(8),used)) ogg_int64_t OC_V3=
- 0x0003000300030003LL; 
+ 0x0003000300030003LL;
 static const __attribute__((aligned(8),used)) ogg_int64_t OC_V4=
- 0x0004000400040004LL; 
+ 0x0004000400040004LL;
 
 static const __attribute__((aligned(8),used)) int OC_FZIG_ZAGMMX[64]={
    0, 8, 1, 2, 9,16,24,17,
@@ -254,17 +254,17 @@
   __asm__ __volatile__("emms\n\t");
 }
 
-static void loop_filter_v(unsigned char *_pix,int _ystride,int *_bv){
+static void loop_filter_v(unsigned char *_pix,int _ystride,
+ const ogg_int16_t *_ll){
   long esi;
-  long edi;
   _pix-=_ystride*2;
   __asm__ __volatile__(
     /*mm0=0*/
     "pxor %%mm0,%%mm0\n\t"
+    /*esi=_ystride*3*/
+    "lea (%[ystride],%[ystride],2),%[s]\n\t"
     /*mm7=_pix[0...8]*/
     "movq (%[pix]),%%mm7\n\t"
-    /*esi=_ystride*3*/
-    "lea (%[ystride],%[ystride],2),%[s]\n\t"
     /*mm4=_pix[0...8+_ystride*3]*/
     "movq (%[pix],%[s]),%%mm4\n\t"
     /*mm6=_pix[0...8]*/
@@ -289,14 +289,14 @@
     "movq %%mm2,%%mm1\n\t"
     /*Expand these arrays.*/
     "punpckhbw %%mm0,%%mm5\n\t"
-    "punpcklbw %%mm0,%%mm4\n\t" 
+    "punpcklbw %%mm0,%%mm4\n\t"
     "punpckhbw %%mm0,%%mm3\n\t"
     "punpcklbw %%mm0,%%mm2\n\t"
     /*Preload...*/
     "movq %[OC_V3],%%mm0\n\t"
     /*mm3:mm2=_pix[0...8+_ystride*2]-_pix[0...8+_ystride]*/
-    "psubw %%mm5,%%mm3\n\t" 
-    "psubw %%mm4,%%mm2\n\t" 
+    "psubw %%mm5,%%mm3\n\t"
+    "psubw %%mm4,%%mm2\n\t"
     /*Scale by 3.*/
     "pmullw %%mm0,%%mm3\n\t"
     "pmullw %%mm0,%%mm2\n\t"
@@ -312,93 +312,122 @@
     /*"Divide" by 8.*/
     "psraw $3,%%mm3\n\t"
     "psraw $3,%%mm2\n\t"
-    /*Now perform mm7:m6=_bv[(f+4>>3)]*/
-    /*First the low part:*/
-    /*pextrw requires MMX+/SSE.
-    "pextrw $0,%%mm2,%%esi\n\t"
-    "pextrw $1,%%mm2,%%edi\n\t"*/
-    /*We duplicate the value and pull out of two registers in parallel;
-       perhaps we should not bother with just MMX, since any processor with
-       multiply MMX units will also have SSE, and should be using that
-       instead.*/
-    "movq %%mm2,%%mm0\n\t"
-    "psrlq $16,%%mm2\n\t"
-    "movd %%mm0,%%esi\n\t"
-    "movd %%mm2,%%edi\n\t"
-    "psrlq $32,%%mm0\n\t"
-    "movsx %%si,%[s]\n\t"
-    "psrlq $32,%%mm2\n\t"
-    "movsx %%di,%[d]\n\t"
-    /*pinsrw requires MMX+/SSE.
-    "pinsrw $0,(%[bv],%[s],4),%%mm6\n\t"
-    "pinsrw $1,(%[bv],%[d],4),%%mm6\n\t"
-    "pextrw $2,%%mm2,%%esi\n\t"
-    "pextrw $3,%%mm2,%%edi\n\t"*/
-    "movd (%[bv],%[s],4),%%mm6\n\t"
-    "movd %%mm0,%%esi\n\t"
-    "movd (%[bv],%[d],4),%%mm0\n\t"
-    "movd %%mm2,%%edi\n\t"
-    "movsx %%si,%[s]\n\t"
-    "movsx %%di,%[d]\n\t"
-    /*"pinsrw $2,(%[bv],%%esi,4),%%mm6\n\t"
-    "pinsrw $3,(%[bv],%%edi,4),%%mm6\n\t"*/
-    "movd (%[bv],%[s],4),%%mm2\n\t"
-    "pslld $16,%%mm2\n\t"
-    "por %%mm2,%%mm6\n\t"
-    "movd (%[bv],%[d],4),%%mm2\n\t"
-    "pslld $16,%%mm2\n\t"
-    "por %%mm2,%%mm0\n\t"
-    "punpcklwd %%mm0,%%mm6\n\t"
-    /*Do it again for the high part:*/
-    /*"pextrw $0,%%mm3,%%esi\n\t" 
-    "pextrw $1,%%mm3,%%edi\n\t"*/
-    "movq %%mm3,%%mm0\n\t"
-    "psrlq $16,%%mm3\n\t"
-    "movd %%mm0,%%esi\n\t"
-    "movd %%mm3,%%edi\n\t"
-    "psrlq $32,%%mm0\n\t"
-    "movsx %%si,%[s]\n\t"
-    "psrlq $32,%%mm3\n\t"
-    "movsx %%di,%[d]\n\t"
-    /*"pinsrw $0,(%[bv],%%esi,4),%%mm7\n\t"
-    "pinsrw $1,(%[bv],%%edi,4),%%mm7\n\t"
-    "pextrw $2,%%mm3,%%esi\n\t"
-    "pextrw $3,%%mm3,%%edi\n\t"*/
-    "movd (%[bv],%[s],4),%%mm7\n\t"
-    "movd %%mm0,%%esi\n\t"
-    "movd (%[bv],%[d],4),%%mm0\n\t"
-    "movd %%mm3,%%edi\n\t"
-    "movsx %%si,%[s]\n\t"
-    "movsx %%di,%[d]\n\t"
-    /*"pinsrw $2,(%[bv],%%esi,4),%%mm7\n\t"
-    "pinsrw $3, (%[bv],%%edi,4),%%mm7\n\t"*/
-    "movd (%[bv],%[s],4),%%mm2\n\t"
-    "movd (%[bv],%[d],4),%%mm3\n\t"
-    "pslld $16,%%mm2\n\t"
-    "pslld $16,%%mm3\n\t"
-    "por %%mm2,%%mm7\n\t"
-    "por %%mm3,%%mm0\n\t"
-    "punpcklwd %%mm0,%%mm7\n\t"
-    /*mm7:mm6 now contain the final values of f.*/
-    /*_pix[0...8+_ystride]+=f*/
-    "paddw %%mm6,%%mm4\n\t"
-    "paddw %%mm7,%%mm5\n\t"
-    /*Re-expand _pix[0...8+_ystride*2], since we didn't have enough registers
-       to keep the whole thing around.*/
+    /*Now compute lflim of mm3:mm2 cf. Section 7.10 of the sepc.*/
+    /*Free up mm5.*/
+    "packuswb %%mm5,%%mm4\n\t"
+    /*mm0=L L L L*/
+    "movq (%[ll]),%%mm0\n\t"
+    /*if(R_i<-2L||R_i>2L)R_i=0:*/
+    "movq %%mm2,%%mm5\n\t"
+    "pxor %%mm6,%%mm6\n\t"
+    "movq %%mm0,%%mm7\n\t"
+    "psubw %%mm0,%%mm6\n\t"
+    "psllw $1,%%mm7\n\t"
+    "psllw $1,%%mm6\n\t"
+    /*mm2==R_3 R_2 R_1 R_0*/
+    /*mm5==R_3 R_2 R_1 R_0*/
+    /*mm6==-2L -2L -2L -2L*/
+    /*mm7==2L 2L 2L 2L*/
+    "pcmpgtw %%mm2,%%mm7\n\t"
+    "pcmpgtw %%mm6,%%mm5\n\t"
+    "pand %%mm7,%%mm2\n\t"
+    "movq %%mm0,%%mm7\n\t"
+    "pand %%mm5,%%mm2\n\t"
+    "psllw $1,%%mm7\n\t"
+    "movq %%mm3,%%mm5\n\t"
+    /*mm3==R_7 R_6 R_5 R_4*/
+    /*mm5==R_7 R_6 R_5 R_4*/
+    /*mm6==-2L -2L -2L -2L*/
+    /*mm7==2L 2L 2L 2L*/
+    "pcmpgtw %%mm3,%%mm7\n\t"
+    "pcmpgtw %%mm6,%%mm5\n\t"
+    "pand %%mm7,%%mm3\n\t"
+    "movq %%mm0,%%mm7\n\t"
+    "pand %%mm5,%%mm3\n\t"
+    /*if(R_i<-L)R_i'=R_i+2L;
+      if(R_i>L)R_i'=R_i-2L;
+      if(R_i<-L||R_i>L)R_i=-R_i':*/
+    "psraw $1,%%mm6\n\t"
+    "movq %%mm2,%%mm5\n\t"
+    "psllw $1,%%mm7\n\t"
+    /*mm2==R_3 R_2 R_1 R_0*/
+    /*mm5==R_3 R_2 R_1 R_0*/
+    /*mm6==-L -L -L -L*/
+    /*mm0==L L L L*/
+    /*mm5=R_i>L?FF:00*/
+    "pcmpgtw %%mm0,%%mm5\n\t"
+    /*mm6=-L>R_i?FF:00*/
+    "pcmpgtw %%mm2,%%mm6\n\t"
+    /*mm7=R_i>L?2L:0*/
+    "pand %%mm5,%%mm7\n\t"
+    /*mm2=R_i>L?R_i-2L:R_i*/
+    "psubw %%mm7,%%mm2\n\t"
+    "movq %%mm0,%%mm7\n\t"
+    /*mm5=-L>R_i||R_i>L*/
+    "por %%mm6,%%mm5\n\t"
+    "psllw $1,%%mm7\n\t"
+    /*mm7=-L>R_i?2L:0*/
+    "pand %%mm6,%%mm7\n\t"
+    "pxor %%mm6,%%mm6\n\t"
+    /*mm2=-L>R_i?R_i+2L:R_i*/
+    "paddw %%mm7,%%mm2\n\t"
+    "psubw %%mm0,%%mm6\n\t"
+    /*mm5=-L>R_i||R_i>L?-R_i':0*/
+    "pand %%mm2,%%mm5\n\t"
+    "movq %%mm0,%%mm7\n\t"
+    /*mm2=-L>R_i||R_i>L?0:R_i*/
+    "psubw %%mm5,%%mm2\n\t"
+    "psllw $1,%%mm7\n\t"
+    /*mm2=-L>R_i||R_i>L?-R_i':R_i*/
+    "psubw %%mm5,%%mm2\n\t"
+    "movq %%mm3,%%mm5\n\t"
+    /*mm3==R_7 R_6 R_5 R_4*/
+    /*mm5==R_7 R_6 R_5 R_4*/
+    /*mm6==-L -L -L -L*/
+    /*mm0==L L L L*/
+    /*mm6=-L>R_i?FF:00*/
+    "pcmpgtw %%mm3,%%mm6\n\t"
+    /*mm5=R_i>L?FF:00*/
+    "pcmpgtw %%mm0,%%mm5\n\t"
+    /*mm7=R_i>L?2L:0*/
+    "pand %%mm5,%%mm7\n\t"
+    /*mm2=R_i>L?R_i-2L:R_i*/
+    "psubw %%mm7,%%mm3\n\t"
+    "psllw $1,%%mm0\n\t"
+    /*mm5=-L>R_i||R_i>L*/
+    "por %%mm6,%%mm5\n\t"
+    /*mm0=-L>R_i?2L:0*/
+    "pand %%mm6,%%mm0\n\t"
+    /*mm3=-L>R_i?R_i+2L:R_i*/
+    "paddw %%mm0,%%mm3\n\t"
+    /*mm5=-L>R_i||R_i>L?-R_i':0*/
+    "pand %%mm3,%%mm5\n\t"
+    /*mm2=-L>R_i||R_i>L?0:R_i*/
+    "psubw %%mm5,%%mm3\n\t"
+    /*mm2=-L>R_i||R_i>L?-R_i':R_i*/
+    "psubw %%mm5,%%mm3\n\t"
+    /*Unfortunately, there's no unsigned byte+signed byte with unsigned
+       saturation op code, so we have to promote things back 16 bits.*/
     "pxor %%mm0,%%mm0\n\t"
-    "movq %%mm1,%%mm2\n\t"
+    "movq %%mm4,%%mm5\n\t"
+    "punpcklbw %%mm0,%%mm4\n\t"
+    "punpckhbw %%mm0,%%mm5\n\t"
+    "movq %%mm1,%%mm6\n\t"
     "punpcklbw %%mm0,%%mm1\n\t"
-    "punpckhbw %%mm0,%%mm2\n\t"
-    /*_pix[0...8+_ystride*2]-=f*/
-    "psubw %%mm6,%%mm1\n\t"
-    "psubw %%mm7,%%mm2\n\t"
-    /*Pack it back into 8 bits and write it back out.*/
-    "packuswb %%mm2,%%mm1\n\t"
+    "punpckhbw %%mm0,%%mm6\n\t"
+    /*_pix[0...8+_ystride]+=R_i*/
+    "paddw %%mm2,%%mm4\n\t"
+    "paddw %%mm3,%%mm5\n\t"
+    /*_pix[0...8+_ystride*2]-=R_i*/
+    "psubw %%mm2,%%mm1\n\t"
+    "psubw %%mm3,%%mm6\n\t"
     "packuswb %%mm5,%%mm4\n\t"
+    "packuswb %%mm6,%%mm1\n\t"
+    /*Write it back out.*/
+    "movq %%mm4,(%[pix],%[ystride])\n\t"
     "movq %%mm1,(%[pix],%[ystride],2)\n\t"
-    "movq %%mm4,(%[pix],%[ystride])\n\t"
-    :[s]"=&S"(esi),[d]"=&D"(edi)
-    :[pix]"r"(_pix),[ystride]"r"((long)_ystride),[bv]"r"(_bv),
+    :[s]"=&S"(esi)
+    :[pix]"r"(_pix),[ystride]"r"((long)_ystride),[ll]"r"(_ll),
      [OC_V3]"m"(OC_V3),[OC_V4]"m"(OC_V4)
     :"memory"
   );
@@ -407,16 +436,16 @@
 /*This code implements the bulk of loop_filter_h().
   Data are striped p0 p1 p2 p3 ... p0 p1 p2 p3 ..., so in order to load all
    four p0's to one register we must transpose the values in four mmx regs.
-  When half is done we repeat this for the rest.
-  TODO: some instruction stalls can be avoided.*/
-static void loop_filter_h4(unsigned char *_pix,long _ystride,const int *_bv){
+  When half is done we repeat this for the rest.*/
+static void loop_filter_h4(unsigned char *_pix,long _ystride,
+ const ogg_int16_t *_ll){
   long esi;
   long edi;
   __asm__ __volatile__(
+    /*x x x x 3 2 1 0*/
+    "movd (%[pix]),%%mm0\n\t"
     /*esi=_ystride*3*/
     "lea (%[ystride],%[ystride],2),%[s]\n\t"
-    /*x x x x 3 2 1 0*/
-    "movd (%[pix]),%%mm0\n\t"
     /*x x x x 7 6 5 4*/
     "movd (%[pix],%[ystride]),%%mm1\n\t"
     /*x x x x B A 9 8*/
@@ -429,23 +458,23 @@
     "punpcklbw %%mm3,%%mm2\n\t"
     /*mm1=7 3 6 2 5 1 4 0*/
     "movq %%mm0,%%mm1\n\t"
+    /*mm0=F B 7 3 E A 6 2*/
+    "punpckhwd %%mm2,%%mm0\n\t"
     /*mm1=D 9 5 1 C 8 4 0*/
     "punpcklwd %%mm2,%%mm1\n\t"
-    /*mm0=F B 7 3 E A 6 2*/
-    "punpckhwd %%mm2,%%mm0\n\t"
     "pxor %%mm7,%%mm7\n\t"
     /*mm5=D 9 5 1 C 8 4 0*/
     "movq %%mm1,%%mm5\n\t"
+    /*mm1=x C x 8 x 4 x 0==pix[0]*/
+    "punpcklbw %%mm7,%%mm1\n\t"
     /*mm5=x D x 9 x 5 x 1==pix[1]*/
     "punpckhbw %%mm7,%%mm5\n\t"
-    /*mm1=x C x 8 x 4 x 0==pix[0]*/
-    "punpcklbw %%mm7,%%mm1\n\t"
     /*mm3=F B 7 3 E A 6 2*/
     "movq %%mm0,%%mm3\n\t"
+    /*mm0=x E x A x 6 x 2==pix[2]*/
+    "punpcklbw %%mm7,%%mm0\n\t"
     /*mm3=x F x B x 7 x 3==pix[3]*/
     "punpckhbw %%mm7,%%mm3\n\t"
-    /*mm0=x E x A x 6 x 2==pix[2]*/
-    "punpcklbw %%mm7,%%mm0\n\t"
     /*mm1=mm1-mm3==pix[0]-pix[3]*/
     "psubw %%mm3,%%mm1\n\t"
     /*Save a copy of pix[2] for later.*/
@@ -455,45 +484,66 @@
     /*Scale by 3.*/
     "pmullw %[OC_V3],%%mm0\n\t"
     /*f=mm1==_pix[0]-_pix[3]+ 3*(_pix[2]-_pix[1])*/
-    "paddw %%mm0,%%mm1\n\t"
+    "paddw %%mm1,%%mm0\n\t"
     /*Add 4.*/
-    "paddw %[OC_V4],%%mm1\n\t"
-    /*"Divide" by 8.*/
-    "psraw $3,%%mm1\n\t"
-    /*Now perform mm0=_bv[(f+4>>3)]*/
-    /*pextrw requires MMX+/SSE.
-    "pextrw $0,%%mm1,%%esi\n\t"
-    "pextrw $1,%%mm1,%%edi\n\t"*/
-    "movd %%mm1,%%esi\n\t"
-    "psrlq $16,%%mm1\n\t"
-    "movd %%mm1,%%edi\n\t"
-    "movsx %%si,%[s]\n\t"
-    "psrlq $16,%%mm1\n\t"
-    "movsx %%di,%[d]\n\t"
-    /*pinsrw requires MMX+/SSE.
-    "pinsrw $0,(%[bv],%%esi,4),%%mm0\n\t"
-    "pextrw $2,%%mm1,%%esi\n\t"
-    "pinsrw $1,(%[bv],%%edi,4),%%mm0\n\t"
-    "pextrw $3,%%mm1,%%edi\n\t"*/
-    "movd (%[bv],%[s],4),%%mm0\n\t"
-    "movd %%mm1,%%esi\n\t"
-    "movd (%[bv],%[d],4),%%mm2\n\t"
-    "psrlq $16,%%mm1\n\t"
-    "movsx %%si,%[s]\n\t"
-    "movd %%mm1,%%edi\n\t"
-    /*"pinsrw $2,(%[bv],%%esi,4),%%mm0\n\t"
-    "pinsrw $3,(%[bv],%%edi,4),%%mm0\n\t"*/
-    "movd (%[bv],%[s],4),%%mm3\n\t"
-    "movsx %%di,%[d]\n\t"
-    "pslld $16,%%mm3\n\t"
-    "movd (%[bv],%[d],4),%%mm6\n\t"
-    "por %%mm3,%%mm0\n\t"
-    "pslld $16,%%mm6\n\t"
-    "por %%mm6,%%mm2\n\t"
-    "punpcklwd %%mm2,%%mm0\n\t"
-    /*_pix[1]+=f;*/
+    "paddw %[OC_V4],%%mm0\n\t"
+    /*"Divide" by 8, producing the residuals R_i.*/
+    "psraw $3,%%mm0\n\t"
+    /*Now compute lflim of mm0 cf. Section 7.10 of the sepc.*/
+    /*mm6=L L L L*/
+    "movq (%[ll]),%%mm6\n\t"
+    /*if(R_i<-2L||R_i>2L)R_i=0:*/
+    "movq %%mm0,%%mm1\n\t"
+    "pxor %%mm2,%%mm2\n\t"
+    "movq %%mm6,%%mm3\n\t"
+    "psubw %%mm6,%%mm2\n\t"
+    "psllw $1,%%mm3\n\t"
+    "psllw $1,%%mm2\n\t"
+    /*mm0==R_3 R_2 R_1 R_0*/
+    /*mm1==R_3 R_2 R_1 R_0*/
+    /*mm2==-2L -2L -2L -2L*/
+    /*mm3==2L 2L 2L 2L*/
+    "pcmpgtw %%mm0,%%mm3\n\t"
+    "pcmpgtw %%mm2,%%mm1\n\t"
+    "pand %%mm3,%%mm0\n\t"
+    "pand %%mm1,%%mm0\n\t"
+    /*if(R_i<-L)R_i'=R_i+2L;
+      if(R_i>L)R_i'=R_i-2L;
+      if(R_i<-L||R_i>L)R_i=-R_i':*/
+    "psraw $1,%%mm2\n\t"
+    "movq %%mm0,%%mm1\n\t"
+    "movq %%mm6,%%mm3\n\t"
+    /*mm0==R_3 R_2 R_1 R_0*/
+    /*mm1==R_3 R_2 R_1 R_0*/
+    /*mm2==-L -L -L -L*/
+    /*mm6==L L L L*/
+    /*mm2=-L>R_i?FF:00*/
+    "pcmpgtw %%mm0,%%mm2\n\t"
+    /*mm1=R_i>L?FF:00*/
+    "pcmpgtw %%mm6,%%mm1\n\t"
+    /*mm3=2L 2L 2L 2L*/
+    "psllw $1,%%mm3\n\t"
+    /*mm6=2L 2L 2L 2L*/
+    "psllw $1,%%mm6\n\t"
+    /*mm3=R_i>L?2L:0*/
+    "pand %%mm1,%%mm3\n\t"
+    /*mm6=-L>R_i?2L:0*/
+    "pand %%mm2,%%mm6\n\t"
+    /*mm0=R_i>L?R_i-2L:R_i*/
+    "psubw %%mm3,%%mm0\n\t"
+    /*mm1=-L>R_i||R_i>L*/
+    "por %%mm2,%%mm1\n\t"
+    /*mm0=-L>R_i?R_i+2L:R_i*/
+    "paddw %%mm6,%%mm0\n\t"
+    /*mm1=-L>R_i||R_i>L?R_i':0*/
+    "pand %%mm0,%%mm1\n\t"
+    /*mm0=-L>R_i||R_i>L?0:R_i*/
+    "psubw %%mm1,%%mm0\n\t"
+    /*mm0=-L>R_i||R_i>L?-R_i':R_i*/
+    "psubw %%mm1,%%mm0\n\t"
+    /*_pix[1]+=R_i;*/
     "paddw %%mm0,%%mm5\n\t"
-    /*_pix[2]-=f;*/
+    /*_pix[2]-=R_i;*/
     "psubw %%mm0,%%mm4\n\t"
     /*mm5=x x x x D 9 5 1*/
     "packuswb %%mm7,%%mm5\n\t"
@@ -501,39 +551,36 @@
     "packuswb %%mm7,%%mm4\n\t"
     /*mm5=E D A 9 6 5 2 1*/
     "punpcklbw %%mm4,%%mm5\n\t"
-    /*esi=6 5 2 1*/
-    "movd %%mm5,%%esi\n\t"
-    "movw %%si,1(%[pix])\n\t"
+    /*edi=6 5 2 1*/
+    "movd %%mm5,%%edi\n\t"
+    "movw %%di,1(%[pix])\n\t"
     /*Why is there such a big stall here?*/
     "psrlq $32,%%mm5\n\t"
-    "shrl $16,%%esi\n\t"
-    "movw %%si,1(%[pix],%[ystride])\n\t"
-    /*esi=E D A 9*/
-    "movd %%mm5,%%esi\n\t"
-    "lea (%[ystride],%[ystride],2),%[d]\n\t"
-    "movw %%si,(%[pix],%[ystride])\n\t"
-    "shrl $16,%%esi\n\t"
-    "movw %%si,1(%[pix],%[d])\n\t"
+    "shrl $16,%%edi\n\t"
+    "movw %%di,1(%[pix],%[ystride])\n\t"
+    /*edi=E D A 9*/
+    "movd %%mm5,%%edi\n\t"
+    "movw %%di,1(%[pix],%[ystride],2)\n\t"
+    "shrl $16,%%edi\n\t"
+    "movw %%di,1(%[pix],%[s])\n\t"
     :[s]"=&S"(esi),[d]"=&D"(edi),
-     [pix]"+r"(_pix),[ystride]"+r"(_ystride),[bv]"+r"(_bv)
+     [pix]"+r"(_pix),[ystride]"+r"(_ystride),[ll]"+r"(_ll)
     :[OC_V3]"m"(OC_V3),[OC_V4]"m"(OC_V4)
     :"memory"
   );
 }
 
-static void loop_filter_h(unsigned char *_pix,int _ystride,int *_bv){
+static void loop_filter_h(unsigned char *_pix,int _ystride,
+ const ogg_int16_t *_ll){
   _pix-=2;
-  loop_filter_h4(_pix,_ystride,_bv);
-  _pix+=_ystride*4;
-  loop_filter_h4(_pix,_ystride,_bv);
+  loop_filter_h4(_pix,_ystride,_ll);
+  loop_filter_h4(_pix+(_ystride<<2),_ystride,_ll);
 }
 
 /*We copy the whole function because the MMX routines will be inlined 4 times,
-   and we do just a single emms call at the end.
-  Originally _bv pointer would also not be offset by 256 to get rid of a sign
-   extension instruction, but it turns out this is still needed on x86-64 to
-   avoid a partial register stall, and is needed even on x86-32 once we
-   eliminate the MMX+/SSE-specific pextrw/pinsrw instructions.*/
+   and we can do just a single emms call at the end this way.
+  We also do not utilize the _bv lookup table, instead computing the values
+   that would lie in it on the fly.*/
 
 /*Apply the loop filter to a given set of fragment rows in the given plane.
   The filter may be run on the bottom edge, affecting pixels in the next row of
@@ -545,15 +592,17 @@
   _fragy_end: The Y coordinate of the fragment row to stop filtering at.*/
 void oc_state_loop_filter_frag_rows_mmx(oc_theora_state *_state,int *_bv,
  int _refi,int _pli,int _fragy0,int _fragy_end){
-  th_img_plane  *iplane;
-  oc_fragment_plane *fplane;
-  oc_fragment       *frag_top;
-  oc_fragment       *frag0;
-  oc_fragment       *frag;
-  oc_fragment       *frag_end;
-  oc_fragment       *frag0_end;
-  oc_fragment       *frag_bot;
-  _bv+=256;
+  ogg_int16_t __attribute__((aligned(8)))  ll[4];
+  th_img_plane                            *iplane;
+  oc_fragment_plane                       *fplane;
+  oc_fragment                             *frag_top;
+  oc_fragment                             *frag0;
+  oc_fragment                             *frag;
+  oc_fragment                             *frag_end;
+  oc_fragment                             *frag0_end;
+  oc_fragment                             *frag_bot;
+  ll[0]=ll[1]=ll[2]=ll[3]=
+   (ogg_int16_t)_state->loop_filter_limits[_state->qis[0]];
   iplane=_state->ref_frame_bufs[_refi]+_pli;
   fplane=_state->fplanes+_pli;
   /*The following loops are constructed somewhat non-intuitively on purpose.
@@ -571,17 +620,17 @@
     while(frag<frag_end){
       if(frag->coded){
         if(frag>frag0){
-          loop_filter_h(frag->buffer[_refi],iplane->ystride,_bv);
+          loop_filter_h(frag->buffer[_refi],iplane->ystride,ll);
         }
         if(frag0>frag_top){
-          loop_filter_v(frag->buffer[_refi],iplane->ystride,_bv);
+          loop_filter_v(frag->buffer[_refi],iplane->ystride,ll);
         }
         if(frag+1<frag_end&&!(frag+1)->coded){
-          loop_filter_h(frag->buffer[_refi]+8,iplane->ystride,_bv);
+          loop_filter_h(frag->buffer[_refi]+8,iplane->ystride,ll);
         }
         if(frag+fplane->nhfrags<frag_bot&&!(frag+fplane->nhfrags)->coded){
           loop_filter_v((frag+fplane->nhfrags)->buffer[_refi],
-           iplane->ystride,_bv);
+           iplane->ystride,ll);
         }
       }
       frag++;



More information about the commits mailing list