[xiph-commits] r17728 - in trunk/theora/lib: . arm c64x x86 x86_vc
tterribe at svn.xiph.org
tterribe at svn.xiph.org
Tue Dec 7 02:28:07 PST 2010
Author: tterribe
Date: 2010-12-07 02:28:07 -0800 (Tue, 07 Dec 2010)
New Revision: 17728
Added:
trunk/theora/lib/x86/x86zigzag.h
trunk/theora/lib/x86_vc/x86zigzag.h
Modified:
trunk/theora/lib/Makefile.am
trunk/theora/lib/analyze.c
trunk/theora/lib/arm/armidct.s
trunk/theora/lib/c64x/c64xidct.c
trunk/theora/lib/encint.h
trunk/theora/lib/enquant.c
trunk/theora/lib/fdct.c
trunk/theora/lib/idct.c
trunk/theora/lib/tokenize.c
trunk/theora/lib/x86/mmxfdct.c
trunk/theora/lib/x86/mmxidct.c
trunk/theora/lib/x86/sse2fdct.c
trunk/theora/lib/x86/sse2idct.c
trunk/theora/lib/x86/x86enc.c
trunk/theora/lib/x86/x86enc.h
trunk/theora/lib/x86/x86enquant.c
trunk/theora/lib/x86_vc/mmxfdct.c
trunk/theora/lib/x86_vc/mmxidct.c
Log:
Move zig-zagging from quantization into the fDCT.
This removes one of the transposes from the fDCT, and avoids several zig-zag
lookups during tokenization.
This change also makes the encoder iDCT clear the input buffer like the
decoder, which can be re-used for the next block, avoiding the need for a
memcpy or memset in the tokenizer.
This gives a 1.3% speed-up at the default speed-level (1), and a 3.1% speed-up
at speed-level 2 (for 480p, on x86-64).
Modified: trunk/theora/lib/Makefile.am
===================================================================
--- trunk/theora/lib/Makefile.am 2010-12-07 10:27:43 UTC (rev 17727)
+++ trunk/theora/lib/Makefile.am 2010-12-07 10:28:07 UTC (rev 17728)
@@ -13,7 +13,14 @@
arm/arm2gnu.pl \
c64x/c64xint.h \
c64x/c64xdec.h \
+ x86/mmxfrag.c \
+ x86/mmxidct.c \
+ x86/mmxloop.h \
+ x86/mmxstate.c \
+ x86/sse2idct.c \
x86/x86cpu.c \
+ x86/x86int.h \
+ x86/x86state.c \
x86/mmxencfrag.c \
x86/mmxfdct.c \
x86/sse2encfrag.c \
@@ -22,13 +29,7 @@
x86/x86enc.c \
x86/x86enc.h \
x86/x86enquant.c \
- x86/mmxfrag.c \
- x86/mmxidct.c \
- x86/mmxloop.h \
- x86/mmxstate.c \
- x86/sse2idct.c \
- x86/x86int.h \
- x86/x86state.c \
+ x86/x86zigzag.h \
x86_vc
lib_LTLIBRARIES = libtheoradec.la libtheoraenc.la libtheora.la
@@ -215,11 +216,12 @@
arm/armcpu.h \
c64x/c64xdec.h \
c64x/c64xint.h \
- x86/x86cpu.h \
x86/mmxloop.h \
x86/sse2trans.h \
+ x86/x86cpu.h \
x86/x86enc.h \
- x86/x86int.h
+ x86/x86int.h \
+ x86/x86zigzag.h
libtheoradec_la_SOURCES = \
$(decoder_sources) \
Modified: trunk/theora/lib/analyze.c
===================================================================
--- trunk/theora/lib/analyze.c 2010-12-07 10:27:43 UTC (rev 17727)
+++ trunk/theora/lib/analyze.c 2010-12-07 10:28:07 UTC (rev 17728)
@@ -667,8 +667,9 @@
oc_enc_pipeline_state *_pipe,int _pli,ptrdiff_t _fragi,
unsigned _rd_scale,unsigned _rd_iscale,oc_rd_metric *_mo,
oc_fr_state *_fr,oc_token_checkpoint **_stack){
+ ogg_int16_t *data;
ogg_int16_t *dct;
- ogg_int16_t *data;
+ ogg_int16_t *idct;
oc_qii_state qs;
const ogg_uint16_t *dequant;
ogg_uint16_t dequant_dc;
@@ -701,6 +702,7 @@
qii=frags[_fragi].qii;
data=_enc->pipe.dct_data;
dct=data+64;
+ idct=data+128;
if(qii&~3){
#if !defined(OC_COLLECT_METRICS)
if(_enc->sp_level>=OC_SP_LEVEL_EARLY_SKIP){
@@ -771,12 +773,12 @@
/*Tokenize.*/
checkpoint=*_stack;
if(_enc->sp_level<OC_SP_LEVEL_FAST_ANALYSIS){
- ac_bits=oc_enc_tokenize_ac(_enc,_pli,_fragi,data,dequant,dct,nonzero+1,
- _stack,OC_RD_ISCALE(_enc->lambda,_rd_iscale),qti?0:3);
+ ac_bits=oc_enc_tokenize_ac(_enc,_pli,_fragi,idct,data,dequant,dct,
+ nonzero+1,_stack,OC_RD_ISCALE(_enc->lambda,_rd_iscale),qti?0:3);
}
else{
- ac_bits=oc_enc_tokenize_ac_fast(_enc,_pli,_fragi,data,dequant,dct,nonzero+1,
- _stack,OC_RD_ISCALE(_enc->lambda,_rd_iscale),qti?0:3);
+ ac_bits=oc_enc_tokenize_ac_fast(_enc,_pli,_fragi,idct,data,dequant,dct,
+ nonzero+1,_stack,OC_RD_ISCALE(_enc->lambda,_rd_iscale),qti?0:3);
}
/*Reconstruct.
TODO: nonzero may need to be adjusted after tokenization.*/
@@ -798,8 +800,9 @@
else if(qi01>=0)qii=0;
}
else{
- data[0]=dc*dequant_dc;
- oc_idct8x8(&_enc->state,data,data,nonzero+1);
+ idct[0]=dc*dequant_dc;
+ /*Note: This clears idct[] back to zero for the next block.*/
+ oc_idct8x8(&_enc->state,data,idct,nonzero+1);
}
frags[_fragi].qii=qii;
if(nqis>1){
Modified: trunk/theora/lib/arm/armidct.s
===================================================================
--- trunk/theora/lib/arm/armidct.s 2010-12-07 10:27:43 UTC (rev 17727)
+++ trunk/theora/lib/arm/armidct.s 2010-12-07 10:28:07 UTC (rev 17728)
@@ -64,11 +64,8 @@
BL idct8core_arm
BL idct8core_arm
LDR r0, [r13], #4 ; Write to the final destination.
- ; Clear input data for next block (decoder only).
SUB r2, r1, #8*16
- CMP r0, r2
- MOV r1, r13 ; And read from temp storage.
- BEQ oc_idct8x8_slow_arm_cols
+ ; Clear input data for next block.
MOV r4, #0
MOV r5, #0
MOV r6, #0
@@ -81,7 +78,7 @@
STMIA r2!,{r4,r5,r6,r7}
STMIA r2!,{r4,r5,r6,r7}
STMIA r2!,{r4,r5,r6,r7}
-oc_idct8x8_slow_arm_cols
+ MOV r1, r13 ; And read from temp storage.
; Column transforms
BL idct8core_down_arm
BL idct8core_down_arm
@@ -105,18 +102,15 @@
BL idct3core_arm
BL idct2core_arm
BL idct1core_arm
- ; Clear input data for next block (decoder only).
- SUB r0, r1, #4*16
- CMP r0, r2
- MOV r1, r13 ; Read from temp storage.
- BEQ oc_idct8x8_10_arm_cols
+ ; Clear input data for next block.
MOV r4, #0
- STR r4, [r0]
- STR r4, [r0,#4]
- STR r4, [r0,#16]
- STR r4, [r0,#20]
- STR r4, [r0,#32]
- STR r4, [r0,#48]
+ STR r4, [r1,#-4*16]!
+ STR r4, [r1,#4]
+ STR r4, [r1,#16]
+ STR r4, [r1,#20]
+ STR r4, [r1,#32]
+ STR r4, [r1,#48]
+ MOV r1, r13 ; Read from temp storage.
MOV r0, r2 ; Write to the final destination
oc_idct8x8_10_arm_cols
; Column transforms
@@ -141,18 +135,14 @@
BL idct3core_arm
BL idct2core_arm
BL idct1core_arm
- ; Clear input data for next block (decoder only).
- SUB r0, r1, #3*16
- CMP r0, r2
- MOV r1, r13 ; Read from temp storage.
- BEQ oc_idct8x8_6_arm_cols
+ ; Clear input data for next block.
MOV r4, #0
- STR r4, [r0]
- STR r4, [r0,#4]
- STR r4, [r0,#16]
- STR r4, [r0,#32]
+ STR r4, [r1,#-3*16]!
+ STR r4, [r1,#4]
+ STR r4, [r1,#16]
+ STR r4, [r1,#32]
+ MOV r1, r13 ; Read from temp storage.
MOV r0, r2 ; Write to the final destination
-oc_idct8x8_6_arm_cols
; Column transforms
BL idct3core_down_arm
BL idct3core_down_arm
@@ -174,14 +164,12 @@
MOV r0, r13 ; Write to temp storage.
BL idct2core_arm
BL idct1core_arm
- ; Clear input data for next block (decoder only).
- SUB r0, r1, #2*16
- CMP r0, r2
+ ; Clear input data for next block.
+ MOV r4, #0
+ STR r4, [r1,#-2*16]!
+ STR r4, [r1,#16]
MOV r1, r13 ; Read from temp storage.
- MOVNE r4, #0
- STRNE r4, [r0]
- STRNE r4, [r0,#16]
- MOVNE r0, r2 ; Write to the final destination
+ MOV r0, r2 ; Write to the final destination
; Column transforms
BL idct2core_down_arm
BL idct2core_down_arm
@@ -799,30 +787,26 @@
BL idct8_8core_v6
BL idct8_8core_v6
LDR r0, [r13], #4 ; Write to the final destination.
- ; Clear input data for next block (decoder only).
- SUB r2, r1, #8*16
- CMP r0, r2
- MOV r1, r13 ; And read from temp storage.
- BEQ oc_idct8x8_slow_v6_cols
+ ; Clear input data for next block.
MOV r4, #0
MOV r5, #0
- STRD r4, [r2], #8
- STRD r4, [r2], #8
- STRD r4, [r2], #8
- STRD r4, [r2], #8
- STRD r4, [r2], #8
- STRD r4, [r2], #8
- STRD r4, [r2], #8
- STRD r4, [r2], #8
- STRD r4, [r2], #8
- STRD r4, [r2], #8
- STRD r4, [r2], #8
- STRD r4, [r2], #8
- STRD r4, [r2], #8
- STRD r4, [r2], #8
- STRD r4, [r2], #8
- STRD r4, [r2], #8
-oc_idct8x8_slow_v6_cols
+ STRD r4, [r1,#-8*16]!
+ STRD r4, [r1,#8]
+ STRD r4, [r1,#16]
+ STRD r4, [r1,#24]
+ STRD r4, [r1,#32]
+ STRD r4, [r1,#40]
+ STRD r4, [r1,#48]
+ STRD r4, [r1,#56]
+ STRD r4, [r1,#64]
+ STRD r4, [r1,#72]
+ STRD r4, [r1,#80]
+ STRD r4, [r1,#88]
+ STRD r4, [r1,#96]
+ STRD r4, [r1,#104]
+ STRD r4, [r1,#112]
+ STRD r4, [r1,#120]
+ MOV r1, r13 ; And read from temp storage.
; Column transforms
BL idct8_8core_down_v6
BL idct8_8core_down_v6
@@ -843,20 +827,16 @@
BL idct4_3core_v6
BL idct2_1core_v6
LDR r0, [r13], #4 ; Write to the final destination.
- ; Clear input data for next block (decoder only).
- SUB r2, r1, #4*16
- CMP r0, r2
- AND r1, r13,#4 ; Align the stack.
- BEQ oc_idct8x8_10_v6_cols
+ ; Clear input data for next block.
MOV r4, #0
MOV r5, #0
- STRD r4, [r2]
- STRD r4, [r2,#16]
- STR r4, [r2,#32]
- STR r4, [r2,#48]
-oc_idct8x8_10_v6_cols
-; Column transforms
+ STRD r4, [r1,#-4*16]!
+ STRD r4, [r1,#16]
+ STR r4, [r1,#32]
+ STR r4, [r1,#48]
+ AND r1, r13,#4 ; Align the stack.
ADD r1, r1, r13 ; And read from temp storage.
+; Column transforms
BL idct4_4core_down_v6
BL idct4_4core_down_v6
BL idct4_4core_down_v6
@@ -872,14 +852,12 @@
MOV r8, r0
MOV r0, r13 ; Write to temp storage.
BL idct2_1core_v6
- ; Clear input data for next block (decoder only).
- SUB r0, r1, #2*16
- CMP r0, r8
+ ; Clear input data for next block.
+ MOV r4, #0
+ STR r4, [r1,#-2*16]!
+ STR r4, [r1,#16]
MOV r1, r13 ; Read from temp storage.
- MOVNE r4, #0
- STRNE r4, [r0]
- STRNE r4, [r0,#16]
- MOVNE r0, r8 ; Write to the final destination.
+ MOV r0, r8 ; Write to the final destination.
; Column transforms
BL idct2_2core_down_v6
BL idct2_2core_down_v6
@@ -1035,20 +1013,16 @@
ADD r0, r0, r13 ; Write to temp storage.
BL idct3_2core_v6
BL idct1core_v6
- ; Clear input data for next block (decoder only).
- SUB r0, r1, #3*16
- CMP r0, r8
- AND r1, r13,#4 ; Align the stack.
- BEQ oc_idct8x8_6_v6_cols
+ ; Clear input data for next block.
MOV r4, #0
MOV r5, #0
- STRD r4, [r0]
- STR r4, [r0,#16]
- STR r4, [r0,#32]
+ STRD r4, [r1,#-3*16]!
+ STR r4, [r1,#16]
+ STR r4, [r1,#32]
+ AND r1, r13,#4 ; Align the stack.
MOV r0, r8 ; Write to the final destination.
-oc_idct8x8_6_v6_cols
-; Column transforms
ADD r1, r1, r13 ; And read from temp storage.
+; Column transforms
BL idct3_3core_down_v6
BL idct3_3core_down_v6
BL idct3_3core_down_v6
@@ -1590,7 +1564,6 @@
VSWP D23,D30
; Column transforms
BL oc_idct8x8_stage123_neon
- CMP r0,r1
; We have to put the return address back in the LR, or the branch
; predictor will not recognize the function return and mis-predict the
; entire call stack.
@@ -1604,7 +1577,6 @@
VADD.S16 Q10,Q10,Q5 ; Q10 = y[2]=t[2]'+t[5]''
VSUB.S16 Q12,Q11,Q4 ; Q12 = y[4]=t[3]'-t[4]'
VADD.S16 Q11,Q11,Q4 ; Q11 = y[3]=t[3]'+t[4]'
- BEQ oc_idct8x8_slow_neon_noclear
VMOV.I8 Q2,#0
VPOP {D8-D15}
VMOV.I8 Q3,#0
@@ -1622,19 +1594,6 @@
VRSHR.S16 Q15,Q15,#4 ; Q15 = y[7]+8>>4
VSTMIA r0, {D16-D31}
MOV PC, r14
-
-oc_idct8x8_slow_neon_noclear
- VPOP {D8-D15}
- VRSHR.S16 Q8, Q8, #4 ; Q8 = y[0]+8>>4
- VRSHR.S16 Q9, Q9, #4 ; Q9 = y[1]+8>>4
- VRSHR.S16 Q10,Q10,#4 ; Q10 = y[2]+8>>4
- VRSHR.S16 Q11,Q11,#4 ; Q11 = y[3]+8>>4
- VRSHR.S16 Q12,Q12,#4 ; Q12 = y[4]+8>>4
- VRSHR.S16 Q13,Q13,#4 ; Q13 = y[5]+8>>4
- VRSHR.S16 Q14,Q14,#4 ; Q14 = y[6]+8>>4
- VRSHR.S16 Q15,Q15,#4 ; Q15 = y[7]+8>>4
- VSTMIA r0, {D16-D31}
- MOV PC, r14
ENDP
oc_idct8x8_stage123_neon PROC
@@ -1865,7 +1824,6 @@
VADD.S16 Q10,Q1, Q2 ; Q10= t[1]'=t[0]+t[2]
VSUB.S16 Q2, Q1, Q2 ; Q2 = t[2]'=t[0]-t[2]
; Stage 4
- CMP r0, r1
VADD.S16 Q8, Q11,Q15 ; Q8 = y[0]=t[0]'+t[7]'
VADD.S16 Q9, Q10,Q14 ; Q9 = y[1]=t[1]'+t[6]''
VSUB.S16 Q15,Q11,Q15 ; Q15 = y[7]=t[0]'-t[7]'
@@ -1874,7 +1832,6 @@
VADD.S16 Q11,Q3, Q12 ; Q11 = y[3]=t[3]'+t[4]'
VSUB.S16 Q12,Q3, Q12 ; Q12 = y[4]=t[3]'-t[4]'
VSUB.S16 Q13,Q2, Q13 ; Q13 = y[5]=t[2]'-t[5]''
- BEQ oc_idct8x8_10_neon_noclear
VMOV.I8 D2, #0
VRSHR.S16 Q8, Q8, #4 ; Q8 = y[0]+8>>4
VST1.64 {D2}, [r1 at 64], r12
@@ -1890,18 +1847,6 @@
VRSHR.S16 Q15,Q15,#4 ; Q15 = y[7]+8>>4
VSTMIA r0, {D16-D31}
MOV PC, r14
-
-oc_idct8x8_10_neon_noclear
- VRSHR.S16 Q8, Q8, #4 ; Q8 = y[0]+8>>4
- VRSHR.S16 Q9, Q9, #4 ; Q9 = y[1]+8>>4
- VRSHR.S16 Q10,Q10,#4 ; Q10 = y[2]+8>>4
- VRSHR.S16 Q11,Q11,#4 ; Q11 = y[3]+8>>4
- VRSHR.S16 Q12,Q12,#4 ; Q12 = y[4]+8>>4
- VRSHR.S16 Q13,Q13,#4 ; Q13 = y[5]+8>>4
- VRSHR.S16 Q14,Q14,#4 ; Q14 = y[6]+8>>4
- VRSHR.S16 Q15,Q15,#4 ; Q15 = y[7]+8>>4
- VSTMIA r0, {D16-D31}
- MOV PC, r14
ENDP
]
Modified: trunk/theora/lib/c64x/c64xidct.c
===================================================================
--- trunk/theora/lib/c64x/c64xidct.c 2010-12-07 10:27:43 UTC (rev 17727)
+++ trunk/theora/lib/c64x/c64xidct.c 2010-12-07 10:28:07 UTC (rev 17728)
@@ -319,12 +319,10 @@
/*Transform rows of x into columns of w.*/
for(i=0;i<8;i+=2){
OC_IDCT8x2_LOAD8(_x+i*8);
- if(_x!=_y){
- _amem8(_x+i*8)=0LL;
- _amem8(_x+i*8+4)=0LL;
- _amem8(_x+i*8+8)=0LL;
- _amem8(_x+i*8+12)=0LL;
- }
+ _amem8(_x+i*8)=0LL;
+ _amem8(_x+i*8+4)=0LL;
+ _amem8(_x+i*8+8)=0LL;
+ _amem8(_x+i*8+12)=0LL;
OC_IDCT8x2();
OC_IDCT8x2_STORET(w+i);
}
@@ -357,12 +355,10 @@
OC_IDCT8x2_4();
OC_IDCT8x2_STORET(w);
OC_IDCT8x2_LOAD2(_x+16);
- if(_x!=_y){
- _amem8(_x)=0LL;
- _amem8(_x+8)=0LL;
- _amem4(_x+16)=0;
- _amem4(_x+24)=0;
- }
+ _amem8(_x)=0LL;
+ _amem8(_x+8)=0LL;
+ _amem4(_x+16)=0;
+ _amem4(_x+24)=0;
OC_IDCT8x2_2();
OC_IDCT8x2_STORET(w+2);
/*Transform rows of w into columns of y.*/
@@ -398,10 +394,8 @@
OC_IDCT8x2_2();
OC_IDCT8x2_STORE(w+i*8);
}
- if(_x!=_y){
- _amem4(_x)=0;
- _amem4(_x+8)=0;
- }
+ _amem4(_x)=0;
+ _amem4(_x+8)=0;
/*Transform columns of w into columns of y.*/
for(i=0;i<8;i+=2){
OC_IDCT8x2_LOAD2T(w+i);
Modified: trunk/theora/lib/encint.h
===================================================================
--- trunk/theora/lib/encint.h 2010-12-07 10:27:43 UTC (rev 17727)
+++ trunk/theora/lib/encint.h 2010-12-07 10:28:07 UTC (rev 17728)
@@ -444,7 +444,7 @@
This is kept off the stack because a) gcc can't align things on the stack
reliably on ARM, and b) it avoids (unintentional) data hazards between
ARM and NEON code.*/
- OC_ALIGN16(ogg_int16_t dct_data[128]);
+ OC_ALIGN16(ogg_int16_t dct_data[64*3]);
OC_ALIGN16(signed char bounding_values[256]);
oc_fr_state fr[3];
oc_qii_state qs[3];
@@ -765,10 +765,12 @@
void oc_enc_tokenize_start(oc_enc_ctx *_enc);
int oc_enc_tokenize_ac(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi,
- ogg_int16_t *_qdct,const ogg_uint16_t *_dequant,const ogg_int16_t *_dct,
+ ogg_int16_t *_qdct_out,const ogg_int16_t *_qdct_in,
+ const ogg_uint16_t *_dequant,const ogg_int16_t *_dct,
int _zzi,oc_token_checkpoint **_stack,int _lambda,int _acmin);
int oc_enc_tokenize_ac_fast(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi,
- ogg_int16_t *_qdct,const ogg_uint16_t *_dequant,const ogg_int16_t *_dct,
+ ogg_int16_t *_qdct_out,const ogg_int16_t *_qdct_in,
+ const ogg_uint16_t *_dequant,const ogg_int16_t *_dct,
int _zzi,oc_token_checkpoint **_stack,int _lambda,int _acmin);
void oc_enc_tokenlog_rollback(oc_enc_ctx *_enc,
const oc_token_checkpoint *_stack,int _n);
Modified: trunk/theora/lib/enquant.c
===================================================================
--- trunk/theora/lib/enquant.c 2010-12-07 10:27:43 UTC (rev 17727)
+++ trunk/theora/lib/enquant.c 2010-12-07 10:28:07 UTC (rev 17728)
@@ -229,7 +229,7 @@
enquant=(const oc_iquant *)_enquant;
nonzero=0;
for(zzi=0;zzi<64;zzi++){
- val=_dct[OC_FZIG_ZAG[zzi]];
+ val=_dct[zzi];
d=_dequant[zzi];
val=val<<1;
if(abs(val)>=d){
Modified: trunk/theora/lib/fdct.c
===================================================================
--- trunk/theora/lib/fdct.c 2010-12-07 10:27:43 UTC (rev 17727)
+++ trunk/theora/lib/fdct.c 2010-12-07 10:28:07 UTC (rev 17728)
@@ -147,7 +147,7 @@
/*Round the result back to the external working precision (which is still
scaled by four relative to the orthogonal result).
TODO: We should just update the external working precision.*/
- for(i=0;i<64;i++)_y[i]=w[i]+2>>2;
+ for(i=0;i<64;i++)_y[i]=w[OC_FZIG_ZAG[i]]+2>>2;
}
Modified: trunk/theora/lib/idct.c
===================================================================
--- trunk/theora/lib/idct.c 2010-12-07 10:27:43 UTC (rev 17727)
+++ trunk/theora/lib/idct.c 2010-12-07 10:28:07 UTC (rev 17728)
@@ -241,8 +241,8 @@
for(i=0;i<8;i++)idct8_2(_y+i,w+i*8);
/*Adjust for the scale factor.*/
for(i=0;i<64;i++)_y[i]=(ogg_int16_t)(_y[i]+8>>4);
- /*Clear input data for next block (decoder only).*/
- if(_x!=_y)_x[0]=_x[1]=_x[8]=0;
+ /*Clear input data for next block.*/
+ _x[0]=_x[1]=_x[8]=0;
}
/*Performs an inverse 8x8 Type-II DCT transform.
@@ -272,8 +272,8 @@
for(i=0;i<8;i++)idct8_4(_y+i,w+i*8);
/*Adjust for the scale factor.*/
for(i=0;i<64;i++)_y[i]=(ogg_int16_t)(_y[i]+8>>4);
- /*Clear input data for next block (decoder only).*/
- if(_x!=_y)_x[0]=_x[1]=_x[2]=_x[3]=_x[8]=_x[9]=_x[10]=_x[16]=_x[17]=_x[24]=0;
+ /*Clear input data for next block.*/
+ _x[0]=_x[1]=_x[2]=_x[3]=_x[8]=_x[9]=_x[10]=_x[16]=_x[17]=_x[24]=0;
}
/*Performs an inverse 8x8 Type-II DCT transform.
@@ -291,7 +291,8 @@
for(i=0;i<8;i++)idct8(_y+i,w+i*8);
/*Adjust for the scale factor.*/
for(i=0;i<64;i++)_y[i]=(ogg_int16_t)(_y[i]+8>>4);
- if(_x!=_y)for(i=0;i<64;i++)_x[i]=0;
+ /*Clear input data for next block.*/
+ for(i=0;i<64;i++)_x[i]=0;
}
/*Performs an inverse 8x8 Type-II DCT transform.
Modified: trunk/theora/lib/tokenize.c
===================================================================
--- trunk/theora/lib/tokenize.c 2010-12-07 10:27:43 UTC (rev 17727)
+++ trunk/theora/lib/tokenize.c 2010-12-07 10:28:07 UTC (rev 17728)
@@ -454,9 +454,10 @@
/*Tokenizes the AC coefficients, possibly adjusting the quantization, and then
dequantizes and de-zig-zags the result.
- The DC coefficient is not preserved; it should be restored by the caller.*/
+ The AC coefficients of _idct must be pre-initialized to zero.*/
int oc_enc_tokenize_ac(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi,
- ogg_int16_t *_qdct,const ogg_uint16_t *_dequant,const ogg_int16_t *_dct,
+ ogg_int16_t *_idct,const ogg_int16_t *_qdct,
+ const ogg_uint16_t *_dequant,const ogg_int16_t *_dct,
int _zzi,oc_token_checkpoint **_stack,int _lambda,int _acmin){
oc_token_checkpoint *stack;
ogg_int64_t zflags;
@@ -501,7 +502,7 @@
qc=_qdct[zzi];
s=-(qc<0);
qc_m=qc+s^s;
- c=_dct[OC_FZIG_ZAG[zzi]];
+ c=_dct[zzi];
/*The hard case: try a zero run.*/
if(qc_m<=1){
ogg_uint32_t sum_d2;
@@ -565,7 +566,7 @@
/*Try a +/- 1 combo token.*/
token=OC_DCT_RUN_CAT1_TOKEN[nzeros-1];
eb=OC_DCT_RUN_CAT1_EB[nzeros-1][-val_s];
- e=_dct[OC_FZIG_ZAG[zzj]]-(_dequant[zzj]+val_s^val_s);
+ e=_dct[zzj]-(_dequant[zzj]+val_s^val_s);
d2=e*(ogg_int32_t)e+sum_d2-d2_accum[zzj];
bits=oc_token_bits(_enc,huffi,zzi,token);
cost=d2+_lambda*bits+tokens[zzk][tk].cost;
@@ -585,7 +586,7 @@
bits=oc_token_bits(_enc,huffi,zzi,token);
val=2+(val>2);
sval=val+val_s^val_s;
- e=_dct[OC_FZIG_ZAG[zzj]]-_dequant[zzj]*sval;
+ e=_dct[zzj]-_dequant[zzj]*sval;
d2=e*(ogg_int32_t)e+sum_d2-d2_accum[zzj];
cost=d2+_lambda*bits+tokens[zzk][tk].cost;
if(cost<=best_cost){
@@ -701,9 +702,6 @@
}
/*Emit the tokens from the best path through the trellis.*/
stack=*_stack;
- /*We blow away the first entry here so that things vectorize better.
- The DC coefficient is not actually stored in the array yet.*/
- for(zzi=0;zzi<64;zzi++)_qdct[zzi]=0;
dct_fzig_zag=_enc->state.opt_data.dct_fzig_zag;
zzi=1;
ti=best_flags>>1&1;
@@ -737,7 +735,7 @@
zzj=(next>>1)-1&63;
/*TODO: It may be worth saving the dequantized coefficient in the trellis
above; we had to compute it to measure the error anyway.*/
- _qdct[dct_fzig_zag[zzj]]=(ogg_int16_t)(qc*(int)_dequant[zzj]);
+ _idct[dct_fzig_zag[zzj]]=(ogg_int16_t)(qc*(int)_dequant[zzj]);
zzi=next>>1;
ti=next&1;
}
@@ -747,16 +745,15 @@
}
/*Simplistic R/D tokenizer.
+ The AC coefficients of _idct must be pre-initialized to zero.
This could be made more accurate by using more sophisticated
rate predictions for zeros.
It could be made faster by switching from R/D decisions to static
lambda-derived rounding biases.*/
int oc_enc_tokenize_ac_fast(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi,
- ogg_int16_t *_qdct,const ogg_uint16_t *_dequant,const ogg_int16_t *_dct,
+ ogg_int16_t *_idct,const ogg_int16_t *_qdct,
+ const ogg_uint16_t *_dequant,const ogg_int16_t *_dct,
int _zzi,oc_token_checkpoint **_stack,int _lambda,int _acmin){
- /*Note that gcc will not always respect this alignment.
- In this case it doesn't matter terribly much.*/
- OC_ALIGN16(ogg_int16_t coef[64]);
const unsigned char *dct_fzig_zag;
ogg_uint16_t *eob_run;
oc_token_checkpoint *stack;
@@ -779,9 +776,7 @@
eob_run=_enc->eob_run[_pli];
dct_fzig_zag=_enc->state.opt_data.dct_fzig_zag;
huffi=_enc->huff_idxs[_enc->state.frame_type][1][_pli+1>>1];
- memcpy(coef,_qdct,_zzi*sizeof(*coef));
- for(zzj=0;zzj<64;zzj++)_qdct[zzj]=0;
- for(zzj=zzi=1;zzj<_zzi&&!coef[zzj];zzj++);
+ for(zzj=zzi=1;zzj<_zzi&&!_qdct[zzj];zzj++);
while(zzj<_zzi){
int v;
int d0;
@@ -797,10 +792,10 @@
int eob_bits;
int dct_fzig_zzj;
dct_fzig_zzj=dct_fzig_zag[zzj];
- v=_dct[OC_FZIG_ZAG[zzj]];
- d0=coef[zzj];
+ v=_dct[zzj];
+ d0=_qdct[zzj];
eob=eob_run[zzi];
- for(zzk=zzj+1;zzk<_zzi&&!coef[zzk];zzk++);
+ for(zzk=zzj+1;zzk<_zzi&&!_qdct[zzk];zzk++);
next_zero=zzk-zzj+62>>6;
dq0=d0*_dequant[zzj];
dd0=dq0-v;
@@ -840,7 +835,7 @@
cost=dd1+zr[next_zero];
}
if((dd0+(best_bits+eob_bits)*_lambda)>cost){
- _qdct[dct_fzig_zzj]=dq1;
+ _idct[dct_fzig_zzj]=dq1;
if(d1==0){
zzj=zzk;
continue;
@@ -851,7 +846,7 @@
}
else{
best_eb=*(OC_DCT_VALUE_EB_PTR+d0);
- _qdct[dct_fzig_zzj]=dq0;
+ _idct[dct_fzig_zzj]=dq0;
}
oc_enc_tokenlog_checkpoint(_enc,stack++,_pli,zzi);
if(eob>0){
@@ -927,7 +922,6 @@
}
best_cost=dd0+(best_bits+eob_bits)*_lambda;
if(d1==0&&(dd1+zr[2+next_zero])<=best_cost){
- _qdct[dct_fzig_zzj]=0;
zzj=zzk;
continue;
}
@@ -936,9 +930,9 @@
best_token=best_token1;
best_eb=best_eb1;
d=d1;
- _qdct[dct_fzig_zzj]=dq1;
+ _idct[dct_fzig_zzj]=dq1;
}
- else _qdct[dct_fzig_zzj]=dq0;
+ else _idct[dct_fzig_zzj]=dq0;
oc_enc_tokenlog_checkpoint(_enc,stack++,_pli,zzi);
if(eob){
oc_enc_eob_log(_enc,_pli,zzi,eob);
Modified: trunk/theora/lib/x86/mmxfdct.c
===================================================================
--- trunk/theora/lib/x86/mmxfdct.c 2010-12-07 10:27:43 UTC (rev 17727)
+++ trunk/theora/lib/x86/mmxfdct.c 2010-12-07 10:28:07 UTC (rev 17728)
@@ -12,6 +12,7 @@
/*MMX fDCT implementation for x86_32*/
/*$Id: fdct_ses2.c 14579 2008-03-12 06:42:40Z xiphmont $*/
#include "x86enc.h"
+#include "x86zigzag.h"
#if defined(OC_X86_ASM)
@@ -462,8 +463,9 @@
mm7 = d3 c3 b3 a3*/ \
/*MMX implementation of the fDCT.*/
-void oc_enc_fdct8x8_mmx(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
- ptrdiff_t a;
+void oc_enc_fdct8x8_mmxext(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
+ ogg_int16_t buf[64] __attribute__((aligned(8)));
+ ptrdiff_t a;
__asm__ __volatile__(
/*Add two extra bits of working precision to improve accuracy; any more and
we could overflow.*/
@@ -586,78 +588,89 @@
"movq 0x30(%[y]),%%mm3\n\t"
OC_FDCT_STAGE1_8x4
OC_FDCT8x4("0x00","0x10","0x20","0x30","0x08","0x18","0x28","0x38")
- OC_TRANSPOSE8x4("0x00","0x10","0x20","0x30","0x08","0x18","0x28","0x38")
- /*mm0={-2}x4*/
- "pcmpeqw %%mm0,%%mm0\n\t"
- "paddw %%mm0,%%mm0\n\t"
- /*Round the results.*/
- "psubw %%mm0,%%mm1\n\t"
- "psubw %%mm0,%%mm2\n\t"
- "psraw $2,%%mm1\n\t"
- "psubw %%mm0,%%mm3\n\t"
- "movq %%mm1,0x18(%[y])\n\t"
- "psraw $2,%%mm2\n\t"
- "psubw %%mm0,%%mm4\n\t"
- "movq 0x08(%[y]),%%mm1\n\t"
- "psraw $2,%%mm3\n\t"
- "psubw %%mm0,%%mm5\n\t"
+ /*mm2={-2}x4*/
+ "pcmpeqw %%mm2,%%mm2\n\t"
+ "paddw %%mm2,%%mm2\n\t"
+ /*Round and store the results (no transpose).*/
+ "movq 0x10(%[y]),%%mm7\n\t"
+ "psubw %%mm2,%%mm4\n\t"
+ "psubw %%mm2,%%mm6\n\t"
"psraw $2,%%mm4\n\t"
- "psubw %%mm0,%%mm6\n\t"
+ "psubw %%mm2,%%mm0\n\t"
+ "movq %%mm4,0x00(%[buf])\n\t"
+ "movq 0x30(%[y]),%%mm4\n\t"
+ "psraw $2,%%mm6\n\t"
+ "psubw %%mm2,%%mm5\n\t"
+ "movq %%mm6,0x20(%[buf])\n\t"
+ "psraw $2,%%mm0\n\t"
+ "psubw %%mm2,%%mm3\n\t"
+ "movq %%mm0,0x40(%[buf])\n\t"
"psraw $2,%%mm5\n\t"
- "psubw %%mm0,%%mm7\n\t"
- "psraw $2,%%mm6\n\t"
- "psubw %%mm0,%%mm1\n\t"
+ "psubw %%mm2,%%mm1\n\t"
+ "movq %%mm5,0x50(%[buf])\n\t"
+ "psraw $2,%%mm3\n\t"
+ "psubw %%mm2,%%mm7\n\t"
+ "movq %%mm3,0x60(%[buf])\n\t"
+ "psraw $2,%%mm1\n\t"
+ "psubw %%mm2,%%mm4\n\t"
+ "movq %%mm1,0x70(%[buf])\n\t"
"psraw $2,%%mm7\n\t"
+ "movq %%mm7,0x10(%[buf])\n\t"
+ "psraw $2,%%mm4\n\t"
+ "movq %%mm4,0x30(%[buf])\n\t"
+ /*Load the next block.*/
"movq 0x40(%[y]),%%mm0\n\t"
- "psraw $2,%%mm1\n\t"
- "movq %%mm7,0x30(%[y])\n\t"
"movq 0x78(%[y]),%%mm7\n\t"
- "movq %%mm1,0x08(%[y])\n\t"
"movq 0x50(%[y]),%%mm1\n\t"
- "movq %%mm6,0x20(%[y])\n\t"
"movq 0x68(%[y]),%%mm6\n\t"
- "movq %%mm2,0x28(%[y])\n\t"
"movq 0x60(%[y]),%%mm2\n\t"
- "movq %%mm5,0x10(%[y])\n\t"
"movq 0x58(%[y]),%%mm5\n\t"
- "movq %%mm3,0x38(%[y])\n\t"
"movq 0x70(%[y]),%%mm3\n\t"
- "movq %%mm4,0x00(%[y])\n\t"
"movq 0x48(%[y]),%%mm4\n\t"
OC_FDCT_STAGE1_8x4
OC_FDCT8x4("0x40","0x50","0x60","0x70","0x48","0x58","0x68","0x78")
- OC_TRANSPOSE8x4("0x40","0x50","0x60","0x70","0x48","0x58","0x68","0x78")
- /*mm0={-2}x4*/
- "pcmpeqw %%mm0,%%mm0\n\t"
- "paddw %%mm0,%%mm0\n\t"
- /*Round the results.*/
- "psubw %%mm0,%%mm1\n\t"
- "psubw %%mm0,%%mm2\n\t"
- "psraw $2,%%mm1\n\t"
- "psubw %%mm0,%%mm3\n\t"
- "movq %%mm1,0x58(%[y])\n\t"
- "psraw $2,%%mm2\n\t"
- "psubw %%mm0,%%mm4\n\t"
- "movq 0x48(%[y]),%%mm1\n\t"
- "psraw $2,%%mm3\n\t"
- "psubw %%mm0,%%mm5\n\t"
- "movq %%mm2,0x68(%[y])\n\t"
+ /*mm2={-2}x4*/
+ "pcmpeqw %%mm2,%%mm2\n\t"
+ "paddw %%mm2,%%mm2\n\t"
+ /*Round and store the results (no transpose).*/
+ "movq 0x50(%[y]),%%mm7\n\t"
+ "psubw %%mm2,%%mm4\n\t"
+ "psubw %%mm2,%%mm6\n\t"
"psraw $2,%%mm4\n\t"
- "psubw %%mm0,%%mm6\n\t"
- "movq %%mm3,0x78(%[y])\n\t"
+ "psubw %%mm2,%%mm0\n\t"
+ "movq %%mm4,0x08(%[buf])\n\t"
+ "movq 0x70(%[y]),%%mm4\n\t"
+ "psraw $2,%%mm6\n\t"
+ "psubw %%mm2,%%mm5\n\t"
+ "movq %%mm6,0x28(%[buf])\n\t"
+ "psraw $2,%%mm0\n\t"
+ "psubw %%mm2,%%mm3\n\t"
+ "movq %%mm0,0x48(%[buf])\n\t"
"psraw $2,%%mm5\n\t"
- "psubw %%mm0,%%mm7\n\t"
- "movq %%mm4,0x40(%[y])\n\t"
- "psraw $2,%%mm6\n\t"
- "psubw %%mm0,%%mm1\n\t"
- "movq %%mm5,0x50(%[y])\n\t"
+ "psubw %%mm2,%%mm1\n\t"
+ "movq %%mm5,0x58(%[buf])\n\t"
+ "psraw $2,%%mm3\n\t"
+ "psubw %%mm2,%%mm7\n\t"
+ "movq %%mm3,0x68(%[buf])\n\t"
+ "psraw $2,%%mm1\n\t"
+ "psubw %%mm2,%%mm4\n\t"
+ "movq %%mm1,0x78(%[buf])\n\t"
"psraw $2,%%mm7\n\t"
- "movq %%mm6,0x60(%[y])\n\t"
- "psraw $2,%%mm1\n\t"
- "movq %%mm7,0x70(%[y])\n\t"
- "movq %%mm1,0x48(%[y])\n\t"
+ "movq %%mm7,0x18(%[buf])\n\t"
+ "psraw $2,%%mm4\n\t"
+ "movq %%mm4,0x38(%[buf])\n\t"
+ /*Final transpose and zig-zag.*/
+#define OC_ZZ_LOAD_ROW_LO(_row,_reg) \
+ "movq 0x"_row"0(%[buf]),"_reg"\n\t" \
+
+#define OC_ZZ_LOAD_ROW_HI(_row,_reg) \
+ "movq 0x"_row"8(%[buf]),"_reg"\n\t" \
+
+ OC_TRANSPOSE_ZIG_ZAG_MMXEXT
+#undef OC_ZZ_LOAD_ROW_LO
+#undef OC_ZZ_LOAD_ROW_HI
:[a]"=&r"(a)
- :[y]"r"(_y),[x]"r"(_x)
+ :[y]"r"(_y),[x]"r"(_x),[buf]"r"(buf)
:"memory"
);
}
Modified: trunk/theora/lib/x86/mmxidct.c
===================================================================
--- trunk/theora/lib/x86/mmxidct.c 2010-12-07 10:27:43 UTC (rev 17727)
+++ trunk/theora/lib/x86/mmxidct.c 2010-12-07 10:28:07 UTC (rev 17728)
@@ -284,6 +284,7 @@
"#end OC_COLUMN_IDCT\n\t" \
static void oc_idct8x8_slow_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64]){
+ int i;
/*This routine accepts an 8x8 matrix, but in partially transposed form.
Every 4x4 block is transposed.*/
__asm__ __volatile__(
@@ -313,18 +314,15 @@
:[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64),
[c]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128)
);
- if(_x!=_y){
- int i;
- __asm__ __volatile__("pxor %%mm0,%%mm0\n\t"::);
- for(i=0;i<4;i++){
- __asm__ __volatile__(
- "movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t"
- "movq %%mm0,"OC_MEM_OFFS(0x08,x)"\n\t"
- "movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t"
- "movq %%mm0,"OC_MEM_OFFS(0x18,x)"\n\t"
- :[x]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_x+16*i,16)
- );
- }
+ __asm__ __volatile__("pxor %%mm0,%%mm0\n\t"::);
+ for(i=0;i<4;i++){
+ __asm__ __volatile__(
+ "movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t"
+ "movq %%mm0,"OC_MEM_OFFS(0x08,x)"\n\t"
+ "movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t"
+ "movq %%mm0,"OC_MEM_OFFS(0x18,x)"\n\t"
+ :[x]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_x+16*i,16)
+ );
}
}
@@ -514,16 +512,14 @@
:[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64),
[c]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128)
);
- if(_x!=_y){
- __asm__ __volatile__(
- "pxor %%mm0,%%mm0\n\t"
- "movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t"
- "movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t"
- "movq %%mm0,"OC_MEM_OFFS(0x20,x)"\n\t"
- "movq %%mm0,"OC_MEM_OFFS(0x30,x)"\n\t"
- :[x]"+m"OC_ARRAY_OPERAND(ogg_int16_t,_x,28)
- );
- }
+ __asm__ __volatile__(
+ "pxor %%mm0,%%mm0\n\t"
+ "movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t"
+ "movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t"
+ "movq %%mm0,"OC_MEM_OFFS(0x20,x)"\n\t"
+ "movq %%mm0,"OC_MEM_OFFS(0x30,x)"\n\t"
+ :[x]"+m"OC_ARRAY_OPERAND(ogg_int16_t,_x,28)
+ );
}
/*Performs an inverse 8x8 Type-II DCT transform.
Modified: trunk/theora/lib/x86/sse2fdct.c
===================================================================
--- trunk/theora/lib/x86/sse2fdct.c 2010-12-07 10:27:43 UTC (rev 17727)
+++ trunk/theora/lib/x86/sse2fdct.c 2010-12-07 10:28:07 UTC (rev 17728)
@@ -13,6 +13,7 @@
/*$Id: fdct_ses2.c 14579 2008-03-12 06:42:40Z xiphmont $*/
#include <stddef.h>
#include "x86enc.h"
+#include "x86zigzag.h"
#include "sse2trans.h"
#if defined(OC_X86_64_ASM)
@@ -412,8 +413,6 @@
/*Transform rows.*/
OC_TRANSPOSE_8x8
OC_FDCT_8x8
- /*TODO: zig-zag ordering?*/
- OC_TRANSPOSE_8x8
/*xmm14={-2,-2,-2,-2,-2,-2,-2,-2}*/
"paddw %%xmm14,%%xmm14\n\t"
"psubw %%xmm14,%%xmm0\n\t"
@@ -432,15 +431,19 @@
"psubw %%xmm14,%%xmm7\n\t"
"psraw $2,%%xmm6\n\t"
"psraw $2,%%xmm7\n\t"
- /*Store the result.*/
- "movdqa %%xmm0,0x00(%[y])\n\t"
- "movdqa %%xmm1,0x10(%[y])\n\t"
- "movdqa %%xmm2,0x20(%[y])\n\t"
- "movdqa %%xmm3,0x30(%[y])\n\t"
- "movdqa %%xmm4,0x40(%[y])\n\t"
- "movdqa %%xmm5,0x50(%[y])\n\t"
- "movdqa %%xmm6,0x60(%[y])\n\t"
- "movdqa %%xmm7,0x70(%[y])\n\t"
+ /*Transpose, zig-zag, and store the result.*/
+ /*We could probably do better using SSSE3's palignr, but re-using MMXEXT
+ version will do for now.*/
+#define OC_ZZ_LOAD_ROW_LO(_row,_reg) \
+ "movdq2q %%xmm"_row","_reg"\n\t" \
+
+#define OC_ZZ_LOAD_ROW_HI(_row,_reg) \
+ "punpckhqdq %%xmm"_row",%%xmm"_row"\n\t" \
+ "movdq2q %%xmm"_row","_reg"\n\t" \
+
+ OC_TRANSPOSE_ZIG_ZAG_MMXEXT
+#undef OC_ZZ_LOAD_ROW_LO
+#undef OC_ZZ_LOAD_ROW_HI
:[a]"=&r"(a)
:[y]"r"(_y),[x]"r"(_x)
:"memory"
Modified: trunk/theora/lib/x86/sse2idct.c
===================================================================
--- trunk/theora/lib/x86/sse2idct.c 2010-12-07 10:27:43 UTC (rev 17727)
+++ trunk/theora/lib/x86/sse2idct.c 2010-12-07 10:28:07 UTC (rev 17728)
@@ -208,6 +208,7 @@
static void oc_idct8x8_slow_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){
OC_ALIGN16(ogg_int16_t buf[16]);
+ int i;
/*This routine accepts an 8x8 matrix pre-transposed.*/
__asm__ __volatile__(
/*Load rows 2, 3, 5, and 6 for the first stage of the iDCT.*/
@@ -230,19 +231,16 @@
:[x]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64)),
[c]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128))
);
- if(_x!=_y){
- int i;
- __asm__ __volatile__("pxor %%xmm0,%%xmm0\n\t"::);
- /*Clear input data for next block (decoder only).*/
- for(i=0;i<2;i++){
- __asm__ __volatile__(
- "movdqa %%xmm0,"OC_MEM_OFFS(0x00,x)"\n\t"
- "movdqa %%xmm0,"OC_MEM_OFFS(0x10,x)"\n\t"
- "movdqa %%xmm0,"OC_MEM_OFFS(0x20,x)"\n\t"
- "movdqa %%xmm0,"OC_MEM_OFFS(0x30,x)"\n\t"
- :[x]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_x+i*32,32))
- );
- }
+ __asm__ __volatile__("pxor %%xmm0,%%xmm0\n\t"::);
+ /*Clear input data for next block (decoder only).*/
+ for(i=0;i<2;i++){
+ __asm__ __volatile__(
+ "movdqa %%xmm0,"OC_MEM_OFFS(0x00,x)"\n\t"
+ "movdqa %%xmm0,"OC_MEM_OFFS(0x10,x)"\n\t"
+ "movdqa %%xmm0,"OC_MEM_OFFS(0x20,x)"\n\t"
+ "movdqa %%xmm0,"OC_MEM_OFFS(0x30,x)"\n\t"
+ :[x]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_x+i*32,32))
+ );
}
}
@@ -411,17 +409,15 @@
:[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64),
[c]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128))
);
- if(_x!=_y){
- /*Clear input data for next block (decoder only).*/
- __asm__ __volatile__(
- "pxor %%mm0,%%mm0\n\t"
- "movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t"
- "movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t"
- "movq %%mm0,"OC_MEM_OFFS(0x20,x)"\n\t"
- "movq %%mm0,"OC_MEM_OFFS(0x30,x)"\n\t"
- :[x]"+m"(OC_ARRAY_OPERAND(ogg_int16_t,_x,28))
- );
- }
+ /*Clear input data for next block (decoder only).*/
+ __asm__ __volatile__(
+ "pxor %%mm0,%%mm0\n\t"
+ "movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t"
+ "movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t"
+ "movq %%mm0,"OC_MEM_OFFS(0x20,x)"\n\t"
+ "movq %%mm0,"OC_MEM_OFFS(0x30,x)"\n\t"
+ :[x]"+m"(OC_ARRAY_OPERAND(ogg_int16_t,_x,28))
+ );
}
/*Performs an inverse 8x8 Type-II DCT transform.
Modified: trunk/theora/lib/x86/x86enc.c
===================================================================
--- trunk/theora/lib/x86/x86enc.c 2010-12-07 10:27:43 UTC (rev 17727)
+++ trunk/theora/lib/x86/x86enc.c 2010-12-07 10:28:07 UTC (rev 17728)
@@ -28,7 +28,6 @@
_enc->opt_vtable.frag_sub_128=oc_enc_frag_sub_128_mmx;
_enc->opt_vtable.frag_recon_intra=oc_frag_recon_intra_mmx;
_enc->opt_vtable.frag_recon_inter=oc_frag_recon_inter_mmx;
- _enc->opt_vtable.fdct8x8=oc_enc_fdct8x8_mmx;
}
if(cpu_flags&OC_CPU_X86_MMXEXT){
_enc->opt_vtable.frag_sad=oc_enc_frag_sad_mmxext;
@@ -38,6 +37,7 @@
_enc->opt_vtable.frag_satd2=oc_enc_frag_satd2_mmxext;
_enc->opt_vtable.frag_intra_satd=oc_enc_frag_intra_satd_mmxext;
_enc->opt_vtable.frag_copy2=oc_enc_frag_copy2_mmxext;
+ _enc->opt_vtable.fdct8x8=oc_enc_fdct8x8_mmxext;
}
if(cpu_flags&OC_CPU_X86_SSE2){
# if defined(OC_X86_64_ASM)
Modified: trunk/theora/lib/x86/x86enc.h
===================================================================
--- trunk/theora/lib/x86/x86enc.h 2010-12-07 10:27:43 UTC (rev 17727)
+++ trunk/theora/lib/x86/x86enc.h 2010-12-07 10:28:07 UTC (rev 17728)
@@ -105,7 +105,7 @@
void oc_enc_enquant_table_fixup_x86(void *_enquant[3][3][2],int _nqis);
int oc_enc_quantize_sse2(ogg_int16_t _qdct[64],const ogg_int16_t _dct[64],
const ogg_uint16_t _dequant[64],const void *_enquant);
-void oc_enc_fdct8x8_mmx(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
+void oc_enc_fdct8x8_mmxext(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
# if defined(OC_X86_64_ASM)
void oc_enc_fdct8x8_x86_64sse2(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
Modified: trunk/theora/lib/x86/x86enquant.c
===================================================================
--- trunk/theora/lib/x86/x86enquant.c 2010-12-07 10:27:43 UTC (rev 17727)
+++ trunk/theora/lib/x86/x86enquant.c 2010-12-07 10:28:07 UTC (rev 17728)
@@ -56,125 +56,17 @@
}
}
-/*Convert DCT coefficients in %[dct] from natural order into zig-zag scan order
- and store them in %[qdct].
- The index of each output element in the original 64-element array should wind
- up in the following 8x8 matrix (the letters indicate the order we compute
- each 4-tuple below):
- A 0 1 8 16 9 2 3 10 B
- C 17 24 32 25 18 11 4 5 D
- E 12 19 26 33 40 48 41 34 I
- H 27 20 13 6 7 14 21 28 G
- K 35 42 49 56 57 50 43 36 J
- F 29 22 15 23 30 37 44 51 M
- P 58 59 52 45 38 31 39 46 L
- N 53 60 61 54 47 55 62 63 O
- The order of the coefficients within each tuple is reversed in the comments
- below to reflect the usual MSB to LSB notation.*/
-#define OC_ZIG_ZAG_MMXEXT \
- "movq 0x00(%[dct]),%%mm0\n\t" /*mm0=03 02 01 00*/ \
- "movq 0x08(%[dct]),%%mm1\n\t" /*mm1=07 06 05 04*/ \
- "movq 0x10(%[dct]),%%mm2\n\t" /*mm2=11 10 09 08*/ \
- "movq 0x20(%[dct]),%%mm3\n\t" /*mm3=19 18 17 16*/ \
- "movq 0x30(%[dct]),%%mm4\n\t" /*mm4=27 26 25 24*/ \
- "movq 0x40(%[dct]),%%mm5\n\t" /*mm5=35 34 33 32*/ \
- "movq %%mm2,%%mm7\n\t" /*mm7=11 10 09 08*/ \
- "punpcklwd %%mm3,%%mm2\n\t" /*mm2=17 09 16 08*/ \
- "movq %%mm0,%%mm6\n\t" /*mm6=03 02 01 00*/ \
- "punpckldq %%mm2,%%mm0\n\t" /*mm0=16 08 01 00 *A*/ \
- "movq %%mm0,0x00(%[qdct])\n\t" \
- "movq 0x18(%[dct]),%%mm0\n\t" /*mm0=15 14 13 12*/ \
- "punpckhdq %%mm6,%%mm6\n\t" /*mm6=03 02 03 02*/ \
- "psrlq $16,%%mm7\n\t" /*mm7=.. 11 10 09*/ \
- "punpckldq %%mm7,%%mm6\n\t" /*mm6=10 09 03 02*/ \
- "punpckhwd %%mm7,%%mm3\n\t" /*mm3=.. 19 11 18*/ \
- "pshufw $0xD2,%%mm6,%%mm6\n\t" /*mm6=10 03 02 09 *B*/ \
- "movq %%mm6,0x08(%[qdct])\n\t" \
- "psrlq $48,%%mm2\n\t" /*mm2=.. .. .. 17*/ \
- "movq %%mm1,%%mm6\n\t" /*mm6=07 06 05 04*/ \
- "punpcklwd %%mm5,%%mm2\n\t" /*mm2=33 .. 32 17*/ \
- "movq %%mm3,%%mm7\n\t" /*mm7=.. 19 11 18*/ \
- "punpckldq %%mm1,%%mm3\n\t" /*mm3=05 04 11 18 *C*/ \
- "por %%mm2,%%mm7\n\t" /*mm7=33 19 ?? ??*/ \
- "punpcklwd %%mm4,%%mm2\n\t" /*mm2=25 32 24 17 *D**/ \
- "movq %%mm2,0x10(%[qdct])\n\t" \
- "movq %%mm3,0x18(%[qdct])\n\t" \
- "movq 0x28(%[dct]),%%mm2\n\t" /*mm2=23 22 21 20*/ \
- "movq 0x38(%[dct]),%%mm1\n\t" /*mm1=31 30 29 28*/ \
- "pshufw $0x9C,%%mm0,%%mm3\n\t" /*mm3=14 13 15 12*/ \
- "punpckhdq %%mm7,%%mm7\n\t" /*mm7=33 19 33 19*/ \
- "punpckhwd %%mm3,%%mm6\n\t" /*mm6=14 07 13 06*/ \
- "punpckldq %%mm0,%%mm0\n\t" /*mm0=13 12 13 12*/ \
- "punpcklwd %%mm1,%%mm3\n\t" /*mm3=29 15 28 12*/ \
- "punpckhwd %%mm4,%%mm0\n\t" /*mm0=27 13 26 12*/ \
- "pshufw $0xB4,%%mm3,%%mm3\n\t" /*mm3=15 29 28 12*/ \
- "psrlq $48,%%mm4\n\t" /*mm4=.. .. .. 27*/ \
- "punpcklwd %%mm7,%%mm0\n\t" /*mm0=33 26 19 12 *E*/ \
- "punpcklwd %%mm1,%%mm4\n\t" /*mm4=29 .. 28 27*/ \
- "punpckhwd %%mm2,%%mm3\n\t" /*mm3=23 15 22 29 *F*/ \
- "movq %%mm0,0x20(%[qdct])\n\t" \
- "movq %%mm3,0x50(%[qdct])\n\t" \
- "movq 0x60(%[dct]),%%mm3\n\t" /*mm3=51 50 49 48*/ \
- "movq 0x70(%[dct]),%%mm7\n\t" /*mm7=59 58 57 56*/ \
- "movq 0x50(%[dct]),%%mm0\n\t" /*mm0=43 42 41 40*/ \
- "punpcklwd %%mm4,%%mm2\n\t" /*mm2=28 21 27 20*/ \
- "psrlq $32,%%mm5\n\t" /*mm5=.. .. 35 34*/ \
- "movq %%mm2,%%mm4\n\t" /*mm4=28 21 27 20*/ \
- "punpckldq %%mm6,%%mm2\n\t" /*mm2=13 06 27 20*/ \
- "punpckhdq %%mm4,%%mm6\n\t" /*mm6=28 21 14 07 *G*/ \
- "movq %%mm3,%%mm4\n\t" /*mm4=51 50 49 48*/ \
- "pshufw $0xB1,%%mm2,%%mm2\n\t" /*mm2=06 13 20 27 *H*/ \
- "movq %%mm2,0x30(%[qdct])\n\t" \
- "movq %%mm6,0x38(%[qdct])\n\t" \
- "movq 0x48(%[dct]),%%mm2\n\t" /*mm2=39 38 37 36*/ \
- "punpcklwd %%mm5,%%mm4\n\t" /*mm4=35 49 34 48*/ \
- "movq 0x58(%[dct]),%%mm5\n\t" /*mm5=47 46 45 44*/ \
- "punpckldq %%mm7,%%mm6\n\t" /*mm6=57 56 14 07*/ \
- "psrlq $32,%%mm3\n\t" /*mm3=.. .. 51 50*/ \
- "punpckhwd %%mm0,%%mm6\n\t" /*mm6=43 57 42 56*/ \
- "punpcklwd %%mm4,%%mm0\n\t" /*mm0=34 41 48 40 *I*/ \
- "pshufw $0x4E,%%mm6,%%mm6\n\t" /*mm6=42 56 43 57*/ \
- "movq %%mm0,0x28(%[qdct])\n\t" \
- "punpcklwd %%mm2,%%mm3\n\t" /*mm3=37 51 36 50*/ \
- "punpckhwd %%mm6,%%mm4\n\t" /*mm4=42 35 56 49*/ \
- "punpcklwd %%mm3,%%mm6\n\t" /*mm6=36 43 50 57 *J*/ \
- "pshufw $0x4E,%%mm4,%%mm4\n\t" /*mm4=56 49 42 35 *K*/ \
- "movq %%mm4,0x40(%[qdct])\n\t" \
- "movq %%mm6,0x48(%[qdct])\n\t" \
- "movq 0x68(%[dct]),%%mm6\n\t" /*mm6=55 54 53 52*/ \
- "movq 0x78(%[dct]),%%mm0\n\t" /*mm0=63 62 61 60*/ \
- "psrlq $32,%%mm1\n\t" /*mm1=.. .. 31 30*/ \
- "pshufw $0xD8,%%mm5,%%mm5\n\t" /*mm5=47 45 46 44*/ \
- "pshufw $0x0B,%%mm3,%%mm3\n\t" /*mm3=50 50 51 37*/ \
- "punpcklwd %%mm5,%%mm1\n\t" /*mm1=46 31 44 30*/ \
- "pshufw $0xC9,%%mm6,%%mm6\n\t" /*mm6=55 52 54 53*/ \
- "punpckhwd %%mm1,%%mm2\n\t" /*mm2=46 39 31 38 *L*/ \
- "punpcklwd %%mm3,%%mm1\n\t" /*mm1=51 44 37 30 *M*/ \
- "movq %%mm2,0x68(%[qdct])\n\t" \
- "movq %%mm1,0x58(%[qdct])\n\t" \
- "punpckhwd %%mm6,%%mm5\n\t" /*mm5=55 47 52 45*/ \
- "punpckldq %%mm0,%%mm6\n\t" /*mm6=61 60 54 53*/ \
- "pshufw $0x10,%%mm5,%%mm4\n\t" /*mm4=45 52 45 45*/ \
- "pshufw $0x78,%%mm6,%%mm6\n\t" /*mm6=53 60 61 54 *N*/ \
- "punpckhdq %%mm0,%%mm5\n\t" /*mm5=63 62 55 47 *O*/ \
- "punpckhdq %%mm4,%%mm7\n\t" /*mm7=45 52 59 58 *P*/ \
- "movq %%mm6,0x70(%[qdct])\n\t" \
- "movq %%mm5,0x78(%[qdct])\n\t" \
- "movq %%mm7,0x60(%[qdct])\n\t" \
-
int oc_enc_quantize_sse2(ogg_int16_t _qdct[64],const ogg_int16_t _dct[64],
const ogg_uint16_t _dequant[64],const void *_enquant){
ptrdiff_t r;
__asm__ __volatile__(
- /*Put the input in zig-zag order.*/
- OC_ZIG_ZAG_MMXEXT
"xor %[r],%[r]\n\t"
/*Loop through two rows at a time.*/
".p2align 4\n\t"
"0:\n\t"
/*Load the first two rows of the data and the quant matrices.*/
- "movdqa 0x00(%[qdct],%[r]),%%xmm0\n\t"
- "movdqa 0x10(%[qdct],%[r]),%%xmm1\n\t"
+ "movdqa 0x00(%[dct],%[r]),%%xmm0\n\t"
+ "movdqa 0x10(%[dct],%[r]),%%xmm1\n\t"
"movdqa 0x00(%[dq],%[r]),%%xmm2\n\t"
"movdqa 0x10(%[dq],%[r]),%%xmm3\n\t"
"movdqa 0x00(%[q],%[r]),%%xmm4\n\t"
Added: trunk/theora/lib/x86/x86zigzag.h
===================================================================
--- trunk/theora/lib/x86/x86zigzag.h (rev 0)
+++ trunk/theora/lib/x86/x86zigzag.h 2010-12-07 10:28:07 UTC (rev 17728)
@@ -0,0 +1,244 @@
+/********************************************************************
+ * *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
+ * *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ * *
+ ********************************************************************
+
+ function:
+ last mod: $Id: sse2trans.h 15675 2009-02-06 09:43:27Z tterribe $
+
+ ********************************************************************/
+
+#if !defined(_x86_x86zigzag_H)
+# define _x86_x86zigzag_H (1)
+# include "x86enc.h"
+
+
+/*Converts DCT coefficients from transposed order into zig-zag scan order and
+ stores them in %[y].
+ This relies on two macros to load the contents of each row:
+ OC_ZZ_LOAD_ROW_LO(row,reg) and OC_ZZ_LOAD_ROW_HI(row,reg), which load the
+ first four and second four entries of each row into the specified register,
+ respectively.
+ OC_ZZ_LOAD_ROW_LO must be called before OC_ZZ_LOAD_ROW_HI for the same row
+ (because when the rows are already in SSE2 registers, loading the high half
+ destructively modifies the register).
+ The index of each output element in the original 64-element array should wind
+ up in the following 8x8 matrix (the letters indicate the order we compute
+ each 4-tuple below):
+ A 0 8 1 2 9 16 24 17 B
+ C 10 3 4 11 18 25 32 40 E
+ F 33 26 19 12 5 6 13 20 D
+ G 27 34 41 48 56 49 42 35 I
+ L 28 21 14 7 15 22 29 36 M
+ H 43 50 57 58 51 44 37 30 O
+ N 23 31 38 45 52 59 60 53 J
+ P 46 39 47 54 61 62 55 63 K
+ The order of the coefficients within each tuple is reversed in the comments
+ below to reflect the usual MSB to LSB notation.*/
+#define OC_TRANSPOSE_ZIG_ZAG_MMXEXT \
+ OC_ZZ_LOAD_ROW_LO("0","%%mm0") /*mm0=03 02 01 00*/ \
+ OC_ZZ_LOAD_ROW_LO("1","%%mm1") /*mm1=11 10 09 08*/ \
+ OC_ZZ_LOAD_ROW_LO("2","%%mm2") /*mm2=19 18 17 16*/ \
+ OC_ZZ_LOAD_ROW_LO("3","%%mm3") /*mm3=27 26 25 24*/ \
+ OC_ZZ_LOAD_ROW_HI("0","%%mm4") /*mm4=07 06 05 04*/ \
+ OC_ZZ_LOAD_ROW_HI("1","%%mm5") /*mm5=15 14 13 12*/ \
+ OC_ZZ_LOAD_ROW_HI("2","%%mm6") /*mm6=23 22 21 20*/ \
+ "movq %%mm0,%%mm7\n\t" /*mm7=03 02 01 00*/ \
+ "punpckhdq %%mm1,%%mm0\n\t" /*mm0=11 10 03 02*/ \
+ "pshufw $0x39,%%mm4,%%mm4\n\t" /*mm4=04 07 06 05*/ \
+ "punpcklwd %%mm0,%%mm1\n\t" /*mm1=03 09 02 08*/ \
+ "pshufw $0x39,%%mm5,%%mm5\n\t" /*mm5=12 15 14 13*/ \
+ "punpcklwd %%mm1,%%mm7\n\t" /*mm7=02 01 08 00 *A*/ \
+ "movq %%mm7,0x00(%[y])\n\t" \
+ "punpckhwd %%mm4,%%mm1\n\t" /*mm1=04 03 07 09*/ \
+ "movq %%mm2,%%mm7\n\t" /*mm7=19 18 17 16*/ \
+ "punpckhdq %%mm1,%%mm0\n\t" /*mm0=04 03 11 10*/ \
+ "punpckhwd %%mm5,%%mm7\n\t" /*mm7=12 19 15 18*/ \
+ "punpcklwd %%mm3,%%mm1\n\t" /*mm1=25 07 24 09*/ \
+ "punpcklwd %%mm6,%%mm5\n\t" /*mm5=21 14 20 13*/ \
+ "punpcklwd %%mm2,%%mm1\n\t" /*mm1=17 24 16 09 *B*/ \
+ OC_ZZ_LOAD_ROW_LO("4","%%mm2") /*mm2=35 34 33 32*/ \
+ "movq %%mm1,0x08(%[y])\n\t" \
+ OC_ZZ_LOAD_ROW_LO("5","%%mm1") /*mm1=43 42 41 40*/ \
+ "pshufw $0x78,%%mm0,%%mm0\n\t" /*mm0=11 04 03 10 *C*/ \
+ "movq %%mm0,0x10(%[y])\n\t" \
+ "punpckhdq %%mm4,%%mm6\n\t" /*mm6=?? 07 23 22*/ \
+ "punpckldq %%mm5,%%mm4\n\t" /*mm4=20 13 06 05 *D*/ \
+ "movq %%mm4,0x28(%[y])\n\t" \
+ "psrlq $16,%%mm3\n\t" /*mm3=.. 27 26 25*/ \
+ "pshufw $0x0E,%%mm2,%%mm0\n\t" /*mm0=?? ?? 35 34*/ \
+ "movq %%mm7,%%mm4\n\t" /*mm4=12 19 15 18*/ \
+ "punpcklwd %%mm3,%%mm2\n\t" /*mm2=26 33 25 32*/ \
+ "punpcklwd %%mm1,%%mm4\n\t" /*mm4=41 15 40 18*/ \
+ "punpckhwd %%mm1,%%mm3\n\t" /*mm3=43 .. 42 27*/ \
+ "punpckldq %%mm2,%%mm4\n\t" /*mm4=25 32 40 18*/ \
+ "punpcklwd %%mm0,%%mm3\n\t" /*mm3=35 42 34 27*/ \
+ OC_ZZ_LOAD_ROW_LO("6","%%mm0") /*mm0=51 50 49 48*/ \
+ "pshufw $0x6C,%%mm4,%%mm4\n\t" /*mm4=40 32 25 18 *E*/ \
+ "movq %%mm4,0x18(%[y])\n\t" \
+ OC_ZZ_LOAD_ROW_LO("7","%%mm4") /*mm4=59 58 57 56*/ \
+ "punpckhdq %%mm7,%%mm2\n\t" /*mm2=12 19 26 33 *F*/ \
+ "movq %%mm2,0x20(%[y])\n\t" \
+ "pshufw $0xD0,%%mm1,%%mm1\n\t" /*mm1=43 41 ?? ??*/ \
+ "pshufw $0x87,%%mm0,%%mm0\n\t" /*mm0=50 48 49 51*/ \
+ "movq %%mm3,%%mm2\n\t" /*mm2=35 42 34 27*/ \
+ "punpckhwd %%mm0,%%mm1\n\t" /*mm1=50 43 48 41*/ \
+ "pshufw $0x93,%%mm4,%%mm4\n\t" /*mm4=58 57 56 59*/ \
+ "punpckldq %%mm1,%%mm3\n\t" /*mm3=48 41 34 27 *G*/ \
+ "movq %%mm3,0x30(%[y])\n\t" \
+ "punpckhdq %%mm4,%%mm1\n\t" /*mm1=58 57 50 43 *H*/ \
+ "movq %%mm1,0x50(%[y])\n\t" \
+ OC_ZZ_LOAD_ROW_HI("7","%%mm1") /*mm1=63 62 61 60*/ \
+ "punpcklwd %%mm0,%%mm4\n\t" /*mm4=49 56 51 59*/ \
+ OC_ZZ_LOAD_ROW_HI("6","%%mm0") /*mm0=55 54 53 52*/ \
+ "psllq $16,%%mm6\n\t" /*mm6=07 23 22 ..*/ \
+ "movq %%mm4,%%mm3\n\t" /*mm3=49 56 51 59*/ \
+ "punpckhdq %%mm2,%%mm4\n\t" /*mm4=35 42 49 56 *I*/ \
+ OC_ZZ_LOAD_ROW_HI("3","%%mm2") /*mm2=31 30 29 28*/ \
+ "movq %%mm4,0x38(%[y])\n\t" \
+ "punpcklwd %%mm1,%%mm3\n\t" /*mm3=61 51 60 59*/ \
+ "punpcklwd %%mm6,%%mm7\n\t" /*mm7=22 15 .. ??*/ \
+ "movq %%mm3,%%mm4\n\t" /*mm4=61 51 60 59*/ \
+ "punpcklwd %%mm0,%%mm3\n\t" /*mm3=53 60 52 59*/ \
+ "punpckhwd %%mm0,%%mm4\n\t" /*mm4=55 61 54 51*/ \
+ OC_ZZ_LOAD_ROW_HI("4","%%mm0") /*mm0=39 38 37 36*/ \
+ "pshufw $0xE1,%%mm3,%%mm3\n\t" /*mm3=53 60 59 52 *J*/ \
+ "movq %%mm3,0x68(%[y])\n\t" \
+ "movq %%mm4,%%mm3\n\t" /*mm3=?? ?? 54 51*/ \
+ "pshufw $0x39,%%mm2,%%mm2\n\t" /*mm2=28 31 30 29*/ \
+ "punpckhwd %%mm1,%%mm4\n\t" /*mm4=63 55 62 61 *K*/ \
+ OC_ZZ_LOAD_ROW_HI("5","%%mm1") /*mm1=47 46 45 44*/ \
+ "movq %%mm4,0x78(%[y])\n\t" \
+ "punpckhwd %%mm2,%%mm6\n\t" /*mm6=28 07 31 23*/ \
+ "punpcklwd %%mm0,%%mm2\n\t" /*mm2=37 30 36 29*/ \
+ "punpckhdq %%mm6,%%mm5\n\t" /*mm5=28 07 21 14*/ \
+ "pshufw $0x4B,%%mm2,%%mm2\n\t" /*mm2=36 29 30 37*/ \
+ "pshufw $0x87,%%mm5,%%mm5\n\t" /*mm5=07 14 21 28 *L*/ \
+ "movq %%mm5,0x40(%[y])\n\t" \
+ "punpckhdq %%mm2,%%mm7\n\t" /*mm7=36 29 22 15 *M*/ \
+ "movq %%mm7,0x48(%[y])\n\t" \
+ "pshufw $0x9C,%%mm1,%%mm1\n\t" /*mm1=46 45 47 44*/ \
+ "punpckhwd %%mm1,%%mm0\n\t" /*mm0=46 39 45 38*/ \
+ "punpcklwd %%mm1,%%mm3\n\t" /*mm3=47 54 44 51*/ \
+ "punpckldq %%mm0,%%mm6\n\t" /*mm6=45 38 31 23 *N*/ \
+ "movq %%mm6,0x60(%[y])\n\t" \
+ "punpckhdq %%mm3,%%mm0\n\t" /*mm0=47 54 46 39*/ \
+ "punpckldq %%mm2,%%mm3\n\t" /*mm3=30 37 44 51 *O*/ \
+ "movq %%mm3,0x58(%[y])\n\t" \
+ "pshufw $0xB1,%%mm0,%%mm0\n\t" /*mm0=54 47 39 46 *P*/ \
+ "movq %%mm0,0x70(%[y])\n\t" \
+
+/*Converts DCT coefficients in %[dct] from natural order into zig-zag scan
+ order and stores them in %[qdct].
+ The index of each output element in the original 64-element array should wind
+ up in the following 8x8 matrix (the letters indicate the order we compute
+ each 4-tuple below):
+ A 0 1 8 16 9 2 3 10 B
+ C 17 24 32 25 18 11 4 5 D
+ E 12 19 26 33 40 48 41 34 I
+ H 27 20 13 6 7 14 21 28 G
+ K 35 42 49 56 57 50 43 36 J
+ F 29 22 15 23 30 37 44 51 M
+ P 58 59 52 45 38 31 39 46 L
+ N 53 60 61 54 47 55 62 63 O
+ The order of the coefficients within each tuple is reversed in the comments
+ below to reflect the usual MSB to LSB notation.*/
+#define OC_ZIG_ZAG_MMXEXT \
+ "movq 0x00(%[dct]),%%mm0\n\t" /*mm0=03 02 01 00*/ \
+ "movq 0x08(%[dct]),%%mm1\n\t" /*mm1=07 06 05 04*/ \
+ "movq 0x10(%[dct]),%%mm2\n\t" /*mm2=11 10 09 08*/ \
+ "movq 0x20(%[dct]),%%mm3\n\t" /*mm3=19 18 17 16*/ \
+ "movq 0x30(%[dct]),%%mm4\n\t" /*mm4=27 26 25 24*/ \
+ "movq 0x40(%[dct]),%%mm5\n\t" /*mm5=35 34 33 32*/ \
+ "movq %%mm2,%%mm7\n\t" /*mm7=11 10 09 08*/ \
+ "punpcklwd %%mm3,%%mm2\n\t" /*mm2=17 09 16 08*/ \
+ "movq %%mm0,%%mm6\n\t" /*mm6=03 02 01 00*/ \
+ "punpckldq %%mm2,%%mm0\n\t" /*mm0=16 08 01 00 *A*/ \
+ "movq %%mm0,0x00(%[qdct])\n\t" \
+ "movq 0x18(%[dct]),%%mm0\n\t" /*mm0=15 14 13 12*/ \
+ "punpckhdq %%mm6,%%mm6\n\t" /*mm6=03 02 03 02*/ \
+ "psrlq $16,%%mm7\n\t" /*mm7=.. 11 10 09*/ \
+ "punpckldq %%mm7,%%mm6\n\t" /*mm6=10 09 03 02*/ \
+ "punpckhwd %%mm7,%%mm3\n\t" /*mm3=.. 19 11 18*/ \
+ "pshufw $0xD2,%%mm6,%%mm6\n\t" /*mm6=10 03 02 09 *B*/ \
+ "movq %%mm6,0x08(%[qdct])\n\t" \
+ "psrlq $48,%%mm2\n\t" /*mm2=.. .. .. 17*/ \
+ "movq %%mm1,%%mm6\n\t" /*mm6=07 06 05 04*/ \
+ "punpcklwd %%mm5,%%mm2\n\t" /*mm2=33 .. 32 17*/ \
+ "movq %%mm3,%%mm7\n\t" /*mm7=.. 19 11 18*/ \
+ "punpckldq %%mm1,%%mm3\n\t" /*mm3=05 04 11 18 *C*/ \
+ "por %%mm2,%%mm7\n\t" /*mm7=33 19 ?? ??*/ \
+ "punpcklwd %%mm4,%%mm2\n\t" /*mm2=25 32 24 17 *D**/ \
+ "movq %%mm2,0x10(%[qdct])\n\t" \
+ "movq %%mm3,0x18(%[qdct])\n\t" \
+ "movq 0x28(%[dct]),%%mm2\n\t" /*mm2=23 22 21 20*/ \
+ "movq 0x38(%[dct]),%%mm1\n\t" /*mm1=31 30 29 28*/ \
+ "pshufw $0x9C,%%mm0,%%mm3\n\t" /*mm3=14 13 15 12*/ \
+ "punpckhdq %%mm7,%%mm7\n\t" /*mm7=33 19 33 19*/ \
+ "punpckhwd %%mm3,%%mm6\n\t" /*mm6=14 07 13 06*/ \
+ "punpckldq %%mm0,%%mm0\n\t" /*mm0=13 12 13 12*/ \
+ "punpcklwd %%mm1,%%mm3\n\t" /*mm3=29 15 28 12*/ \
+ "punpckhwd %%mm4,%%mm0\n\t" /*mm0=27 13 26 12*/ \
+ "pshufw $0xB4,%%mm3,%%mm3\n\t" /*mm3=15 29 28 12*/ \
+ "psrlq $48,%%mm4\n\t" /*mm4=.. .. .. 27*/ \
+ "punpcklwd %%mm7,%%mm0\n\t" /*mm0=33 26 19 12 *E*/ \
+ "punpcklwd %%mm1,%%mm4\n\t" /*mm4=29 .. 28 27*/ \
+ "punpckhwd %%mm2,%%mm3\n\t" /*mm3=23 15 22 29 *F*/ \
+ "movq %%mm0,0x20(%[qdct])\n\t" \
+ "movq %%mm3,0x50(%[qdct])\n\t" \
+ "movq 0x60(%[dct]),%%mm3\n\t" /*mm3=51 50 49 48*/ \
+ "movq 0x70(%[dct]),%%mm7\n\t" /*mm7=59 58 57 56*/ \
+ "movq 0x50(%[dct]),%%mm0\n\t" /*mm0=43 42 41 40*/ \
+ "punpcklwd %%mm4,%%mm2\n\t" /*mm2=28 21 27 20*/ \
+ "psrlq $32,%%mm5\n\t" /*mm5=.. .. 35 34*/ \
+ "movq %%mm2,%%mm4\n\t" /*mm4=28 21 27 20*/ \
+ "punpckldq %%mm6,%%mm2\n\t" /*mm2=13 06 27 20*/ \
+ "punpckhdq %%mm4,%%mm6\n\t" /*mm6=28 21 14 07 *G*/ \
+ "movq %%mm3,%%mm4\n\t" /*mm4=51 50 49 48*/ \
+ "pshufw $0xB1,%%mm2,%%mm2\n\t" /*mm2=06 13 20 27 *H*/ \
+ "movq %%mm2,0x30(%[qdct])\n\t" \
+ "movq %%mm6,0x38(%[qdct])\n\t" \
+ "movq 0x48(%[dct]),%%mm2\n\t" /*mm2=39 38 37 36*/ \
+ "punpcklwd %%mm5,%%mm4\n\t" /*mm4=35 49 34 48*/ \
+ "movq 0x58(%[dct]),%%mm5\n\t" /*mm5=47 46 45 44*/ \
+ "punpckldq %%mm7,%%mm6\n\t" /*mm6=57 56 14 07*/ \
+ "psrlq $32,%%mm3\n\t" /*mm3=.. .. 51 50*/ \
+ "punpckhwd %%mm0,%%mm6\n\t" /*mm6=43 57 42 56*/ \
+ "punpcklwd %%mm4,%%mm0\n\t" /*mm0=34 41 48 40 *I*/ \
+ "pshufw $0x4E,%%mm6,%%mm6\n\t" /*mm6=42 56 43 57*/ \
+ "movq %%mm0,0x28(%[qdct])\n\t" \
+ "punpcklwd %%mm2,%%mm3\n\t" /*mm3=37 51 36 50*/ \
+ "punpckhwd %%mm6,%%mm4\n\t" /*mm4=42 35 56 49*/ \
+ "punpcklwd %%mm3,%%mm6\n\t" /*mm6=36 43 50 57 *J*/ \
+ "pshufw $0x4E,%%mm4,%%mm4\n\t" /*mm4=56 49 42 35 *K*/ \
+ "movq %%mm4,0x40(%[qdct])\n\t" \
+ "movq %%mm6,0x48(%[qdct])\n\t" \
+ "movq 0x68(%[dct]),%%mm6\n\t" /*mm6=55 54 53 52*/ \
+ "movq 0x78(%[dct]),%%mm0\n\t" /*mm0=63 62 61 60*/ \
+ "psrlq $32,%%mm1\n\t" /*mm1=.. .. 31 30*/ \
+ "pshufw $0xD8,%%mm5,%%mm5\n\t" /*mm5=47 45 46 44*/ \
+ "pshufw $0x0B,%%mm3,%%mm3\n\t" /*mm3=50 50 51 37*/ \
+ "punpcklwd %%mm5,%%mm1\n\t" /*mm1=46 31 44 30*/ \
+ "pshufw $0xC9,%%mm6,%%mm6\n\t" /*mm6=55 52 54 53*/ \
+ "punpckhwd %%mm1,%%mm2\n\t" /*mm2=46 39 31 38 *L*/ \
+ "punpcklwd %%mm3,%%mm1\n\t" /*mm1=51 44 37 30 *M*/ \
+ "movq %%mm2,0x68(%[qdct])\n\t" \
+ "movq %%mm1,0x58(%[qdct])\n\t" \
+ "punpckhwd %%mm6,%%mm5\n\t" /*mm5=55 47 52 45*/ \
+ "punpckldq %%mm0,%%mm6\n\t" /*mm6=61 60 54 53*/ \
+ "pshufw $0x10,%%mm5,%%mm4\n\t" /*mm4=45 52 45 45*/ \
+ "pshufw $0x78,%%mm6,%%mm6\n\t" /*mm6=53 60 61 54 *N*/ \
+ "punpckhdq %%mm0,%%mm5\n\t" /*mm5=63 62 55 47 *O*/ \
+ "punpckhdq %%mm4,%%mm7\n\t" /*mm7=45 52 59 58 *P*/ \
+ "movq %%mm6,0x70(%[qdct])\n\t" \
+ "movq %%mm5,0x78(%[qdct])\n\t" \
+ "movq %%mm7,0x60(%[qdct])\n\t" \
+
+#endif
Modified: trunk/theora/lib/x86_vc/mmxfdct.c
===================================================================
--- trunk/theora/lib/x86_vc/mmxfdct.c 2010-12-07 10:27:43 UTC (rev 17727)
+++ trunk/theora/lib/x86_vc/mmxfdct.c 2010-12-07 10:28:07 UTC (rev 17728)
@@ -12,6 +12,7 @@
/*MMX fDCT implementation for x86_32*/
/*$Id: fdct_ses2.c 14579 2008-03-12 06:42:40Z xiphmont $*/
#include "x86enc.h"
+#include "x86zigzag.h"
#if defined(OC_X86_ASM)
@@ -463,11 +464,13 @@
/*MMX implementation of the fDCT.*/
void oc_enc_fdct8x8_mmx(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
+ __declspec (align(8)) ogg_int16_t buf[64];
ptrdiff_t a;
__asm{
+#define X edx
#define Y eax
#define A ecx
-#define X edx
+#define BUF esi
/*Add two extra bits of working precision to improve accuracy; any more and
we could overflow.*/
/*We also add biases to correct for some systematic error that remains in
@@ -591,79 +594,90 @@
movq mm3,[0x30+Y]
OC_FDCT_STAGE1_8x4
OC_FDCT8x4(0x00,0x10,0x20,0x30,0x08,0x18,0x28,0x38)
- OC_TRANSPOSE8x4(0x00,0x10,0x20,0x30,0x08,0x18,0x28,0x38)
/*mm0={-2}x4*/
- pcmpeqw mm0,mm0
- paddw mm0,mm0
- /*Round the results.*/
- psubw mm1,mm0
- psubw mm2,mm0
- psraw mm1,2
- psubw mm3,mm0
- movq [0x18+Y],mm1
- psraw mm2,2
- psubw mm4,mm0
- movq mm1,[0x08+Y]
- psraw mm3,2
- psubw mm5,mm0
+ pcmpeqw mm2,mm2
+ paddw mm2,mm2
+ /*Round and store the results (no transpose).*/
+ movq mm7,[Y+0x10]
+ psubw mm4,mm2
+ psubw mm6,mm2
psraw mm4,2
- psubw mm6,mm0
+ psubw mm0,mm2
+ movq [BUF+0x00],mm4
+ movq mm4,[Y+0x30]
+ psraw mm6,2
+ psubw mm5,mm2
+ movq [BUF+0x20],mm6
+ psraw mm0,2
+ psubw mm3,mm2
+ movq [BUF+0x40],mm0
psraw mm5,2
- psubw mm7,mm0
- psraw mm6,2
- psubw mm1,mm0
+ psubw mm1,mm2
+ movq [BUF+0x50],mm5
+ psraw mm3,2
+ psubw mm7,mm2
+ movq [BUF+0x60],mm3
+ psraw mm1,2
+ psubw mm4,mm2
+ movq [BUF+0x70],mm1
psraw mm7,2
+ movq [BUF+0x10],mm7
+ psraw mm4,2
+ movq [BUF+0x30],mm4
+ /*Load the next block.*/
movq mm0,[0x40+Y]
- psraw mm1,2
- movq [0x30+Y],mm7
movq mm7,[0x78+Y]
- movq [0x08+Y],mm1
movq mm1,[0x50+Y]
- movq [0x20+Y],mm6
movq mm6,[0x68+Y]
- movq [0x28+Y],mm2
movq mm2,[0x60+Y]
- movq [0x10+Y],mm5
movq mm5,[0x58+Y]
- movq [0x38+Y],mm3
movq mm3,[0x70+Y]
- movq [0x00+Y],mm4
movq mm4,[0x48+Y]
OC_FDCT_STAGE1_8x4
OC_FDCT8x4(0x40,0x50,0x60,0x70,0x48,0x58,0x68,0x78)
- OC_TRANSPOSE8x4(0x40,0x50,0x60,0x70,0x48,0x58,0x68,0x78)
/*mm0={-2}x4*/
- pcmpeqw mm0,mm0
- paddw mm0,mm0
- /*Round the results.*/
- psubw mm1,mm0
- psubw mm2,mm0
- psraw mm1,2
- psubw mm3,mm0
- movq [0x58+Y],mm1
- psraw mm2,2
- psubw mm4,mm0
- movq mm1,[0x48+Y]
- psraw mm3,2
- psubw mm5,mm0
- movq [0x68+Y],mm2
+ pcmpeqw mm2,mm2
+ paddw mm2,mm2
+ /*Round and store the results (no transpose).*/
+ movq mm7,[Y+0x50]
+ psubw mm4,mm2
+ psubw mm6,mm2
psraw mm4,2
- psubw mm6,mm0
- movq [0x78+Y],mm3
+ psubw mm0,mm2
+ movq [BUF+0x08],mm4
+ movq mm4,[Y+0x70]
+ psraw mm6,2
+ psubw mm5,mm2
+ movq [BUF+0x28],mm6
+ psraw mm0,2
+ psubw mm3,mm2
+ movq [BUF+0x48],mm0
psraw mm5,2
- psubw mm7,mm0
- movq [0x40+Y],mm4
- psraw mm6,2
- psubw mm1,mm0
- movq [0x50+Y],mm5
+ psubw mm1,mm2
+ movq [BUF+0x58],mm5
+ psraw mm3,2
+ psubw mm7,mm2
+ movq [BUF+0x68],mm3
+ psraw mm1,2
+ psubw mm4,mm2
+ movq [BUF+0x78],mm1
psraw mm7,2
- movq [0x60+Y],mm6
- psraw mm1,2
- movq [0x70+Y],mm7
- movq [0x48+Y],mm1
+ movq [BUF+0x18],mm7
+ psraw mm4,2
+ movq [BUF+0x38],mm4
+#define OC_ZZ_LOAD_ROW_LO(_row,_reg) \
+ __asm movq _reg,[BUF+16*(_row)] \
+
+#define OC_ZZ_LOAD_ROW_HI(_row,_reg) \
+ __asm movq _reg,[BUF+16*(_row)+8] \
+
+ OC_TRANSPOSE_ZIG_ZAG_MMXEXT
+#undef OC_ZZ_LOAD_ROW_LO
+#undef OC_ZZ_LOAD_ROW_HI
+#undef X
#undef Y
#undef A
-#undef X
+#undef BUF
}
}
Modified: trunk/theora/lib/x86_vc/mmxidct.c
===================================================================
--- trunk/theora/lib/x86_vc/mmxidct.c 2010-12-07 10:27:43 UTC (rev 17727)
+++ trunk/theora/lib/x86_vc/mmxidct.c 2010-12-07 10:28:07 UTC (rev 17728)
@@ -339,22 +339,19 @@
#undef Y
#undef X
}
- if(_x!=_y){
- int i;
- __asm pxor mm0,mm0;
- for(i=0;i<4;i++){
- ogg_int16_t *x;
- x=_x+16*i;
+ __asm pxor mm0,mm0;
+ for(i=0;i<4;i++){
+ ogg_int16_t *x;
+ x=_x+16*i;
#define X ecx
- __asm{
- mov X,x
- movq [X+0x00],mm0
- movq [X+0x08],mm0
- movq [X+0x10],mm0
- movq [X+0x18],mm0
- }
-#undef X
+ __asm{
+ mov X,x
+ movq [X+0x00],mm0
+ movq [X+0x08],mm0
+ movq [X+0x10],mm0
+ movq [X+0x18],mm0
}
+#undef X
}
}
@@ -547,18 +544,16 @@
#undef Y
#undef X
}
- if(_x!=_y){
#define X ecx
- __asm{
- pxor mm0,mm0;
- mov X,_x
- movq [X+0x00],mm0
- movq [X+0x10],mm0
- movq [X+0x20],mm0
- movq [X+0x30],mm0
- }
-#undef X
+ __asm{
+ pxor mm0,mm0;
+ mov X,_x
+ movq [X+0x00],mm0
+ movq [X+0x10],mm0
+ movq [X+0x20],mm0
+ movq [X+0x30],mm0
}
+#undef X
}
/*Performs an inverse 8x8 Type-II DCT transform.
Added: trunk/theora/lib/x86_vc/x86zigzag.h
===================================================================
--- trunk/theora/lib/x86_vc/x86zigzag.h (rev 0)
+++ trunk/theora/lib/x86_vc/x86zigzag.h 2010-12-07 10:28:07 UTC (rev 17728)
@@ -0,0 +1,244 @@
+/********************************************************************
+ * *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
+ * *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ * *
+ ********************************************************************
+
+ function:
+ last mod: $Id: sse2trans.h 15675 2009-02-06 09:43:27Z tterribe $
+
+ ********************************************************************/
+
+#if !defined(_x86_vc_x86zigzag_H)
+# define _x86_vc_x86zigzag_H (1)
+# include "x86enc.h"
+
+
+/*Converts DCT coefficients from transposed order into zig-zag scan order and
+ stores them in Y.
+ This relies on two macros to load the contents of each row:
+ OC_ZZ_LOAD_ROW_LO(row,reg) and OC_ZZ_LOAD_ROW_HI(row,reg), which load the
+ first four and second four entries of each row into the specified register,
+ respectively.
+ OC_ZZ_LOAD_ROW_LO must be called before OC_ZZ_LOAD_ROW_HI for the same row
+ (because when the rows are already in SSE2 registers, loading the high half
+ destructively modifies the register).
+ The index of each output element in the original 64-element array should wind
+ up in the following 8x8 matrix (the letters indicate the order we compute
+ each 4-tuple below):
+ A 0 8 1 2 9 16 24 17 B
+ C 10 3 4 11 18 25 32 40 E
+ F 33 26 19 12 5 6 13 20 D
+ G 27 34 41 48 56 49 42 35 I
+ L 28 21 14 7 15 22 29 36 M
+ H 43 50 57 58 51 44 37 30 O
+ N 23 31 38 45 52 59 60 53 J
+ P 46 39 47 54 61 62 55 63 K
+ The order of the coefficients within each tuple is reversed in the comments
+ below to reflect the usual MSB to LSB notation.*/
+#define OC_TRANSPOSE_ZIG_ZAG_MMXEXT \
+ OC_ZZ_LOAD_ROW_LO(0,mm0) /*mm0=03 02 01 00*/ \
+ OC_ZZ_LOAD_ROW_LO(1,mm1) /*mm1=11 10 09 08*/ \
+ OC_ZZ_LOAD_ROW_LO(2,mm2) /*mm2=19 18 17 16*/ \
+ OC_ZZ_LOAD_ROW_LO(3,mm3) /*mm3=27 26 25 24*/ \
+ OC_ZZ_LOAD_ROW_HI(0,mm4) /*mm4=07 06 05 04*/ \
+ OC_ZZ_LOAD_ROW_HI(1,mm5) /*mm5=15 14 13 12*/ \
+ OC_ZZ_LOAD_ROW_HI(2,mm6) /*mm6=23 22 21 20*/ \
+ __asm movq mm7,mm0 /*mm7=03 02 01 00*/ \
+ __asm punpckhdq mm0,mm1 /*mm0=11 10 03 02*/ \
+ __asm pshufw mm4,mm4,0x39 /*mm4=04 07 06 05*/ \
+ __asm punpcklwd mm1,mm0 /*mm1=03 09 02 08*/ \
+ __asm pshufw mm5,mm5,0x39 /*mm5=12 15 14 13*/ \
+ __asm punpcklwd mm7,mm1 /*mm7=02 01 08 00 *A*/ \
+ __asm movq [Y+0x00],mm7 \
+ __asm punpckhwd mm1,mm4 /*mm1=04 03 07 09*/ \
+ __asm movq mm7,mm2 /*mm7=19 18 17 16*/ \
+ __asm punpckhdq mm0,mm1 /*mm0=04 03 11 10*/ \
+ __asm punpckhwd mm7,mm5 /*mm7=12 19 15 18*/ \
+ __asm punpcklwd mm1,mm3 /*mm1=25 07 24 09*/ \
+ __asm punpcklwd mm5,mm6 /*mm5=21 14 20 13*/ \
+ __asm punpcklwd mm1,mm2 /*mm1=17 24 16 09 *B*/ \
+ OC_ZZ_LOAD_ROW_LO(4,mm2) /*mm2=35 34 33 32*/ \
+ __asm movq [Y+0x08],mm1 \
+ OC_ZZ_LOAD_ROW_LO(5,mm1) /*mm1=43 42 41 40*/ \
+ __asm pshufw mm0,mm0,0x78 /*mm0=11 04 03 10 *C*/ \
+ __asm movq [Y+0x10],mm0 \
+ __asm punpckhdq mm6,mm4 /*mm6=?? 07 23 22*/ \
+ __asm punpckldq mm4,mm5 /*mm4=20 13 06 05 *D*/ \
+ __asm movq [Y+0x28],mm4 \
+ __asm psrlq mm3,16 /*mm3=.. 27 26 25*/ \
+ __asm pshufw mm0,mm2,0x0E /*mm0=?? ?? 35 34*/ \
+ __asm movq mm4,mm7 /*mm4=12 19 15 18*/ \
+ __asm punpcklwd mm2,mm3 /*mm2=26 33 25 32*/ \
+ __asm punpcklwd mm4,mm1 /*mm4=41 15 40 18*/ \
+ __asm punpckhwd mm3,mm1 /*mm3=43 .. 42 27*/ \
+ __asm punpckldq mm4,mm2 /*mm4=25 32 40 18*/ \
+ __asm punpcklwd mm3,mm0 /*mm3=35 42 34 27*/ \
+ OC_ZZ_LOAD_ROW_LO(6,mm0) /*mm0=51 50 49 48*/ \
+ __asm pshufw mm4,mm4,0x6C /*mm4=40 32 25 18 *E*/ \
+ __asm movq [Y+0x18],mm4 \
+ OC_ZZ_LOAD_ROW_LO(7,mm4) /*mm4=59 58 57 56*/ \
+ __asm punpckhdq mm2,mm7 /*mm2=12 19 26 33 *F*/ \
+ __asm movq [Y+0x20],mm2 \
+ __asm pshufw mm1,mm1,0xD0 /*mm1=43 41 ?? ??*/ \
+ __asm pshufw mm0,mm0,0x87 /*mm0=50 48 49 51*/ \
+ __asm movq mm2,mm3 /*mm2=35 42 34 27*/ \
+ __asm punpckhwd mm1,mm0 /*mm1=50 43 48 41*/ \
+ __asm pshufw mm4,mm4,0x93 /*mm4=58 57 56 59*/ \
+ __asm punpckldq mm3,mm1 /*mm3=48 41 34 27 *G*/ \
+ __asm movq [Y+0x30],mm3 \
+ __asm punpckhdq mm1,mm4 /*mm1=58 57 50 43 *H*/ \
+ __asm movq [Y+0x50],mm1 \
+ OC_ZZ_LOAD_ROW_HI(7,mm1) /*mm1=63 62 61 60*/ \
+ __asm punpcklwd mm4,mm0 /*mm4=49 56 51 59*/ \
+ OC_ZZ_LOAD_ROW_HI(6,mm0) /*mm0=55 54 53 52*/ \
+ __asm psllq mm6,16 /*mm6=07 23 22 ..*/ \
+ __asm movq mm3,mm4 /*mm3=49 56 51 59*/ \
+ __asm punpckhdq mm4,mm2 /*mm4=35 42 49 56 *I*/ \
+ OC_ZZ_LOAD_ROW_HI(3,mm2) /*mm2=31 30 29 28*/ \
+ __asm movq [Y+0x38],mm4 \
+ __asm punpcklwd mm3,mm1 /*mm3=61 51 60 59*/ \
+ __asm punpcklwd mm7,mm6 /*mm7=22 15 .. ??*/ \
+ __asm movq mm4,mm3 /*mm4=61 51 60 59*/ \
+ __asm punpcklwd mm3,mm0 /*mm3=53 60 52 59*/ \
+ __asm punpckhwd mm4,mm0 /*mm4=55 61 54 51*/ \
+ OC_ZZ_LOAD_ROW_HI(4,mm0) /*mm0=39 38 37 36*/ \
+ __asm pshufw mm3,mm3,0xE1 /*mm3=53 60 59 52 *J*/ \
+ __asm movq [Y+0x68],mm3 \
+ __asm movq mm3,mm4 /*mm3=?? ?? 54 51*/ \
+ __asm pshufw mm2,mm2,0x39 /*mm2=28 31 30 29*/ \
+ __asm punpckhwd mm4,mm1 /*mm4=63 55 62 61 *K*/ \
+ OC_ZZ_LOAD_ROW_HI(5,mm1) /*mm1=47 46 45 44*/ \
+ __asm movq [Y+0x78],mm4 \
+ __asm punpckhwd mm6,mm2 /*mm6=28 07 31 23*/ \
+ __asm punpcklwd mm2,mm0 /*mm2=37 30 36 29*/ \
+ __asm punpckhdq mm5,mm6 /*mm5=28 07 21 14*/ \
+ __asm pshufw mm2,mm2,0x4B /*mm2=36 29 30 37*/ \
+ __asm pshufw mm5,mm5,0x87 /*mm5=07 14 21 28 *L*/ \
+ __asm movq [Y+0x40],mm5 \
+ __asm punpckhdq mm7,mm2 /*mm7=36 29 22 15 *M*/ \
+ __asm movq [Y+0x48],mm7 \
+ __asm pshufw mm1,mm1,0x9C /*mm1=46 45 47 44*/ \
+ __asm punpckhwd mm0,mm1 /*mm0=46 39 45 38*/ \
+ __asm punpcklwd mm3,mm1 /*mm3=47 54 44 51*/ \
+ __asm punpckldq mm6,mm0 /*mm6=45 38 31 23 *N*/ \
+ __asm movq [Y+0x60],mm6 \
+ __asm punpckhdq mm0,mm3 /*mm0=47 54 46 39*/ \
+ __asm punpckldq mm3,mm2 /*mm3=30 37 44 51 *O*/ \
+ __asm movq [Y+0x58],mm3 \
+ __asm pshufw mm0,mm0,0xB1 /*mm0=54 47 39 46 *P*/ \
+ __asm movq [Y+0x70],mm0 \
+
+/*Converts DCT coefficients in %[dct] from natural order into zig-zag scan
+ order and stores them in %[qdct].
+ The index of each output element in the original 64-element array should wind
+ up in the following 8x8 matrix (the letters indicate the order we compute
+ each 4-tuple below):
+ A 0 1 8 16 9 2 3 10 B
+ C 17 24 32 25 18 11 4 5 D
+ E 12 19 26 33 40 48 41 34 I
+ H 27 20 13 6 7 14 21 28 G
+ K 35 42 49 56 57 50 43 36 J
+ F 29 22 15 23 30 37 44 51 M
+ P 58 59 52 45 38 31 39 46 L
+ N 53 60 61 54 47 55 62 63 O
+ The order of the coefficients within each tuple is reversed in the comments
+ below to reflect the usual MSB to LSB notation.*/
+#define OC_ZIG_ZAG_MMXEXT \
+ "movq 0x00(%[dct]),%%mm0\n\t" /*mm0=03 02 01 00*/ \
+ "movq 0x08(%[dct]),%%mm1\n\t" /*mm1=07 06 05 04*/ \
+ "movq 0x10(%[dct]),%%mm2\n\t" /*mm2=11 10 09 08*/ \
+ "movq 0x20(%[dct]),%%mm3\n\t" /*mm3=19 18 17 16*/ \
+ "movq 0x30(%[dct]),%%mm4\n\t" /*mm4=27 26 25 24*/ \
+ "movq 0x40(%[dct]),%%mm5\n\t" /*mm5=35 34 33 32*/ \
+ "movq %%mm2,%%mm7\n\t" /*mm7=11 10 09 08*/ \
+ "punpcklwd %%mm3,%%mm2\n\t" /*mm2=17 09 16 08*/ \
+ "movq %%mm0,%%mm6\n\t" /*mm6=03 02 01 00*/ \
+ "punpckldq %%mm2,%%mm0\n\t" /*mm0=16 08 01 00 *A*/ \
+ "movq %%mm0,0x00(%[qdct])\n\t" \
+ "movq 0x18(%[dct]),%%mm0\n\t" /*mm0=15 14 13 12*/ \
+ "punpckhdq %%mm6,%%mm6\n\t" /*mm6=03 02 03 02*/ \
+ "psrlq $16,%%mm7\n\t" /*mm7=.. 11 10 09*/ \
+ "punpckldq %%mm7,%%mm6\n\t" /*mm6=10 09 03 02*/ \
+ "punpckhwd %%mm7,%%mm3\n\t" /*mm3=.. 19 11 18*/ \
+ "pshufw $0xD2,%%mm6,%%mm6\n\t" /*mm6=10 03 02 09 *B*/ \
+ "movq %%mm6,0x08(%[qdct])\n\t" \
+ "psrlq $48,%%mm2\n\t" /*mm2=.. .. .. 17*/ \
+ "movq %%mm1,%%mm6\n\t" /*mm6=07 06 05 04*/ \
+ "punpcklwd %%mm5,%%mm2\n\t" /*mm2=33 .. 32 17*/ \
+ "movq %%mm3,%%mm7\n\t" /*mm7=.. 19 11 18*/ \
+ "punpckldq %%mm1,%%mm3\n\t" /*mm3=05 04 11 18 *C*/ \
+ "por %%mm2,%%mm7\n\t" /*mm7=33 19 ?? ??*/ \
+ "punpcklwd %%mm4,%%mm2\n\t" /*mm2=25 32 24 17 *D**/ \
+ "movq %%mm2,0x10(%[qdct])\n\t" \
+ "movq %%mm3,0x18(%[qdct])\n\t" \
+ "movq 0x28(%[dct]),%%mm2\n\t" /*mm2=23 22 21 20*/ \
+ "movq 0x38(%[dct]),%%mm1\n\t" /*mm1=31 30 29 28*/ \
+ "pshufw $0x9C,%%mm0,%%mm3\n\t" /*mm3=14 13 15 12*/ \
+ "punpckhdq %%mm7,%%mm7\n\t" /*mm7=33 19 33 19*/ \
+ "punpckhwd %%mm3,%%mm6\n\t" /*mm6=14 07 13 06*/ \
+ "punpckldq %%mm0,%%mm0\n\t" /*mm0=13 12 13 12*/ \
+ "punpcklwd %%mm1,%%mm3\n\t" /*mm3=29 15 28 12*/ \
+ "punpckhwd %%mm4,%%mm0\n\t" /*mm0=27 13 26 12*/ \
+ "pshufw $0xB4,%%mm3,%%mm3\n\t" /*mm3=15 29 28 12*/ \
+ "psrlq $48,%%mm4\n\t" /*mm4=.. .. .. 27*/ \
+ "punpcklwd %%mm7,%%mm0\n\t" /*mm0=33 26 19 12 *E*/ \
+ "punpcklwd %%mm1,%%mm4\n\t" /*mm4=29 .. 28 27*/ \
+ "punpckhwd %%mm2,%%mm3\n\t" /*mm3=23 15 22 29 *F*/ \
+ "movq %%mm0,0x20(%[qdct])\n\t" \
+ "movq %%mm3,0x50(%[qdct])\n\t" \
+ "movq 0x60(%[dct]),%%mm3\n\t" /*mm3=51 50 49 48*/ \
+ "movq 0x70(%[dct]),%%mm7\n\t" /*mm7=59 58 57 56*/ \
+ "movq 0x50(%[dct]),%%mm0\n\t" /*mm0=43 42 41 40*/ \
+ "punpcklwd %%mm4,%%mm2\n\t" /*mm2=28 21 27 20*/ \
+ "psrlq $32,%%mm5\n\t" /*mm5=.. .. 35 34*/ \
+ "movq %%mm2,%%mm4\n\t" /*mm4=28 21 27 20*/ \
+ "punpckldq %%mm6,%%mm2\n\t" /*mm2=13 06 27 20*/ \
+ "punpckhdq %%mm4,%%mm6\n\t" /*mm6=28 21 14 07 *G*/ \
+ "movq %%mm3,%%mm4\n\t" /*mm4=51 50 49 48*/ \
+ "pshufw $0xB1,%%mm2,%%mm2\n\t" /*mm2=06 13 20 27 *H*/ \
+ "movq %%mm2,0x30(%[qdct])\n\t" \
+ "movq %%mm6,0x38(%[qdct])\n\t" \
+ "movq 0x48(%[dct]),%%mm2\n\t" /*mm2=39 38 37 36*/ \
+ "punpcklwd %%mm5,%%mm4\n\t" /*mm4=35 49 34 48*/ \
+ "movq 0x58(%[dct]),%%mm5\n\t" /*mm5=47 46 45 44*/ \
+ "punpckldq %%mm7,%%mm6\n\t" /*mm6=57 56 14 07*/ \
+ "psrlq $32,%%mm3\n\t" /*mm3=.. .. 51 50*/ \
+ "punpckhwd %%mm0,%%mm6\n\t" /*mm6=43 57 42 56*/ \
+ "punpcklwd %%mm4,%%mm0\n\t" /*mm0=34 41 48 40 *I*/ \
+ "pshufw $0x4E,%%mm6,%%mm6\n\t" /*mm6=42 56 43 57*/ \
+ "movq %%mm0,0x28(%[qdct])\n\t" \
+ "punpcklwd %%mm2,%%mm3\n\t" /*mm3=37 51 36 50*/ \
+ "punpckhwd %%mm6,%%mm4\n\t" /*mm4=42 35 56 49*/ \
+ "punpcklwd %%mm3,%%mm6\n\t" /*mm6=36 43 50 57 *J*/ \
+ "pshufw $0x4E,%%mm4,%%mm4\n\t" /*mm4=56 49 42 35 *K*/ \
+ "movq %%mm4,0x40(%[qdct])\n\t" \
+ "movq %%mm6,0x48(%[qdct])\n\t" \
+ "movq 0x68(%[dct]),%%mm6\n\t" /*mm6=55 54 53 52*/ \
+ "movq 0x78(%[dct]),%%mm0\n\t" /*mm0=63 62 61 60*/ \
+ "psrlq $32,%%mm1\n\t" /*mm1=.. .. 31 30*/ \
+ "pshufw $0xD8,%%mm5,%%mm5\n\t" /*mm5=47 45 46 44*/ \
+ "pshufw $0x0B,%%mm3,%%mm3\n\t" /*mm3=50 50 51 37*/ \
+ "punpcklwd %%mm5,%%mm1\n\t" /*mm1=46 31 44 30*/ \
+ "pshufw $0xC9,%%mm6,%%mm6\n\t" /*mm6=55 52 54 53*/ \
+ "punpckhwd %%mm1,%%mm2\n\t" /*mm2=46 39 31 38 *L*/ \
+ "punpcklwd %%mm3,%%mm1\n\t" /*mm1=51 44 37 30 *M*/ \
+ "movq %%mm2,0x68(%[qdct])\n\t" \
+ "movq %%mm1,0x58(%[qdct])\n\t" \
+ "punpckhwd %%mm6,%%mm5\n\t" /*mm5=55 47 52 45*/ \
+ "punpckldq %%mm0,%%mm6\n\t" /*mm6=61 60 54 53*/ \
+ "pshufw $0x10,%%mm5,%%mm4\n\t" /*mm4=45 52 45 45*/ \
+ "pshufw $0x78,%%mm6,%%mm6\n\t" /*mm6=53 60 61 54 *N*/ \
+ "punpckhdq %%mm0,%%mm5\n\t" /*mm5=63 62 55 47 *O*/ \
+ "punpckhdq %%mm4,%%mm7\n\t" /*mm7=45 52 59 58 *P*/ \
+ "movq %%mm6,0x70(%[qdct])\n\t" \
+ "movq %%mm5,0x78(%[qdct])\n\t" \
+ "movq %%mm7,0x60(%[qdct])\n\t" \
+
+#endif
More information about the commits
mailing list