[xiph-commits] r14783 - in branches/theora-thusnelda/lib/enc: . x86_64
xiphmont at svn.xiph.org
xiphmont at svn.xiph.org
Tue Apr 22 09:23:11 PDT 2008
Author: xiphmont
Date: 2008-04-22 09:23:11 -0700 (Tue, 22 Apr 2008)
New Revision: 14783
Modified:
branches/theora-thusnelda/lib/enc/codec_internal.h
branches/theora-thusnelda/lib/enc/encoder_quant.c
branches/theora-thusnelda/lib/enc/mode.c
branches/theora-thusnelda/lib/enc/x86_64/idct_mmx.c
Log:
Minor rho collection optimization, add MMX idct for x86_64
Modified: branches/theora-thusnelda/lib/enc/codec_internal.h
===================================================================
--- branches/theora-thusnelda/lib/enc/codec_internal.h 2008-04-21 02:24:43 UTC (rev 14782)
+++ branches/theora-thusnelda/lib/enc/codec_internal.h 2008-04-22 16:23:11 UTC (rev 14783)
@@ -153,10 +153,10 @@
} superblock_t;
typedef ogg_int16_t quant_table[64];
-typedef quant_table quant_tables[64];
+typedef quant_table quant_tables[64]; /* [zigzag][qi] */
typedef ogg_int32_t iquant_table[64];
-typedef iquant_table iquant_tables[64];
+typedef iquant_table iquant_tables[64]; /* [qi][coeff] */
typedef struct {
const unsigned char *mode_bits[8];
@@ -255,7 +255,6 @@
/********************************************************************/
/* Fragment SAD->bitrate estimation tracking metrics */
- unsigned char rho_lookup[2][3][64][OC_QUANT_MAX>>2];
ogg_uint32_t rho_count[65];
#ifdef COLLECT_METRICS
@@ -300,11 +299,6 @@
extern void ReconRefFrames (CP_INSTANCE *cpi);
-extern void quantize( CP_INSTANCE *cpi,
- ogg_int32_t *iquant_table,
- ogg_int16_t *DCT_block,
- ogg_int16_t *quantized_list);
-
extern void fdct_short ( ogg_int16_t *InputData, ogg_int16_t *OutputData );
extern void DPCMTokenize (CP_INSTANCE *cpi);
Modified: branches/theora-thusnelda/lib/enc/encoder_quant.c
===================================================================
--- branches/theora-thusnelda/lib/enc/encoder_quant.c 2008-04-21 02:24:43 UTC (rev 14782)
+++ branches/theora-thusnelda/lib/enc/encoder_quant.c 2008-04-22 16:23:11 UTC (rev 14783)
@@ -163,14 +163,14 @@
/*Scale DC the coefficient from the proper table.*/
q=((ogg_uint32_t)qinfo->dc_scale[qi]*base[0]/100)<<2;
q=OC_CLAMPI(OC_DC_QUANT_MIN[qti],q,OC_QUANT_MAX);
- cpi->quant_tables[qti][pli][qi][0]=(ogg_uint16_t)q;
+ cpi->quant_tables[qti][pli][0][qi]=(ogg_uint16_t)q;
cpi->iquant_tables[qti][pli][qi][0]=(ogg_int32_t)(((1<<31))/q+1);
/*Now scale AC coefficients from the proper table.*/
for(ci=1;ci<64;ci++){
q=((ogg_uint32_t)qinfo->ac_scale[qi]*base[ci]/100)<<2;
q=OC_CLAMPI(OC_AC_QUANT_MIN[qti],q,OC_QUANT_MAX);
- cpi->quant_tables[qti][pli][qi][zigzag_index[ci]]=(ogg_uint16_t)q;
+ cpi->quant_tables[qti][pli][zigzag_index[ci]][qi]=(ogg_uint16_t)q;
cpi->iquant_tables[qti][pli][qi][ci]=(ogg_int32_t)(((1<<31))/q+1);
}
@@ -189,28 +189,3 @@
}
}
}
-
-void quantize( CP_INSTANCE *cpi,
- ogg_int32_t *q,
- ogg_int16_t *in,
- ogg_int16_t *out){
- int i;
-
- /* Set the quantized_list to default to 0 */
- memset(out, 0, 64 * sizeof(*out) );
-
- /* Note that we add half divisor to effect rounding on positive number */
- for( i = 0; i < 64; i++) {
- // the extra precision version to perfectly match dequant and thus rho metrics. It's about a 2% speed penalty.
- int val = (((q[i]>>15)*in[i]) + (1<<15) + (((q[i]&0x7fff)*in[i])>>15)) >>16;
- if(val==0){
- out[zigzag_index[i]] = 0;
- }else if(val>511){
- out[zigzag_index[i]] = 511;
- }else if (val<-511){
- out[zigzag_index[i]] = -511;
- }else{
- out[zigzag_index[i]] = val;
- }
- }
-}
Modified: branches/theora-thusnelda/lib/enc/mode.c
===================================================================
--- branches/theora-thusnelda/lib/enc/mode.c 2008-04-21 02:24:43 UTC (rev 14782)
+++ branches/theora-thusnelda/lib/enc/mode.c 2008-04-22 16:23:11 UTC (rev 14783)
@@ -249,7 +249,7 @@
return y[0] + ((ydel*xdel)>>OC_SAD_SHIFT);
}
-static signed char mvmap[2][63] = {
+static const int mvmap[2][63] = {
{ -15,-15,-14, -14,-13,-13,-12, -12,-11,-11,-10, -10, -9, -9, -8,
-8, -7, -7, -6, -6, -5, -5, -4, -4, -3, -3, -2, -2, -1, -1, 0,
0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7,
@@ -260,7 +260,7 @@
4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7 }
};
-static signed char mvmap2[2][63] = {
+static const int mvmap2[2][63] = {
{ -1, 0,-1, 0,-1, 0,-1, 0,-1, 0,-1, 0,-1, 0,-1,
0,-1, 0,-1, 0,-1, 0,-1, 0,-1, 0,-1, 0,-1, 0,-1,
0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
@@ -488,187 +488,19 @@
}
#include "quant_lookup.h"
-static int find_nonzero_transition(quant_tables *q, int pos, ogg_int16_t val){
+static int find_nonzero_transition(ogg_int16_t *q, ogg_int16_t in){
int i;
-
- val = (abs(val)<<1);
-
- if( val < (*q)[32][pos]){
- if( val < (*q)[48][pos]){
- if( val < (*q)[56][pos]){
- if( val < (*q)[60][pos]){
- if( val < (*q)[62][pos]){
- if( val < (*q)[63][pos])return 64;
- return 63;
- }else{
- if( val < (*q)[61][pos])return 62;
- return 61;
- }
- }else{
- if( val < (*q)[58][pos]){
- if( val < (*q)[59][pos])return 60;
- return 59;
- }else{
- if( val < (*q)[57][pos])return 58;
- return 57;
- }
- }
- }else{
- if( val < (*q)[52][pos]){
- if( val < (*q)[54][pos]){
- if( val < (*q)[55][pos])return 56;
- return 55;
- }else{
- if( val < (*q)[53][pos])return 54;
- return 53;
- }
- }else{
- if( val < (*q)[50][pos]){
- if( val < (*q)[51][pos])return 52;
- return 51;
- }else{
- if( val < (*q)[49][pos])return 50;
- return 49;
- }
- }
- }
- }else{
- if( val < (*q)[40][pos]){
- if( val < (*q)[44][pos]){
- if( val < (*q)[46][pos]){
- if( val < (*q)[47][pos])return 48;
- return 47;
- }else{
- if( val < (*q)[45][pos])return 46;
- return 45;
- }
- }else{
- if( val < (*q)[42][pos]){
- if( val < (*q)[43][pos])return 44;
- return 43;
- }else{
- if( val < (*q)[41][pos])return 42;
- return 41;
- }
- }
- }else{
- if( val < (*q)[36][pos]){
- if( val < (*q)[38][pos]){
- if( val < (*q)[39][pos])return 40;
- return 39;
- }else{
- if( val < (*q)[37][pos])return 38;
- return 37;
- }
- }else{
- if( val < (*q)[34][pos]){
- if( val < (*q)[35][pos])return 36;
- return 35;
- }else{
- if( val < (*q)[33][pos])return 34;
- return 33;
- }
- }
- }
- }
- }else{
- if( val < (*q)[16][pos]){
- if( val < (*q)[24][pos]){
- if( val < (*q)[28][pos]){
- if( val < (*q)[30][pos]){
- if( val < (*q)[31][pos])return 32;
- return 31;
- }else{
- if( val < (*q)[29][pos])return 30;
- return 29;
- }
- }else{
- if( val < (*q)[26][pos]){
- if( val < (*q)[27][pos])return 28;
- return 27;
- }else{
- if( val < (*q)[25][pos])return 26;
- return 25;
- }
- }
- }else{
- if( val < (*q)[20][pos]){
- if( val < (*q)[22][pos]){
- if( val < (*q)[23][pos])return 24;
- return 23;
- }else{
- if( val < (*q)[21][pos])return 22;
- return 21;
- }
- }else{
- if( val < (*q)[18][pos]){
- if( val < (*q)[19][pos])return 20;
- return 19;
- }else{
- if( val < (*q)[17][pos])return 18;
- return 17;
- }
- }
- }
- }else{
- if( val < (*q)[8][pos]){
- if( val < (*q)[12][pos]){
- if( val < (*q)[14][pos]){
- if( val < (*q)[15][pos])return 16;
- return 15;
- }else{
- if( val < (*q)[13][pos])return 14;
- return 13;
- }
- }else{
- if( val < (*q)[10][pos]){
- if( val < (*q)[11][pos])return 12;
- return 11;
- }else{
- if( val < (*q)[9][pos])return 10;
- return 9;
- }
- }
- }else{
- if( val < (*q)[4][pos]){
- if( val < (*q)[6][pos]){
- if( val < (*q)[7][pos])return 8;
- return 7;
- }else{
- if( val < (*q)[5][pos])return 6;
- return 5;
- }
- }else{
- if( val < (*q)[2][pos]){
- if( val < (*q)[3][pos])return 4;
- return 3;
- }else{
- if( val < (*q)[1][pos])return 2;
- if( val < (*q)[0][pos])return 1;
- }
- }
- }
- }
- }
-
- return 0;
+ int val = (abs((int)in)<<1);
+ for(i=63;i>=0;i--)
+ if( val < q[i])break;
+ return i+1;
}
-/* rho computation and quant/dequant should be in bed together. They're not... yet */
-static void collect_rho(CP_INSTANCE *cpi, int mode, int plane, ogg_int16_t *buffer){
- int pos[64];
- int i;
- int interp = (mode != CODE_INTRA);
- quant_tables *q = &cpi->quant_tables[interp][plane];
-
- for(i=0;i<64;i++){
- int ii = zigzag_index[i];
- pos[ii] = find_nonzero_transition(q,ii,buffer[i]);
- }
-}
-
-static void TQB (CP_INSTANCE *cpi, int mode, int fi, ogg_int32_t *iq, ogg_int16_t *q, mv_t mv, int plane){
+static void TQB (CP_INSTANCE *cpi, int mode, int fi, mv_t mv, int plane, ogg_int16_t re_q[2][3][64], int *rho_count){
if ( cpi->frag_coded[fi] ) {
+ int qi = cpi->BaseQ; /* temporary */;
+ int inter = (mode != CODE_INTRA);
+ ogg_int32_t *iq = cpi->iquant_tables[inter][plane][qi];
ogg_int16_t buffer[64];
ogg_int16_t *data = cpi->frag_dct[fi].data;
int bi = cpi->frag_buffer_index[fi];
@@ -723,27 +555,39 @@
/* transform */
dsp_fdct_short(cpi->dsp, data, buffer);
- /* collect rho metrics */
- collect_rho(cpi, mode, plane, buffer);
+ /* collect rho metrics, quantize */
+ {
+ int i;
+ for(i=0;i<64;i++){
+ int ii = zigzag_index[i];
+ int pos = find_nonzero_transition(cpi->quant_tables[inter][plane][ii],buffer[i]);
+ rho_count[pos]++;
+
+ if(qi<pos){
+ data[ii] = 0;
+ }else{
+ int val = (((iq[i]>>15)*buffer[i]) + (1<<15) + (((iq[i]&0x7fff)*buffer[i])>>15)) >>16;
+ data[ii] = (val>511?511:(val<-511?-511:val));
+ }
+ }
+ }
- /* quantize */
- quantize (cpi, iq, buffer, data);
- cpi->frag_dc[fi] = cpi->frag_dct[fi].data[0];
+ cpi->frag_dc[fi] = data[0];
/* reconstruct */
while(!data[nonzero] && --nonzero);
switch(nonzero){
case 0:
- IDct1( data, q, buffer );
+ IDct1( data, re_q[inter][plane], buffer );
break;
case 1: case 2:
- dsp_IDct3(cpi->dsp, data, q, buffer );
+ dsp_IDct3(cpi->dsp, data, re_q[inter][plane], buffer );
break;
case 3:case 4:case 5:case 6:case 7:case 8: case 9:
- dsp_IDct10(cpi->dsp, data, q, buffer );
+ dsp_IDct10(cpi->dsp, data, re_q[inter][plane], buffer );
break;
default:
- dsp_IDctSlow(cpi->dsp, data, q, buffer );
+ dsp_IDctSlow(cpi->dsp, data, re_q[inter][plane], buffer );
}
dsp_recon8x8 (cpi->dsp, thisrecon, buffer, stride);
@@ -751,16 +595,13 @@
}
}
-static void TQMB ( CP_INSTANCE *cpi, macroblock_t *mb, int qi){
+static void TQMB ( CP_INSTANCE *cpi, macroblock_t *mb, int qi, ogg_int16_t req[2][3][64], int *rc){
int pf = cpi->info.pixelformat;
int mode = mb->mode;
- int inter = (mode != CODE_INTRA);
- ogg_int32_t *iq = cpi->iquant_tables[inter][0][qi];
- ogg_int16_t *q = cpi->quant_tables[inter][0][qi];
int i;
for(i=0;i<4;i++)
- TQB(cpi,mode,mb->Ryuv[0][i],iq,q,mb->mv[i],0);
+ TQB(cpi,mode,mb->Ryuv[0][i],mb->mv[i],0,req,rc);
switch(pf){
case OC_PF_420:
@@ -773,19 +614,11 @@
mv.x = ( mv.x >= 0 ? (mv.x + 2) / 4 : (mv.x - 2) / 4);
mv.y = ( mv.y >= 0 ? (mv.y + 2) / 4 : (mv.y - 2) / 4);
- iq = cpi->iquant_tables[inter][1][qi];
- q = cpi->quant_tables[inter][1][qi];
- TQB(cpi,mode,mb->Ryuv[1][0],iq,q,mv,1);
- iq = cpi->iquant_tables[inter][2][qi];
- q = cpi->quant_tables[inter][2][qi];
- TQB(cpi,mode,mb->Ryuv[2][0],iq,q,mv,2);
+ TQB(cpi,mode,mb->Ryuv[1][0],mv,1,req,rc);
+ TQB(cpi,mode,mb->Ryuv[2][0],mv,2,req,rc);
}else{
- iq = cpi->iquant_tables[inter][1][qi];
- q = cpi->quant_tables[inter][1][qi];
- TQB(cpi,mode,mb->Ryuv[1][0],iq,q,mb->mv[0],1);
- iq = cpi->iquant_tables[inter][2][qi];
- q = cpi->quant_tables[inter][2][qi];
- TQB(cpi,mode,mb->Ryuv[2][0],iq,q,mb->mv[0],2);
+ TQB(cpi,mode,mb->Ryuv[1][0],mb->mv[0],1,req,rc);
+ TQB(cpi,mode,mb->Ryuv[2][0],mb->mv[0],2,req,rc);
}
break;
@@ -803,37 +636,24 @@
mvB.x = ( mvB.x >= 0 ? (mvB.x + 1) / 2 : (mvB.x - 1) / 2);
mvB.y = ( mvB.y >= 0 ? (mvB.y + 1) / 2 : (mvB.y - 1) / 2);
- iq = cpi->iquant_tables[inter][1][qi];
- q = cpi->quant_tables[inter][1][qi];
- TQB(cpi,mode,mb->Ryuv[1][0],iq,q,mvA,1);
- TQB(cpi,mode,mb->Ryuv[1][1],iq,q,mvB,1);
+ TQB(cpi,mode,mb->Ryuv[1][0],mvA,1,req,rc);
+ TQB(cpi,mode,mb->Ryuv[1][1],mvB,1,req,rc);
+ TQB(cpi,mode,mb->Ryuv[2][0],mvA,2,req,rc);
+ TQB(cpi,mode,mb->Ryuv[2][1],mvB,2,req,rc);
- iq = cpi->iquant_tables[inter][2][qi];
- q = cpi->quant_tables[inter][2][qi];
- TQB(cpi,mode,mb->Ryuv[2][0],iq,q,mvA,2);
- TQB(cpi,mode,mb->Ryuv[2][1],iq,q,mvB,2);
-
}else{
- iq = cpi->iquant_tables[inter][1][qi];
- q = cpi->quant_tables[inter][1][qi];
- TQB(cpi,mode,mb->Ryuv[1][0],iq,q,mb->mv[0],1);
- TQB(cpi,mode,mb->Ryuv[1][1],iq,q,mb->mv[0],1);
- iq = cpi->iquant_tables[inter][2][qi];
- q = cpi->quant_tables[inter][2][qi];
- TQB(cpi,mode,mb->Ryuv[2][0],iq,q,mb->mv[0],2);
- TQB(cpi,mode,mb->Ryuv[2][1],iq,q,mb->mv[0],2);
+ TQB(cpi,mode,mb->Ryuv[1][0],mb->mv[0],1,req,rc);
+ TQB(cpi,mode,mb->Ryuv[1][1],mb->mv[0],1,req,rc);
+ TQB(cpi,mode,mb->Ryuv[2][0],mb->mv[0],2,req,rc);
+ TQB(cpi,mode,mb->Ryuv[2][1],mb->mv[0],2,req,rc);
}
break;
-
+
case OC_PF_444:
- iq = cpi->iquant_tables[inter][1][qi];
- q = cpi->quant_tables[inter][1][qi];
for(i=0;i<4;i++)
- TQB(cpi,mode,mb->Ryuv[1][i],iq,q,mb->mv[i],1);
- iq = cpi->iquant_tables[inter][2][qi];
- q = cpi->quant_tables[inter][2][qi];
+ TQB(cpi,mode,mb->Ryuv[1][i],mb->mv[i],1,req,rc);
for(i=0;i<4;i++)
- TQB(cpi,mode,mb->Ryuv[2][i],iq,q,mb->mv[i],2);
+ TQB(cpi,mode,mb->Ryuv[2][i],mb->mv[i],2,req,rc);
break;
}
}
@@ -842,18 +662,26 @@
unsigned char qi = cpi->BaseQ; // temporary
superblock_t *sb = cpi->super[0];
superblock_t *sb_end = sb + cpi->super_n[0];
- int i,j;
+ int i,j,k;
ogg_uint32_t interbits = 0;
ogg_uint32_t intrabits = 0;
mc_state mcenc;
mv_t last_mv = {0,0};
mv_t prior_mv = {0,0};
unsigned char *cp = cpi->frag_coded;
+ ogg_int16_t req[2][3][64];
+ int rho_count[65];
#ifdef COLLECT_METRICS
int sad[8][3][4];
#endif
oc_mode_scheme_chooser_init(cpi);
+ memset(rho_count,0,sizeof(rho_count));
+ for(i=0;i<2;i++)
+ for(j=0;j<3;j++)
+ for(k=0;k<64;k++)
+ req[i][j][k]=cpi->quant_tables[i][j][k][qi];
+
cpi->MVBits_0 = 0;
cpi->MVBits_1 = 0;
@@ -1004,7 +832,7 @@
#endif
/* Transform, quantize, collect rho metrics */
- TQMB(cpi, mb, qi);
+ TQMB(cpi, mb, qi, req, rho_count);
}
}
Modified: branches/theora-thusnelda/lib/enc/x86_64/idct_mmx.c
===================================================================
--- branches/theora-thusnelda/lib/enc/x86_64/idct_mmx.c 2008-04-21 02:24:43 UTC (rev 14782)
+++ branches/theora-thusnelda/lib/enc/x86_64/idct_mmx.c 2008-04-22 16:23:11 UTC (rev 14783)
@@ -5,7 +5,7 @@
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007 *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2008 *
* by the Xiph.Org Foundation http://www.xiph.org/ *
* *
********************************************************************
@@ -19,9 +19,1110 @@
#if defined(USE_ASM)
-/* nothing implemented right now */
+#define MaskOffset 0 // 4 masks come in order low word to high
+#define CosineOffset 32 // 7 cosines come in order pi/16 * (1 ... 7)
+#define EightOffset 88
+#define IdctAdjustBeforeShift 8
+
+ogg_uint16_t idctconstants[(4+7+1) * 4] = {
+ 65535, 0, 0, 0, 0, 65535, 0, 0,
+ 0, 0, 65535, 0, 0, 0, 0, 65535,
+ 64277, 64277, 64277, 64277, 60547, 60547, 60547, 60547,
+ 54491, 54491, 54491, 54491, 46341, 46341, 46341, 46341,
+ 36410, 36410, 36410, 36410, 25080, 25080, 25080, 25080,
+ 12785, 12785, 12785, 12785, 8, 8, 8, 8,
+};
+
+/**************************************************************************************
+ *
+ * Routine: BeginIDCT
+ *
+ * Description: The Macro does IDct on 4 1-D Dcts
+ *
+ * Input: None
+ *
+ * Output: None
+ *
+ * Return: None
+ *
+ * Special Note: None
+ *
+ * Error: None
+ *
+ ***************************************************************************************
+ */
+
+#define MtoSTR(s) #s
+
+#define BeginIDCT "#BeginIDCT\n" \
+ \
+ " movq "I(3)",%%mm2\n" \
+ \
+ " movq "C(3)",%%mm6\n" \
+ " movq %%mm2,%%mm4\n" \
+ " movq "J(5)",%%mm7\n" \
+ " pmulhw %%mm6,%%mm4\n" \
+ " movq "C(5)",%%mm1\n" \
+ " pmulhw %%mm7,%%mm6\n" \
+ " movq %%mm1,%%mm5\n" \
+ " pmulhw %%mm2,%%mm1\n" \
+ " movq "I(1)",%%mm3\n" \
+ " pmulhw %%mm7,%%mm5\n" \
+ " movq "C(1)",%%mm0\n" \
+ " paddw %%mm2,%%mm4\n" \
+ " paddw %%mm7,%%mm6\n" \
+ " paddw %%mm1,%%mm2\n" \
+ " movq "J(7)",%%mm1\n" \
+ " paddw %%mm5,%%mm7\n" \
+ " movq %%mm0,%%mm5\n" \
+ " pmulhw %%mm3,%%mm0\n" \
+ " paddsw %%mm7,%%mm4\n" \
+ " pmulhw %%mm1,%%mm5\n" \
+ " movq "C(7)",%%mm7\n" \
+ " psubsw %%mm2,%%mm6\n" \
+ " paddw %%mm3,%%mm0\n" \
+ " pmulhw %%mm7,%%mm3\n" \
+ " movq "I(2)",%%mm2\n" \
+ " pmulhw %%mm1,%%mm7\n" \
+ " paddw %%mm1,%%mm5\n" \
+ " movq %%mm2,%%mm1\n" \
+ " pmulhw "C(2)",%%mm2\n" \
+ " psubsw %%mm5,%%mm3\n" \
+ " movq "J(6)",%%mm5\n" \
+ " paddsw %%mm7,%%mm0\n" \
+ " movq %%mm5,%%mm7\n" \
+ " psubsw %%mm4,%%mm0\n" \
+ " pmulhw "C(2)",%%mm5\n" \
+ " paddw %%mm1,%%mm2\n" \
+ " pmulhw "C(6)",%%mm1\n" \
+ " paddsw %%mm4,%%mm4\n" \
+ " paddsw %%mm0,%%mm4\n" \
+ " psubsw %%mm6,%%mm3\n" \
+ " paddw %%mm7,%%mm5\n" \
+ " paddsw %%mm6,%%mm6\n" \
+ " pmulhw "C(6)",%%mm7\n" \
+ " paddsw %%mm3,%%mm6\n" \
+ " movq %%mm4,"I(1)"\n" \
+ " psubsw %%mm5,%%mm1\n" \
+ " movq "C(4)",%%mm4\n" \
+ " movq %%mm3,%%mm5\n" \
+ " pmulhw %%mm4,%%mm3\n" \
+ " paddsw %%mm2,%%mm7\n" \
+ " movq %%mm6,"I(2)"\n" \
+ " movq %%mm0,%%mm2\n" \
+ " movq "I(0)",%%mm6\n" \
+ " pmulhw %%mm4,%%mm0\n" \
+ " paddw %%mm3,%%mm5\n" \
+ "\n" \
+ " movq "J(4)",%%mm3\n" \
+ " psubsw %%mm1,%%mm5\n" \
+ " paddw %%mm0,%%mm2\n" \
+ " psubsw %%mm3,%%mm6\n" \
+ " movq %%mm6,%%mm0\n" \
+ " pmulhw %%mm4,%%mm6\n" \
+ " paddsw %%mm3,%%mm3\n" \
+ " paddsw %%mm1,%%mm1\n" \
+ " paddsw %%mm0,%%mm3\n" \
+ " paddsw %%mm5,%%mm1\n" \
+ " pmulhw %%mm3,%%mm4\n" \
+ " paddsw %%mm0,%%mm6\n" \
+ " psubsw %%mm2,%%mm6\n" \
+ " paddsw %%mm2,%%mm2\n" \
+ " movq "I(1)",%%mm0\n" \
+ " paddsw %%mm6,%%mm2\n" \
+ " paddw %%mm3,%%mm4\n" \
+ " psubsw %%mm1,%%mm2\n" \
+ "#end BeginIDCT\n"
+// end BeginIDCT macro (38 cycles).
+
+
+// Two versions of the end of the idct depending on whether we're feeding
+// into a transpose or dividing the final results by 16 and storing them.
+
+/**************************************************************************************
+ *
+ * Routine: RowIDCT
+ *
+ * Description: The Macro does 1-D IDct on 4 Rows
+ *
+ * Input: None
+ *
+ * Output: None
+ *
+ * Return: None
+ *
+ * Special Note: None
+ *
+ * Error: None
+ *
+ ***************************************************************************************
+ */
+
+// RowIDCT gets ready to transpose.
+
+#define RowIDCT "#RowIDCT\n" \
+ BeginIDCT \
+ "\n" \
+ " movq "I(2)",%%mm3\n" /* r3 = D. */ \
+ " psubsw %%mm7,%%mm4\n" /* r4 = E. = E - G */ \
+ " paddsw %%mm1,%%mm1\n" /* r1 = H. + H. */ \
+ " paddsw %%mm7,%%mm7\n" /* r7 = G + G */ \
+ " paddsw %%mm2,%%mm1\n" /* r1 = R1 = A.. + H. */\
+ " paddsw %%mm4,%%mm7\n" /* r7 = G. = E + G */ \
+ " psubsw %%mm3,%%mm4\n" /* r4 = R4 = E. - D. */ \
+ " paddsw %%mm3,%%mm3\n" \
+ " psubsw %%mm5,%%mm6\n" /* r6 = R6 = F. - B.. */\
+ " paddsw %%mm5,%%mm5\n" \
+ " paddsw %%mm4,%%mm3\n" /* r3 = R3 = E. + D. */ \
+ " paddsw %%mm6,%%mm5\n" /* r5 = R5 = F. + B.. */\
+ " psubsw %%mm0,%%mm7\n" /* r7 = R7 = G. - C. */ \
+ " paddsw %%mm0,%%mm0\n" \
+ " movq %%mm1,"I(1)"\n" /* save R1 */ \
+ " paddsw %%mm7,%%mm0\n" /* r0 = R0 = G. + C. */ \
+ "#end RowIDCT"
+
+// end RowIDCT macro (8 + 38 = 46 cycles)
+
+
+/**************************************************************************************
+ *
+ * Routine: ColumnIDCT
+ *
+ * Description: The Macro does 1-D IDct on 4 columns
+ *
+ * Input: None
+ *
+ * Output: None
+ *
+ * Return: None
+ *
+ * Special Note: None
+ *
+ * Error: None
+ *
+ ***************************************************************************************
+ */
+// Column IDCT normalizes and stores final results.
+
+#define ColumnIDCT "#ColumnIDCT\n" \
+ BeginIDCT \
+ "\n" \
+ " paddsw "Eight",%%mm2\n" \
+ " paddsw %%mm1,%%mm1\n" /* r1 = H. + H. */ \
+ " paddsw %%mm2,%%mm1\n" /* r1 = R1 = A.. + H. */\
+ " psraw ""$4"",%%mm2\n" /* r2 = NR2 */ \
+ " psubsw %%mm7,%%mm4\n" /* r4 = E. = E - G */ \
+ " psraw ""$4"",%%mm1\n" /* r1 = NR1 */ \
+ " movq "I(2)",%%mm3\n" /* r3 = D. */ \
+ " paddsw %%mm7,%%mm7\n" /* r7 = G + G */ \
+ " movq %%mm2,"I(2)"\n" /* store NR2 at I2 */ \
+ " paddsw %%mm4,%%mm7\n" /* r7 = G. = E + G */ \
+ " movq %%mm1,"I(1)"\n" /* store NR1 at I1 */ \
+ " psubsw %%mm3,%%mm4\n" /* r4 = R4 = E. - D. */ \
+ " paddsw "Eight",%%mm4\n" \
+ " paddsw %%mm3,%%mm3\n" /* r3 = D. + D. */ \
+ " paddsw %%mm4,%%mm3\n" /* r3 = R3 = E. + D. */ \
+ " psraw ""$4"",%%mm4\n" /* r4 = NR4 */ \
+ " psubsw %%mm5,%%mm6\n" /* r6 = R6 = F. - B.. */\
+ " psraw ""$4"",%%mm3\n" /* r3 = NR3 */ \
+ " paddsw "Eight",%%mm6\n" \
+ " paddsw %%mm5,%%mm5\n" /* r5 = B.. + B.. */ \
+ " paddsw %%mm6,%%mm5\n" /* r5 = R5 = F. + B.. */\
+ " psraw ""$4"",%%mm6\n" /* r6 = NR6 */ \
+ " movq %%mm4,"J(4)"\n" /* store NR4 at J4 */ \
+ " psraw ""$4"",%%mm5\n" /* r5 = NR5 */ \
+ " movq %%mm3,"I(3)"\n" /* store NR3 at I3 */ \
+ " psubsw %%mm0,%%mm7\n" /* r7 = R7 = G. - C. */ \
+ " paddsw "Eight",%%mm7\n" \
+ " paddsw %%mm0,%%mm0\n" /* r0 = C. + C. */ \
+ " paddsw %%mm7,%%mm0\n" /* r0 = R0 = G. + C. */ \
+ " psraw ""$4"",%%mm7\n" /* r7 = NR7 */ \
+ " movq %%mm6,"J(6)"\n" /* store NR6 at J6 */ \
+ " psraw ""$4"",%%mm0\n" /* r0 = NR0 */ \
+ " movq %%mm5,"J(5)"\n" /* store NR5 at J5 */ \
+ " movq %%mm7,"J(7)"\n" /* store NR7 at J7 */ \
+ " movq %%mm0,"I(0)"\n" /* store NR0 at I0 */ \
+ "#end ColumnIDCT\n"
+
+// end ColumnIDCT macro (38 + 19 = 57 cycles)
+
+/**************************************************************************************
+ *
+ * Routine: Transpose
+ *
+ * Description: The Macro does two 4x4 transposes in place.
+ *
+ * Input: None
+ *
+ * Output: None
+ *
+ * Return: None
+ *
+ * Special Note: None
+ *
+ * Error: None
+ *
+ ***************************************************************************************
+ */
+
+/* Following macro does two 4x4 transposes in place.
+
+ At entry (we assume):
+
+ r0 = a3 a2 a1 a0
+ I(1) = b3 b2 b1 b0
+ r2 = c3 c2 c1 c0
+ r3 = d3 d2 d1 d0
+
+ r4 = e3 e2 e1 e0
+ r5 = f3 f2 f1 f0
+ r6 = g3 g2 g1 g0
+ r7 = h3 h2 h1 h0
+
+ At exit, we have:
+
+ I(0) = d0 c0 b0 a0
+ I(1) = d1 c1 b1 a1
+ I(2) = d2 c2 b2 a2
+ I(3) = d3 c3 b3 a3
+
+ J(4) = h0 g0 f0 e0
+ J(5) = h1 g1 f1 e1
+ J(6) = h2 g2 f2 e2
+ J(7) = h3 g3 f3 e3
+
+ I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3.
+ J(4) J(5) J(6) J(7) is the transpose of r4 r5 r6 r7.
+
+ Since r1 is free at entry, we calculate the Js first. */
+
+
+#define Transpose "#Transpose\n" \
+ " movq %%mm4,%%mm1\n" \
+ " punpcklwd %%mm5,%%mm4\n" \
+ " movq %%mm0,"I(0)"\n" \
+ " punpckhwd %%mm5,%%mm1\n" \
+ " movq %%mm6,%%mm0\n" \
+ " punpcklwd %%mm7,%%mm6\n" \
+ " movq %%mm4,%%mm5\n" \
+ " punpckldq %%mm6,%%mm4\n" \
+ " punpckhdq %%mm6,%%mm5\n" \
+ " movq %%mm1,%%mm6\n" \
+ " movq %%mm4,"J(4)"\n" \
+ " punpckhwd %%mm7,%%mm0\n" \
+ " movq %%mm5,"J(5)"\n" \
+ " punpckhdq %%mm0,%%mm6\n" \
+ " movq "I(0)",%%mm4\n" \
+ " punpckldq %%mm0,%%mm1\n" \
+ " movq "I(1)",%%mm5\n" \
+ " movq %%mm4,%%mm0\n" \
+ " movq %%mm6,"J(7)"\n" \
+ " punpcklwd %%mm5,%%mm0\n" \
+ " movq %%mm1,"J(6)"\n" \
+ " punpckhwd %%mm5,%%mm4\n" \
+ " movq %%mm2,%%mm5\n" \
+ " punpcklwd %%mm3,%%mm2\n" \
+ " movq %%mm0,%%mm1\n" \
+ " punpckldq %%mm2,%%mm0\n" \
+ " punpckhdq %%mm2,%%mm1\n" \
+ " movq %%mm4,%%mm2\n" \
+ " movq %%mm0,"I(0)"\n" \
+ " punpckhwd %%mm3,%%mm5\n" \
+ " movq %%mm1,"I(1)"\n" \
+ " punpckhdq %%mm5,%%mm4\n" \
+ " punpckldq %%mm5,%%mm2\n" \
+ \
+ " movq %%mm4,"I(3)"\n" \
+ \
+ " movq %%mm2,"I(2)"\n" \
+ "#end Transpose\n"
+// end Transpose macro (19 cycles).
+
+/**************************************************************************************
+ *
+ * Routine: MMX_idct
+ *
+ * Description: Perform IDCT on a 8x8 block
+ *
+ * Input: Pointer to input and output buffer
+ *
+ * Output: None
+ *
+ * Return: None
+ *
+ * Special Note: The input coefficients are in ZigZag order
+ *
+ * Error: None
+ *
+ ***************************************************************************************
+ */
+void IDctSlow__mmx(ogg_int16_t *in,
+ ogg_int16_t *q,
+ ogg_int16_t *out ) {
+
+# define MID(M,I) MtoSTR(M+(I)*8)"(%[c])"
+# define M(I) MID( MaskOffset , I )
+# define C(I) MID( CosineOffset , I-1 )
+# define Eight MID(EightOffset,0)
+
+ /* eax = quantized input */
+ /* esi = quantization table */
+ /* edx = destination (= idct buffer) */
+ /* ecx = idctconstants */
+
+
+ __asm__ __volatile__ (
+ "# dequantize, de-zigzag\n"
+ "movq (%[i]), %%mm0\n"
+ "pmullw (%[q]), %%mm0\n" /* r0 = 03 02 01 00 */
+ "movq 16(%[i]), %%mm1\n"
+ "pmullw 16(%[q]), %%mm1\n" /* r1 = 13 12 11 10 */
+ "movq "M(0)", %%mm2\n" /* r2 = __ __ __ FF */
+ "movq %%mm0, %%mm3\n" /* r3 = 03 02 01 00 */
+ "movq 8(%[i]), %%mm4\n"
+ "psrlq $16, %%mm0\n" /* r0 = __ 03 02 01 */
+ "pmullw 8(%[q]), %%mm4\n" /* r4 = 07 06 05 04 */
+ "pand %%mm2, %%mm3\n" /* r3 = __ __ __ 00 */
+ "movq %%mm0, %%mm5\n" /* r5 = __ 03 02 01 */
+ "movq %%mm1, %%mm6\n" /* r6 = 13 12 11 10 */
+ "pand %%mm2, %%mm5\n" /* r5 = __ __ __ 01 */
+ "psllq $32, %%mm6\n" /* r6 = 11 10 __ __ */
+ "movq "M(3)", %%mm7\n" /* r7 = FF __ __ __ */
+ "pxor %%mm5, %%mm0\n" /* r0 = __ 03 02 __ */
+ "pand %%mm6, %%mm7\n" /* r7 = 11 __ __ __ */
+ "por %%mm3, %%mm0\n" /* r0 = __ 03 02 00 */
+ "pxor %%mm7, %%mm6\n" /* r6 = __ 10 __ __ */
+ "por %%mm7, %%mm0\n" /* r0 = 11 03 02 00 = R0 */
+ "movq "M(3)", %%mm7\n" /* r7 = FF __ __ __ */
+ "movq %%mm4, %%mm3\n" /* r3 = 07 06 05 04 */
+ "movq %%mm0, (%[o])\n" /* write R0 = r0 */
+ "pand %%mm2, %%mm3\n" /* r3 = __ __ __ 04 */
+ "movq 32(%[i]), %%mm0\n"
+ "psllq $16, %%mm3\n" /* r3 = __ __ 04 __ */
+ "pmullw 32(%[q]), %%mm0\n" /* r0 = 23 22 21 20 */
+ "pand %%mm1, %%mm7\n" /* r7 = 13 __ __ __ */
+ "por %%mm3, %%mm5\n" /* r5 = __ __ 04 01 */
+ "por %%mm6, %%mm7\n" /* r7 = 13 10 __ __ */
+ "movq 24(%[i]), %%mm3\n"
+ "por %%mm5, %%mm7\n" /* r7 = 13 10 04 01 = R1 */
+ "pmullw 24(%[q]), %%mm3\n" /* r3 = 17 16 15 14 */
+ "psrlq $16, %%mm4\n" /* r4 = __ 07 06 05 */
+ "movq %%mm7, 16(%[o])\n" /* write R1 = r7 */
+ "movq %%mm4, %%mm5\n" /* r5 = __ 07 06 05 */
+ "movq %%mm0, %%mm7\n" /* r7 = 23 22 21 20 */
+ "psrlq $16, %%mm4\n" /* r4 = __ __ 07 06 */
+ "psrlq $48, %%mm7\n" /* r7 = __ __ __ 23 */
+ "movq %%mm2, %%mm6\n" /* r6 = __ __ __ FF */
+ "pand %%mm2, %%mm5\n" /* r5 = __ __ __ 05 */
+ "pand %%mm4, %%mm6\n" /* r6 = __ __ __ 06 */
+ "movq %%mm7, 80(%[o])\n" /* partial R9 = __ __ __ 23 */
+ "pxor %%mm6, %%mm4\n" /* r4 = __ __ 07 __ */
+ "psrlq $32, %%mm1\n" /* r1 = __ __ 13 12 */
+ "por %%mm5, %%mm4\n" /* r4 = __ __ 07 05 */
+ "movq "M(3)", %%mm7\n" /* r7 = FF __ __ __ */
+ "pand %%mm2, %%mm1\n" /* r1 = __ __ __ 12 */
+ "movq 48(%[i]), %%mm5\n"
+ "psllq $16, %%mm0\n" /* r0 = 22 21 20 __ */
+ "pmullw 48(%[q]), %%mm5\n" /* r5 = 33 32 31 30 */
+ "pand %%mm0, %%mm7\n" /* r7 = 22 __ __ __ */
+ "movq %%mm1, 64(%[o])\n" /* partial R8 = __ __ __ 12 */
+ "por %%mm4, %%mm7\n" /* r7 = 22 __ 07 05 */
+ "movq %%mm3, %%mm4\n" /* r4 = 17 16 15 14 */
+ "pand %%mm2, %%mm3\n" /* r3 = __ __ __ 14 */
+ "movq "M(2)", %%mm1\n" /* r1 = __ FF __ __ */
+ "psllq $32, %%mm3\n" /* r3 = __ 14 __ __ */
+ "por %%mm3, %%mm7\n" /* r7 = 22 14 07 05 = R2 */
+ "movq %%mm5, %%mm3\n" /* r3 = 33 32 31 30 */
+ "psllq $48, %%mm3\n" /* r3 = 30 __ __ __ */
+ "pand %%mm0, %%mm1\n" /* r1 = __ 21 __ __ */
+ "movq %%mm7, 32(%[o])\n" /* write R2 = r7 */
+ "por %%mm3, %%mm6\n" /* r6 = 30 __ __ 06 */
+ "movq "M(1)", %%mm7\n" /* r7 = __ __ FF __ */
+ "por %%mm1, %%mm6\n" /* r6 = 30 21 __ 06 */
+ "movq 56(%[i]), %%mm1\n"
+ "pand %%mm4, %%mm7\n" /* r7 = __ __ 15 __ */
+ "pmullw 56(%[q]), %%mm1\n" /* r1 = 37 36 35 34 */
+ "por %%mm6, %%mm7\n" /* r7 = 30 21 15 06 = R3 */
+ "pand "M(1)", %%mm0\n" /* r0 = __ __ 20 __ */
+ "psrlq $32, %%mm4\n" /* r4 = __ __ 17 16 */
+ "movq %%mm7, 48(%[o])\n" /* write R3 = r7 */
+ "movq %%mm4, %%mm6\n" /* r6 = __ __ 17 16 */
+ "movq "M(3)", %%mm7\n" /* r7 = FF __ __ __ */
+ "pand %%mm2, %%mm4\n" /* r4 = __ __ __ 16 */
+ "movq "M(1)", %%mm3\n" /* r3 = __ __ FF __ */
+ "pand %%mm1, %%mm7\n" /* r7 = 37 __ __ __ */
+ "pand %%mm5, %%mm3\n" /* r3 = __ __ 31 __ */
+ "por %%mm4, %%mm0\n" /* r0 = __ __ 20 16 */
+ "psllq $16, %%mm3\n" /* r3 = __ 31 __ __ */
+ "por %%mm0, %%mm7\n" /* r7 = 37 __ 20 16 */
+ "movq "M(2)", %%mm4\n" /* r4 = __ FF __ __ */
+ "por %%mm3, %%mm7\n" /* r7 = 37 31 20 16 = R4 */
+ "movq 80(%[i]), %%mm0\n"
+ "movq %%mm4, %%mm3\n" /* r3 = __ __ FF __ */
+ "pmullw 80(%[q]), %%mm0\n" /* r0 = 53 52 51 50 */
+ "pand %%mm5, %%mm4\n" /* r4 = __ 32 __ __ */
+ "movq %%mm7, 8(%[o])\n" /* write R4 = r7 */
+ "por %%mm4, %%mm6\n" /* r6 = __ 32 17 16 */
+ "movq %%mm3, %%mm4\n" /* r4 = __ FF __ __ */
+ "psrlq $16, %%mm6\n" /* r6 = __ __ 32 17 */
+ "movq %%mm0, %%mm7\n" /* r7 = 53 52 51 50 */
+ "pand %%mm1, %%mm4\n" /* r4 = __ 36 __ __ */
+ "psllq $48, %%mm7\n" /* r7 = 50 __ __ __ */
+ "por %%mm4, %%mm6\n" /* r6 = __ 36 32 17 */
+ "movq 88(%[i]), %%mm4\n"
+ "por %%mm6, %%mm7\n" /* r7 = 50 36 32 17 = R5 */
+ "pmullw 88(%[q]), %%mm4\n" /* r4 = 57 56 55 54 */
+ "psrlq $16, %%mm3\n" /* r3 = __ __ FF __ */
+ "movq %%mm7, 24(%[o])\n" /* write R5 = r7 */
+ "pand %%mm1, %%mm3\n" /* r3 = __ __ 35 __ */
+ "psrlq $48, %%mm5\n" /* r5 = __ __ __ 33 */
+ "pand %%mm2, %%mm1\n" /* r1 = __ __ __ 34 */
+ "movq 104(%[i]), %%mm6\n"
+ "por %%mm3, %%mm5\n" /* r5 = __ __ 35 33 */
+ "pmullw 104(%[q]), %%mm6\n" /* r6 = 67 66 65 64 */
+ "psrlq $16, %%mm0\n" /* r0 = __ 53 52 51 */
+ "movq %%mm4, %%mm7\n" /* r7 = 57 56 55 54 */
+ "movq %%mm2, %%mm3\n" /* r3 = __ __ __ FF */
+ "psllq $48, %%mm7\n" /* r7 = 54 __ __ __ */
+ "pand %%mm0, %%mm3\n" /* r3 = __ __ __ 51 */
+ "pxor %%mm3, %%mm0\n" /* r0 = __ 53 52 __ */
+ "psllq $32, %%mm3\n" /* r3 = __ 51 __ __ */
+ "por %%mm5, %%mm7\n" /* r7 = 54 __ 35 33 */
+ "movq %%mm6, %%mm5\n" /* r5 = 67 66 65 64 */
+ "pand "M(1)", %%mm6\n" /* r6 = __ __ 65 __ */
+ "por %%mm3, %%mm7\n" /* r7 = 54 51 35 33 = R6 */
+ "psllq $32, %%mm6\n" /* r6 = 65 __ __ __ */
+ "por %%mm1, %%mm0\n" /* r0 = __ 53 52 34 */
+ "movq %%mm7, 40(%[o])\n" /* write R6 = r7 */
+ "por %%mm6, %%mm0\n" /* r0 = 65 53 52 34 = R7 */
+ "movq 120(%[i]), %%mm7\n"
+ "movq %%mm5, %%mm6\n" /* r6 = 67 66 65 64 */
+ "pmullw 120(%[q]), %%mm7\n" /* r7 = 77 76 75 74 */
+ "psrlq $32, %%mm5\n" /* r5 = __ __ 67 66 */
+ "pand %%mm2, %%mm6\n" /* r6 = __ __ __ 64 */
+ "movq %%mm5, %%mm1\n" /* r1 = __ __ 67 66 */
+ "movq %%mm0, 56(%[o])\n" /* write R7 = r0 */
+ "pand %%mm2, %%mm1\n" /* r1 = __ __ __ 66 */
+ "movq 112(%[i]), %%mm0\n"
+ "movq %%mm7, %%mm3\n" /* r3 = 77 76 75 74 */
+ "pmullw 112(%[q]), %%mm0\n" /* r0 = 73 72 71 70 */
+ "psllq $16, %%mm3\n" /* r3 = 76 75 74 __ */
+ "pand "M(3)", %%mm7\n" /* r7 = 77 __ __ __ */
+ "pxor %%mm1, %%mm5\n" /* r5 = __ __ 67 __ */
+ "por %%mm5, %%mm6\n" /* r6 = __ __ 67 64 */
+ "movq %%mm3, %%mm5\n" /* r5 = 76 75 74 __ */
+ "pand "M(3)", %%mm5\n" /* r5 = 76 __ __ __ */
+ "por %%mm1, %%mm7\n" /* r7 = 77 __ __ 66 */
+ "movq 96(%[i]), %%mm1\n"
+ "pxor %%mm5, %%mm3\n" /* r3 = __ 75 74 __ */
+ "pmullw 96(%[q]), %%mm1\n" /* r1 = 63 62 61 60 */
+ "por %%mm3, %%mm7\n" /* r7 = 77 75 74 66 = R15 */
+ "por %%mm5, %%mm6\n" /* r6 = 76 __ 67 64 */
+ "movq %%mm0, %%mm5\n" /* r5 = 73 72 71 70 */
+ "movq %%mm7, 120(%[o])\n" /* store R15 = r7 */
+ "psrlq $16, %%mm5\n" /* r5 = __ 73 72 71 */
+ "pand "M(2)", %%mm5\n" /* r5 = __ 73 __ __ */
+ "movq %%mm0, %%mm7\n" /* r7 = 73 72 71 70 */
+ "por %%mm5, %%mm6\n" /* r6 = 76 73 67 64 = R14 */
+ "pand %%mm2, %%mm0\n" /* r0 = __ __ __ 70 */
+ "pxor %%mm0, %%mm7\n" /* r7 = 73 72 71 __ */
+ "psllq $32, %%mm0\n" /* r0 = __ 70 __ __ */
+ "movq %%mm6, 104(%[o])\n" /* write R14 = r6 */
+ "psrlq $16, %%mm4\n" /* r4 = __ 57 56 55 */
+ "movq 72(%[i]), %%mm5\n"
+ "psllq $16, %%mm7\n" /* r7 = 72 71 __ __ */
+ "pmullw 72(%[q]), %%mm5\n" /* r5 = 47 46 45 44 */
+ "movq %%mm7, %%mm6\n" /* r6 = 72 71 __ __ */
+ "movq "M(2)", %%mm3\n" /* r3 = __ FF __ __ */
+ "psllq $16, %%mm6\n" /* r6 = 71 __ __ __ */
+ "pand "M(3)", %%mm7\n" /* r7 = 72 __ __ __ */
+ "pand %%mm1, %%mm3\n" /* r3 = __ 62 __ __ */
+ "por %%mm0, %%mm7\n" /* r7 = 72 70 __ __ */
+ "movq %%mm1, %%mm0\n" /* r0 = 63 62 61 60 */
+ "pand "M(3)", %%mm1\n" /* r1 = 63 __ __ __ */
+ "por %%mm3, %%mm6\n" /* r6 = 71 62 __ __ */
+ "movq %%mm4, %%mm3\n" /* r3 = __ 57 56 55 */
+ "psrlq $32, %%mm1\n" /* r1 = __ __ 63 __ */
+ "pand %%mm2, %%mm3\n" /* r3 = __ __ __ 55 */
+ "por %%mm1, %%mm7\n" /* r7 = 72 70 63 __ */
+ "por %%mm3, %%mm7\n" /* r7 = 72 70 63 55 = R13 */
+ "movq %%mm4, %%mm3\n" /* r3 = __ 57 56 55 */
+ "pand "M(1)", %%mm3\n" /* r3 = __ __ 56 __ */
+ "movq %%mm5, %%mm1\n" /* r1 = 47 46 45 44 */
+ "movq %%mm7, 88(%[o])\n" /* write R13 = r7 */
+ "psrlq $48, %%mm5\n" /* r5 = __ __ __ 47 */
+ "movq 64(%[i]), %%mm7\n"
+ "por %%mm3, %%mm6\n" /* r6 = 71 62 56 __ */
+ "pmullw 64(%[q]), %%mm7\n" /* r7 = 43 42 41 40 */
+ "por %%mm5, %%mm6\n" /* r6 = 71 62 56 47 = R12 */
+ "pand "M(2)", %%mm4\n" /* r4 = __ 57 __ __ */
+ "psllq $32, %%mm0\n" /* r0 = 61 60 __ __ */
+ "movq %%mm6, 72(%[o])\n" /* write R12 = r6 */
+ "movq %%mm0, %%mm6\n" /* r6 = 61 60 __ __ */
+ "pand "M(3)", %%mm0\n" /* r0 = 61 __ __ __ */
+ "psllq $16, %%mm6\n" /* r6 = 60 __ __ __ */
+ "movq 40(%[i]), %%mm5\n"
+ "movq %%mm1, %%mm3\n" /* r3 = 47 46 45 44 */
+ "pmullw 40(%[q]), %%mm5\n" /* r5 = 27 26 25 24 */
+ "psrlq $16, %%mm1\n" /* r1 = __ 47 46 45 */
+ "pand "M(1)", %%mm1\n" /* r1 = __ __ 46 __ */
+ "por %%mm4, %%mm0\n" /* r0 = 61 57 __ __ */
+ "pand %%mm7, %%mm2\n" /* r2 = __ __ __ 40 */
+ "por %%mm1, %%mm0\n" /* r0 = 61 57 46 __ */
+ "por %%mm2, %%mm0\n" /* r0 = 61 57 46 40 = R11 */
+ "psllq $16, %%mm3\n" /* r3 = 46 45 44 __ */
+ "movq %%mm3, %%mm4\n" /* r4 = 46 45 44 __ */
+ "movq %%mm5, %%mm2\n" /* r2 = 27 26 25 24 */
+ "movq %%mm0, 112(%[o])\n" /* write R11 = r0 */
+ "psrlq $48, %%mm2\n" /* r2 = __ __ __ 27 */
+ "pand "M(2)", %%mm4\n" /* r4 = __ 45 __ __ */
+ "por %%mm2, %%mm6\n" /* r6 = 60 __ __ 27 */
+ "movq "M(1)", %%mm2\n" /* r2 = __ __ FF __ */
+ "por %%mm4, %%mm6\n" /* r6 = 60 45 __ 27 */
+ "pand %%mm7, %%mm2\n" /* r2 = __ __ 41 __ */
+ "psllq $32, %%mm3\n" /* r3 = 44 __ __ __ */
+ "por 80(%[o]), %%mm3\n" /* r3 = 44 __ __ 23 */
+ "por %%mm2, %%mm6\n" /* r6 = 60 45 41 27 = R10 */
+ "movq "M(3)", %%mm2\n" /* r2 = FF __ __ __ */
+ "psllq $16, %%mm5\n" /* r5 = 26 25 24 __ */
+ "movq %%mm6, 96(%[o])\n" /* store R10 = r6 */
+ "pand %%mm5, %%mm2\n" /* r2 = 26 __ __ __ */
+ "movq "M(2)", %%mm6\n" /* r6 = __ FF __ __ */
+ "pxor %%mm2, %%mm5\n" /* r5 = __ 25 24 __ */
+ "pand %%mm7, %%mm6\n" /* r6 = __ 42 __ __ */
+ "psrlq $32, %%mm2\n" /* r2 = __ __ 26 __ */
+ "pand "M(3)", %%mm7\n" /* r7 = 43 __ __ __ */
+ "por %%mm2, %%mm3\n" /* r3 = 44 __ 26 23 */
+ "por 64(%[o]), %%mm7\n" /* r7 = 43 __ __ 12 */
+ "por %%mm3, %%mm6\n" /* r6 = 44 42 26 23 = R9 */
+ "por %%mm5, %%mm7\n" /* r7 = 43 25 24 12 = R8 */
+ "movq %%mm6, 80(%[o])\n" /* store R9 = r6 */
+ "movq %%mm7, 64(%[o])\n" /* store R8 = r7 */
+
+ /* 123c ( / 64 coeffs < 2c / coeff) */
+
+/* Done w/dequant + descramble + partial transpose; now do the idct itself. */
+
+# define I( K) MtoSTR((K*16))"(%[o])"
+# define J( K) MtoSTR(((K - 4)*16)+8)"(%[o])"
+
+ RowIDCT /* 46 c */
+ Transpose /* 19 c */
+
+# undef I
+# undef J
+# define I( K) MtoSTR((K*16)+64)"(%[o])"
+# define J( K) MtoSTR(((K-4)*16)+72)"(%[o])"
+
+ RowIDCT /* 46 c */
+ Transpose /* 19 c */
+
+# undef I
+# undef J
+# define I( K) MtoSTR((K * 16))"(%[o])"
+# define J( K) I( K)
+
+ ColumnIDCT /* 57 c */
+
+# undef I
+# undef J
+# define I( K) MtoSTR((K*16)+8)"(%[o])"
+# define J( K) I( K)
+
+ ColumnIDCT /* 57 c */
+
+# undef I
+# undef J
+ /* 368 cycles ( / 64 coeff < 6 c / coeff) */
+
+ "emms\n"
+ :
+ :[i]"r"(in),[q]"r"(q),[o]"r"(out),[c]"r"(idctconstants)
+ );
+}
+
+/**************************************************************************************
+ *
+ * Routine: MMX_idct10
+ *
+ * Description: Perform IDCT on a 8x8 block with at most 10 nonzero coefficients
+ *
+ * Input: Pointer to input and output buffer
+ *
+ * Output: None
+ *
+ * Return: None
+ *
+ * Special Note: The input coefficients are in transposed ZigZag order
+ *
+ * Error: None
+ *
+ ***************************************************************************************
+ */
+/* --------------------------------------------------------------- */
+// This macro does four 4-sample one-dimensional idcts in parallel. Inputs
+// 4 thru 7 are assumed to be zero.
+#define BeginIDCT_10 "#BeginIDCT_10\n" \
+ " movq "I(3)",%%mm2\n" \
+ \
+ " movq "C(3)",%%mm6\n" \
+ " movq %%mm2,%%mm4\n" \
+ \
+ " movq "C(5)",%%mm1\n" \
+ " pmulhw %%mm6,%%mm4\n" \
+ \
+ " movq "I(1)",%%mm3\n" \
+ " pmulhw %%mm2,%%mm1\n" \
+ \
+ " movq "C(1)",%%mm0\n" \
+ " paddw %%mm2,%%mm4\n" \
+ \
+ " pxor %%mm6,%%mm6\n" \
+ " paddw %%mm1,%%mm2\n" \
+ \
+ " movq "I(2)",%%mm5\n" \
+ " pmulhw %%mm3,%%mm0\n" \
+ \
+ " movq %%mm5,%%mm1\n" \
+ " paddw %%mm3,%%mm0\n" \
+ \
+ " pmulhw "C(7)",%%mm3\n" \
+ " psubsw %%mm2,%%mm6\n" \
+ \
+ " pmulhw "C(2)",%%mm5\n" \
+ " psubsw %%mm4,%%mm0\n" \
+ \
+ " movq "I(2)",%%mm7\n" \
+ " paddsw %%mm4,%%mm4\n" \
+ \
+ " paddw %%mm5,%%mm7\n" \
+ " paddsw %%mm0,%%mm4\n" \
+ \
+ " pmulhw "C(6)",%%mm1\n" \
+ " psubsw %%mm6,%%mm3\n" \
+ \
+ " movq %%mm4,"I(1)"\n" \
+ " paddsw %%mm6,%%mm6\n" \
+ \
+ " movq "C(4)",%%mm4\n" \
+ " paddsw %%mm3,%%mm6\n" \
+ \
+ " movq %%mm3,%%mm5\n" \
+ " pmulhw %%mm4,%%mm3\n" \
+ \
+ " movq %%mm6,"I(2)"\n" \
+ " movq %%mm0,%%mm2\n" \
+ \
+ " movq "I(0)",%%mm6\n" \
+ " pmulhw %%mm4,%%mm0\n" \
+ \
+ " paddw %%mm3,%%mm5\n" \
+ " paddw %%mm0,%%mm2\n" \
+ \
+ " psubsw %%mm1,%%mm5\n" \
+ " pmulhw %%mm4,%%mm6\n" \
+ \
+ " paddw "I(0)",%%mm6\n" \
+ " paddsw %%mm1,%%mm1\n" \
+ \
+ " movq %%mm6,%%mm4\n" \
+ " paddsw %%mm5,%%mm1\n" \
+ \
+ " psubsw %%mm2,%%mm6\n" \
+ " paddsw %%mm2,%%mm2\n" \
+ \
+ " movq "I(1)",%%mm0\n" \
+ " paddsw %%mm6,%%mm2\n" \
+ \
+ " psubsw %%mm1,%%mm2\n" \
+ "#end BeginIDCT_10\n"
+// end BeginIDCT_10 macro (25 cycles).
+
+#define RowIDCT_10 "#RowIDCT_10\n" \
+ BeginIDCT_10 \
+ "\n" \
+ " movq "I(2)",%%mm3\n" /* r3 = D. */ \
+ " psubsw %%mm7,%%mm4\n" /* r4 = E. = E - G */ \
+ " paddsw %%mm1,%%mm1\n" /* r1 = H. + H. */ \
+ " paddsw %%mm7,%%mm7\n" /* r7 = G + G */ \
+ " paddsw %%mm2,%%mm1\n" /* r1 = R1 = A.. + H. */\
+ " paddsw %%mm4,%%mm7\n" /* r7 = G. = E + G */ \
+ " psubsw %%mm3,%%mm4\n" /* r4 = R4 = E. - D. */ \
+ " paddsw %%mm3,%%mm3\n" \
+ " psubsw %%mm5,%%mm6\n" /* r6 = R6 = F. - B.. */\
+ " paddsw %%mm5,%%mm5\n" \
+ " paddsw %%mm4,%%mm3\n" /* r3 = R3 = E. + D. */ \
+ " paddsw %%mm6,%%mm5\n" /* r5 = R5 = F. + B.. */\
+ " psubsw %%mm0,%%mm7\n" /* r7 = R7 = G. - C. */ \
+ " paddsw %%mm0,%%mm0\n" \
+ " movq %%mm1,"I(1)"\n" /* save R1 */ \
+ " paddsw %%mm7,%%mm0\n" /* r0 = R0 = G. + C. */ \
+ "#end RowIDCT_10\n"
+// end RowIDCT macro (8 + 38 = 46 cycles)
+
+// Column IDCT normalizes and stores final results.
+
+#define ColumnIDCT_10 "#ColumnIDCT_10\n" \
+ BeginIDCT_10 \
+ "\n" \
+ " paddsw "Eight",%%mm2\n" \
+ " paddsw %%mm1,%%mm1\n" /* r1 = H. + H. */ \
+ " paddsw %%mm2,%%mm1\n" /* r1 = R1 = A.. + H. */\
+ " psraw ""$4"",%%mm2\n" /* r2 = NR2 */ \
+ " psubsw %%mm7,%%mm4\n" /* r4 = E. = E - G */ \
+ " psraw ""$4"",%%mm1\n" /* r1 = NR1 */ \
+ " movq "I(2)",%%mm3\n" /* r3 = D. */ \
+ " paddsw %%mm7,%%mm7\n" /* r7 = G + G */ \
+ " movq %%mm2,"I(2)"\n" /* store NR2 at I2 */ \
+ " paddsw %%mm4,%%mm7\n" /* r7 = G. = E + G */ \
+ " movq %%mm1,"I(1)"\n" /* store NR1 at I1 */ \
+ " psubsw %%mm3,%%mm4\n" /* r4 = R4 = E. - D. */ \
+ " paddsw "Eight",%%mm4\n" \
+ " paddsw %%mm3,%%mm3\n" /* r3 = D. + D. */ \
+ " paddsw %%mm4,%%mm3\n" /* r3 = R3 = E. + D. */ \
+ " psraw ""$4"",%%mm4\n" /* r4 = NR4 */ \
+ " psubsw %%mm5,%%mm6\n" /* r6 = R6 = F. - B.. */\
+ " psraw ""$4"",%%mm3\n" /* r3 = NR3 */ \
+ " paddsw "Eight",%%mm6\n" \
+ " paddsw %%mm5,%%mm5\n" /* r5 = B.. + B.. */ \
+ " paddsw %%mm6,%%mm5\n" /* r5 = R5 = F. + B.. */\
+ " psraw ""$4"",%%mm6\n" /* r6 = NR6 */ \
+ " movq %%mm4,"J(4)"\n" /* store NR4 at J4 */ \
+ " psraw ""$4"",%%mm5\n" /* r5 = NR5 */ \
+ " movq %%mm3,"I(3)"\n" /* store NR3 at I3 */ \
+ " psubsw %%mm0,%%mm7\n" /* r7 = R7 = G. - C. */ \
+ " paddsw "Eight",%%mm7\n" \
+ " paddsw %%mm0,%%mm0\n" /* r0 = C. + C. */ \
+ " paddsw %%mm7,%%mm0\n" /* r0 = R0 = G. + C. */ \
+ " psraw ""$4"",%%mm7\n" /* r7 = NR7 */ \
+ " movq %%mm6,"J(6)"\n" /* store NR6 at J6 */ \
+ " psraw ""$4"",%%mm0\n" /* r0 = NR0 */ \
+ " movq %%mm5,"J(5)"\n" /* store NR5 at J5 */ \
+ \
+ " movq %%mm7,"J(7)"\n" /* store NR7 at J7 */ \
+ \
+ " movq %%mm0,"I(0)"\n" /* store NR0 at I0 */ \
+ "#end ColumnIDCT_10\n"
+
+// end ColumnIDCT macro (38 + 19 = 57 cycles)
+/* --------------------------------------------------------------- */
+
+
+/* --------------------------------------------------------------- */
+/* IDCT 10 */
+void IDct10__mmx( ogg_int16_t *in,
+ ogg_int16_t *q,
+ ogg_int16_t *out ) {
+
+ __asm__ __volatile__ (
+
+ "movq (%[i]), %%mm0\n"
+ "pmullw (%[q]), %%mm0\n" /* r0 = 03 02 01 00 */
+ "movq 16(%[i]), %%mm1\n"
+ "pmullw 16(%[q]), %%mm1\n" /* r1 = 13 12 11 10 */
+ "movq "M(0)", %%mm2\n" /* r2 = __ __ __ FF */
+ "movq %%mm0, %%mm3\n" /* r3 = 03 02 01 00 */
+ "movq 8(%[i]), %%mm4\n"
+ "psrlq $16, %%mm0\n" /* r0 = __ 03 02 01 */
+ "pmullw 8(%[q]), %%mm4\n" /* r4 = 07 06 05 04 */
+ "pand %%mm2, %%mm3\n" /* r3 = __ __ __ 00 */
+ "movq %%mm0, %%mm5\n" /* r5 = __ 03 02 01 */
+ "pand %%mm2, %%mm5\n" /* r5 = __ __ __ 01 */
+ "psllq $32, %%mm1\n" /* r1 = 11 10 __ __ */
+ "movq "M(3)", %%mm7\n" /* r7 = FF __ __ __ */
+ "pxor %%mm5, %%mm0\n" /* r0 = __ 03 02 __ */
+ "pand %%mm1, %%mm7\n" /* r7 = 11 __ __ __ */
+ "por %%mm3, %%mm0\n" /* r0 = __ 03 02 00 */
+ "pxor %%mm7, %%mm1\n" /* r1 = __ 10 __ __ */
+ "por %%mm7, %%mm0\n" /* r0 = 11 03 02 00 = R0 */
+ "movq %%mm4, %%mm3\n" /* r3 = 07 06 05 04 */
+ "movq %%mm0, (%[o])\n" /* write R0 = r0 */
+ "pand %%mm2, %%mm3\n" /* r3 = __ __ __ 04 */
+ "psllq $16, %%mm3\n" /* r3 = __ __ 04 __ */
+ "por %%mm3, %%mm5\n" /* r5 = __ __ 04 01 */
+ "por %%mm5, %%mm1\n" /* r1 = __ 10 04 01 = R1 */
+ "psrlq $16, %%mm4\n" /* r4 = __ 07 06 05 */
+ "movq %%mm1, 16(%[o])\n" /* write R1 = r1 */
+ "movq %%mm4, %%mm5\n" /* r5 = __ 07 06 05 */
+ "psrlq $16, %%mm4\n" /* r4 = __ __ 07 06 */
+ "movq %%mm2, %%mm6\n" /* r6 = __ __ __ FF */
+ "pand %%mm2, %%mm5\n" /* r5 = __ __ __ 05 */
+ "pand %%mm4, %%mm6\n" /* r6 = __ __ __ 06 */
+ "pxor %%mm6, %%mm4\n" /* r4 = __ __ 07 __ */
+ "por %%mm5, %%mm4\n" /* r4 = __ __ 07 05 */
+ "movq %%mm4, 32(%[o])\n" /* write R2 = r4 */
+ "movq %%mm6, 48(%[o])\n" /* write R3 = r6 */
+
+# define I( K) MtoSTR((K*16))"(%[o])"
+# define J( K) MtoSTR(((K - 4) * 16)+8)"(%[o])"
+
+ RowIDCT_10 /* 33 c */
+ Transpose /* 19 c */
+
+# undef I
+# undef J
+
+# define I( K) MtoSTR((K * 16))"(%[o])"
+# define J( K) I( K)
+
+ ColumnIDCT_10 /* 44 c */
+
+# undef I
+# undef J
+# define I( K) MtoSTR((K * 16)+8)"(%[o])"
+# define J( K) I( K)
+
+ ColumnIDCT_10 /* 44 c */
+
+# undef I
+# undef J
+
+ "emms\n"
+ :
+ :[i]"r"(in),[q]"r"(q),[o]"r"(out),[c]"r"(idctconstants)
+ );
+}
+
+/**************************************************************************************
+ *
+ * Routine: MMX_idct3
+ *
+ * Description: Perform IDCT on a 8x8 block with at most 3 nonzero coefficients
+ *
+ * Input: Pointer to input and output buffer
+ *
+ * Output: None
+ *
+ * Return: None
+ *
+ * Special Note: Only works for three nonzero coefficients.
+ *
+ * Error: None
+ *
+ ***************************************************************************************
+ */
+/***************************************************************************************
+ In IDCT 3, we are dealing with only three Non-Zero coefficients in the 8x8 block.
+ In the case that we work in the fashion RowIDCT -> ColumnIDCT, we only have to
+ do 1-D row idcts on the first two rows, the rest six rows remain zero anyway.
+ After row IDCTs, since every column could have nonzero coefficients, we need do
+ eight 1-D column IDCT. However, for each column, there are at most two nonzero
+ coefficients, coefficient 0 and coefficient 1. Same for the coefficents for the
+ two 1-d row idcts. For this reason, the process of a 1-D IDCT is simplified
+
+ from a full version:
+
+ A = (C1 * I1) + (C7 * I7) B = (C7 * I1) - (C1 * I7)
+ C = (C3 * I3) + (C5 * I5) D = (C3 * I5) - (C5 * I3)
+ A. = C4 * (A - C) B. = C4 * (B - D)
+ C. = A + C D. = B + D
+
+ E = C4 * (I0 + I4) F = C4 * (I0 - I4)
+ G = (C2 * I2) + (C6 * I6) H = (C6 * I2) - (C2 * I6)
+ E. = E - G
+ G. = E + G
+
+ A.. = F + A. B.. = B. - H
+ F. = F - A. H. = B. + H
+
+ R0 = G. + C. R1 = A.. + H. R3 = E. + D. R5 = F. + B..
+ R7 = G. - C. R2 = A.. - H. R4 = E. - D. R6 = F. - B..
+
+ To:
+
+
+ A = (C1 * I1) B = (C7 * I1)
+ C = 0 D = 0
+ A. = C4 * A B. = C4 * B
+ C. = A D. = B
+
+ E = C4 * I0 F = E
+ G = 0 H = 0
+ E. = E
+ G. = E
+
+ A.. = E + A. B.. = B.
+ F. = E - A. H. = B.
+
+ R0 = E + A R1 = E + A. + B. R3 = E + B R5 = E - A. + B.
+ R7 = E - A R2 = E + A. - B. R4 = E - B R6 = F - A. - B.
+
+******************************************************************************************/
+
+#define RowIDCT_3 "#RowIDCT_3\n"\
+ " movq "I(1)",%%mm7\n" /* r7 = I1 */ \
+ " movq "C(1)",%%mm0\n" /* r0 = C1 */ \
+ " movq "C(7)",%%mm3\n" /* r3 = C7 */ \
+ " pmulhw %%mm7,%%mm0\n" /* r0 = C1 * I1 - I1 */ \
+ " pmulhw %%mm7,%%mm3\n" /* r3 = C7 * I1 = B, D. */ \
+ " movq "I(0)",%%mm6\n" /* r6 = I0 */ \
+ " movq "C(4)",%%mm4\n" /* r4 = C4 */ \
+ " paddw %%mm7,%%mm0\n" /* r0 = C1 * I1 = A, C. */ \
+ " movq %%mm6,%%mm1\n" /* make a copy of I0 */ \
+ " pmulhw %%mm4,%%mm6\n" /* r2 = C4 * I0 - I0 */ \
+ " movq %%mm0,%%mm2\n" /* make a copy of A */ \
+ " movq %%mm3,%%mm5\n" /* make a copy of B */ \
+ " pmulhw %%mm4,%%mm2\n" /* r2 = C4 * A - A */ \
+ " pmulhw %%mm4,%%mm5\n" /* r5 = C4 * B - B */ \
+ " paddw %%mm1,%%mm6\n" /* r2 = C4 * I0 = E, F */ \
+ " movq %%mm6,%%mm4\n" /* r4 = E */ \
+ " paddw %%mm0,%%mm2\n" /* r2 = A. */ \
+ " paddw %%mm3,%%mm5\n" /* r5 = B. */ \
+ " movq %%mm6,%%mm7\n" /* r7 = E */ \
+ " movq %%mm5,%%mm1\n" /* r1 = B. */ \
+ /* r0 = A */ \
+ /* r3 = B */ \
+ /* r2 = A. */ \
+ /* r5 = B. */ \
+ /* r6 = E */ \
+ /* r4 = E */ \
+ /* r7 = E */ \
+ /* r1 = B. */ \
+ " psubw %%mm2,%%mm6\n" /* r6 = E - A. */ \
+ " psubw %%mm3,%%mm4\n" /* r4 = E - B ----R4 */ \
+ " psubw %%mm0,%%mm7\n" /* r7 = E - A ----R7 */ \
+ " paddw %%mm2,%%mm2\n" /* r2 = A. + A. */ \
+ " paddw %%mm3,%%mm3\n" /* r3 = B + B */ \
+ " paddw %%mm0,%%mm0\n" /* r0 = A + A */ \
+ " paddw %%mm6,%%mm2\n" /* r2 = E + A. */ \
+ " paddw %%mm4,%%mm3\n" /* r3 = E + B ----R3 */ \
+ " psubw %%mm1,%%mm2\n" /* r2 = E + A. - B. ----R2 */ \
+ " psubw %%mm5,%%mm6\n" /* r6 = E - A. - B. ----R6 */ \
+ " paddw %%mm1,%%mm1\n" /* r1 = B. + B. */ \
+ " paddw %%mm5,%%mm5\n" /* r5 = B. + B. */ \
+ " paddw %%mm7,%%mm0\n" /* r0 = E + A ----R0 */ \
+ " paddw %%mm2,%%mm1\n" /* r1 = E + A. + B. -----R1 */ \
+ " movq %%mm1,"I(1)"\n" /* save r1 */ \
+ " paddw %%mm6,%%mm5\n" /* r5 = E - A. + B. -----R5 */ \
+ "#end RowIDCT_3\n"
+//End of RowIDCT_3
+
+#define ColumnIDCT_3 "#ColumnIDCT_3\n"\
+ " movq "I(1)",%%mm7\n" /* r7 = I1 */ \
+ " movq "C(1)",%%mm0\n" /* r0 = C1 */ \
+ " movq "C(7)",%%mm3\n" /* r3 = C7 */ \
+ " pmulhw %%mm7,%%mm0\n" /* r0 = C1 * I1 - I1 */ \
+ " pmulhw %%mm7,%%mm3\n" /* r3 = C7 * I1 = B, D. */ \
+ " movq "I(0)",%%mm6\n" /* r6 = I0 */ \
+ " movq "C(4)",%%mm4\n" /* r4 = C4 */ \
+ " paddw %%mm7,%%mm0\n" /* r0 = C1 * I1 = A, C. */ \
+ " movq %%mm6,%%mm1\n" /* make a copy of I0 */ \
+ " pmulhw %%mm4,%%mm6\n" /* r2 = C4 * I0 - I0 */ \
+ " movq %%mm0,%%mm2\n" /* make a copy of A */ \
+ " movq %%mm3,%%mm5\n" /* make a copy of B */ \
+ " pmulhw %%mm4,%%mm2\n" /* r2 = C4 * A - A */ \
+ " pmulhw %%mm4,%%mm5\n" /* r5 = C4 * B - B */ \
+ " paddw %%mm1,%%mm6\n" /* r2 = C4 * I0 = E, F */ \
+ " movq %%mm6,%%mm4\n" /* r4 = E */ \
+ " paddw "Eight",%%mm6\n" /* +8 for shift */ \
+ " paddw "Eight",%%mm4\n" /* +8 for shift */ \
+ " paddw %%mm0,%%mm2\n" /* r2 = A. */ \
+ " paddw %%mm3,%%mm5\n" /* r5 = B. */ \
+ " movq %%mm6,%%mm7\n" /* r7 = E */ \
+ " movq %%mm5,%%mm1\n" /* r1 = B. */ \
+/* r0 = A */ \
+/* r3 = B */ \
+/* r2 = A. */ \
+/* r5 = B. */ \
+/* r6 = E */ \
+/* r4 = E */ \
+/* r7 = E */ \
+/* r1 = B. */ \
+ " psubw %%mm2,%%mm6\n" /* r6 = E - A. */ \
+ " psubw %%mm3,%%mm4\n" /* r4 = E - B ----R4 */ \
+ " psubw %%mm0,%%mm7\n" /* r7 = E - A ----R7 */ \
+ " paddw %%mm2,%%mm2\n" /* r2 = A. + A. */ \
+ " paddw %%mm3,%%mm3\n" /* r3 = B + B */ \
+ " paddw %%mm0,%%mm0\n" /* r0 = A + A */ \
+ " paddw %%mm6,%%mm2\n" /* r2 = E + A. */ \
+ " paddw %%mm4,%%mm3\n" /* r3 = E + B ----R3 */ \
+ " psraw $4,%%mm4\n" /* shift */ \
+ " movq %%mm4,"J(4)"\n" /* store R4 at J4 */ \
+ " psraw $4,%%mm3\n" /* shift */ \
+ " movq %%mm3,"I(3)"\n" /* store R3 at I3 */ \
+ " psubw %%mm1,%%mm2\n" /* r2 = E + A. - B. ----R2 */ \
+ " psubw %%mm5,%%mm6\n" /* r6 = E - A. - B. ----R6 */ \
+ " paddw %%mm1,%%mm1\n" /* r1 = B. + B. */ \
+ " paddw %%mm5,%%mm5\n" /* r5 = B. + B. */ \
+ " paddw %%mm7,%%mm0\n" /* r0 = E + A ----R0 */ \
+ " paddw %%mm2,%%mm1\n" /* r1 = E + A. + B. -----R1 */ \
+ " psraw $4,%%mm7\n" /* shift */ \
+ " psraw $4,%%mm2\n" /* shift */ \
+ " psraw $4,%%mm0\n" /* shift */ \
+ " psraw $4,%%mm1\n" /* shift */ \
+ " movq %%mm7,"J(7)"\n" /* store R7 to J7 */ \
+ " movq %%mm0,"I(0)"\n" /* store R0 to I0 */ \
+ " movq %%mm1,"I(1)"\n" /* store R1 to I1 */ \
+ " movq %%mm2,"I(2)"\n" /* store R2 to I2 */ \
+ " movq %%mm1,"I(1)"\n" /* save r1 */ \
+ " paddw %%mm6,%%mm5\n" /* r5 = E - A. + B. -----R5 */ \
+ " psraw $4,%%mm5\n" /* shift */ \
+ " movq %%mm5,"J(5)"\n" /* store R5 at J5 */ \
+ " psraw $4,%%mm6\n" /* shift */ \
+ " movq %%mm6,"J(6)"\n" /* store R6 at J6 */ \
+ "#end ColumnIDCT_3\n"
+//End of ColumnIDCT_3
+
+void IDct3__mmx( ogg_int16_t *in,
+ ogg_int16_t *q,
+ ogg_int16_t *out ) {
+
+ __asm__ __volatile__ (
+
+ "movq (%[i]), %%mm0\n"
+ "pmullw (%[q]), %%mm0\n" /* r0 = 03 02 01 00 */
+ "movq "M(0)", %%mm2\n" /* r2 = __ __ __ FF */
+ "movq %%mm0, %%mm3\n" /* r3 = 03 02 01 00 */
+ "psrlq $16, %%mm0\n" /* r0 = __ 03 02 01 */
+ "pand %%mm2, %%mm3\n" /* r3 = __ __ __ 00 */
+ "movq %%mm0, %%mm5\n" /* r5 = __ 03 02 01 */
+ "pand %%mm2, %%mm5\n" /* r5 = __ __ __ 01 */
+ "pxor %%mm5, %%mm0\n" /* r0 = __ 03 02 __ */
+ "por %%mm3, %%mm0\n" /* r0 = __ 03 02 00 */
+ "movq %%mm0, (%[o])\n" /* write R0 = r0 */
+ "movq %%mm5, 16(%[o])\n" /* write R1 = r5 */
+
+/* Done partial transpose; now do the idct itself. */
+
+# define I( K) MtoSTR((K*16))"(%[o])"
+# define J( K) MtoSTR(((K - 4)*16)+8)"(%[o])"
+
+ RowIDCT_3 /* 33 c */
+ Transpose /* 19 c */
+
+# undef I
+# undef J
+
+# define I( K) MtoSTR((K * 16))"(%[o])"
+# define J( K) I( K)
+
+ ColumnIDCT_3 /* 44 c */
+
+# undef I
+# undef J
+# define I( K) MtoSTR((K*16)+8)"(%[o])"
+# define J( K) I( K)
+
+ ColumnIDCT_3 /* 44 c */
+
+# undef I
+# undef J
+
+ "emms\n"
+ :
+ :[i]"r"(in),[q]"r"(q),[o]"r"(out),[c]"r"(idctconstants)
+ );
+
+}
+
+/* install our implementation in the function table */
void dsp_mmx_idct_init(DspFunctions *funcs)
{
+ funcs->IDctSlow = IDctSlow__mmx;
+ funcs->IDct10 = IDct10__mmx;
+ funcs->IDct3 = IDct3__mmx;
}
#endif /* USE_ASM */
+
+
More information about the commits
mailing list