[xiph-commits] r15153 - in trunk/theora: examples include/theora lib lib/dec lib/dec/x86 lib/dec/x86_vc lib/enc lib/enc/x86_32 lib/enc/x86_32_vs lib/enc/x86_64
tterribe at svn.xiph.org
tterribe at svn.xiph.org
Mon Aug 4 11:37:56 PDT 2008
Author: tterribe
Date: 2008-08-04 11:37:55 -0700 (Mon, 04 Aug 2008)
New Revision: 15153
Modified:
trunk/theora/examples/player_example.c
trunk/theora/include/theora/codec.h
trunk/theora/lib/cpu.c
trunk/theora/lib/dec/bitwise.c
trunk/theora/lib/dec/bitwise.h
trunk/theora/lib/dec/decapiwrapper.c
trunk/theora/lib/dec/decint.h
trunk/theora/lib/dec/decode.c
trunk/theora/lib/dec/dequant.c
trunk/theora/lib/dec/idct.c
trunk/theora/lib/dec/quant.c
trunk/theora/lib/dec/quant.h
trunk/theora/lib/dec/state.c
trunk/theora/lib/dec/x86/mmxfrag.c
trunk/theora/lib/dec/x86/mmxstate.c
trunk/theora/lib/dec/x86_vc/mmxidct.c
trunk/theora/lib/dec/x86_vc/mmxloopfilter.c
trunk/theora/lib/dec/x86_vc/mmxstate.c
trunk/theora/lib/dec/x86_vc/x86state.c
trunk/theora/lib/enc/codec_internal.h
trunk/theora/lib/enc/dct_decode.c
trunk/theora/lib/enc/dct_encode.c
trunk/theora/lib/enc/dsp.c
trunk/theora/lib/enc/dsp.h
trunk/theora/lib/enc/encode.c
trunk/theora/lib/enc/encoder_quant.c
trunk/theora/lib/enc/encoder_toplevel.c
trunk/theora/lib/enc/frarray.c
trunk/theora/lib/enc/frinit.c
trunk/theora/lib/enc/mcomp.c
trunk/theora/lib/enc/x86_32/dct_decode_mmx.c
trunk/theora/lib/enc/x86_32/dsp_mmx.c
trunk/theora/lib/enc/x86_32/dsp_mmxext.c
trunk/theora/lib/enc/x86_32/fdct_mmx.c
trunk/theora/lib/enc/x86_32/idct_mmx.c
trunk/theora/lib/enc/x86_32/recon_mmx.c
trunk/theora/lib/enc/x86_32_vs/dsp_mmx.c
trunk/theora/lib/enc/x86_32_vs/fdct_mmx.c
trunk/theora/lib/enc/x86_32_vs/recon_mmx.c
trunk/theora/lib/enc/x86_64/dct_decode_mmx.c
trunk/theora/lib/enc/x86_64/dsp_mmx.c
trunk/theora/lib/enc/x86_64/dsp_mmxext.c
trunk/theora/lib/enc/x86_64/fdct_mmx.c
trunk/theora/lib/enc/x86_64/recon_mmx.c
trunk/theora/lib/internal.h
Log:
Remove all TH_DEBUG statements.
They required variadic macros, which are not standard in C90.
They also cluttered up the code, and were unlikely to be maintained properly
anyway.
Also, remove all the tabs and trailing whitespace, etc., that xiphmont gunked
up my code with.
Modified: trunk/theora/examples/player_example.c
===================================================================
--- trunk/theora/examples/player_example.c 2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/examples/player_example.c 2008-08-04 18:37:55 UTC (rev 15153)
@@ -567,6 +567,10 @@
/* and now we have it all. initialize decoders */
if(theora_p){
+ ti.offset_x=0;
+ ti.offset_y=0;
+ ti.frame_width=ti.width;
+ ti.frame_height=ti.height;
theora_decode_init(&td,&ti);
printf("Ogg logical stream %lx is Theora %dx%d %.02f fps",
to.serialno,ti.width,ti.height,
Modified: trunk/theora/include/theora/codec.h
===================================================================
--- trunk/theora/include/theora/codec.h 2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/include/theora/codec.h 2008-08-04 18:37:55 UTC (rev 15153)
@@ -112,13 +112,18 @@
* specification</a>, Section 4.4, for details on the precise sample
* locations.*/
typedef enum{
- /**Chroma decimation by 2 in both the X and Y directions (4:2:0).*/
+ /**Chroma decimation by 2 in both the X and Y directions (4:2:0).
+ The Cb and Cr chroma planes are half the width and half the height of the
+ luma plane.*/
TH_PF_420,
/**Currently reserved.*/
TH_PF_RSVD,
- /**Chroma decimation by 2 in the X direction (4:2:2).*/
+ /**Chroma decimation by 2 in the X direction (4:2:2).
+ The Cb and Cr chroma planes are half the width of the luma plane, but full
+ height.*/
TH_PF_422,
- /**No chroma decimation (4:4:4).*/
+ /**No chroma decimation (4:4:4).
+ The Cb and Cr chroma planes are full width and full height.*/
TH_PF_444,
/**The total number of currently defined pixel formats.*/
TH_PF_NFORMATS
Modified: trunk/theora/lib/cpu.c
===================================================================
--- trunk/theora/lib/cpu.c 2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/cpu.c 2008-08-04 18:37:55 UTC (rev 15153)
@@ -220,24 +220,6 @@
/*Implement me.*/
flags=0;
}
-# if defined(DEBUG)
- if(flags){
- TH_DEBUG("vectorized instruction sets supported:");
- if(flags&OC_CPU_X86_MMX)TH_DEBUG(" mmx");
- if(flags&OC_CPU_X86_MMXEXT)TH_DEBUG(" mmxext");
- if(flags&OC_CPU_X86_SSE)TH_DEBUG(" sse");
- if(flags&OC_CPU_X86_SSE2)TH_DEBUG(" sse2");
- if(flags&OC_CPU_X86_3DNOW)TH_DEBUG(" 3dnow");
- if(flags&OC_CPU_X86_3DNOWEXT)TH_DEBUG(" 3dnowext");
- if(flags&OC_CPU_X86_PNI)TH_DEBUG(" pni");
- if(flags&OC_CPU_X86_SSSE3)TH_DEBUG(" ssse3");
- if(flags&OC_CPU_X86_SSE4_1)TH_DEBUG(" sse4_1");
- if(flags&OC_CPU_X86_SSE4_2)TH_DEBUG(" sse4_2");
- if(flags&OC_CPU_X86_SSE4A)TH_DEBUG(" sse4a");
- if(flags&OC_CPU_X86_SSE5)TH_DEBUG(" sse5");
- TH_DEBUG("\n");
- }
-# endif
return flags;
}
#endif
Modified: trunk/theora/lib/dec/bitwise.c
===================================================================
--- trunk/theora/lib/dec/bitwise.c 2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/dec/bitwise.c 2008-08-04 18:37:55 UTC (rev 15153)
@@ -15,112 +15,107 @@
********************************************************************/
-/* We're 'MSb' endian; if we write a word but read individual bits,
- then we'll read the msb first */
+/*We're 'MSb' endian; if we write a word but read individual bits,
+ then we'll read the MSb first.*/
#include <string.h>
#include <stdlib.h>
#include "bitwise.h"
-void theorapackB_reset(oggpack_buffer *b){
- b->ptr=b->buffer;
- b->buffer[0]=0;
- b->endbit=b->endbyte=0;
+void theorapackB_readinit(oggpack_buffer *_b,unsigned char *_buf,int _bytes){
+ memset(_b,0,sizeof(*_b));
+ _b->buffer=_b->ptr=_buf;
+ _b->storage=_bytes;
}
-void theorapackB_readinit(oggpack_buffer *b,unsigned char *buf,int bytes){
- memset(b,0,sizeof(*b));
- b->buffer=b->ptr=buf;
- b->storage=bytes;
-}
-
-int theorapackB_look1(oggpack_buffer *b,long *_ret){
- if(b->endbyte>=b->storage){
+int theorapackB_look1(oggpack_buffer *_b,long *_ret){
+ if(_b->endbyte>=_b->storage){
*_ret=0L;
return -1;
}
- *_ret=((b->ptr[0]>>(7-b->endbit))&1);
+ *_ret=(_b->ptr[0]>>7-_b->endbit)&1;
return 0;
}
-void theorapackB_adv1(oggpack_buffer *b){
- if(++(b->endbit)>7){
- b->endbit=0;
- b->ptr++;
- b->endbyte++;
+void theorapackB_adv1(oggpack_buffer *_b){
+ if(++(_b->endbit)>7){
+ _b->endbit=0;
+ _b->ptr++;
+ _b->endbyte++;
}
}
-/* bits <= 32 */
-int theorapackB_read(oggpack_buffer *b,int bits,long *_ret){
+/*Here we assume that 0<=_bits&&_bits<=32.*/
+int theorapackB_read(oggpack_buffer *_b,int _bits,long *_ret){
long ret;
long m;
+ long d;
int fail;
- m=32-bits;
- bits+=b->endbit;
- if(b->endbyte+4>=b->storage){
- /* not the main path */
- if(b->endbyte*8+bits>b->storage*8){
+ m=32-_bits;
+ _bits+=_b->endbit;
+ d=_b->storage-_b->endbyte;
+ if(d<=4){
+ /*Not the main path.*/
+ if(d*8<_bits){
*_ret=0L;
fail=-1;
goto overflow;
}
- /* special case to avoid reading b->ptr[0], which might be past the end of
- the buffer; also skips some useless accounting */
- else if(!bits){
+ /*Special case to avoid reading _b->ptr[0], which might be past the end of
+ the buffer; also skips some useless accounting.*/
+ else if(!_bits){
*_ret=0L;
return 0;
}
}
- ret=b->ptr[0]<<(24+b->endbit);
- if(bits>8){
- ret|=b->ptr[1]<<(16+b->endbit);
- if(bits>16){
- ret|=b->ptr[2]<<(8+b->endbit);
- if(bits>24){
- ret|=b->ptr[3]<<(b->endbit);
- if(bits>32 && b->endbit)
- ret|=b->ptr[4]>>(8-b->endbit);
+ ret=_b->ptr[0]<<24+_b->endbit;
+ if(_bits>8){
+ ret|=_b->ptr[1]<<16+_b->endbit;
+ if(_bits>16){
+ ret|=_b->ptr[2]<<8+_b->endbit;
+ if(_bits>24){
+ ret|=_b->ptr[3]<<_b->endbit;
+ if(_bits>32)ret|=_b->ptr[4]>>8-_b->endbit;
}
}
}
- *_ret=((ret&0xffffffffUL)>>(m>>1))>>((m+1)>>1);
+ *_ret=((ret&0xFFFFFFFFUL)>>(m>>1))>>(m+1>>1);
fail=0;
overflow:
- b->ptr+=bits/8;
- b->endbyte+=bits/8;
- b->endbit=bits&7;
+ _b->ptr+=_bits>>3;
+ _b->endbyte+=_bits>>3;
+ _b->endbit=_bits&7;
return fail;
}
-int theorapackB_read1(oggpack_buffer *b,long *_ret){
+int theorapackB_read1(oggpack_buffer *_b,long *_ret){
int fail;
- if(b->endbyte>=b->storage){
- /* not the main path */
+ if(_b->endbyte>=_b->storage){
+ /*Not the main path.*/
*_ret=0L;
fail=-1;
- goto overflow;
}
- *_ret=(b->ptr[0]>>(7-b->endbit))&1;
- fail=0;
-overflow:
- b->endbit++;
- if(b->endbit>7){
- b->endbit=0;
- b->ptr++;
- b->endbyte++;
+ else{
+ *_ret=(_b->ptr[0]>>7-_b->endbit)&1;
+ fail=0;
}
+ _b->endbit++;
+ if(_b->endbit>7){
+ _b->endbit=0;
+ _b->ptr++;
+ _b->endbyte++;
+ }
return fail;
}
-long theorapackB_bytes(oggpack_buffer *b){
- return(b->endbyte+(b->endbit+7)/8);
+long theorapackB_bytes(oggpack_buffer *_b){
+ return _b->endbyte+(_b->endbit+7>>3);
}
-long theorapackB_bits(oggpack_buffer *b){
- return(b->endbyte*8+b->endbit);
+long theorapackB_bits(oggpack_buffer *_b){
+ return _b->endbyte*8+_b->endbit;
}
-unsigned char *theorapackB_get_buffer(oggpack_buffer *b){
- return(b->buffer);
+unsigned char *theorapackB_get_buffer(oggpack_buffer *_b){
+ return _b->buffer;
}
Modified: trunk/theora/lib/dec/bitwise.h
===================================================================
--- trunk/theora/lib/dec/bitwise.h 2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/dec/bitwise.h 2008-08-04 18:37:55 UTC (rev 15153)
@@ -18,59 +18,61 @@
# define _bitwise_H (1)
# include <ogg/ogg.h>
-void theorapackB_reset(oggpack_buffer *b);
-void theorapackB_readinit(oggpack_buffer *b,unsigned char *buf,int bytes);
-/* Read in bits without advancing the bitptr; bits <= 32 */
-static int theorapackB_look(oggpack_buffer *b,int bits,long *_ret);
-int theorapackB_look1(oggpack_buffer *b,long *_ret);
-static void theorapackB_adv(oggpack_buffer *b,int bits);
-void theorapackB_adv1(oggpack_buffer *b);
-/* bits <= 32 */
-int theorapackB_read(oggpack_buffer *b,int bits,long *_ret);
-int theorapackB_read1(oggpack_buffer *b,long *_ret);
-long theorapackB_bytes(oggpack_buffer *b);
-long theorapackB_bits(oggpack_buffer *b);
-unsigned char *theorapackB_get_buffer(oggpack_buffer *b);
+void theorapackB_readinit(oggpack_buffer *_b,unsigned char *_buf,int _bytes);
+/*Read in bits without advancing the bitptr.
+ Here we assume 0<=_bits&&_bits<=32.*/
+static int theorapackB_look(oggpack_buffer *_b,int _bits,long *_ret);
+int theorapackB_look1(oggpack_buffer *_b,long *_ret);
+static void theorapackB_adv(oggpack_buffer *_b,int _bits);
+void theorapackB_adv1(oggpack_buffer *_b);
+/*Here we assume 0<=_bits&&_bits<=32.*/
+int theorapackB_read(oggpack_buffer *_b,int _bits,long *_ret);
+int theorapackB_read1(oggpack_buffer *_b,long *_ret);
+long theorapackB_bytes(oggpack_buffer *_b);
+long theorapackB_bits(oggpack_buffer *_b);
+unsigned char *theorapackB_get_buffer(oggpack_buffer *_b);
/*These two functions are only used in one place, and declaring them static so
they can be inlined saves considerable function call overhead.*/
-/* Read in bits without advancing the bitptr; bits <= 32 */
-static int theorapackB_look(oggpack_buffer *b,int bits,long *_ret){
+/*Read in bits without advancing the bitptr.
+ Here we assume 0<=_bits&&_bits<=32.*/
+static int theorapackB_look(oggpack_buffer *_b,int _bits,long *_ret){
long ret;
long m;
- m=32-bits;
- bits+=b->endbit;
- if(b->endbyte+4>=b->storage){
- /* not the main path */
- if(b->endbyte>=b->storage){
+ long d;
+ m=32-_bits;
+ _bits+=_b->endbit;
+ d=_b->storage-_b->endbyte;
+ if(d<=4){
+ /*Not the main path.*/
+ if(d<=0){
*_ret=0L;
- return -1;
+ return -(_bits>d*8);
}
/*If we have some bits left, but not enough, return the ones we have.*/
- if((b->storage-b->endbyte)*8<bits)bits=(b->storage-b->endbyte)*8;
+ if(d*8<_bits)_bits=d*8;
}
- ret=b->ptr[0]<<(24+b->endbit);
- if(bits>8){
- ret|=b->ptr[1]<<(16+b->endbit);
- if(bits>16){
- ret|=b->ptr[2]<<(8+b->endbit);
- if(bits>24){
- ret|=b->ptr[3]<<(b->endbit);
- if(bits>32&&b->endbit)
- ret|=b->ptr[4]>>(8-b->endbit);
+ ret=_b->ptr[0]<<24+_b->endbit;
+ if(_bits>8){
+ ret|=_b->ptr[1]<<16+_b->endbit;
+ if(_bits>16){
+ ret|=_b->ptr[2]<<8+_b->endbit;
+ if(_bits>24){
+ ret|=_b->ptr[3]<<_b->endbit;
+ if(_bits>32)ret|=_b->ptr[4]>>8-_b->endbit;
}
}
}
- *_ret=((ret&0xffffffff)>>(m>>1))>>((m+1)>>1);
+ *_ret=((ret&0xFFFFFFFF)>>(m>>1))>>(m+1>>1);
return 0;
}
-static void theorapackB_adv(oggpack_buffer *b,int bits){
- bits+=b->endbit;
- b->ptr+=bits/8;
- b->endbyte+=bits/8;
- b->endbit=bits&7;
+static void theorapackB_adv(oggpack_buffer *_b,int _bits){
+ _bits+=_b->endbit;
+ _b->ptr+=_bits>>3;
+ _b->endbyte+=_bits>>3;
+ _b->endbit=_bits&7;
}
#endif
Modified: trunk/theora/lib/dec/decapiwrapper.c
===================================================================
--- trunk/theora/lib/dec/decapiwrapper.c 2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/dec/decapiwrapper.c 2008-08-04 18:37:55 UTC (rev 15153)
@@ -29,10 +29,6 @@
static void theora_decode_clear(theora_state *_td){
if(_td->i!=NULL)theora_info_clear(_td->i);
-#ifdef _TH_DEBUG_
- fclose(debugout);
- debugout=NULL;
-#endif
memset(_td,0,sizeof(*_td));
}
@@ -92,7 +88,6 @@
th_api_info *apiinfo;
th_api_wrapper *api;
th_info info;
-
api=(th_api_wrapper *)_ci->codec_setup;
/*Allocate our own combined API wrapper/theora_info struct.
We put them both in one malloc'd block so that when the API wrapper is
@@ -130,11 +125,6 @@
th_api_wrapper *api;
th_info info;
int ret;
-
-#ifdef _TH_DEBUG_
- debugout = fopen("theoradec-debugout.txt","w");
-#endif
-
api=(th_api_wrapper *)_ci->codec_setup;
/*Allocate an API wrapper struct on demand, since it will not also include a
theora_info struct like the ones that are used in a theora_state struct.*/
@@ -167,16 +157,9 @@
th_api_wrapper *api;
ogg_int64_t gp;
int ret;
-
- if(!_td || !_td->i || !_td->i->codec_setup)return OC_FAULT;
+ if(!_td||!_td->i||!_td->i->codec_setup)return OC_FAULT;
api=(th_api_wrapper *)_td->i->codec_setup;
- if(!api || !api->decode)return OC_FAULT;
ret=th_decode_packetin(api->decode,_op,&gp);
-
-#ifdef _TH_DEBUG_
- dframe++;
-#endif
-
if(ret<0)return OC_BADPACKET;
_td->granulepos=gp;
return 0;
@@ -186,10 +169,9 @@
th_api_wrapper *api;
th_ycbcr_buffer buf;
int ret;
-
- if(!_td || !_td->i || !_td->i->codec_setup)return OC_FAULT;
+ if(!_td||!_td->i||!_td->i->codec_setup)return OC_FAULT;
api=(th_api_wrapper *)_td->i->codec_setup;
- if(!api || !api->decode)return OC_FAULT;
+ if(!api->decode)return OC_FAULT;
ret=th_decode_ycbcr_out(api->decode,buf);
if(ret>=0){
_yuv->y_width=buf[0].width;
@@ -202,6 +184,5 @@
_yuv->u=buf[1].data;
_yuv->v=buf[2].data;
}
-
return ret;
}
Modified: trunk/theora/lib/dec/decint.h
===================================================================
--- trunk/theora/lib/dec/decint.h 2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/dec/decint.h 2008-08-04 18:37:55 UTC (rev 15153)
@@ -47,45 +47,45 @@
struct th_dec_ctx{
/*Shared encoder/decoder state.*/
- oc_theora_state state;
+ oc_theora_state state;
/*Whether or not packets are ready to be emitted.
This takes on negative values while there are remaining header packets to
be emitted, reaches 0 when the codec is ready for input, and goes to 1
when a frame has been processed and a data packet is ready.*/
- int packet_state;
+ int packet_state;
/*Buffer in which to assemble packets.*/
- oggpack_buffer opb;
+ oggpack_buffer opb;
/*Huffman decode trees.*/
- oc_huff_node *huff_tables[TH_NHUFFMAN_TABLES];
+ oc_huff_node *huff_tables[TH_NHUFFMAN_TABLES];
/*The index of one past the last token in each plane for each coefficient.
The final entries are the total number of tokens for each coefficient.*/
- int ti0[3][64];
+ int ti0[3][64];
/*The index of one past the last extra bits entry in each plane for each
coefficient.
The final entries are the total number of extra bits entries for each
coefficient.*/
- int ebi0[3][64];
+ int ebi0[3][64];
/*The number of outstanding EOB runs at the start of each coefficient in each
plane.*/
- int eob_runs[3][64];
+ int eob_runs[3][64];
/*The DCT token lists.*/
- unsigned char **dct_tokens;
+ unsigned char **dct_tokens;
/*The extra bits associated with DCT tokens.*/
- ogg_uint16_t **extra_bits;
+ ogg_uint16_t **extra_bits;
/*The out-of-loop post-processing level.*/
- int pp_level;
+ int pp_level;
/*The DC scale used for out-of-loop deblocking.*/
- int pp_dc_scale[64];
+ int pp_dc_scale[64];
/*The sharpen modifier used for out-of-loop deringing.*/
- int pp_sharp_mod[64];
+ int pp_sharp_mod[64];
/*The DC quantization index of each block.*/
- unsigned char *dc_qis;
+ unsigned char *dc_qis;
/*The variance of each block.*/
- int *variances;
+ int *variances;
/*The storage for the post-processed frame buffer.*/
- unsigned char *pp_frame_data;
+ unsigned char *pp_frame_data;
/*Whether or not the post-processsed frame buffer has space for chroma.*/
- int pp_frame_has_chroma;
+ int pp_frame_has_chroma;
/*The buffer used for the post-processed frame.*/
th_ycbcr_buffer pp_frame_buf;
/*The striped decode callback function.*/
Modified: trunk/theora/lib/dec/decode.c
===================================================================
--- trunk/theora/lib/dec/decode.c 2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/dec/decode.c 2008-08-04 18:37:55 UTC (rev 15153)
@@ -12,7 +12,7 @@
function:
last mod: $Id$
-
+
********************************************************************/
#include <stdlib.h>
@@ -170,7 +170,7 @@
_dec->state.dequant_table_data[qti][pli];
}
oc_dequant_tables_init(_dec->state.dequant_tables,_dec->pp_dc_scale,
- &_setup->qinfo);
+ &_setup->qinfo);
for(qi=0;qi<64;qi++){
int qsum;
qsum=0;
@@ -210,38 +210,28 @@
static int oc_dec_frame_header_unpack(oc_dec_ctx *_dec){
long val;
-
- TH_DEBUG("\n>>>> beginning frame %ld\n\n",dframe);
-
/*Check to make sure this is a data packet.*/
theorapackB_read1(&_dec->opb,&val);
- TH_DEBUG("frame type = %s, ",val==0?"video":"unknown");
if(val!=0)return TH_EBADPACKET;
/*Read in the frame type (I or P).*/
theorapackB_read1(&_dec->opb,&val);
_dec->state.frame_type=(int)val;
- TH_DEBUG("%s\n",val?"predicted":"key");
/*Read in the current qi.*/
theorapackB_read(&_dec->opb,6,&val);
_dec->state.qis[0]=(int)val;
- TH_DEBUG("frame quality = { %ld ",val);
theorapackB_read1(&_dec->opb,&val);
if(!val)_dec->state.nqis=1;
else{
theorapackB_read(&_dec->opb,6,&val);
_dec->state.qis[1]=(int)val;
- TH_DEBUG("%ld ",val);
theorapackB_read1(&_dec->opb,&val);
if(!val)_dec->state.nqis=2;
else{
theorapackB_read(&_dec->opb,6,&val);
- TH_DEBUG("%ld ",val);
_dec->state.qis[2]=(int)val;
_dec->state.nqis=3;
}
}
- TH_DEBUG("}\n");
-
if(_dec->state.frame_type==OC_INTRA_FRAME){
/*Keyframes have 3 unused configuration bits, holdovers from VP3 days.
Most of the other unused bits in the VP3 headers were eliminated.
@@ -305,7 +295,6 @@
int run_count;
theorapackB_read1(&_dec->opb,&val);
flag=(int)val;
-
sb=_dec->state.sbs;
sb_end=sb+_dec->state.nsbs;
run_count=npartial=0;
@@ -319,7 +308,6 @@
npartial+=flag;
sb++;
}
-
while(--run_count>0&&sb<sb_end);
if(full_run&&sb<sb_end){
theorapackB_read1(&_dec->opb,&val);
@@ -349,7 +337,6 @@
for(;sb->coded_partially;sb++);
theorapackB_read1(&_dec->opb,&val);
flag=(int)val;
-
while(sb<sb_end){
int full_run;
run_count=oc_sb_run_unpack(&_dec->opb);
@@ -428,71 +415,6 @@
}
/*TODO: run_count should be 0 here.
If it's not, we should issue a warning of some kind.*/
-
-
-#ifdef _TH_DEBUG_
- // assuming 4:2:0 right now; THIS IS WRONG but only an issue if dumping debug info
- TH_DEBUG("predicted (partially coded frame)\n");
- TH_DEBUG("superblock coded flags = {");
- int x,y,i;
- int w = _dec->state.info.frame_width;
- int h = _dec->state.info.frame_height;
-
- i=0;
- for(y=0;y< (h+31)/32;y++){
- TH_DEBUG("\n ");
- for(x=0;x< (w+31)/32;x++,i++)
- TH_DEBUG("%x", (_dec->state.sbs[i].coded_partially!=0)|
- (_dec->state.sbs[i].coded_fully));
- }
-
- TH_DEBUG("\n ");
- for(y=0;y< (h+63)/64;y++){
- TH_DEBUG("\n ");
- for(x=0;x< (w+63)/64;x++,i++)
- TH_DEBUG("%x", (_dec->state.sbs[i].coded_partially!=0)|
- (_dec->state.sbs[i].coded_fully));
- }
- TH_DEBUG("\n ");
- for(y=0;y< (h+63)/64;y++){
- TH_DEBUG("\n ");
- for(x=0;x< (w+63)/64;x++,i++)
- TH_DEBUG("%x", (_dec->state.sbs[i].coded_partially!=0)|
- (_dec->state.sbs[i].coded_fully));
- }
- TH_DEBUG("\n}\n");
-
- if(i!=_dec->state.nsbs)
- TH_DEBUG("WARNING! superblock count, raster %d != flat %d\n",
- i,_dec->state.nsbs);
-
- TH_DEBUG("block coded flags = {");
-
- i=0;
- for(y=0;y< (h+7)/8;y++){
- TH_DEBUG("\n ");
- for(x=0;x< (w+7)/8;x++,i++)
- TH_DEBUG("%x", (_dec->state.frags[i].coded!=0));
- }
- TH_DEBUG("\n ");
- for(y=0;y< (h+15)/16;y++){
- TH_DEBUG("\n ");
- for(x=0;x< (w+15)/16;x++,i++)
- TH_DEBUG("%x", (_dec->state.frags[i].coded!=0));
- }
- TH_DEBUG("\n ");
- for(y=0;y< (h+15)/16;y++){
- TH_DEBUG("\n ");
- for(x=0;x< (w+15)/16;x++,i++)
- TH_DEBUG("%x", (_dec->state.frags[i].coded!=0));
- }
- TH_DEBUG("\n}\n");
-
- if(i!=_dec->state.nfrags)
- TH_DEBUG("WARNING! block count, raster %d != flat %d\n",
- i,_dec->state.nfrags);
-#endif
-
}
@@ -526,57 +448,37 @@
int mode_scheme;
theorapackB_read(&_dec->opb,3,&val);
mode_scheme=(int)val;
- TH_DEBUG("mode encode scheme = %d\n",(int)val);
-
if(mode_scheme==0){
int mi;
/*Just in case, initialize the modes to something.
If the bitstream doesn't contain each index exactly once, it's likely
corrupt and the rest of the packet is garbage anyway, but this way we
won't crash, and we'll decode SOMETHING.*/
- TH_DEBUG("mode scheme list = { ");
/*LOOP VECTORIZES.*/
for(mi=0;mi<OC_NMODES;mi++)scheme0_alphabet[mi]=OC_MODE_INTER_NOMV;
for(mi=0;mi<OC_NMODES;mi++){
theorapackB_read(&_dec->opb,3,&val);
scheme0_alphabet[val]=OC_MODE_ALPHABETS[6][mi];
- TH_DEBUG("%d ",(int)val);
}
- TH_DEBUG("}\n");
alphabet=scheme0_alphabet;
- }else
- alphabet=OC_MODE_ALPHABETS[mode_scheme-1];
- if(mode_scheme==7)
- mode_unpack=oc_clc_mode_unpack;
- else
- mode_unpack=oc_vlc_mode_unpack;
+ }
+ else alphabet=OC_MODE_ALPHABETS[mode_scheme-1];
+ if(mode_scheme==7)mode_unpack=oc_clc_mode_unpack;
+ else mode_unpack=oc_vlc_mode_unpack;
mb=_dec->state.mbs;
mb_end=mb+_dec->state.nmbs;
-
- TH_DEBUG("mode list = { ");
- for(j=0;mb<mb_end;mb++){
+ for(;mb<mb_end;mb++){
if(mb->mode!=OC_MODE_INVALID){
int bi;
for(bi=0;bi<4;bi++){
- int fragi;
- fragi=mb->map[0][bi];
- if(fragi>=0&&_dec->state.frags[fragi].coded)break;
+ int fragi;
+ fragi=mb->map[0][bi];
+ if(fragi>=0&&_dec->state.frags[fragi].coded)break;
}
- if(bi<4){
- mb->mode=alphabet[(*mode_unpack)(&_dec->opb)];
-
-#ifdef _TH_DEBUG_
- if((j&0x1f)==0)
- TH_DEBUG("\n ");
- TH_DEBUG("%d ",mb->mode);
- j++;
-#endif
-
- }else
- mb->mode=OC_MODE_INTER_NOMV;
+ if(bi<4)mb->mode=alphabet[(*mode_unpack)(&_dec->opb)];
+ else mb->mode=OC_MODE_INTER_NOMV;
}
}
- TH_DEBUG("\n}\n");
}
@@ -629,23 +531,16 @@
const int *map_idxs;
long val;
int map_nidxs;
-#ifdef _TH_DEBUG_
- int j=0;
-#endif
oc_mv last_mv[2];
oc_mv cbmvs[4];
set_chroma_mvs=OC_SET_CHROMA_MVS_TABLE[_dec->state.info.pixel_fmt];
theorapackB_read1(&_dec->opb,&val);
- TH_DEBUG("motion vector table = %d\n",(int)val);
mv_comp_unpack=val?oc_clc_mv_comp_unpack:oc_vlc_mv_comp_unpack;
map_idxs=OC_MB_MAP_IDXS[_dec->state.info.pixel_fmt];
map_nidxs=OC_MB_MAP_NIDXS[_dec->state.info.pixel_fmt];
memset(last_mv,0,sizeof(last_mv));
mb=_dec->state.mbs;
mb_end=mb+_dec->state.nmbs;
-
- TH_DEBUG("motion vectors = {");
-
for(;mb<mb_end;mb++)if(mb->mode!=OC_MODE_INVALID){
oc_fragment *frag;
oc_mv mbmv;
@@ -667,98 +562,62 @@
if(ncoded<=0)continue;
mb_mode=mb->mode;
switch(mb_mode){
- case OC_MODE_INTER_MV_FOUR:
- {
- oc_mv lbmvs[4];
- int bi;
- /*Mark the tail of the list, so we don't accidentally go past it.*/
- coded[ncoded]=-1;
- for(bi=codedi=0;bi<4;bi++){
- if(coded[codedi]==bi){
- codedi++;
- frag=_dec->state.frags+mb->map[0][bi];
- frag->mbmode=mb_mode;
- frag->mv[0]=lbmvs[bi][0]=(signed char)(*mv_comp_unpack)(&_dec->opb);
- frag->mv[1]=lbmvs[bi][1]=(signed char)(*mv_comp_unpack)(&_dec->opb);
-
-#ifdef _TH_DEBUG_
- if((j&0x7)==0)
- TH_DEBUG("\n ");
- TH_DEBUG("%+03d,%+03d ",frag->mv[0],frag->mv[1]);
- j++;
-#endif
-
- }
- else lbmvs[bi][0]=lbmvs[bi][1]=0;
- }
- if(codedi>0){
- last_mv[1][0]=last_mv[0][0];
- last_mv[1][1]=last_mv[0][1];
- last_mv[0][0]=lbmvs[coded[codedi-1]][0];
- last_mv[0][1]=lbmvs[coded[codedi-1]][1];
- }
- if(codedi<ncoded){
- (*set_chroma_mvs)(cbmvs,(const oc_mv *)lbmvs);
- for(;codedi<ncoded;codedi++){
- mapi=coded[codedi];
- bi=mapi&3;
- frag=_dec->state.frags+mb->map[mapi>>2][bi];
- frag->mbmode=mb_mode;
- frag->mv[0]=cbmvs[bi][0];
- frag->mv[1]=cbmvs[bi][1];
- }
- }
- }
- break;
- case OC_MODE_INTER_MV:
- {
- last_mv[1][0]=last_mv[0][0];
- last_mv[1][1]=last_mv[0][1];
- mbmv[0]=last_mv[0][0]=(signed char)(*mv_comp_unpack)(&_dec->opb);
- mbmv[1]=last_mv[0][1]=(signed char)(*mv_comp_unpack)(&_dec->opb);
-
-#ifdef _TH_DEBUG_
- if((j&0x7)==0)
- TH_DEBUG("\n ");
- TH_DEBUG("%+03d,%+03d ",mbmv[0],mbmv[1]);
- j++;
-#endif
-
- }
- break;
- case OC_MODE_INTER_MV_LAST:
- {
+ case OC_MODE_INTER_MV_FOUR:{
+ oc_mv lbmvs[4];
+ int bi;
+ /*Mark the tail of the list, so we don't accidentally go past it.*/
+ coded[ncoded]=-1;
+ for(bi=codedi=0;bi<4;bi++){
+ if(coded[codedi]==bi){
+ codedi++;
+ frag=_dec->state.frags+mb->map[0][bi];
+ frag->mbmode=mb_mode;
+ frag->mv[0]=lbmvs[bi][0]=(signed char)(*mv_comp_unpack)(&_dec->opb);
+ frag->mv[1]=lbmvs[bi][1]=(signed char)(*mv_comp_unpack)(&_dec->opb);
+ }
+ else lbmvs[bi][0]=lbmvs[bi][1]=0;
+ }
+ if(codedi>0){
+ last_mv[1][0]=last_mv[0][0];
+ last_mv[1][1]=last_mv[0][1];
+ last_mv[0][0]=lbmvs[coded[codedi-1]][0];
+ last_mv[0][1]=lbmvs[coded[codedi-1]][1];
+ }
+ if(codedi<ncoded){
+ (*set_chroma_mvs)(cbmvs,(const oc_mv *)lbmvs);
+ for(;codedi<ncoded;codedi++){
+ mapi=coded[codedi];
+ bi=mapi&3;
+ frag=_dec->state.frags+mb->map[mapi>>2][bi];
+ frag->mbmode=mb_mode;
+ frag->mv[0]=cbmvs[bi][0];
+ frag->mv[1]=cbmvs[bi][1];
+ }
+ }
+ }break;
+ case OC_MODE_INTER_MV:{
+ last_mv[1][0]=last_mv[0][0];
+ last_mv[1][1]=last_mv[0][1];
+ mbmv[0]=last_mv[0][0]=(signed char)(*mv_comp_unpack)(&_dec->opb);
+ mbmv[1]=last_mv[0][1]=(signed char)(*mv_comp_unpack)(&_dec->opb);
+ }break;
+ case OC_MODE_INTER_MV_LAST:{
mbmv[0]=last_mv[0][0];
mbmv[1]=last_mv[0][1];
- }
- break;
- case OC_MODE_INTER_MV_LAST2:
- {
+ }break;
+ case OC_MODE_INTER_MV_LAST2:{
mbmv[0]=last_mv[1][0];
mbmv[1]=last_mv[1][1];
last_mv[1][0]=last_mv[0][0];
last_mv[1][1]=last_mv[0][1];
last_mv[0][0]=mbmv[0];
last_mv[0][1]=mbmv[1];
- }
- break;
- case OC_MODE_GOLDEN_MV:
- {
+ }break;
+ case OC_MODE_GOLDEN_MV:{
mbmv[0]=(signed char)(*mv_comp_unpack)(&_dec->opb);
mbmv[1]=(signed char)(*mv_comp_unpack)(&_dec->opb);
-
-#ifdef _TH_DEBUG_
- if((j&0x7)==0)
- TH_DEBUG("\n ");
- TH_DEBUG("%+03d,%+03d ",mbmv[0],mbmv[1]);
- j++;
-#endif
-
- }
- break;
- default:
- mbmv[0]=mbmv[1]=0;
- break;
+ }break;
+ default:mbmv[0]=mbmv[1]=0;break;
}
/*4MV mode fills in the fragments itself.
For all other modes we can use this common code.*/
@@ -773,9 +632,6 @@
}
}
}
-
- TH_DEBUG("\n}\n");
-
}
static void oc_dec_block_qis_unpack(oc_dec_ctx *_dec){
@@ -1362,7 +1218,7 @@
if(_dec->pp_level<OC_PP_LEVEL_DEBLOCKC){
_dec->variances=(int *)_ogg_realloc(_dec->variances,
_dec->state.fplanes[0].nfrags*sizeof(_dec->variances[0]));
- _dec->pp_frame_data=(unsigned char *)_ogg_realloc(
+ _dec->pp_frame_data=(unsigned char *)_ogg_realloc(
_dec->pp_frame_data,frame_sz*sizeof(_dec->pp_frame_data[0]));
_dec->pp_frame_buf[0].width=_dec->state.info.frame_width;
_dec->pp_frame_buf[0].height=_dec->state.info.frame_height;
@@ -1382,7 +1238,7 @@
c_h=_dec->state.info.frame_height>>!(_dec->state.info.pixel_fmt&2);
c_sz=c_w*c_h;
frame_sz+=c_sz<<1;
- _dec->pp_frame_data=(unsigned char *)_ogg_realloc(
+ _dec->pp_frame_data=(unsigned char *)_ogg_realloc(
_dec->pp_frame_data,frame_sz*sizeof(_dec->pp_frame_data[0]));
_dec->pp_frame_buf[0].width=_dec->state.info.frame_width;
_dec->pp_frame_buf[0].height=_dec->state.info.frame_height;
@@ -1503,9 +1359,6 @@
for(fragy=fragy0;fragy<fragy_end;fragy++){
for(fragx=0;fragx<fplane->nhfrags;fragx++,frag++){
if(!frag->coded)continue;
-#ifdef _TH_DEBUG_
- frag->quant[0] = frag->dc; /* stash un-predicted dc for debug output */
-#endif
pred_last[OC_FRAME_FOR_MODE[frag->mbmode]]=frag->dc+=
oc_frag_pred_dc(frag,fplane,fragx,fragy,pred_last);
ncoded_fragis++;
@@ -1597,40 +1450,6 @@
_pipe->uncoded_fragis[_pli]-=_pipe->nuncoded_fragis[_pli];
oc_state_frag_copy(&_dec->state,_pipe->uncoded_fragis[_pli],
_pipe->nuncoded_fragis[_pli],OC_FRAME_SELF,OC_FRAME_PREV,_pli);
-
-#ifdef _TH_DEBUG_
- {
- int i,j,k;
- int framei=_dec->state.ref_frame_idx[OC_FRAME_SELF];
- int ystride=_dec->state.ref_frame_bufs[framei][_pli].stride;
- int *fragi_end = _pipe->coded_fragis[_pli];
- int *fragi = fragi_end-_pipe->ncoded_fragis[_pli];
-
- for(;fragi<fragi_end;fragi++){
- oc_fragment *frag=_dec->state.frags+*fragi;
- unsigned char *src=frag->buffer[framei];
- for(i=0,j=0;j<8;j++){
- for(k=0;k<8;k++,i++)
- frag->recon[i] = src[k];
- src+=ystride;
- }
- }
-
- fragi = _pipe->uncoded_fragis[_pli];
- fragi_end = fragi+_pipe->nuncoded_fragis[_pli];
-
- for(;fragi<fragi_end;fragi++){
- oc_fragment *frag=_dec->state.frags+*fragi;
- unsigned char *src=frag->buffer[framei];
- for(i=0,j=0;j<8;j++){
- for(k=0;k<8;k++,i++)
- frag->recon[i] = src[k];
- src+=ystride;
- }
- }
- }
-#endif
-
}
/*Filter a horizontal block edge.*/
@@ -2039,7 +1858,6 @@
/*A completely empty packet indicates a dropped frame and is treated exactly
like an inter frame with no coded blocks.
Only proceed if we have a non-empty packet.*/
-
if(_op->bytes!=0){
oc_dec_pipeline_state pipe;
th_ycbcr_buffer stripe_buf;
@@ -2093,7 +1911,6 @@
}
oc_dec_block_qis_unpack(_dec);
oc_dec_residual_tokens_unpack(_dec);
-
/*Update granule position.
This must be done before the striped decode callbacks so that the
application knows what to do with the frame data.*/
@@ -2203,91 +2020,6 @@
}
notstart=1;
}
-
-#ifdef _TH_DEBUG_
- {
- int x,y,i,j,k,xn,yn;
- int plane;
- int buf;
-
- /* dump fragment DCT components */
- for(plane=0;plane<3;plane++){
- char *plstr;
- int offset;
- switch(plane){
- case 0:
- plstr="Y";
- xn = _dec->state.info.frame_width>>3;
- yn = _dec->state.info.frame_height>>3;
- offset = 0;
- break;
- case 1:
- plstr="U";
- xn = _dec->state.info.frame_width>>4;
- yn = _dec->state.info.frame_height>>4;
- offset = xn*yn*4;
- break;
- case 2:
- plstr="V";
- xn = _dec->state.info.frame_width>>4;
- yn = _dec->state.info.frame_height>>4;
- offset = xn*yn*5;
- break;
- }
- for(y=0;y<yn;y++){
- for(x=0;x<xn;x++,i++){
-
- for(buf=0;buf<4;buf++){
- int *ptr;
- char *bufn;
- int codecheck=0;
-
- i = offset + y*xn + x;
-
- switch(buf){
- case 0:
- codecheck=1;
- bufn = "coded";
- ptr = _dec->state.frags[i].quant;
- break;
- case 1:
- codecheck=1;
- bufn = "coeff";
- ptr = _dec->state.frags[i].freq;
- break;
- case 2:
- codecheck=1;
- bufn = "idct";
- ptr = _dec->state.frags[i].time;
- break;
- case 3:
- bufn = "recon";
- ptr = _dec->state.frags[i].loop;
- break;
- }
-
-
- TH_DEBUG("%s %s [%d][%d] = {",bufn,plstr,x,y);
- if(codecheck && !_dec->state.frags[i].coded)
- TH_DEBUG(" not coded }\n");
- else{
- int l=0;
- for(j=0;j<8;j++){
- TH_DEBUG("\n ");
- for(k=0;k<8;k++,l++){
- TH_DEBUG("%d ",ptr[l]);
- }
- }
- TH_DEBUG(" }\n");
- }
- }
- TH_DEBUG("\n");
- }
- }
- }
- }
-#endif
-
/*Finish filling in the reference frame borders.*/
for(pli=0;pli<3;pli++)oc_state_borders_fill_caps(&_dec->state,refi,pli);
/*Update the reference frame indices.*/
Modified: trunk/theora/lib/dec/dequant.c
===================================================================
--- trunk/theora/lib/dec/dequant.c 2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/dec/dequant.c 2008-08-04 18:37:55 UTC (rev 15153)
@@ -127,65 +127,6 @@
}
while(qri-->0);
}
-
-#ifdef _TH_DEBUG_
- /* dump the tables */
- {
- int i, j, k, l, m;
- TH_DEBUG("loop filter limits = {");
- for(i=0;i<64;){
- TH_DEBUG("\n ");
- for(j=0;j<16;i++,j++)
- TH_DEBUG("%3d ",_qinfo->loop_filter_limits[i]);
- }
- TH_DEBUG("\n}\n\n");
-
- TH_DEBUG("ac scale = {");
- for(i=0;i<64;){
- TH_DEBUG("\n ");
- for(j=0;j<16;i++,j++)
- TH_DEBUG("%3d ",_qinfo->ac_scale[i]);
- }
- TH_DEBUG("\n}\n\n");
-
- TH_DEBUG("dc scale = {");
- for(i=0;i<64;){
- TH_DEBUG("\n ");
- for(j=0;j<16;i++,j++)
- TH_DEBUG("%3d ",_qinfo->dc_scale[i]);
- }
- TH_DEBUG("\n}\n\n");
-
- for(k=0;k<2;k++)
- for(l=0;l<3;l++){
- char *name[2][3]={
- {"intra Y bases","intra U bases", "intra V bases"},
- {"inter Y bases","inter U bases", "inter V bases"}
- };
-
- th_quant_ranges *r = &_qinfo->qi_ranges[k][l];
- TH_DEBUG("%s = {\n",name[k][l]);
- TH_DEBUG(" ranges = %d\n",r->nranges);
- TH_DEBUG(" intervals = { ");
- for(i=0;i<r->nranges;i++)
- TH_DEBUG("%3d ",r->sizes[i]);
- TH_DEBUG("}\n");
- TH_DEBUG("\n matricies = { ");
- for(m=0;m<r->nranges+1;m++){
- TH_DEBUG("\n { ");
- for(i=0;i<64;){
- TH_DEBUG("\n ");
- for(j=0;j<8;i++,j++)
- TH_DEBUG("%3d ",r->base_matrices[m][i]);
- }
- TH_DEBUG("\n }");
- }
- TH_DEBUG("\n }\n");
- }
- }
-
-#endif
-
_ogg_free(base_mats);
return 0;
}
@@ -227,4 +168,3 @@
_ogg_free((void *)_qinfo->qi_ranges[qti][pli].base_matrices);
}
}
-
Modified: trunk/theora/lib/dec/idct.c
===================================================================
--- trunk/theora/lib/dec/idct.c 2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/dec/idct.c 2008-08-04 18:37:55 UTC (rev 15153)
@@ -169,7 +169,6 @@
_y[7<<3]=(ogg_int16_t)(t[0]-t[7]);
}
-
/*Performs an inverse 8 point Type-II DCT transform.
The output is scaled by a factor of 2 relative to the orthonormal version of
the transform.
@@ -204,7 +203,6 @@
_y[7<<3]=(ogg_int16_t)(t[0]-t[7]);
}
-
/*Performs an inverse 8 point Type-II DCT transform.
The output is scaled by a factor of 2 relative to the orthonormal version of
the transform.
Modified: trunk/theora/lib/dec/quant.c
===================================================================
--- trunk/theora/lib/dec/quant.c 2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/dec/quant.c 2008-08-04 18:37:55 UTC (rev 15153)
@@ -39,114 +39,84 @@
qi values change between frames (this is what VP3 did).*/
void oc_dequant_tables_init(oc_quant_table *_dequant[2][3],
int _pp_dc_scale[64],const th_quant_info *_qinfo){
- int qti; /* coding mode: intra or inter */
- int pli; /* Y U V */
+ /*coding mode: intra or inter.*/
+ int qti;
+ /*Y', C_b, C_r*/
+ int pli;
for(qti=0;qti<2;qti++){
for(pli=0;pli<3;pli++){
oc_quant_tables stage;
-
- int qi; /* quality index */
- int qri; /* range iterator */
-
+ /*Quality index.*/
+ int qi;
+ /*Range iterator.*/
+ int qri;
for(qi=0,qri=0; qri<=_qinfo->qi_ranges[qti][pli].nranges; qri++){
- th_quant_base base;
-
- ogg_uint32_t q;
- int qi_start;
- int qi_end;
- int ci;
- memcpy(base,_qinfo->qi_ranges[qti][pli].base_matrices[qri],
- sizeof(base));
-
- qi_start=qi;
- if(qri==_qinfo->qi_ranges[qti][pli].nranges)
- qi_end=qi+1;
- else
- qi_end=qi+_qinfo->qi_ranges[qti][pli].sizes[qri];
-
- /* Iterate over quality indicies in this range */
- for(;;){
-
- /*In the original VP3.2 code, the rounding offset and the size of the
- dead zone around 0 were controlled by a "sharpness" parameter.
- The size of our dead zone is now controlled by the per-coefficient
- quality thresholds returned by our HVS module.
- We round down from a more accurate value when the quality of the
- reconstruction does not fall below our threshold and it saves bits.
- Hence, all of that VP3.2 code is gone from here, and the remaining
- floating point code has been implemented as equivalent integer code
- with exact precision.*/
-
- /* for postprocess, not dequant */
- if(_pp_dc_scale!=NULL)
- _pp_dc_scale[qi]=(int)((ogg_uint32_t)_qinfo->dc_scale[qi]*base[0]/160);
-
- /*Scale DC the coefficient from the proper table.*/
- q=((ogg_uint32_t)_qinfo->dc_scale[qi]*base[0]/100)<<2;
- q=OC_CLAMPI(OC_DC_QUANT_MIN[qti],q,OC_QUANT_MAX);
- stage[qi][0]=(ogg_uint16_t)q;
-
- /*Now scale AC coefficients from the proper table.*/
- for(ci=1;ci<64;ci++){
- q=((ogg_uint32_t)_qinfo->ac_scale[qi]*base[ci]/100)<<2;
- q=OC_CLAMPI(OC_AC_QUANT_MIN[qti],q,OC_QUANT_MAX);
- stage[qi][ci]=(ogg_uint16_t)q;
- }
-
- if(++qi>=qi_end)break;
-
- /*Interpolate the next base matrix.*/
- for(ci=0;ci<64;ci++){
- base[ci]=(unsigned char)
- ((2*((qi_end-qi)*_qinfo->qi_ranges[qti][pli].base_matrices[qri][ci]+
- (qi-qi_start)*_qinfo->qi_ranges[qti][pli].base_matrices[qri+1][ci])
- +_qinfo->qi_ranges[qti][pli].sizes[qri])/
- (2*_qinfo->qi_ranges[qti][pli].sizes[qri]));
- }
- }
+ th_quant_base base;
+ ogg_uint32_t q;
+ int qi_start;
+ int qi_end;
+ int ci;
+ memcpy(base,_qinfo->qi_ranges[qti][pli].base_matrices[qri],
+ sizeof(base));
+ qi_start=qi;
+ if(qri==_qinfo->qi_ranges[qti][pli].nranges)qi_end=qi+1;
+ else qi_end=qi+_qinfo->qi_ranges[qti][pli].sizes[qri];
+ /*Iterate over quality indicies in this range.*/
+ for(;;){
+ ogg_uint32_t qfac;
+ /*In the original VP3.2 code, the rounding offset and the size of the
+ dead zone around 0 were controlled by a "sharpness" parameter.
+ The size of our dead zone is now controlled by the per-coefficient
+ quality thresholds returned by our HVS module.
+ We round down from a more accurate value when the quality of the
+ reconstruction does not fall below our threshold and it saves bits.
+ Hence, all of that VP3.2 code is gone from here, and the remaining
+ floating point code has been implemented as equivalent integer code
+ with exact precision.*/
+ qfac=(ogg_uint32_t)_qinfo->dc_scale[qi]*base[0];
+ /*For postprocessing, not dequantization.*/
+ if(_pp_dc_scale!=NULL)_pp_dc_scale[qi]=(int)(qfac/160);
+ /*Scale DC the coefficient from the proper table.*/
+ q=(qfac/100)<<2;
+ q=OC_CLAMPI(OC_DC_QUANT_MIN[qti],q,OC_QUANT_MAX);
+ stage[qi][0]=(ogg_uint16_t)q;
+ /*Now scale AC coefficients from the proper table.*/
+ for(ci=1;ci<64;ci++){
+ q=((ogg_uint32_t)_qinfo->ac_scale[qi]*base[ci]/100)<<2;
+ q=OC_CLAMPI(OC_AC_QUANT_MIN[qti],q,OC_QUANT_MAX);
+ stage[qi][ci]=(ogg_uint16_t)q;
+ }
+ if(++qi>=qi_end)break;
+ /*Interpolate the next base matrix.*/
+ for(ci=0;ci<64;ci++){
+ base[ci]=(unsigned char)(
+ (2*((qi_end-qi)*_qinfo->qi_ranges[qti][pli].base_matrices[qri][ci]+
+ (qi-qi_start)*_qinfo->qi_ranges[qti][pli].base_matrices[qri+1][ci])
+ +_qinfo->qi_ranges[qti][pli].sizes[qri])/
+ (2*_qinfo->qi_ranges[qti][pli].sizes[qri]));
+ }
+ }
}
-
- /* Staging matricies complete; commit to memory only if this
- isn't a duplicate of a preceeding plane. This simple check
- helps us improve cache coherency later.*/
+ /*Staging matrices complete; commit to memory only if this isn't a
+ duplicate of a preceeding plane.
+ This simple check helps us improve cache coherency later.*/
{
- int dupe = 0;
- int i,j;
- for(i=0;i<=qti;i++){
- for(j=0;j<(i<qti?3:pli);j++){
- if(!memcmp(stage,_dequant[i][j],sizeof(stage))){
- dupe = 1;
- break;
- }
- }
- if(dupe)break;
- }
- if(dupe){
- _dequant[qti][pli]=_dequant[i][j];
- }else{
- memcpy(_dequant[qti][pli],stage,sizeof(stage));
- }
+ int dupe;
+ int qtj;
+ int plj;
+ dupe=0;
+ for(qtj=0;qtj<=qti;qtj++){
+ for(plj=0;plj<(qtj<qti?3:pli);plj++){
+ if(!memcmp(stage,_dequant[qtj][plj],sizeof(stage))){
+ dupe=1;
+ break;
+ }
+ }
+ if(dupe)break;
+ }
+ if(dupe)_dequant[qti][pli]=_dequant[qtj][plj];
+ else memcpy(_dequant[qti][pli],stage,sizeof(stage));
}
}
}
-
-#ifdef _TH_DEBUG_
- int i, j, k, l;
- /* dump the calculated quantizer tables */
- for(i=0;i<2;i++){
- for(j=0;j<3;j++){
- for(k=0;k<64;k++){
- TH_DEBUG("quantizer table [%s][%s][Q%d] = {",
- (i==0?"intra":"inter"),(j==0?"Y":(j==1?"U":"V")),k);
- for(l=0;l<64;l++){
- if((l&7)==0)
- TH_DEBUG("\n ");
- TH_DEBUG("%4d ",_dequant[i][j][k][l]);
- }
- TH_DEBUG("}\n");
- }
- }
- }
-#endif
-
}
Modified: trunk/theora/lib/dec/quant.h
===================================================================
--- trunk/theora/lib/dec/quant.h 2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/dec/quant.h 2008-08-04 18:37:55 UTC (rev 15153)
@@ -40,7 +40,6 @@
void oc_dequant_tables_init(oc_quant_table *_dequant[2][3],
- int _pp_dc_scale[64],
- const th_quant_info *_qinfo);
+ int _pp_dc_scale[64],const th_quant_info *_qinfo);
#endif
Modified: trunk/theora/lib/dec/state.c
===================================================================
--- trunk/theora/lib/dec/state.c 2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/dec/state.c 2008-08-04 18:37:55 UTC (rev 15153)
@@ -831,37 +831,11 @@
ogg_int16_t p;
/*Why is the iquant product rounded in this case and no others?
Who knows.*/
-
p=(ogg_int16_t)((ogg_int32_t)_frag->dc*_dc_iquant+15>>5);
/*LOOP VECTORIZES.*/
for(ci=0;ci<64;ci++)res_buf[ci]=p;
-
-#ifdef _TH_DEBUG_
- {
- int i;
- _frag->freq[0] = _frag->dc*_dc_iquant;
- _frag->time[0] = p;
- for(i=1;i<64;i++){
- _frag->quant[i] = 0;
- _frag->freq[i] = 0;
- _frag->time[i] = p;
- }
- }
-#endif
-
}
else{
-
-#ifdef _TH_DEBUG_
- {
- int i;
- for(i=1;i<_ncoefs;i++)
- _frag->quant[i] = _dct_coeffs[i];
- for(;i<64;i++)
- _frag->quant[i] = 0;
- }
-#endif
-
/*First, dequantize the coefficients.*/
dct_buf[0]=(ogg_int16_t)((ogg_int32_t)_frag->dc*_dc_iquant);
for(zzi=1;zzi<_ncoefs;zzi++){
@@ -869,21 +843,6 @@
ci=OC_FZIG_ZAG[zzi];
dct_buf[ci]=(ogg_int16_t)((ogg_int32_t)_dct_coeffs[zzi]*_ac_iquant[ci]);
}
-
-#ifdef _TH_DEBUG_
- for(;zzi<64;zzi++){
- int ci;
- ci=OC_FZIG_ZAG[zzi];
- dct_buf[ci]=0;
- }
-
- {
- int i;
- for(i=0;i<64;i++)
- _frag->freq[i] = dct_buf[i];
- }
-#endif
-
/*Then, fill in the remainder of the coefficients with 0's, and perform
the iDCT.*/
if(_last_zzi<10){
@@ -894,15 +853,6 @@
for(;zzi<64;zzi++)dct_buf[OC_FZIG_ZAG[zzi]]=0;
oc_idct8x8_c(res_buf,dct_buf);
}
-
-#ifdef _TH_DEBUG_
- {
- int i;
- for(i=0;i<64;i++)
- _frag->time[i] = res_buf[i];
- }
-#endif
-
}
/*Fill in the target buffer.*/
dst_framei=_state->ref_frame_idx[OC_FRAME_SELF];
@@ -1038,7 +988,7 @@
}
void oc_state_loop_filter_frag_rows_c(oc_theora_state *_state,int *_bv,
- int _refi,int _pli,int _fragy0,int _fragy_end){
+ int _refi,int _pli,int _fragy0,int _fragy_end){
th_img_plane *iplane;
oc_fragment_plane *fplane;
oc_fragment *frag_top;
@@ -1050,7 +1000,6 @@
_bv+=127;
iplane=_state->ref_frame_bufs[_refi]+_pli;
fplane=_state->fplanes+_pli;
-
/*The following loops are constructed somewhat non-intuitively on purpose.
The main idea is: if a block boundary has at least one coded fragment on
it, the filter is applied to it.
@@ -1079,46 +1028,6 @@
iplane->stride,_bv);
}
}
-
-
-#ifdef _TH_DEBUG_
- {
- int i,j,k,l;
- unsigned char *src;
-
- for(l=0;l<5;l++){
- oc_fragment *f;
- switch(l){
- case 0:
- f = frag;
- break;
- case 1: /* left */
- if(frag == frag0)continue;
- f = frag-1;
- break;
- case 2: /* bottom (top once flipped) */
- if(frag0 == frag_top)continue;
- f = frag - fplane->nhfrags;
- break;
- case 3: /* right */
- if(frag+1 >= frag_end) continue;
- f = frag + 1;
- break;
- case 4: /* top (bottom once flipped) */
- if(frag+fplane->nhfrags >= frag_bot)continue;
- f = frag + fplane->nhfrags;
- break;
- }
-
- src = f->buffer[_refi];
- for(i=0,j=0;j<8;j++){
- for(k=0;k<8;k++,i++)
- f->loop[i] = src[k];
- src+=iplane->stride;
- }
- }
- }
-#endif
frag++;
}
frag0+=fplane->nhfrags;
Modified: trunk/theora/lib/dec/x86/mmxfrag.c
===================================================================
--- trunk/theora/lib/dec/x86/mmxfrag.c 2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/dec/x86/mmxfrag.c 2008-08-04 18:37:55 UTC (rev 15153)
@@ -21,7 +21,7 @@
Note: Loops are unrolled for best performance.
The iteration each instruction belongs to is marked in the comments as #i.*/
#include "x86int.h"
-#include <stdlib.h>
+#include <stddef.h>
#if defined(USE_ASM)
Modified: trunk/theora/lib/dec/x86/mmxstate.c
===================================================================
--- trunk/theora/lib/dec/x86/mmxstate.c 2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/dec/x86/mmxstate.c 2008-08-04 18:37:55 UTC (rev 15153)
@@ -19,7 +19,7 @@
Originally written by Rudolf Marek.*/
#include "x86int.h"
#include "../../internal.h"
-#include <stdlib.h>
+#include <stddef.h>
#if defined(USE_ASM)
Modified: trunk/theora/lib/dec/x86_vc/mmxidct.c
===================================================================
--- trunk/theora/lib/dec/x86_vc/mmxidct.c 2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/dec/x86_vc/mmxidct.c 2008-08-04 18:37:55 UTC (rev 15153)
@@ -11,7 +11,7 @@
********************************************************************
function:
- last mod: $Id:
+ last mod: $Id:
********************************************************************/
@@ -30,7 +30,7 @@
#include "x86int.h"
/*A table of constants used by the MMX routines.*/
-static const __declspec(align(16)) ogg_uint16_t
+static const __declspec(align(16)) ogg_uint16_t
OC_IDCT_CONSTS[(7+1)*4]={
(ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7,
(ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7,
@@ -53,475 +53,475 @@
void oc_idct8x8_10_mmx(ogg_int16_t _y[64]){
_asm {
mov edx, [_y]
- mov eax, offset OC_IDCT_CONSTS
- movq mm2, [edx + 30H]
- movq mm6, [eax + 10H]
- movq mm4, mm2
- movq mm7, [edx + 18H]
- pmulhw mm4, mm6
- movq mm1, [eax + 20H]
- pmulhw mm6, mm7
- movq mm5, mm1
- pmulhw mm1, mm2
- movq mm3, [edx + 10H]
- pmulhw mm5, mm7
- movq mm0, [eax]
- paddw mm4, mm2
- paddw mm6, mm7
- paddw mm2, mm1
- movq mm1, [edx + 38H]
- paddw mm7, mm5
- movq mm5, mm0
- pmulhw mm0, mm3
- paddw mm4, mm7
- pmulhw mm5, mm1
- movq mm7, [eax + 30H]
- psubw mm6, mm2
- paddw mm0, mm3
- pmulhw mm3, mm7
- movq mm2, [edx + 20H]
- pmulhw mm7, mm1
- paddw mm5, mm1
- movq mm1, mm2
- pmulhw mm2, [eax + 08H]
- psubw mm3, mm5
- movq mm5, [edx + 28H]
- paddw mm0, mm7
- movq mm7, mm5
- psubw mm0, mm4
- pmulhw mm5, [eax + 08H]
- paddw mm2, mm1
- pmulhw mm1, [eax + 28H]
- paddw mm4, mm4
- paddw mm4, mm0
- psubw mm3, mm6
- paddw mm5, mm7
- paddw mm6, mm6
- pmulhw mm7, [eax + 28H]
- paddw mm6, mm3
- movq [edx + 10H], mm4
- psubw mm1, mm5
- movq mm4, [eax + 18H]
- movq mm5, mm3
- pmulhw mm3, mm4
- paddw mm7, mm2
- movq [edx + 20H], mm6
- movq mm2, mm0
- movq mm6, [edx]
- pmulhw mm0, mm4
- paddw mm5, mm3
- movq mm3, [edx + 08H]
- psubw mm5, mm1
- paddw mm2, mm0
- psubw mm6, mm3
- movq mm0, mm6
- pmulhw mm6, mm4
- paddw mm3, mm3
- paddw mm1, mm1
- paddw mm3, mm0
- paddw mm1, mm5
- pmulhw mm4, mm3
- paddw mm6, mm0
- psubw mm6, mm2
- paddw mm2, mm2
- movq mm0, [edx + 10H]
- paddw mm2, mm6
- paddw mm4, mm3
- psubw mm2, mm1
- movq mm3, [edx + 20H]
- psubw mm4, mm7
- paddw mm1, mm1
- paddw mm7, mm7
- paddw mm1, mm2
- paddw mm7, mm4
- psubw mm4, mm3
- paddw mm3, mm3
- psubw mm6, mm5
- paddw mm5, mm5
- paddw mm3, mm4
- paddw mm5, mm6
- psubw mm7, mm0
- paddw mm0, mm0
- movq [edx + 10H], mm1
- paddw mm0, mm7
- movq mm1, mm4
- punpcklwd mm4, mm5
- movq [edx], mm0
- punpckhwd mm1, mm5
- movq mm0, mm6
- punpcklwd mm6, mm7
- movq mm5, mm4
- punpckldq mm4, mm6
- punpckhdq mm5, mm6
- movq mm6, mm1
- movq [edx + 08H], mm4
- punpckhwd mm0, mm7
- movq [edx + 18H], mm5
- punpckhdq mm6, mm0
- movq mm4, [edx]
- punpckldq mm1, mm0
- movq mm5, [edx + 10H]
- movq mm0, mm4
- movq [edx + 38H], mm6
- punpcklwd mm0, mm5
- movq [edx + 28H], mm1
- punpckhwd mm4, mm5
- movq mm5, mm2
- punpcklwd mm2, mm3
- movq mm1, mm0
- punpckldq mm0, mm2
- punpckhdq mm1, mm2
- movq mm2, mm4
- movq [edx], mm0
- punpckhwd mm5, mm3
- movq [edx + 10H], mm1
- punpckhdq mm4, mm5
- punpckldq mm2, mm5
- movq [edx + 30H], mm4
- movq [edx + 20H], mm2
- movq mm2, [edx + 70H]
- movq mm6, [eax + 10H]
- movq mm4, mm2
- movq mm7, [edx + 58H]
- pmulhw mm4, mm6
- movq mm1, [eax + 20H]
- pmulhw mm6, mm7
- movq mm5, mm1
- pmulhw mm1, mm2
- movq mm3, [edx + 50H]
- pmulhw mm5, mm7
- movq mm0, [eax]
- paddw mm4, mm2
- paddw mm6, mm7
- paddw mm2, mm1
- movq mm1, [edx + 78H]
- paddw mm7, mm5
- movq mm5, mm0
- pmulhw mm0, mm3
- paddw mm4, mm7
- pmulhw mm5, mm1
- movq mm7, [eax + 30H]
- psubw mm6, mm2
- paddw mm0, mm3
- pmulhw mm3, mm7
- movq mm2, [edx + 60H]
- pmulhw mm7, mm1
- paddw mm5, mm1
- movq mm1, mm2
- pmulhw mm2, [eax + 08H]
- psubw mm3, mm5
- movq mm5, [edx + 68H]
- paddw mm0, mm7
- movq mm7, mm5
- psubw mm0, mm4
- pmulhw mm5, [eax + 08H]
- paddw mm2, mm1
- pmulhw mm1, [eax + 28H]
- paddw mm4, mm4
- paddw mm4, mm0
- psubw mm3, mm6
- paddw mm5, mm7
- paddw mm6, mm6
- pmulhw mm7, [eax + 28H]
- paddw mm6, mm3
- movq [edx + 50H], mm4
- psubw mm1, mm5
- movq mm4, [eax + 18H]
- movq mm5, mm3
- pmulhw mm3, mm4
- paddw mm7, mm2
- movq [edx + 60H], mm6
- movq mm2, mm0
- movq mm6, [edx + 40H]
- pmulhw mm0, mm4
- paddw mm5, mm3
- movq mm3, [edx + 48H]
- psubw mm5, mm1
- paddw mm2, mm0
- psubw mm6, mm3
- movq mm0, mm6
- pmulhw mm6, mm4
- paddw mm3, mm3
- paddw mm1, mm1
- paddw mm3, mm0
- paddw mm1, mm5
- pmulhw mm4, mm3
- paddw mm6, mm0
- psubw mm6, mm2
- paddw mm2, mm2
- movq mm0, [edx + 50H]
- paddw mm2, mm6
- paddw mm4, mm3
- psubw mm2, mm1
- movq mm3, [edx + 60H]
- psubw mm4, mm7
- paddw mm1, mm1
- paddw mm7, mm7
- paddw mm1, mm2
- paddw mm7, mm4
- psubw mm4, mm3
- paddw mm3, mm3
- psubw mm6, mm5
- paddw mm5, mm5
- paddw mm3, mm4
- paddw mm5, mm6
- psubw mm7, mm0
- paddw mm0, mm0
- movq [edx + 50H], mm1
- paddw mm0, mm7
- movq mm1, mm4
- punpcklwd mm4, mm5
- movq [edx + 40H], mm0
- punpckhwd mm1, mm5
- movq mm0, mm6
- punpcklwd mm6, mm7
- movq mm5, mm4
- punpckldq mm4, mm6
- punpckhdq mm5, mm6
- movq mm6, mm1
- movq [edx + 48H], mm4
- punpckhwd mm0, mm7
- movq [edx + 58H], mm5
- punpckhdq mm6, mm0
- movq mm4, [edx + 40H]
- punpckldq mm1, mm0
- movq mm5, [edx + 50H]
- movq mm0, mm4
- movq [edx + 78H], mm6
- punpcklwd mm0, mm5
- movq [edx + 68H], mm1
- punpckhwd mm4, mm5
- movq mm5, mm2
- punpcklwd mm2, mm3
- movq mm1, mm0
- punpckldq mm0, mm2
- punpckhdq mm1, mm2
- movq mm2, mm4
- movq [edx + 40H], mm0
- punpckhwd mm5, mm3
- movq [edx + 50H], mm1
- punpckhdq mm4, mm5
- punpckldq mm2, mm5
- movq [edx + 70H], mm4
- movq [edx + 60H], mm2
- movq mm2, [edx + 30H]
- movq mm6, [eax + 10H]
- movq mm4, mm2
- movq mm7, [edx + 50H]
- pmulhw mm4, mm6
- movq mm1, [eax + 20H]
- pmulhw mm6, mm7
- movq mm5, mm1
- pmulhw mm1, mm2
- movq mm3, [edx + 10H]
- pmulhw mm5, mm7
- movq mm0, [eax]
- paddw mm4, mm2
- paddw mm6, mm7
- paddw mm2, mm1
- movq mm1, [edx + 70H]
- paddw mm7, mm5
- movq mm5, mm0
- pmulhw mm0, mm3
- paddw mm4, mm7
- pmulhw mm5, mm1
- movq mm7, [eax + 30H]
- psubw mm6, mm2
- paddw mm0, mm3
- pmulhw mm3, mm7
- movq mm2, [edx + 20H]
- pmulhw mm7, mm1
- paddw mm5, mm1
- movq mm1, mm2
- pmulhw mm2, [eax + 08H]
- psubw mm3, mm5
- movq mm5, [edx + 60H]
- paddw mm0, mm7
- movq mm7, mm5
- psubw mm0, mm4
- pmulhw mm5, [eax + 08H]
- paddw mm2, mm1
- pmulhw mm1, [eax + 28H]
- paddw mm4, mm4
- paddw mm4, mm0
- psubw mm3, mm6
- paddw mm5, mm7
- paddw mm6, mm6
- pmulhw mm7, [eax + 28H]
- paddw mm6, mm3
- movq [edx + 10H], mm4
- psubw mm1, mm5
- movq mm4, [eax + 18H]
- movq mm5, mm3
- pmulhw mm3, mm4
- paddw mm7, mm2
- movq [edx + 20H], mm6
- movq mm2, mm0
- movq mm6, [edx]
- pmulhw mm0, mm4
- paddw mm5, mm3
- movq mm3, [edx + 40H]
- psubw mm5, mm1
- paddw mm2, mm0
- psubw mm6, mm3
- movq mm0, mm6
- pmulhw mm6, mm4
- paddw mm3, mm3
- paddw mm1, mm1
- paddw mm3, mm0
- paddw mm1, mm5
- pmulhw mm4, mm3
- paddw mm6, mm0
- psubw mm6, mm2
- paddw mm2, mm2
- movq mm0, [edx + 10H]
- paddw mm2, mm6
- paddw mm4, mm3
- psubw mm2, mm1
- paddw mm2, [eax + 38H]
- paddw mm1, mm1
- paddw mm1, mm2
- psraw mm2, 4
- psubw mm4, mm7
- psraw mm1, 4
- movq mm3, [edx + 20H]
- paddw mm7, mm7
- movq [edx + 20H], mm2
- paddw mm7, mm4
- movq [edx + 10H], mm1
- psubw mm4, mm3
- paddw mm4, [eax + 38H]
- paddw mm3, mm3
- paddw mm3, mm4
- psraw mm4, 4
- psubw mm6, mm5
- psraw mm3, 4
- paddw mm6, [eax + 38H]
- paddw mm5, mm5
- paddw mm5, mm6
- psraw mm6, 4
- movq [edx + 40H], mm4
- psraw mm5, 4
- movq [edx + 30H], mm3
- psubw mm7, mm0
- paddw mm7, [eax + 38H]
- paddw mm0, mm0
- paddw mm0, mm7
- psraw mm7, 4
- movq [edx + 60H], mm6
- psraw mm0, 4
- movq [edx + 50H], mm5
- movq [edx + 70H], mm7
- movq [edx], mm0
- movq mm2, [edx + 38H]
- movq mm6, [eax + 10H]
- movq mm4, mm2
- movq mm7, [edx + 58H]
- pmulhw mm4, mm6
- movq mm1, [eax + 20H]
- pmulhw mm6, mm7
- movq mm5, mm1
- pmulhw mm1, mm2
- movq mm3, [edx + 18H]
- pmulhw mm5, mm7
- movq mm0, [eax]
- paddw mm4, mm2
- paddw mm6, mm7
- paddw mm2, mm1
- movq mm1, [edx + 78H]
- paddw mm7, mm5
- movq mm5, mm0
- pmulhw mm0, mm3
- paddw mm4, mm7
- pmulhw mm5, mm1
- movq mm7, [eax + 30H]
- psubw mm6, mm2
- paddw mm0, mm3
- pmulhw mm3, mm7
- movq mm2, [edx + 28H]
- pmulhw mm7, mm1
- paddw mm5, mm1
- movq mm1, mm2
- pmulhw mm2, [eax + 08H]
- psubw mm3, mm5
- movq mm5, [edx + 68H]
- paddw mm0, mm7
- movq mm7, mm5
- psubw mm0, mm4
- pmulhw mm5, [eax + 08H]
- paddw mm2, mm1
- pmulhw mm1, [eax + 28H]
- paddw mm4, mm4
- paddw mm4, mm0
- psubw mm3, mm6
- paddw mm5, mm7
- paddw mm6, mm6
- pmulhw mm7, [eax + 28H]
- paddw mm6, mm3
- movq [edx + 18H], mm4
- psubw mm1, mm5
- movq mm4, [eax + 18H]
- movq mm5, mm3
- pmulhw mm3, mm4
- paddw mm7, mm2
- movq [edx + 28H], mm6
- movq mm2, mm0
- movq mm6, [edx + 08H]
- pmulhw mm0, mm4
- paddw mm5, mm3
- movq mm3, [edx + 48H]
- psubw mm5, mm1
- paddw mm2, mm0
- psubw mm6, mm3
- movq mm0, mm6
- pmulhw mm6, mm4
- paddw mm3, mm3
- paddw mm1, mm1
- paddw mm3, mm0
- paddw mm1, mm5
- pmulhw mm4, mm3
- paddw mm6, mm0
- psubw mm6, mm2
- paddw mm2, mm2
- movq mm0, [edx + 18H]
- paddw mm2, mm6
- paddw mm4, mm3
- psubw mm2, mm1
- paddw mm2, [eax + 38H]
- paddw mm1, mm1
- paddw mm1, mm2
- psraw mm2, 4
- psubw mm4, mm7
- psraw mm1, 4
- movq mm3, [edx + 28H]
- paddw mm7, mm7
- movq [edx + 28H], mm2
- paddw mm7, mm4
- movq [edx + 18H], mm1
- psubw mm4, mm3
- paddw mm4, [eax + 38H]
- paddw mm3, mm3
- paddw mm3, mm4
- psraw mm4, 4
- psubw mm6, mm5
- psraw mm3, 4
- paddw mm6, [eax + 38H]
- paddw mm5, mm5
- paddw mm5, mm6
- psraw mm6, 4
- movq [edx + 48H], mm4
- psraw mm5, 4
- movq [edx + 38H], mm3
- psubw mm7, mm0
- paddw mm7, [eax + 38H]
- paddw mm0, mm0
- paddw mm0, mm7
- psraw mm7, 4
- movq [edx + 68H], mm6
- psraw mm0, 4
- movq [edx + 58H], mm5
- movq [edx + 78H], mm7
- movq [edx + 08H], mm0
+ mov eax, offset OC_IDCT_CONSTS
+ movq mm2, [edx + 30H]
+ movq mm6, [eax + 10H]
+ movq mm4, mm2
+ movq mm7, [edx + 18H]
+ pmulhw mm4, mm6
+ movq mm1, [eax + 20H]
+ pmulhw mm6, mm7
+ movq mm5, mm1
+ pmulhw mm1, mm2
+ movq mm3, [edx + 10H]
+ pmulhw mm5, mm7
+ movq mm0, [eax]
+ paddw mm4, mm2
+ paddw mm6, mm7
+ paddw mm2, mm1
+ movq mm1, [edx + 38H]
+ paddw mm7, mm5
+ movq mm5, mm0
+ pmulhw mm0, mm3
+ paddw mm4, mm7
+ pmulhw mm5, mm1
+ movq mm7, [eax + 30H]
+ psubw mm6, mm2
+ paddw mm0, mm3
+ pmulhw mm3, mm7
+ movq mm2, [edx + 20H]
+ pmulhw mm7, mm1
+ paddw mm5, mm1
+ movq mm1, mm2
+ pmulhw mm2, [eax + 08H]
+ psubw mm3, mm5
+ movq mm5, [edx + 28H]
+ paddw mm0, mm7
+ movq mm7, mm5
+ psubw mm0, mm4
+ pmulhw mm5, [eax + 08H]
+ paddw mm2, mm1
+ pmulhw mm1, [eax + 28H]
+ paddw mm4, mm4
+ paddw mm4, mm0
+ psubw mm3, mm6
+ paddw mm5, mm7
+ paddw mm6, mm6
+ pmulhw mm7, [eax + 28H]
+ paddw mm6, mm3
+ movq [edx + 10H], mm4
+ psubw mm1, mm5
+ movq mm4, [eax + 18H]
+ movq mm5, mm3
+ pmulhw mm3, mm4
+ paddw mm7, mm2
+ movq [edx + 20H], mm6
+ movq mm2, mm0
+ movq mm6, [edx]
+ pmulhw mm0, mm4
+ paddw mm5, mm3
+ movq mm3, [edx + 08H]
+ psubw mm5, mm1
+ paddw mm2, mm0
+ psubw mm6, mm3
+ movq mm0, mm6
+ pmulhw mm6, mm4
+ paddw mm3, mm3
+ paddw mm1, mm1
+ paddw mm3, mm0
+ paddw mm1, mm5
+ pmulhw mm4, mm3
+ paddw mm6, mm0
+ psubw mm6, mm2
+ paddw mm2, mm2
+ movq mm0, [edx + 10H]
+ paddw mm2, mm6
+ paddw mm4, mm3
+ psubw mm2, mm1
+ movq mm3, [edx + 20H]
+ psubw mm4, mm7
+ paddw mm1, mm1
+ paddw mm7, mm7
+ paddw mm1, mm2
+ paddw mm7, mm4
+ psubw mm4, mm3
+ paddw mm3, mm3
+ psubw mm6, mm5
+ paddw mm5, mm5
+ paddw mm3, mm4
+ paddw mm5, mm6
+ psubw mm7, mm0
+ paddw mm0, mm0
+ movq [edx + 10H], mm1
+ paddw mm0, mm7
+ movq mm1, mm4
+ punpcklwd mm4, mm5
+ movq [edx], mm0
+ punpckhwd mm1, mm5
+ movq mm0, mm6
+ punpcklwd mm6, mm7
+ movq mm5, mm4
+ punpckldq mm4, mm6
+ punpckhdq mm5, mm6
+ movq mm6, mm1
+ movq [edx + 08H], mm4
+ punpckhwd mm0, mm7
+ movq [edx + 18H], mm5
+ punpckhdq mm6, mm0
+ movq mm4, [edx]
+ punpckldq mm1, mm0
+ movq mm5, [edx + 10H]
+ movq mm0, mm4
+ movq [edx + 38H], mm6
+ punpcklwd mm0, mm5
+ movq [edx + 28H], mm1
+ punpckhwd mm4, mm5
+ movq mm5, mm2
+ punpcklwd mm2, mm3
+ movq mm1, mm0
+ punpckldq mm0, mm2
+ punpckhdq mm1, mm2
+ movq mm2, mm4
+ movq [edx], mm0
+ punpckhwd mm5, mm3
+ movq [edx + 10H], mm1
+ punpckhdq mm4, mm5
+ punpckldq mm2, mm5
+ movq [edx + 30H], mm4
+ movq [edx + 20H], mm2
+ movq mm2, [edx + 70H]
+ movq mm6, [eax + 10H]
+ movq mm4, mm2
+ movq mm7, [edx + 58H]
+ pmulhw mm4, mm6
+ movq mm1, [eax + 20H]
+ pmulhw mm6, mm7
+ movq mm5, mm1
+ pmulhw mm1, mm2
+ movq mm3, [edx + 50H]
+ pmulhw mm5, mm7
+ movq mm0, [eax]
+ paddw mm4, mm2
+ paddw mm6, mm7
+ paddw mm2, mm1
+ movq mm1, [edx + 78H]
+ paddw mm7, mm5
+ movq mm5, mm0
+ pmulhw mm0, mm3
+ paddw mm4, mm7
+ pmulhw mm5, mm1
+ movq mm7, [eax + 30H]
+ psubw mm6, mm2
+ paddw mm0, mm3
+ pmulhw mm3, mm7
+ movq mm2, [edx + 60H]
+ pmulhw mm7, mm1
+ paddw mm5, mm1
+ movq mm1, mm2
+ pmulhw mm2, [eax + 08H]
+ psubw mm3, mm5
+ movq mm5, [edx + 68H]
+ paddw mm0, mm7
+ movq mm7, mm5
+ psubw mm0, mm4
+ pmulhw mm5, [eax + 08H]
+ paddw mm2, mm1
+ pmulhw mm1, [eax + 28H]
+ paddw mm4, mm4
+ paddw mm4, mm0
+ psubw mm3, mm6
+ paddw mm5, mm7
+ paddw mm6, mm6
+ pmulhw mm7, [eax + 28H]
+ paddw mm6, mm3
+ movq [edx + 50H], mm4
+ psubw mm1, mm5
+ movq mm4, [eax + 18H]
+ movq mm5, mm3
+ pmulhw mm3, mm4
+ paddw mm7, mm2
+ movq [edx + 60H], mm6
+ movq mm2, mm0
+ movq mm6, [edx + 40H]
+ pmulhw mm0, mm4
+ paddw mm5, mm3
+ movq mm3, [edx + 48H]
+ psubw mm5, mm1
+ paddw mm2, mm0
+ psubw mm6, mm3
+ movq mm0, mm6
+ pmulhw mm6, mm4
+ paddw mm3, mm3
+ paddw mm1, mm1
+ paddw mm3, mm0
+ paddw mm1, mm5
+ pmulhw mm4, mm3
+ paddw mm6, mm0
+ psubw mm6, mm2
+ paddw mm2, mm2
+ movq mm0, [edx + 50H]
+ paddw mm2, mm6
+ paddw mm4, mm3
+ psubw mm2, mm1
+ movq mm3, [edx + 60H]
+ psubw mm4, mm7
+ paddw mm1, mm1
+ paddw mm7, mm7
+ paddw mm1, mm2
+ paddw mm7, mm4
+ psubw mm4, mm3
+ paddw mm3, mm3
+ psubw mm6, mm5
+ paddw mm5, mm5
+ paddw mm3, mm4
+ paddw mm5, mm6
+ psubw mm7, mm0
+ paddw mm0, mm0
+ movq [edx + 50H], mm1
+ paddw mm0, mm7
+ movq mm1, mm4
+ punpcklwd mm4, mm5
+ movq [edx + 40H], mm0
+ punpckhwd mm1, mm5
+ movq mm0, mm6
+ punpcklwd mm6, mm7
+ movq mm5, mm4
+ punpckldq mm4, mm6
+ punpckhdq mm5, mm6
+ movq mm6, mm1
+ movq [edx + 48H], mm4
+ punpckhwd mm0, mm7
+ movq [edx + 58H], mm5
+ punpckhdq mm6, mm0
+ movq mm4, [edx + 40H]
+ punpckldq mm1, mm0
+ movq mm5, [edx + 50H]
+ movq mm0, mm4
+ movq [edx + 78H], mm6
+ punpcklwd mm0, mm5
+ movq [edx + 68H], mm1
+ punpckhwd mm4, mm5
+ movq mm5, mm2
+ punpcklwd mm2, mm3
+ movq mm1, mm0
+ punpckldq mm0, mm2
+ punpckhdq mm1, mm2
+ movq mm2, mm4
+ movq [edx + 40H], mm0
+ punpckhwd mm5, mm3
+ movq [edx + 50H], mm1
+ punpckhdq mm4, mm5
+ punpckldq mm2, mm5
+ movq [edx + 70H], mm4
+ movq [edx + 60H], mm2
+ movq mm2, [edx + 30H]
+ movq mm6, [eax + 10H]
+ movq mm4, mm2
+ movq mm7, [edx + 50H]
+ pmulhw mm4, mm6
+ movq mm1, [eax + 20H]
+ pmulhw mm6, mm7
+ movq mm5, mm1
+ pmulhw mm1, mm2
+ movq mm3, [edx + 10H]
+ pmulhw mm5, mm7
+ movq mm0, [eax]
+ paddw mm4, mm2
+ paddw mm6, mm7
+ paddw mm2, mm1
+ movq mm1, [edx + 70H]
+ paddw mm7, mm5
+ movq mm5, mm0
+ pmulhw mm0, mm3
+ paddw mm4, mm7
+ pmulhw mm5, mm1
+ movq mm7, [eax + 30H]
+ psubw mm6, mm2
+ paddw mm0, mm3
+ pmulhw mm3, mm7
+ movq mm2, [edx + 20H]
+ pmulhw mm7, mm1
+ paddw mm5, mm1
+ movq mm1, mm2
+ pmulhw mm2, [eax + 08H]
+ psubw mm3, mm5
+ movq mm5, [edx + 60H]
+ paddw mm0, mm7
+ movq mm7, mm5
+ psubw mm0, mm4
+ pmulhw mm5, [eax + 08H]
+ paddw mm2, mm1
+ pmulhw mm1, [eax + 28H]
+ paddw mm4, mm4
+ paddw mm4, mm0
+ psubw mm3, mm6
+ paddw mm5, mm7
+ paddw mm6, mm6
+ pmulhw mm7, [eax + 28H]
+ paddw mm6, mm3
+ movq [edx + 10H], mm4
+ psubw mm1, mm5
+ movq mm4, [eax + 18H]
+ movq mm5, mm3
+ pmulhw mm3, mm4
+ paddw mm7, mm2
+ movq [edx + 20H], mm6
+ movq mm2, mm0
+ movq mm6, [edx]
+ pmulhw mm0, mm4
+ paddw mm5, mm3
+ movq mm3, [edx + 40H]
+ psubw mm5, mm1
+ paddw mm2, mm0
+ psubw mm6, mm3
+ movq mm0, mm6
+ pmulhw mm6, mm4
+ paddw mm3, mm3
+ paddw mm1, mm1
+ paddw mm3, mm0
+ paddw mm1, mm5
+ pmulhw mm4, mm3
+ paddw mm6, mm0
+ psubw mm6, mm2
+ paddw mm2, mm2
+ movq mm0, [edx + 10H]
+ paddw mm2, mm6
+ paddw mm4, mm3
+ psubw mm2, mm1
+ paddw mm2, [eax + 38H]
+ paddw mm1, mm1
+ paddw mm1, mm2
+ psraw mm2, 4
+ psubw mm4, mm7
+ psraw mm1, 4
+ movq mm3, [edx + 20H]
+ paddw mm7, mm7
+ movq [edx + 20H], mm2
+ paddw mm7, mm4
+ movq [edx + 10H], mm1
+ psubw mm4, mm3
+ paddw mm4, [eax + 38H]
+ paddw mm3, mm3
+ paddw mm3, mm4
+ psraw mm4, 4
+ psubw mm6, mm5
+ psraw mm3, 4
+ paddw mm6, [eax + 38H]
+ paddw mm5, mm5
+ paddw mm5, mm6
+ psraw mm6, 4
+ movq [edx + 40H], mm4
+ psraw mm5, 4
+ movq [edx + 30H], mm3
+ psubw mm7, mm0
+ paddw mm7, [eax + 38H]
+ paddw mm0, mm0
+ paddw mm0, mm7
+ psraw mm7, 4
+ movq [edx + 60H], mm6
+ psraw mm0, 4
+ movq [edx + 50H], mm5
+ movq [edx + 70H], mm7
+ movq [edx], mm0
+ movq mm2, [edx + 38H]
+ movq mm6, [eax + 10H]
+ movq mm4, mm2
+ movq mm7, [edx + 58H]
+ pmulhw mm4, mm6
+ movq mm1, [eax + 20H]
+ pmulhw mm6, mm7
+ movq mm5, mm1
+ pmulhw mm1, mm2
+ movq mm3, [edx + 18H]
+ pmulhw mm5, mm7
+ movq mm0, [eax]
+ paddw mm4, mm2
+ paddw mm6, mm7
+ paddw mm2, mm1
+ movq mm1, [edx + 78H]
+ paddw mm7, mm5
+ movq mm5, mm0
+ pmulhw mm0, mm3
+ paddw mm4, mm7
+ pmulhw mm5, mm1
+ movq mm7, [eax + 30H]
+ psubw mm6, mm2
+ paddw mm0, mm3
+ pmulhw mm3, mm7
+ movq mm2, [edx + 28H]
+ pmulhw mm7, mm1
+ paddw mm5, mm1
+ movq mm1, mm2
+ pmulhw mm2, [eax + 08H]
+ psubw mm3, mm5
+ movq mm5, [edx + 68H]
+ paddw mm0, mm7
+ movq mm7, mm5
+ psubw mm0, mm4
+ pmulhw mm5, [eax + 08H]
+ paddw mm2, mm1
+ pmulhw mm1, [eax + 28H]
+ paddw mm4, mm4
+ paddw mm4, mm0
+ psubw mm3, mm6
+ paddw mm5, mm7
+ paddw mm6, mm6
+ pmulhw mm7, [eax + 28H]
+ paddw mm6, mm3
+ movq [edx + 18H], mm4
+ psubw mm1, mm5
+ movq mm4, [eax + 18H]
+ movq mm5, mm3
+ pmulhw mm3, mm4
+ paddw mm7, mm2
+ movq [edx + 28H], mm6
+ movq mm2, mm0
+ movq mm6, [edx + 08H]
+ pmulhw mm0, mm4
+ paddw mm5, mm3
+ movq mm3, [edx + 48H]
+ psubw mm5, mm1
+ paddw mm2, mm0
+ psubw mm6, mm3
+ movq mm0, mm6
+ pmulhw mm6, mm4
+ paddw mm3, mm3
+ paddw mm1, mm1
+ paddw mm3, mm0
+ paddw mm1, mm5
+ pmulhw mm4, mm3
+ paddw mm6, mm0
+ psubw mm6, mm2
+ paddw mm2, mm2
+ movq mm0, [edx + 18H]
+ paddw mm2, mm6
+ paddw mm4, mm3
+ psubw mm2, mm1
+ paddw mm2, [eax + 38H]
+ paddw mm1, mm1
+ paddw mm1, mm2
+ psraw mm2, 4
+ psubw mm4, mm7
+ psraw mm1, 4
+ movq mm3, [edx + 28H]
+ paddw mm7, mm7
+ movq [edx + 28H], mm2
+ paddw mm7, mm4
+ movq [edx + 18H], mm1
+ psubw mm4, mm3
+ paddw mm4, [eax + 38H]
+ paddw mm3, mm3
+ paddw mm3, mm4
+ psraw mm4, 4
+ psubw mm6, mm5
+ psraw mm3, 4
+ paddw mm6, [eax + 38H]
+ paddw mm5, mm5
+ paddw mm5, mm6
+ psraw mm6, 4
+ movq [edx + 48H], mm4
+ psraw mm5, 4
+ movq [edx + 38H], mm3
+ psubw mm7, mm0
+ paddw mm7, [eax + 38H]
+ paddw mm0, mm0
+ paddw mm0, mm7
+ psraw mm7, 4
+ movq [edx + 68H], mm6
+ psraw mm0, 4
+ movq [edx + 58H], mm5
+ movq [edx + 78H], mm7
+ movq [edx + 08H], mm0
/* emms */
}
}
@@ -530,477 +530,477 @@
void oc_idct8x8_mmx(ogg_int16_t _y[64]){
_asm {
mov edx, [_y]
- mov eax, offset OC_IDCT_CONSTS
- movq mm2, [edx + 30H]
- movq mm6, [eax + 10H]
- movq mm4, mm2
- movq mm7, [edx + 18H]
- pmulhw mm4, mm6
- movq mm1, [eax + 20H]
- pmulhw mm6, mm7
- movq mm5, mm1
- pmulhw mm1, mm2
- movq mm3, [edx + 10H]
- pmulhw mm5, mm7
- movq mm0, [eax]
- paddw mm4, mm2
- paddw mm6, mm7
- paddw mm2, mm1
- movq mm1, [edx + 38H]
- paddw mm7, mm5
- movq mm5, mm0
- pmulhw mm0, mm3
- paddw mm4, mm7
- pmulhw mm5, mm1
- movq mm7, [eax + 30H]
- psubw mm6, mm2
- paddw mm0, mm3
- pmulhw mm3, mm7
- movq mm2, [edx + 20H]
- pmulhw mm7, mm1
- paddw mm5, mm1
- movq mm1, mm2
- pmulhw mm2, [eax + 08H]
- psubw mm3, mm5
- movq mm5, [edx + 28H]
- paddw mm0, mm7
- movq mm7, mm5
- psubw mm0, mm4
- pmulhw mm5, [eax + 08H]
- paddw mm2, mm1
- pmulhw mm1, [eax + 28H]
- paddw mm4, mm4
- paddw mm4, mm0
- psubw mm3, mm6
- paddw mm5, mm7
- paddw mm6, mm6
- pmulhw mm7, [eax + 28H]
- paddw mm6, mm3
- movq [edx + 10H], mm4
- psubw mm1, mm5
- movq mm4, [eax + 18H]
- movq mm5, mm3
- pmulhw mm3, mm4
- paddw mm7, mm2
- movq [edx + 20H], mm6
- movq mm2, mm0
- movq mm6, [edx]
- pmulhw mm0, mm4
- paddw mm5, mm3
- movq mm3, [edx + 08H]
- psubw mm5, mm1
- paddw mm2, mm0
- psubw mm6, mm3
- movq mm0, mm6
- pmulhw mm6, mm4
- paddw mm3, mm3
- paddw mm1, mm1
- paddw mm3, mm0
- paddw mm1, mm5
- pmulhw mm4, mm3
- paddw mm6, mm0
- psubw mm6, mm2
- paddw mm2, mm2
- movq mm0, [edx + 10H]
- paddw mm2, mm6
- paddw mm4, mm3
- psubw mm2, mm1
- movq mm3, [edx + 20H]
- psubw mm4, mm7
- paddw mm1, mm1
- paddw mm7, mm7
- paddw mm1, mm2
- paddw mm7, mm4
- psubw mm4, mm3
- paddw mm3, mm3
- psubw mm6, mm5
- paddw mm5, mm5
- paddw mm3, mm4
- paddw mm5, mm6
- psubw mm7, mm0
- paddw mm0, mm0
- movq [edx + 10H], mm1
- paddw mm0, mm7
- movq mm1, mm4
- punpcklwd mm4, mm5
- movq [edx], mm0
- punpckhwd mm1, mm5
- movq mm0, mm6
- punpcklwd mm6, mm7
- movq mm5, mm4
- punpckldq mm4, mm6
- punpckhdq mm5, mm6
- movq mm6, mm1
- movq [edx + 08H], mm4
- punpckhwd mm0, mm7
- movq [edx + 18H], mm5
- punpckhdq mm6, mm0
- movq mm4, [edx]
- punpckldq mm1, mm0
- movq mm5, [edx + 10H]
- movq mm0, mm4
- movq [edx + 38H], mm6
- punpcklwd mm0, mm5
- movq [edx + 28H], mm1
- punpckhwd mm4, mm5
- movq mm5, mm2
- punpcklwd mm2, mm3
- movq mm1, mm0
- punpckldq mm0, mm2
- punpckhdq mm1, mm2
- movq mm2, mm4
- movq [edx], mm0
- punpckhwd mm5, mm3
- movq [edx + 10H], mm1
- punpckhdq mm4, mm5
- punpckldq mm2, mm5
- movq [edx + 30H], mm4
- movq [edx + 20H], mm2
- movq mm2, [edx + 70H]
- movq mm6, [eax + 10H]
- movq mm4, mm2
- movq mm7, [edx + 58H]
- pmulhw mm4, mm6
- movq mm1, [eax + 20H]
- pmulhw mm6, mm7
- movq mm5, mm1
- pmulhw mm1, mm2
- movq mm3, [edx + 50H]
- pmulhw mm5, mm7
- movq mm0, [eax]
- paddw mm4, mm2
- paddw mm6, mm7
- paddw mm2, mm1
- movq mm1, [edx + 78H]
- paddw mm7, mm5
- movq mm5, mm0
- pmulhw mm0, mm3
- paddw mm4, mm7
- pmulhw mm5, mm1
- movq mm7, [eax + 30H]
- psubw mm6, mm2
- paddw mm0, mm3
- pmulhw mm3, mm7
- movq mm2, [edx + 60H]
- pmulhw mm7, mm1
- paddw mm5, mm1
- movq mm1, mm2
- pmulhw mm2, [eax + 08H]
- psubw mm3, mm5
- movq mm5, [edx + 68H]
- paddw mm0, mm7
- movq mm7, mm5
- psubw mm0, mm4
- pmulhw mm5, [eax + 08H]
- paddw mm2, mm1
- pmulhw mm1, [eax + 28H]
- paddw mm4, mm4
- paddw mm4, mm0
- psubw mm3, mm6
- paddw mm5, mm7
- paddw mm6, mm6
- pmulhw mm7, [eax + 28H]
- paddw mm6, mm3
- movq [edx + 50H], mm4
- psubw mm1, mm5
- movq mm4, [eax + 18H]
- movq mm5, mm3
- pmulhw mm3, mm4
- paddw mm7, mm2
- movq [edx + 60H], mm6
- movq mm2, mm0
- movq mm6, [edx + 40H]
- pmulhw mm0, mm4
- paddw mm5, mm3
- movq mm3, [edx + 48H]
- psubw mm5, mm1
- paddw mm2, mm0
- psubw mm6, mm3
- movq mm0, mm6
- pmulhw mm6, mm4
- paddw mm3, mm3
- paddw mm1, mm1
- paddw mm3, mm0
- paddw mm1, mm5
- pmulhw mm4, mm3
- paddw mm6, mm0
- psubw mm6, mm2
- paddw mm2, mm2
- movq mm0, [edx + 50H]
- paddw mm2, mm6
- paddw mm4, mm3
- psubw mm2, mm1
- movq mm3, [edx + 60H]
- psubw mm4, mm7
- paddw mm1, mm1
- paddw mm7, mm7
- paddw mm1, mm2
- paddw mm7, mm4
- psubw mm4, mm3
- paddw mm3, mm3
- psubw mm6, mm5
- paddw mm5, mm5
- paddw mm3, mm4
- paddw mm5, mm6
- psubw mm7, mm0
- paddw mm0, mm0
- movq [edx + 50H], mm1
- paddw mm0, mm7
- movq mm1, mm4
- punpcklwd mm4, mm5
- movq [edx + 40H], mm0
- punpckhwd mm1, mm5
- movq mm0, mm6
- punpcklwd mm6, mm7
- movq mm5, mm4
- punpckldq mm4, mm6
- punpckhdq mm5, mm6
- movq mm6, mm1
- movq [edx + 48H], mm4
- punpckhwd mm0, mm7
- movq [edx + 58H], mm5
- punpckhdq mm6, mm0
- movq mm4, [edx + 40H]
- punpckldq mm1, mm0
- movq mm5, [edx + 50H]
- movq mm0, mm4
- movq [edx + 78H], mm6
- punpcklwd mm0, mm5
- movq [edx + 68H], mm1
- punpckhwd mm4, mm5
- movq mm5, mm2
- punpcklwd mm2, mm3
- movq mm1, mm0
- punpckldq mm0, mm2
- punpckhdq mm1, mm2
- movq mm2, mm4
- movq [edx + 40H], mm0
- punpckhwd mm5, mm3
- movq [edx + 50H], mm1
- punpckhdq mm4, mm5
- punpckldq mm2, mm5
- movq [edx + 70H], mm4
- movq [edx + 60H], mm2
- movq mm2, [edx + 30H]
- movq mm6, [eax + 10H]
- movq mm4, mm2
- movq mm7, [edx + 50H]
- pmulhw mm4, mm6
- movq mm1, [eax + 20H]
- pmulhw mm6, mm7
- movq mm5, mm1
- pmulhw mm1, mm2
- movq mm3, [edx + 10H]
- pmulhw mm5, mm7
- movq mm0, [eax]
- paddw mm4, mm2
- paddw mm6, mm7
- paddw mm2, mm1
- movq mm1, [edx + 70H]
- paddw mm7, mm5
- movq mm5, mm0
- pmulhw mm0, mm3
- paddw mm4, mm7
- pmulhw mm5, mm1
- movq mm7, [eax + 30H]
- psubw mm6, mm2
- paddw mm0, mm3
- pmulhw mm3, mm7
- movq mm2, [edx + 20H]
- pmulhw mm7, mm1
- paddw mm5, mm1
- movq mm1, mm2
- pmulhw mm2, [eax + 08H]
- psubw mm3, mm5
- movq mm5, [edx + 60H]
- paddw mm0, mm7
- movq mm7, mm5
- psubw mm0, mm4
- pmulhw mm5, [eax + 08H]
- paddw mm2, mm1
- pmulhw mm1, [eax + 28H]
- paddw mm4, mm4
- paddw mm4, mm0
- psubw mm3, mm6
- paddw mm5, mm7
- paddw mm6, mm6
- pmulhw mm7, [eax + 28H]
- paddw mm6, mm3
- movq [edx + 10H], mm4
- psubw mm1, mm5
- movq mm4, [eax + 18H]
- movq mm5, mm3
- pmulhw mm3, mm4
- paddw mm7, mm2
- movq [edx + 20H], mm6
- movq mm2, mm0
- movq mm6, [edx]
- pmulhw mm0, mm4
- paddw mm5, mm3
- movq mm3, [edx + 40H]
- psubw mm5, mm1
- paddw mm2, mm0
- psubw mm6, mm3
- movq mm0, mm6
- pmulhw mm6, mm4
- paddw mm3, mm3
- paddw mm1, mm1
- paddw mm3, mm0
- paddw mm1, mm5
- pmulhw mm4, mm3
- paddw mm6, mm0
- psubw mm6, mm2
- paddw mm2, mm2
- movq mm0, [edx + 10H]
- paddw mm2, mm6
- paddw mm4, mm3
- psubw mm2, mm1
- paddw mm2, [eax + 38H]
- paddw mm1, mm1
- paddw mm1, mm2
- psraw mm2, 4
- psubw mm4, mm7
- psraw mm1, 4
- movq mm3, [edx + 20H]
- paddw mm7, mm7
- movq [edx + 20H], mm2
- paddw mm7, mm4
- movq [edx + 10H], mm1
- psubw mm4, mm3
- paddw mm4, [eax + 38H]
- paddw mm3, mm3
- paddw mm3, mm4
- psraw mm4, 4
- psubw mm6, mm5
- psraw mm3, 4
- paddw mm6, [eax + 38H]
- paddw mm5, mm5
- paddw mm5, mm6
- psraw mm6, 4
- movq [edx + 40H], mm4
- psraw mm5, 4
- movq [edx + 30H], mm3
- psubw mm7, mm0
- paddw mm7, [eax + 38H]
- paddw mm0, mm0
- paddw mm0, mm7
- psraw mm7, 4
- movq [edx + 60H], mm6
- psraw mm0, 4
- movq [edx + 50H], mm5
- movq [edx + 70H], mm7
- movq [edx], mm0
- movq mm2, [edx + 38H]
- movq mm6, [eax + 10H]
- movq mm4, mm2
- movq mm7, [edx + 58H]
- pmulhw mm4, mm6
- movq mm1, [eax + 20H]
- pmulhw mm6, mm7
- movq mm5, mm1
- pmulhw mm1, mm2
- movq mm3, [edx + 18H]
- pmulhw mm5, mm7
- movq mm0, [eax]
- paddw mm4, mm2
- paddw mm6, mm7
- paddw mm2, mm1
- movq mm1, [edx + 78H]
- paddw mm7, mm5
- movq mm5, mm0
- pmulhw mm0, mm3
- paddw mm4, mm7
- pmulhw mm5, mm1
- movq mm7, [eax + 30H]
- psubw mm6, mm2
- paddw mm0, mm3
- pmulhw mm3, mm7
- movq mm2, [edx + 28H]
- pmulhw mm7, mm1
- paddw mm5, mm1
- movq mm1, mm2
- pmulhw mm2, [eax + 08H]
- psubw mm3, mm5
- movq mm5, [edx + 68H]
- paddw mm0, mm7
- movq mm7, mm5
- psubw mm0, mm4
- pmulhw mm5, [eax + 08H]
- paddw mm2, mm1
- pmulhw mm1, [eax + 28H]
- paddw mm4, mm4
- paddw mm4, mm0
- psubw mm3, mm6
- paddw mm5, mm7
- paddw mm6, mm6
- pmulhw mm7, [eax + 28H]
- paddw mm6, mm3
- movq [edx + 18H], mm4
- psubw mm1, mm5
- movq mm4, [eax + 18H]
- movq mm5, mm3
- pmulhw mm3, mm4
- paddw mm7, mm2
- movq [edx + 28H], mm6
- movq mm2, mm0
- movq mm6, [edx + 08H]
- pmulhw mm0, mm4
- paddw mm5, mm3
- movq mm3, [edx + 48H]
- psubw mm5, mm1
- paddw mm2, mm0
- psubw mm6, mm3
- movq mm0, mm6
- pmulhw mm6, mm4
- paddw mm3, mm3
- paddw mm1, mm1
- paddw mm3, mm0
- paddw mm1, mm5
- pmulhw mm4, mm3
- paddw mm6, mm0
- psubw mm6, mm2
- paddw mm2, mm2
- movq mm0, [edx + 18H]
- paddw mm2, mm6
- paddw mm4, mm3
- psubw mm2, mm1
- paddw mm2, [eax + 38H]
- paddw mm1, mm1
- paddw mm1, mm2
- psraw mm2, 4
- psubw mm4, mm7
- psraw mm1, 4
- movq mm3, [edx + 28H]
- paddw mm7, mm7
- movq [edx + 28H], mm2
- paddw mm7, mm4
- movq [edx + 18H], mm1
- psubw mm4, mm3
- paddw mm4, [eax + 38H]
- paddw mm3, mm3
- paddw mm3, mm4
- psraw mm4, 4
- psubw mm6, mm5
- psraw mm3, 4
- paddw mm6, [eax + 38H]
- paddw mm5, mm5
- paddw mm5, mm6
- psraw mm6, 4
- movq [edx + 48H], mm4
- psraw mm5, 4
- movq [edx + 38H], mm3
- psubw mm7, mm0
- paddw mm7, [eax + 38H]
- paddw mm0, mm0
- paddw mm0, mm7
- psraw mm7, 4
- movq [edx + 68H], mm6
- psraw mm0, 4
- movq [edx + 58H], mm5
- movq [edx + 78H], mm7
- movq [edx + 08H], mm0
+ mov eax, offset OC_IDCT_CONSTS
+ movq mm2, [edx + 30H]
+ movq mm6, [eax + 10H]
+ movq mm4, mm2
+ movq mm7, [edx + 18H]
+ pmulhw mm4, mm6
+ movq mm1, [eax + 20H]
+ pmulhw mm6, mm7
+ movq mm5, mm1
+ pmulhw mm1, mm2
+ movq mm3, [edx + 10H]
+ pmulhw mm5, mm7
+ movq mm0, [eax]
+ paddw mm4, mm2
+ paddw mm6, mm7
+ paddw mm2, mm1
+ movq mm1, [edx + 38H]
+ paddw mm7, mm5
+ movq mm5, mm0
+ pmulhw mm0, mm3
+ paddw mm4, mm7
+ pmulhw mm5, mm1
+ movq mm7, [eax + 30H]
+ psubw mm6, mm2
+ paddw mm0, mm3
+ pmulhw mm3, mm7
+ movq mm2, [edx + 20H]
+ pmulhw mm7, mm1
+ paddw mm5, mm1
+ movq mm1, mm2
+ pmulhw mm2, [eax + 08H]
+ psubw mm3, mm5
+ movq mm5, [edx + 28H]
+ paddw mm0, mm7
+ movq mm7, mm5
+ psubw mm0, mm4
+ pmulhw mm5, [eax + 08H]
+ paddw mm2, mm1
+ pmulhw mm1, [eax + 28H]
+ paddw mm4, mm4
+ paddw mm4, mm0
+ psubw mm3, mm6
+ paddw mm5, mm7
+ paddw mm6, mm6
+ pmulhw mm7, [eax + 28H]
+ paddw mm6, mm3
+ movq [edx + 10H], mm4
+ psubw mm1, mm5
+ movq mm4, [eax + 18H]
+ movq mm5, mm3
+ pmulhw mm3, mm4
+ paddw mm7, mm2
+ movq [edx + 20H], mm6
+ movq mm2, mm0
+ movq mm6, [edx]
+ pmulhw mm0, mm4
+ paddw mm5, mm3
+ movq mm3, [edx + 08H]
+ psubw mm5, mm1
+ paddw mm2, mm0
+ psubw mm6, mm3
+ movq mm0, mm6
+ pmulhw mm6, mm4
+ paddw mm3, mm3
+ paddw mm1, mm1
+ paddw mm3, mm0
+ paddw mm1, mm5
+ pmulhw mm4, mm3
+ paddw mm6, mm0
+ psubw mm6, mm2
+ paddw mm2, mm2
+ movq mm0, [edx + 10H]
+ paddw mm2, mm6
+ paddw mm4, mm3
+ psubw mm2, mm1
+ movq mm3, [edx + 20H]
+ psubw mm4, mm7
+ paddw mm1, mm1
+ paddw mm7, mm7
+ paddw mm1, mm2
+ paddw mm7, mm4
+ psubw mm4, mm3
+ paddw mm3, mm3
+ psubw mm6, mm5
+ paddw mm5, mm5
+ paddw mm3, mm4
+ paddw mm5, mm6
+ psubw mm7, mm0
+ paddw mm0, mm0
+ movq [edx + 10H], mm1
+ paddw mm0, mm7
+ movq mm1, mm4
+ punpcklwd mm4, mm5
+ movq [edx], mm0
+ punpckhwd mm1, mm5
+ movq mm0, mm6
+ punpcklwd mm6, mm7
+ movq mm5, mm4
+ punpckldq mm4, mm6
+ punpckhdq mm5, mm6
+ movq mm6, mm1
+ movq [edx + 08H], mm4
+ punpckhwd mm0, mm7
+ movq [edx + 18H], mm5
+ punpckhdq mm6, mm0
+ movq mm4, [edx]
+ punpckldq mm1, mm0
+ movq mm5, [edx + 10H]
+ movq mm0, mm4
+ movq [edx + 38H], mm6
+ punpcklwd mm0, mm5
+ movq [edx + 28H], mm1
+ punpckhwd mm4, mm5
+ movq mm5, mm2
+ punpcklwd mm2, mm3
+ movq mm1, mm0
+ punpckldq mm0, mm2
+ punpckhdq mm1, mm2
+ movq mm2, mm4
+ movq [edx], mm0
+ punpckhwd mm5, mm3
+ movq [edx + 10H], mm1
+ punpckhdq mm4, mm5
+ punpckldq mm2, mm5
+ movq [edx + 30H], mm4
+ movq [edx + 20H], mm2
+ movq mm2, [edx + 70H]
+ movq mm6, [eax + 10H]
+ movq mm4, mm2
+ movq mm7, [edx + 58H]
+ pmulhw mm4, mm6
+ movq mm1, [eax + 20H]
+ pmulhw mm6, mm7
+ movq mm5, mm1
+ pmulhw mm1, mm2
+ movq mm3, [edx + 50H]
+ pmulhw mm5, mm7
+ movq mm0, [eax]
+ paddw mm4, mm2
+ paddw mm6, mm7
+ paddw mm2, mm1
+ movq mm1, [edx + 78H]
+ paddw mm7, mm5
+ movq mm5, mm0
+ pmulhw mm0, mm3
+ paddw mm4, mm7
+ pmulhw mm5, mm1
+ movq mm7, [eax + 30H]
+ psubw mm6, mm2
+ paddw mm0, mm3
+ pmulhw mm3, mm7
+ movq mm2, [edx + 60H]
+ pmulhw mm7, mm1
+ paddw mm5, mm1
+ movq mm1, mm2
+ pmulhw mm2, [eax + 08H]
+ psubw mm3, mm5
+ movq mm5, [edx + 68H]
+ paddw mm0, mm7
+ movq mm7, mm5
+ psubw mm0, mm4
+ pmulhw mm5, [eax + 08H]
+ paddw mm2, mm1
+ pmulhw mm1, [eax + 28H]
+ paddw mm4, mm4
+ paddw mm4, mm0
+ psubw mm3, mm6
+ paddw mm5, mm7
+ paddw mm6, mm6
+ pmulhw mm7, [eax + 28H]
+ paddw mm6, mm3
+ movq [edx + 50H], mm4
+ psubw mm1, mm5
+ movq mm4, [eax + 18H]
+ movq mm5, mm3
+ pmulhw mm3, mm4
+ paddw mm7, mm2
+ movq [edx + 60H], mm6
+ movq mm2, mm0
+ movq mm6, [edx + 40H]
+ pmulhw mm0, mm4
+ paddw mm5, mm3
+ movq mm3, [edx + 48H]
+ psubw mm5, mm1
+ paddw mm2, mm0
+ psubw mm6, mm3
+ movq mm0, mm6
+ pmulhw mm6, mm4
+ paddw mm3, mm3
+ paddw mm1, mm1
+ paddw mm3, mm0
+ paddw mm1, mm5
+ pmulhw mm4, mm3
+ paddw mm6, mm0
+ psubw mm6, mm2
+ paddw mm2, mm2
+ movq mm0, [edx + 50H]
+ paddw mm2, mm6
+ paddw mm4, mm3
+ psubw mm2, mm1
+ movq mm3, [edx + 60H]
+ psubw mm4, mm7
+ paddw mm1, mm1
+ paddw mm7, mm7
+ paddw mm1, mm2
+ paddw mm7, mm4
+ psubw mm4, mm3
+ paddw mm3, mm3
+ psubw mm6, mm5
+ paddw mm5, mm5
+ paddw mm3, mm4
+ paddw mm5, mm6
+ psubw mm7, mm0
+ paddw mm0, mm0
+ movq [edx + 50H], mm1
+ paddw mm0, mm7
+ movq mm1, mm4
+ punpcklwd mm4, mm5
+ movq [edx + 40H], mm0
+ punpckhwd mm1, mm5
+ movq mm0, mm6
+ punpcklwd mm6, mm7
+ movq mm5, mm4
+ punpckldq mm4, mm6
+ punpckhdq mm5, mm6
+ movq mm6, mm1
+ movq [edx + 48H], mm4
+ punpckhwd mm0, mm7
+ movq [edx + 58H], mm5
+ punpckhdq mm6, mm0
+ movq mm4, [edx + 40H]
+ punpckldq mm1, mm0
+ movq mm5, [edx + 50H]
+ movq mm0, mm4
+ movq [edx + 78H], mm6
+ punpcklwd mm0, mm5
+ movq [edx + 68H], mm1
+ punpckhwd mm4, mm5
+ movq mm5, mm2
+ punpcklwd mm2, mm3
+ movq mm1, mm0
+ punpckldq mm0, mm2
+ punpckhdq mm1, mm2
+ movq mm2, mm4
+ movq [edx + 40H], mm0
+ punpckhwd mm5, mm3
+ movq [edx + 50H], mm1
+ punpckhdq mm4, mm5
+ punpckldq mm2, mm5
+ movq [edx + 70H], mm4
+ movq [edx + 60H], mm2
+ movq mm2, [edx + 30H]
+ movq mm6, [eax + 10H]
+ movq mm4, mm2
+ movq mm7, [edx + 50H]
+ pmulhw mm4, mm6
+ movq mm1, [eax + 20H]
+ pmulhw mm6, mm7
+ movq mm5, mm1
+ pmulhw mm1, mm2
+ movq mm3, [edx + 10H]
+ pmulhw mm5, mm7
+ movq mm0, [eax]
+ paddw mm4, mm2
+ paddw mm6, mm7
+ paddw mm2, mm1
+ movq mm1, [edx + 70H]
+ paddw mm7, mm5
+ movq mm5, mm0
+ pmulhw mm0, mm3
+ paddw mm4, mm7
+ pmulhw mm5, mm1
+ movq mm7, [eax + 30H]
+ psubw mm6, mm2
+ paddw mm0, mm3
+ pmulhw mm3, mm7
+ movq mm2, [edx + 20H]
+ pmulhw mm7, mm1
+ paddw mm5, mm1
+ movq mm1, mm2
+ pmulhw mm2, [eax + 08H]
+ psubw mm3, mm5
+ movq mm5, [edx + 60H]
+ paddw mm0, mm7
+ movq mm7, mm5
+ psubw mm0, mm4
+ pmulhw mm5, [eax + 08H]
+ paddw mm2, mm1
+ pmulhw mm1, [eax + 28H]
+ paddw mm4, mm4
+ paddw mm4, mm0
+ psubw mm3, mm6
+ paddw mm5, mm7
+ paddw mm6, mm6
+ pmulhw mm7, [eax + 28H]
+ paddw mm6, mm3
+ movq [edx + 10H], mm4
+ psubw mm1, mm5
+ movq mm4, [eax + 18H]
+ movq mm5, mm3
+ pmulhw mm3, mm4
+ paddw mm7, mm2
+ movq [edx + 20H], mm6
+ movq mm2, mm0
+ movq mm6, [edx]
+ pmulhw mm0, mm4
+ paddw mm5, mm3
+ movq mm3, [edx + 40H]
+ psubw mm5, mm1
+ paddw mm2, mm0
+ psubw mm6, mm3
+ movq mm0, mm6
+ pmulhw mm6, mm4
+ paddw mm3, mm3
+ paddw mm1, mm1
+ paddw mm3, mm0
+ paddw mm1, mm5
+ pmulhw mm4, mm3
+ paddw mm6, mm0
+ psubw mm6, mm2
+ paddw mm2, mm2
+ movq mm0, [edx + 10H]
+ paddw mm2, mm6
+ paddw mm4, mm3
+ psubw mm2, mm1
+ paddw mm2, [eax + 38H]
+ paddw mm1, mm1
+ paddw mm1, mm2
+ psraw mm2, 4
+ psubw mm4, mm7
+ psraw mm1, 4
+ movq mm3, [edx + 20H]
+ paddw mm7, mm7
+ movq [edx + 20H], mm2
+ paddw mm7, mm4
+ movq [edx + 10H], mm1
+ psubw mm4, mm3
+ paddw mm4, [eax + 38H]
+ paddw mm3, mm3
+ paddw mm3, mm4
+ psraw mm4, 4
+ psubw mm6, mm5
+ psraw mm3, 4
+ paddw mm6, [eax + 38H]
+ paddw mm5, mm5
+ paddw mm5, mm6
+ psraw mm6, 4
+ movq [edx + 40H], mm4
+ psraw mm5, 4
+ movq [edx + 30H], mm3
+ psubw mm7, mm0
+ paddw mm7, [eax + 38H]
+ paddw mm0, mm0
+ paddw mm0, mm7
+ psraw mm7, 4
+ movq [edx + 60H], mm6
+ psraw mm0, 4
+ movq [edx + 50H], mm5
+ movq [edx + 70H], mm7
+ movq [edx], mm0
+ movq mm2, [edx + 38H]
+ movq mm6, [eax + 10H]
+ movq mm4, mm2
+ movq mm7, [edx + 58H]
+ pmulhw mm4, mm6
+ movq mm1, [eax + 20H]
+ pmulhw mm6, mm7
+ movq mm5, mm1
+ pmulhw mm1, mm2
+ movq mm3, [edx + 18H]
+ pmulhw mm5, mm7
+ movq mm0, [eax]
+ paddw mm4, mm2
+ paddw mm6, mm7
+ paddw mm2, mm1
+ movq mm1, [edx + 78H]
+ paddw mm7, mm5
+ movq mm5, mm0
+ pmulhw mm0, mm3
+ paddw mm4, mm7
+ pmulhw mm5, mm1
+ movq mm7, [eax + 30H]
+ psubw mm6, mm2
+ paddw mm0, mm3
+ pmulhw mm3, mm7
+ movq mm2, [edx + 28H]
+ pmulhw mm7, mm1
+ paddw mm5, mm1
+ movq mm1, mm2
+ pmulhw mm2, [eax + 08H]
+ psubw mm3, mm5
+ movq mm5, [edx + 68H]
+ paddw mm0, mm7
+ movq mm7, mm5
+ psubw mm0, mm4
+ pmulhw mm5, [eax + 08H]
+ paddw mm2, mm1
+ pmulhw mm1, [eax + 28H]
+ paddw mm4, mm4
+ paddw mm4, mm0
+ psubw mm3, mm6
+ paddw mm5, mm7
+ paddw mm6, mm6
+ pmulhw mm7, [eax + 28H]
+ paddw mm6, mm3
+ movq [edx + 18H], mm4
+ psubw mm1, mm5
+ movq mm4, [eax + 18H]
+ movq mm5, mm3
+ pmulhw mm3, mm4
+ paddw mm7, mm2
+ movq [edx + 28H], mm6
+ movq mm2, mm0
+ movq mm6, [edx + 08H]
+ pmulhw mm0, mm4
+ paddw mm5, mm3
+ movq mm3, [edx + 48H]
+ psubw mm5, mm1
+ paddw mm2, mm0
+ psubw mm6, mm3
+ movq mm0, mm6
+ pmulhw mm6, mm4
+ paddw mm3, mm3
+ paddw mm1, mm1
+ paddw mm3, mm0
+ paddw mm1, mm5
+ pmulhw mm4, mm3
+ paddw mm6, mm0
+ psubw mm6, mm2
+ paddw mm2, mm2
+ movq mm0, [edx + 18H]
+ paddw mm2, mm6
+ paddw mm4, mm3
+ psubw mm2, mm1
+ paddw mm2, [eax + 38H]
+ paddw mm1, mm1
+ paddw mm1, mm2
+ psraw mm2, 4
+ psubw mm4, mm7
+ psraw mm1, 4
+ movq mm3, [edx + 28H]
+ paddw mm7, mm7
+ movq [edx + 28H], mm2
+ paddw mm7, mm4
+ movq [edx + 18H], mm1
+ psubw mm4, mm3
+ paddw mm4, [eax + 38H]
+ paddw mm3, mm3
+ paddw mm3, mm4
+ psraw mm4, 4
+ psubw mm6, mm5
+ psraw mm3, 4
+ paddw mm6, [eax + 38H]
+ paddw mm5, mm5
+ paddw mm5, mm6
+ psraw mm6, 4
+ movq [edx + 48H], mm4
+ psraw mm5, 4
+ movq [edx + 38H], mm3
+ psubw mm7, mm0
+ paddw mm7, [eax + 38H]
+ paddw mm0, mm0
+ paddw mm0, mm7
+ psraw mm7, 4
+ movq [edx + 68H], mm6
+ psraw mm0, 4
+ movq [edx + 58H], mm5
+ movq [edx + 78H], mm7
+ movq [edx + 08H], mm0
/* emms */
}
}
-#endif
\ No newline at end of file
+#endif
Modified: trunk/theora/lib/dec/x86_vc/mmxloopfilter.c
===================================================================
--- trunk/theora/lib/dec/x86_vc/mmxloopfilter.c 2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/dec/x86_vc/mmxloopfilter.c 2008-08-04 18:37:55 UTC (rev 15153)
@@ -11,7 +11,7 @@
********************************************************************
function:
- last mod: $Id:
+ last mod: $Id:
********************************************************************/
@@ -21,7 +21,7 @@
Originally written by Rudolf Marek, based on code from On2's VP3.
Converted to Visual Studio inline assembly by Nils Pipenbrinck.
- Note: I can't test these since my example files never get into the
+ Note: I can't test these since my example files never get into the
loop filters, but the code has been converted semi-automatic from
the GCC sources, so it ought to work.
---------------------------------------------------------------------*/
@@ -33,7 +33,7 @@
-static void loop_filter_v(unsigned char *_pix,int _ystride,
+static void loop_filter_v(unsigned char *_pix,int _ystride,
const ogg_int16_t *_ll){
_asm {
mov eax, [_pix]
@@ -41,134 +41,134 @@
mov ebx, [_ll]
/* _pix -= ystride */
- sub eax, edx
+ sub eax, edx
/* mm0=0 */
- pxor mm0, mm0
+ pxor mm0, mm0
/* _pix -= ystride */
- sub eax, edx
+ sub eax, edx
/* esi=_ystride*3 */
- lea esi, [edx + edx*2]
+ lea esi, [edx + edx*2]
- /* mm7=_pix[0...8]*/
- movq mm7, [eax]
- /* mm4=_pix[0...8+_ystride*3]*/
- movq mm4, [eax + esi]
- /* mm6=_pix[0...8]*/
- movq mm6, mm7
- /* Expand unsigned _pix[0...3] to 16 bits.*/
- punpcklbw mm6, mm0
- movq mm5, mm4
+ /* mm7=_pix[0...8]*/
+ movq mm7, [eax]
+ /* mm4=_pix[0...8+_ystride*3]*/
+ movq mm4, [eax + esi]
+ /* mm6=_pix[0...8]*/
+ movq mm6, mm7
+ /* Expand unsigned _pix[0...3] to 16 bits.*/
+ punpcklbw mm6, mm0
+ movq mm5, mm4
/* Expand unsigned _pix[4...7] to 16 bits.*/
- punpckhbw mm7, mm0
- punpcklbw mm4, mm0
- /* Expand other arrays too.*/
- punpckhbw mm5, mm0
- /*mm7:mm6=_p[0...7]-_p[0...7+_ystride*3]:*/
- psubw mm6, mm4
- psubw mm7, mm5
- /*mm5=mm4=_pix[0...7+_ystride]*/
- movq mm4, [eax + edx]
- /*mm1=mm3=mm2=_pix[0..7]+_ystride*2]*/
- movq mm2, [eax + edx*2]
- movq mm5, mm4
- movq mm3, mm2
- movq mm1, mm2
- /*Expand these arrays.*/
- punpckhbw mm5, mm0
- punpcklbw mm4, mm0
- punpckhbw mm3, mm0
- punpcklbw mm2, mm0
- pcmpeqw mm0, mm0
- /*mm0=3 3 3 3
+ punpckhbw mm7, mm0
+ punpcklbw mm4, mm0
+ /* Expand other arrays too.*/
+ punpckhbw mm5, mm0
+ /*mm7:mm6=_p[0...7]-_p[0...7+_ystride*3]:*/
+ psubw mm6, mm4
+ psubw mm7, mm5
+ /*mm5=mm4=_pix[0...7+_ystride]*/
+ movq mm4, [eax + edx]
+ /*mm1=mm3=mm2=_pix[0..7]+_ystride*2]*/
+ movq mm2, [eax + edx*2]
+ movq mm5, mm4
+ movq mm3, mm2
+ movq mm1, mm2
+ /*Expand these arrays.*/
+ punpckhbw mm5, mm0
+ punpcklbw mm4, mm0
+ punpckhbw mm3, mm0
+ punpcklbw mm2, mm0
+ pcmpeqw mm0, mm0
+ /*mm0=3 3 3 3
mm3:mm2=_pix[0...8+_ystride*2]-_pix[0...8+_ystride]*/
- psubw mm3, mm5
- psrlw mm0, 14
- psubw mm2, mm4
+ psubw mm3, mm5
+ psrlw mm0, 14
+ psubw mm2, mm4
/*Scale by 3.*/
- pmullw mm3, mm0
- pmullw mm2, mm0
+ pmullw mm3, mm0
+ pmullw mm2, mm0
/*mm0=4 4 4 4
f=mm3:mm2==_pix[0...8]-_pix[0...8+_ystride*3]+
3*(_pix[0...8+_ystride*2]-_pix[0...8+_ystride])*/
- psrlw mm0, 1
- paddw mm3, mm7
- psllw mm0, 2
+ psrlw mm0, 1
+ paddw mm3, mm7
+ psllw mm0, 2
paddw mm2, mm6
/*Add 4.*/
- paddw mm3, mm0
- paddw mm2, mm0
- /*"Divide" by 8.*/
- psraw mm3, 3
- psraw mm2, 3
- /*Now compute lflim of mm3:mm2 cf. Section 7.10 of the sepc.*/
+ paddw mm3, mm0
+ paddw mm2, mm0
+ /*"Divide" by 8.*/
+ psraw mm3, 3
+ psraw mm2, 3
+ /*Now compute lflim of mm3:mm2 cf. Section 7.10 of the sepc.*/
/*Free up mm5.*/
- packuswb mm4, mm5
+ packuswb mm4, mm5
/*mm0=L L L L*/
- movq mm0, [ebx]
+ movq mm0, [ebx]
/*if(R_i<-2L||R_i>2L)R_i=0:*/
- movq mm5, mm2
- pxor mm6, mm6
- movq mm7, mm0
- psubw mm6, mm0
- psllw mm7, 1
- psllw mm6, 1
+ movq mm5, mm2
+ pxor mm6, mm6
+ movq mm7, mm0
+ psubw mm6, mm0
+ psllw mm7, 1
+ psllw mm6, 1
/*mm2==R_3 R_2 R_1 R_0*/
/*mm5==R_3 R_2 R_1 R_0*/
/*mm6==-2L -2L -2L -2L*/
/*mm7==2L 2L 2L 2L*/
- pcmpgtw mm7, mm2
- pcmpgtw mm5, mm6
- pand mm2, mm7
- movq mm7, mm0
- pand mm2, mm5
- psllw mm7, 1
- movq mm5, mm3
+ pcmpgtw mm7, mm2
+ pcmpgtw mm5, mm6
+ pand mm2, mm7
+ movq mm7, mm0
+ pand mm2, mm5
+ psllw mm7, 1
+ movq mm5, mm3
/*mm3==R_7 R_6 R_5 R_4*/
/*mm5==R_7 R_6 R_5 R_4*/
/*mm6==-2L -2L -2L -2L*/
/*mm7==2L 2L 2L 2L*/
- pcmpgtw mm7, mm3
- pcmpgtw mm5, mm6
- pand mm3, mm7
- movq mm7, mm0
- pand mm3, mm5
+ pcmpgtw mm7, mm3
+ pcmpgtw mm5, mm6
+ pand mm3, mm7
+ movq mm7, mm0
+ pand mm3, mm5
/*if(R_i<-L)R_i'=R_i+2L;
if(R_i>L)R_i'=R_i-2L;
if(R_i<-L||R_i>L)R_i=-R_i':*/
- psraw mm6, 1
- movq mm5, mm2
- psllw mm7, 1
+ psraw mm6, 1
+ movq mm5, mm2
+ psllw mm7, 1
/*mm2==R_3 R_2 R_1 R_0*/
/*mm5==R_3 R_2 R_1 R_0*/
/*mm6==-L -L -L -L*/
/*mm0==L L L L*/
/*mm5=R_i>L?FF:00*/
- pcmpgtw mm5, mm0
+ pcmpgtw mm5, mm0
/*mm6=-L>R_i?FF:00*/
- pcmpgtw mm6, mm2
+ pcmpgtw mm6, mm2
/*mm7=R_i>L?2L:0*/
- pand mm7, mm5
+ pand mm7, mm5
/*mm2=R_i>L?R_i-2L:R_i*/
- psubw mm2, mm7
- movq mm7, mm0
+ psubw mm2, mm7
+ movq mm7, mm0
/*mm5=-L>R_i||R_i>L*/
- por mm5, mm6
- psllw mm7, 1
+ por mm5, mm6
+ psllw mm7, 1
/*mm7=-L>R_i?2L:0*/
- pand mm7, mm6
- pxor mm6, mm6
+ pand mm7, mm6
+ pxor mm6, mm6
/*mm2=-L>R_i?R_i+2L:R_i*/
- paddw mm2, mm7
- psubw mm6, mm0
+ paddw mm2, mm7
+ psubw mm6, mm0
/*mm5=-L>R_i||R_i>L?-R_i':0*/
- pand mm5, mm2
- movq mm7, mm0
+ pand mm5, mm2
+ movq mm7, mm0
/*mm2=-L>R_i||R_i>L?0:R_i*/
- psubw mm2, mm5
- psllw mm7, 1
+ psubw mm2, mm5
+ psllw mm7, 1
/*mm2=-L>R_i||R_i>L?-R_i':R_i*/
- psubw mm2, mm5
- movq mm5, mm3
+ psubw mm2, mm5
+ movq mm5, mm3
/*mm3==R_7 R_6 R_5 R_4*/
/*mm5==R_7 R_6 R_5 R_4*/
/*mm6==-L -L -L -L*/
@@ -176,44 +176,44 @@
/*mm6=-L>R_i?FF:00*/
pcmpgtw mm6, mm3
/*mm5=R_i>L?FF:00*/
- pcmpgtw mm5, mm0
+ pcmpgtw mm5, mm0
/*mm7=R_i>L?2L:0*/
- pand mm7, mm5
+ pand mm7, mm5
/*mm2=R_i>L?R_i-2L:R_i*/
- psubw mm3, mm7
- psllw mm0, 1
+ psubw mm3, mm7
+ psllw mm0, 1
/*mm5=-L>R_i||R_i>L*/
- por mm5, mm6
+ por mm5, mm6
/*mm0=-L>R_i?2L:0*/
- pand mm0, mm6
+ pand mm0, mm6
/*mm3=-L>R_i?R_i+2L:R_i*/
- paddw mm3, mm0
+ paddw mm3, mm0
/*mm5=-L>R_i||R_i>L?-R_i':0*/
- pand mm5, mm3
+ pand mm5, mm3
/*mm2=-L>R_i||R_i>L?0:R_i*/
- psubw mm3, mm5
+ psubw mm3, mm5
/*mm3=-L>R_i||R_i>L?-R_i':R_i*/
- psubw mm3, mm5
+ psubw mm3, mm5
/*Unfortunately, there's no unsigned byte+signed byte with unsigned
saturation op code, so we have to promote things back 16 bits.*/
- pxor mm0, mm0
- movq mm5, mm4
- punpcklbw mm4, mm0
- punpckhbw mm5, mm0
- movq mm6, mm1
- punpcklbw mm1, mm0
- punpckhbw mm6, mm0
+ pxor mm0, mm0
+ movq mm5, mm4
+ punpcklbw mm4, mm0
+ punpckhbw mm5, mm0
+ movq mm6, mm1
+ punpcklbw mm1, mm0
+ punpckhbw mm6, mm0
/*_pix[0...8+_ystride]+=R_i*/
- paddw mm4, mm2
- paddw mm5, mm3
+ paddw mm4, mm2
+ paddw mm5, mm3
/*_pix[0...8+_ystride*2]-=R_i*/
- psubw mm1, mm2
- psubw mm6, mm3
- packuswb mm4, mm5
- packuswb mm1, mm6
+ psubw mm1, mm2
+ psubw mm6, mm3
+ packuswb mm4, mm5
+ packuswb mm1, mm6
/*Write it back out.*/
- movq [eax + edx], mm4
- movq [eax + edx*2], mm1
+ movq [eax + edx], mm4
+ movq [eax + edx*2], mm1
}
}
@@ -221,7 +221,7 @@
Data are striped p0 p1 p2 p3 ... p0 p1 p2 p3 ..., so in order to load all
four p0's to one register we must transpose the values in four mmx regs.
When half is done we repeat this for the rest.*/
-static void loop_filter_h4(unsigned char *_pix,long _ystride,
+static void loop_filter_h4(unsigned char *_pix,long _ystride,
const ogg_int16_t *_ll){
/* todo: merge the comments from the GCC sources */
_asm {
@@ -229,79 +229,79 @@
mov edx, [_ystride]
mov eax, [_ll]
/*esi=_ystride*3*/
- lea esi, [edx + edx*2]
+ lea esi, [edx + edx*2]
- movd mm0, dword ptr [ecx]
- movd mm1, dword ptr [ecx + edx]
- movd mm2, dword ptr [ecx + edx*2]
- movd mm3, dword ptr [ecx + esi]
- punpcklbw mm0, mm1
- punpcklbw mm2, mm3
- movq mm1, mm0
- punpckhwd mm0, mm2
- punpcklwd mm1, mm2
- pxor mm7, mm7
- movq mm5, mm1
- punpcklbw mm1, mm7
- punpckhbw mm5, mm7
- movq mm3, mm0
- punpcklbw mm0, mm7
- punpckhbw mm3, mm7
- psubw mm1, mm3
- movq mm4, mm0
- pcmpeqw mm2, mm2
- psubw mm0, mm5
- psrlw mm2, 14
- pmullw mm0, mm2
- psrlw mm2, 1
- paddw mm0, mm1
- psllw mm2, 2
- paddw mm0, mm2
- psraw mm0, 3
- movq mm6, qword ptr [eax]
- movq mm1, mm0
- pxor mm2, mm2
- movq mm3, mm6
- psubw mm2, mm6
- psllw mm3, 1
- psllw mm2, 1
- pcmpgtw mm3, mm0
- pcmpgtw mm1, mm2
- pand mm0, mm3
- pand mm0, mm1
- psraw mm2, 1
- movq mm1, mm0
- movq mm3, mm6
- pcmpgtw mm2, mm0
- pcmpgtw mm1, mm6
- psllw mm3, 1
- psllw mm6, 1
- pand mm3, mm1
- pand mm6, mm2
- psubw mm0, mm3
- por mm1, mm2
- paddw mm0, mm6
- pand mm1, mm0
- psubw mm0, mm1
- psubw mm0, mm1
- paddw mm5, mm0
- psubw mm4, mm0
- packuswb mm5, mm7
- packuswb mm4, mm7
- punpcklbw mm5, mm4
- movd edi, mm5
- mov word ptr [ecx + 01H], di
- psrlq mm5, 32
- shr edi, 16
- mov word ptr [ecx + edx + 01H], di
- movd edi, mm5
+ movd mm0, dword ptr [ecx]
+ movd mm1, dword ptr [ecx + edx]
+ movd mm2, dword ptr [ecx + edx*2]
+ movd mm3, dword ptr [ecx + esi]
+ punpcklbw mm0, mm1
+ punpcklbw mm2, mm3
+ movq mm1, mm0
+ punpckhwd mm0, mm2
+ punpcklwd mm1, mm2
+ pxor mm7, mm7
+ movq mm5, mm1
+ punpcklbw mm1, mm7
+ punpckhbw mm5, mm7
+ movq mm3, mm0
+ punpcklbw mm0, mm7
+ punpckhbw mm3, mm7
+ psubw mm1, mm3
+ movq mm4, mm0
+ pcmpeqw mm2, mm2
+ psubw mm0, mm5
+ psrlw mm2, 14
+ pmullw mm0, mm2
+ psrlw mm2, 1
+ paddw mm0, mm1
+ psllw mm2, 2
+ paddw mm0, mm2
+ psraw mm0, 3
+ movq mm6, qword ptr [eax]
+ movq mm1, mm0
+ pxor mm2, mm2
+ movq mm3, mm6
+ psubw mm2, mm6
+ psllw mm3, 1
+ psllw mm2, 1
+ pcmpgtw mm3, mm0
+ pcmpgtw mm1, mm2
+ pand mm0, mm3
+ pand mm0, mm1
+ psraw mm2, 1
+ movq mm1, mm0
+ movq mm3, mm6
+ pcmpgtw mm2, mm0
+ pcmpgtw mm1, mm6
+ psllw mm3, 1
+ psllw mm6, 1
+ pand mm3, mm1
+ pand mm6, mm2
+ psubw mm0, mm3
+ por mm1, mm2
+ paddw mm0, mm6
+ pand mm1, mm0
+ psubw mm0, mm1
+ psubw mm0, mm1
+ paddw mm5, mm0
+ psubw mm4, mm0
+ packuswb mm5, mm7
+ packuswb mm4, mm7
+ punpcklbw mm5, mm4
+ movd edi, mm5
+ mov word ptr [ecx + 01H], di
+ psrlq mm5, 32
+ shr edi, 16
+ mov word ptr [ecx + edx + 01H], di
+ movd edi, mm5
mov word ptr [ecx + edx*2 + 01H], di
- shr edi, 16
- mov word ptr [ecx + esi + 01H], di
+ shr edi, 16
+ mov word ptr [ecx + esi + 01H], di
}
}
-static void loop_filter_h(unsigned char *_pix,int _ystride,
+static void loop_filter_h(unsigned char *_pix,int _ystride,
const ogg_int16_t *_ll){
_pix-=2;
loop_filter_h4(_pix,_ystride,_ll);
@@ -374,4 +374,4 @@
_mm_empty();
}
-#endif
\ No newline at end of file
+#endif
Modified: trunk/theora/lib/dec/x86_vc/mmxstate.c
===================================================================
--- trunk/theora/lib/dec/x86_vc/mmxstate.c 2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/dec/x86_vc/mmxstate.c 2008-08-04 18:37:55 UTC (rev 15153)
@@ -42,33 +42,33 @@
/* Fill a block with value */
static __inline void loc_fill_mmx_value (__m64 * _dst, __m64 _value){
- __m64 t = _value;
- _dst[0] = t; _dst[1] = t; _dst[2] = t; _dst[3] = t;
- _dst[4] = t; _dst[5] = t; _dst[6] = t; _dst[7] = t;
- _dst[8] = t; _dst[9] = t; _dst[10] = t; _dst[11] = t;
- _dst[12] = t; _dst[13] = t; _dst[14] = t; _dst[15] = t;
+ __m64 t = _value;
+ _dst[0] = t; _dst[1] = t; _dst[2] = t; _dst[3] = t;
+ _dst[4] = t; _dst[5] = t; _dst[6] = t; _dst[7] = t;
+ _dst[8] = t; _dst[9] = t; _dst[10] = t; _dst[11] = t;
+ _dst[12] = t; _dst[13] = t; _dst[14] = t; _dst[15] = t;
}
/* copy a block of 8 byte elements using different strides */
-static __inline void loc_blockcopy_mmx (unsigned char * _dst, int _dst_ystride,
- unsigned char * _src, int _src_ystride){
- __m64 a,b,c,d,e,f,g,h;
- a = *(__m64*)(_src + 0 * _src_ystride);
- b = *(__m64*)(_src + 1 * _src_ystride);
- c = *(__m64*)(_src + 2 * _src_ystride);
- d = *(__m64*)(_src + 3 * _src_ystride);
- e = *(__m64*)(_src + 4 * _src_ystride);
- f = *(__m64*)(_src + 5 * _src_ystride);
- g = *(__m64*)(_src + 6 * _src_ystride);
- h = *(__m64*)(_src + 7 * _src_ystride);
- *(__m64*)(_dst + 0 * _dst_ystride) = a;
- *(__m64*)(_dst + 1 * _dst_ystride) = b;
- *(__m64*)(_dst + 2 * _dst_ystride) = c;
- *(__m64*)(_dst + 3 * _dst_ystride) = d;
- *(__m64*)(_dst + 4 * _dst_ystride) = e;
- *(__m64*)(_dst + 5 * _dst_ystride) = f;
- *(__m64*)(_dst + 6 * _dst_ystride) = g;
- *(__m64*)(_dst + 7 * _dst_ystride) = h;
+static __inline void loc_blockcopy_mmx (unsigned char * _dst, int _dst_ystride,
+ unsigned char * _src, int _src_ystride){
+ __m64 a,b,c,d,e,f,g,h;
+ a = *(__m64*)(_src + 0 * _src_ystride);
+ b = *(__m64*)(_src + 1 * _src_ystride);
+ c = *(__m64*)(_src + 2 * _src_ystride);
+ d = *(__m64*)(_src + 3 * _src_ystride);
+ e = *(__m64*)(_src + 4 * _src_ystride);
+ f = *(__m64*)(_src + 5 * _src_ystride);
+ g = *(__m64*)(_src + 6 * _src_ystride);
+ h = *(__m64*)(_src + 7 * _src_ystride);
+ *(__m64*)(_dst + 0 * _dst_ystride) = a;
+ *(__m64*)(_dst + 1 * _dst_ystride) = b;
+ *(__m64*)(_dst + 2 * _dst_ystride) = c;
+ *(__m64*)(_dst + 3 * _dst_ystride) = d;
+ *(__m64*)(_dst + 4 * _dst_ystride) = e;
+ *(__m64*)(_dst + 5 * _dst_ystride) = f;
+ *(__m64*)(_dst + 6 * _dst_ystride) = g;
+ *(__m64*)(_dst + 7 * _dst_ystride) = h;
}
void oc_state_frag_recon_mmx(oc_theora_state *_state,const oc_fragment *_frag,
@@ -117,7 +117,7 @@
the iDCT.*/
/*First zero the buffer.*/
/*On K7, etc., this could be replaced with movntq and sfence.*/
- loc_fill_mmx_value ((__m64 *)res_buf, _mm_setzero_si64());
+ loc_fill_mmx_value ((__m64 *)res_buf, _mm_setzero_si64());
res_buf[0]=(ogg_int16_t)((ogg_int32_t)_frag->dc*_dc_iquant);
/*This is planned to be rewritten in MMX.*/
@@ -156,12 +156,12 @@
_frag->buffer[ref_framei]+mvoffsets[1],ref_ystride,res_buf);
}
else{
- oc_frag_recon_inter_mmx(_frag->buffer[dst_framei],dst_ystride,
+ oc_frag_recon_inter_mmx(_frag->buffer[dst_framei],dst_ystride,
_frag->buffer[ref_framei]+mvoffsets[0],ref_ystride,res_buf);
}
}
- _mm_empty();
+ _mm_empty();
}
@@ -180,8 +180,8 @@
fragi_end=_fragis+_nfragis;
for(fragi=_fragis;fragi<fragi_end;fragi++){
oc_fragment *frag = _state->frags+*fragi;
- loc_blockcopy_mmx (frag->buffer[dst_framei], dst_ystride,
- frag->buffer[src_framei], src_ystride);
+ loc_blockcopy_mmx (frag->buffer[dst_framei], dst_ystride,
+ frag->buffer[src_framei], src_ystride);
}
_m_empty();
}
Modified: trunk/theora/lib/dec/x86_vc/x86state.c
===================================================================
--- trunk/theora/lib/dec/x86_vc/x86state.c 2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/dec/x86_vc/x86state.c 2008-08-04 18:37:55 UTC (rev 15153)
@@ -23,19 +23,19 @@
void oc_state_vtable_init_x86(oc_theora_state *_state){
_state->cpu_flags=oc_cpu_flags_get();
- /* fill with defaults */
- oc_state_vtable_init_c(_state);
+ /* fill with defaults */
+ oc_state_vtable_init_c(_state);
- /* patch MMX functions */
- if(_state->cpu_flags&OC_CPU_X86_MMX){
+ /* patch MMX functions */
+ if(_state->cpu_flags&OC_CPU_X86_MMX){
_state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_mmx;
_state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_mmx;
_state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_mmx;
_state->opt_vtable.restore_fpu=oc_restore_fpu_mmx;
- _state->opt_vtable.state_frag_copy=oc_state_frag_copy_mmx;
- _state->opt_vtable.state_frag_recon=oc_state_frag_recon_mmx;
+ _state->opt_vtable.state_frag_copy=oc_state_frag_copy_mmx;
+ _state->opt_vtable.state_frag_recon=oc_state_frag_recon_mmx;
_state->opt_vtable.state_loop_filter_frag_rows=oc_state_loop_filter_frag_rows_mmx;
- }
+ }
}
#endif
Modified: trunk/theora/lib/enc/codec_internal.h
===================================================================
--- trunk/theora/lib/enc/codec_internal.h 2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/enc/codec_internal.h 2008-08-04 18:37:55 UTC (rev 15153)
@@ -287,10 +287,10 @@
struct PB_INSTANCE {
oggpack_buffer *opb;
theora_info info;
-
+
/* flag to indicate if the headers already have been written */
int HeadersWritten;
-
+
/* how far do we shift the granulepos to seperate out P frame counts? */
int keyframe_granule_shift;
@@ -489,14 +489,14 @@
ogg_int32_t fp_quant_Inter_Y_coeffs[64];
ogg_int32_t fp_quant_Inter_U_coeffs[64];
ogg_int32_t fp_quant_Inter_V_coeffs[64];
-
+
ogg_int32_t fp_quant_Y_round[64];
ogg_int32_t fp_quant_U_round[64];
ogg_int32_t fp_quant_V_round[64];
ogg_int32_t fp_quant_Inter_Y_round[64];
ogg_int32_t fp_quant_Inter_U_round[64];
ogg_int32_t fp_quant_Inter_V_round[64];
-
+
ogg_int32_t fp_ZeroBinSize_Y[64];
ogg_int32_t fp_ZeroBinSize_U[64];
ogg_int32_t fp_ZeroBinSize_V[64];
@@ -518,15 +518,6 @@
DspFunctions dsp; /* Selected functions for this platform */
-#ifdef _TH_DEBUG_
- Q_LIST_ENTRY (*QFragQUAN)[64]; /* Fragment Coefficients
- Array Pointers */
- Q_LIST_ENTRY (*QFragFREQ)[64]; /* Fragment Coefficients
- Array Pointers */
- Q_LIST_ENTRY (*QFragTIME)[64]; /* Fragment Coefficients
- Array Pointers */
-#endif
-
};
/* Encoder (Compressor) instance -- installed in a theora_state */
Modified: trunk/theora/lib/enc/dct_decode.c
===================================================================
--- trunk/theora/lib/enc/dct_decode.c 2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/enc/dct_decode.c 2008-08-04 18:37:55 UTC (rev 15153)
@@ -65,15 +65,6 @@
/* Set up pointer into the quantisation buffer. */
pbi->quantized_list = &pbi->QFragData[FragmentNumber][0];
-#ifdef _TH_DEBUG_
- {
- int i;
- for(i=0;i<64;i++)
- pbi->QFragFREQ[FragmentNumber][dezigzag_index[i]]=
- pbi->quantized_list[i] * pbi->dequant_coeffs[i];
- }
-#endif
-
/* Invert quantisation and DCT to get pixel data. */
switch(pbi->FragCoefEOB[FragmentNumber]){
case 0:case 1:
@@ -89,14 +80,6 @@
dsp_IDctSlow(pbi->dsp, pbi->quantized_list, pbi->dequant_coeffs, pbi->ReconDataBuffer );
}
-#ifdef _TH_DEBUG_
- {
- int i;
- for(i=0;i<64;i++)
- pbi->QFragTIME[FragmentNumber][i]= pbi->ReconDataBuffer[i];
- }
-#endif
-
/* Convert fragment number to a pixel offset in a reconstruction buffer. */
ReconPixelIndex = pbi->recon_pixel_index_table[FragmentNumber];
@@ -148,13 +131,13 @@
/* Select appropriate dequantiser matrix. */
if ( pbi->CodingMode == CODE_INTRA )
- if ( FragmentNumber <
+ if ( FragmentNumber <
(ogg_int32_t)(pbi->YPlaneFragments + pbi->UVPlaneFragments) )
pbi->dequant_coeffs = pbi->dequant_U_coeffs;
else
pbi->dequant_coeffs = pbi->dequant_V_coeffs;
else
- if ( FragmentNumber <
+ if ( FragmentNumber <
(ogg_int32_t)(pbi->YPlaneFragments + pbi->UVPlaneFragments) )
pbi->dequant_coeffs = pbi->dequant_InterU_coeffs;
else
@@ -164,15 +147,6 @@
/* Set up pointer into the quantisation buffer. */
pbi->quantized_list = &pbi->QFragData[FragmentNumber][0];
-#ifdef _TH_DEBUG_
- {
- int i;
- for(i=0;i<64;i++)
- pbi->QFragFREQ[FragmentNumber][dezigzag_index[i]]=
- pbi->quantized_list[i] * pbi->dequant_coeffs[i];
- }
-#endif
-
/* Invert quantisation and DCT to get pixel data. */
switch(pbi->FragCoefEOB[FragmentNumber]){
case 0:case 1:
@@ -188,14 +162,6 @@
dsp_IDctSlow(pbi->dsp, pbi->quantized_list, pbi->dequant_coeffs, pbi->ReconDataBuffer );
}
-#ifdef _TH_DEBUG_
- {
- int i;
- for(i=0;i<64;i++)
- pbi->QFragTIME[FragmentNumber][i]= pbi->ReconDataBuffer[i];
- }
-#endif
-
/* Convert fragment number to a pixel offset in a reconstruction buffer. */
ReconPixelIndex = pbi->recon_pixel_index_table[FragmentNumber];
@@ -646,8 +612,8 @@
}
static void loop_filter_h(unsigned char * PixelPtr,
- ogg_int32_t LineLength,
- ogg_int16_t *BoundingValuePtr){
+ ogg_int32_t LineLength,
+ ogg_int16_t *BoundingValuePtr){
ogg_int32_t j;
ogg_int32_t FiltVal;
PixelPtr-=2;
@@ -669,8 +635,8 @@
}
static void loop_filter_v(unsigned char * PixelPtr,
- ogg_int32_t LineLength,
- ogg_int16_t *BoundingValuePtr){
+ ogg_int32_t LineLength,
+ ogg_int16_t *BoundingValuePtr){
ogg_int32_t j;
ogg_int32_t FiltVal;
PixelPtr -= 2*LineLength;
@@ -702,7 +668,7 @@
SetupBoundingValueArray_Generic(BoundingValues, FLimit);
for ( j = 0; j < 3 ; j++){
- ogg_uint32_t *bp_begin = bp;
+ ogg_uint32_t *bp_begin = bp;
ogg_uint32_t *bp_end;
int stride;
int h;
@@ -719,23 +685,23 @@
stride = pbi->UVStride;
break;
}
-
+
while(bp<bp_end){
ogg_uint32_t *bp_left = bp;
ogg_uint32_t *bp_right = bp + h;
while(bp<bp_right){
- if(cp[0]){
- if(bp>bp_left)
- loop_filter_h(&pbi->LastFrameRecon[bp[0]],stride,bvp);
- if(bp_left>bp_begin)
- loop_filter_v(&pbi->LastFrameRecon[bp[0]],stride,bvp);
- if(bp+1<bp_right && !cp[1])
- loop_filter_h(&pbi->LastFrameRecon[bp[0]]+8,stride,bvp);
- if(bp+h<bp_end && !cp[h])
- loop_filter_v(&pbi->LastFrameRecon[bp[h]],stride,bvp);
- }
- bp++;
- cp++;
+ if(cp[0]){
+ if(bp>bp_left)
+ loop_filter_h(&pbi->LastFrameRecon[bp[0]],stride,bvp);
+ if(bp_left>bp_begin)
+ loop_filter_v(&pbi->LastFrameRecon[bp[0]],stride,bvp);
+ if(bp+1<bp_right && !cp[1])
+ loop_filter_h(&pbi->LastFrameRecon[bp[0]]+8,stride,bvp);
+ if(bp+h<bp_end && !cp[h])
+ loop_filter_v(&pbi->LastFrameRecon[bp[h]],stride,bvp);
+ }
+ bp++;
+ cp++;
}
}
}
@@ -842,7 +808,7 @@
FragsDown = pbi->VFragments >> 1;
break;
/*case 2: v */
- default:
+ default:
FromFragment = pbi->YPlaneFragments + pbi->UVPlaneFragments;
ToFragment = pbi->YPlaneFragments + (2 * pbi->UVPlaneFragments) ;
FragsAcross = pbi->HFragments >> 1;
@@ -949,94 +915,6 @@
/* Apply a loop filter to edge pixels of updated blocks */
dsp_LoopFilter(pbi->dsp, pbi, pbi->quant_info.loop_filter_limits[pbi->FrameQIndex]);
-#ifdef _TH_DEBUG_
- {
- int x,y,i,j,k,xn,yn,stride;
- int plane;
- int buf;
-
- /* dump fragment DCT components */
- for(plane=0;plane<3;plane++){
- char *plstr;
- int offset;
- switch(plane){
- case 0:
- plstr="Y";
- xn = pbi->HFragments;
- yn = pbi->VFragments;
- offset = 0;
- stride = pbi->YStride;
- break;
- case 1:
- plstr="U";
- xn = pbi->HFragments>>1;
- yn = pbi->VFragments>>1;
- offset = pbi->VFragments * pbi->HFragments;
- stride = pbi->UVStride;
- break;
- case 2:
- plstr="V";
- xn = pbi->HFragments>>1;
- yn = pbi->VFragments>>1;
- offset = pbi->VFragments * pbi->HFragments +
- ((pbi->VFragments * pbi->HFragments) >> 2);
- stride = pbi->UVStride;
- break;
- }
- for(y=0;y<yn;y++){
- for(x=0;x<xn;x++,i++){
-
- for(buf=0;buf<3;buf++){
- Q_LIST_ENTRY (*ptr)[64];
- char *bufn;
-
- switch(buf){
- case 0:
- bufn = "coded";
- ptr = pbi->QFragQUAN;
- break;
- case 1:
- bufn = "coeff";
- ptr = pbi->QFragFREQ;
- break;
- case 2:
- bufn = "idct";
- ptr = pbi->QFragTIME;
- break;
- }
-
- i = offset + y*xn + x;
-
- TH_DEBUG("%s %s [%d][%d] = {",bufn,plstr,x,y);
- if ( !pbi->display_fragments[i] )
- TH_DEBUG(" not coded }\n");
- else{
- int l=0;
- for(j=0;j<8;j++){
- TH_DEBUG("\n ");
- for(k=0;k<8;k++,l++){
- TH_DEBUG("%d ",ptr[i][l]);
- }
- }
- TH_DEBUG(" }\n");
- }
- }
-
- /* and the loop filter output, which is a flat struct */
- TH_DEBUG("recon %s [%d][%d] = {",plstr,x,y);
- for(j=0;j<8;j++){
- int l = pbi->recon_pixel_index_table[i] + j*stride;
- TH_DEBUG("\n ");
- for(k=0;k<8;k++,l++)
- TH_DEBUG("%d ", pbi->LastFrameRecon[l]);
- }
- TH_DEBUG(" }\n\n");
- }
- }
- }
- }
-#endif
-
/* We may need to update the UMV border */
UpdateUMVBorder(pbi, pbi->LastFrameRecon);
@@ -1054,7 +932,7 @@
funcs->LoopFilter = LoopFilter__c;
#if defined(USE_ASM)
// Todo: Port the dct for MSC one day.
-#if !defined (_MSC_VER)
+#if !defined (_MSC_VER)
if (cpu_flags & OC_CPU_X86_MMX) {
dsp_mmx_dct_decode_init(funcs);
}
Modified: trunk/theora/lib/enc/dct_encode.c
===================================================================
--- trunk/theora/lib/enc/dct_encode.c 2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/enc/dct_encode.c 2008-08-04 18:37:55 UTC (rev 15153)
@@ -295,7 +295,7 @@
half pixel MC */
unsigned char *ReconPtr1; /* DCT reconstructed image pointers */
unsigned char *ReconPtr2; /* Pointer used in half pixel MC */
-
+
switch(MvDevisor) {
case 2:
MvShift = 1;
@@ -413,7 +413,7 @@
select_quantiser(&cpi->pb, BLOCK_INTER_Y);
} else {
LeftEdge = !((FragIndex-cpi->pb.YPlaneFragments)%(cpi->pb.HFragments>>1));
-
+
if(FragIndex < (ogg_int32_t)cpi->pb.YPlaneFragments + (ogg_int32_t)cpi->pb.UVPlaneFragments) {
/* U plane */
if ( cpi->pb.CodingMode == CODE_INTRA )
Modified: trunk/theora/lib/enc/dsp.c
===================================================================
--- trunk/theora/lib/enc/dsp.c 2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/enc/dsp.c 2008-08-04 18:37:55 UTC (rev 15153)
@@ -72,7 +72,7 @@
static void sub8x8avg2__c (unsigned char *FiltPtr, unsigned char *ReconPtr1,
unsigned char *ReconPtr2, ogg_int16_t *DctInputPtr,
ogg_uint32_t PixelsPerLine,
- ogg_uint32_t ReconPixelsPerLine)
+ ogg_uint32_t ReconPixelsPerLine)
{
int i;
@@ -100,12 +100,12 @@
ogg_uint32_t SadValue;
ogg_uint32_t SadValue1;
- SadValue = DSP_OP_ABS_DIFF (Src1[0], Src2[0]) +
+ SadValue = DSP_OP_ABS_DIFF (Src1[0], Src2[0]) +
DSP_OP_ABS_DIFF (Src1[1], Src2[1]) +
DSP_OP_ABS_DIFF (Src1[2], Src2[2]) +
DSP_OP_ABS_DIFF (Src1[3], Src2[3]);
- SadValue1 = DSP_OP_ABS_DIFF (Src1[4], Src2[4]) +
+ SadValue1 = DSP_OP_ABS_DIFF (Src1[4], Src2[4]) +
DSP_OP_ABS_DIFF (Src1[5], Src2[5]) +
DSP_OP_ABS_DIFF (Src1[6], Src2[6]) +
DSP_OP_ABS_DIFF (Src1[7], Src2[7]);
@@ -132,7 +132,7 @@
SadValue[5] += abs(Src1[5] - Src2[5]);
SadValue[6] += abs(Src1[6] - Src2[6]);
SadValue[7] += abs(Src1[7] - Src2[7]);
-
+
Src1 += stride;
Src2 += stride;
}
@@ -146,18 +146,18 @@
SadValue2[5] += abs(Src1[5] - Src2[5]);
SadValue2[6] += abs(Src1[6] - Src2[6]);
SadValue2[7] += abs(Src1[7] - Src2[7]);
-
+
Src1 += stride;
Src2 += stride;
}
-
+
for ( i = 0; i < 8; i++ ){
if ( SadValue[i] > MaxSad )
MaxSad = SadValue[i];
if ( SadValue2[i] > MaxSad )
MaxSad = SadValue2[i];
}
-
+
return MaxSad;
}
@@ -186,7 +186,7 @@
}
static ogg_uint32_t sad8x8_thres__c (unsigned char *ptr1, ogg_uint32_t stride1,
- unsigned char *ptr2, ogg_uint32_t stride2,
+ unsigned char *ptr2, ogg_uint32_t stride2,
ogg_uint32_t thres)
{
ogg_uint32_t i;
@@ -300,23 +300,23 @@
DiffVal = DSP_OP_DIFF (SrcData[3], RefDataPtr[3]);
XSum += DiffVal;
XXSum += DiffVal*DiffVal;
-
+
DiffVal = DSP_OP_DIFF (SrcData[4], RefDataPtr[4]);
XSum += DiffVal;
XXSum += DiffVal*DiffVal;
-
+
DiffVal = DSP_OP_DIFF (SrcData[5], RefDataPtr[5]);
XSum += DiffVal;
XXSum += DiffVal*DiffVal;
-
+
DiffVal = DSP_OP_DIFF (SrcData[6], RefDataPtr[6]);
XSum += DiffVal;
XXSum += DiffVal*DiffVal;
-
+
DiffVal = DSP_OP_DIFF (SrcData[7], RefDataPtr[7]);
XSum += DiffVal;
XXSum += DiffVal*DiffVal;
-
+
/* Step to next row of block. */
SrcData += SrcStride;
RefDataPtr += RefStride;
@@ -382,7 +382,6 @@
void dsp_init(DspFunctions *funcs)
{
- /* TH_DEBUG("setting dsp functions to C defaults.\n"); */
funcs->save_fpu = nop;
funcs->restore_fpu = nop;
funcs->sub8x8 = sub8x8__c;
Modified: trunk/theora/lib/enc/dsp.h
===================================================================
--- trunk/theora/lib/enc/dsp.h 2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/enc/dsp.h 2008-08-04 18:37:55 UTC (rev 15153)
@@ -36,19 +36,19 @@
void (*sub8x8avg2) (unsigned char *FiltPtr, unsigned char *ReconPtr1,
unsigned char *ReconPtr2, ogg_int16_t *DctInputPtr,
ogg_uint32_t PixelsPerLine,
- ogg_uint32_t ReconPixelsPerLine);
+ ogg_uint32_t ReconPixelsPerLine);
- void (*copy8x8) (unsigned char *src, unsigned char *dest,
+ void (*copy8x8) (unsigned char *src, unsigned char *dest,
ogg_uint32_t stride);
- void (*recon_intra8x8) (unsigned char *ReconPtr, ogg_int16_t *ChangePtr,
+ void (*recon_intra8x8) (unsigned char *ReconPtr, ogg_int16_t *ChangePtr,
ogg_uint32_t LineStep);
- void (*recon_inter8x8) (unsigned char *ReconPtr, unsigned char *RefPtr,
+ void (*recon_inter8x8) (unsigned char *ReconPtr, unsigned char *RefPtr,
ogg_int16_t *ChangePtr, ogg_uint32_t LineStep);
- void (*recon_inter8x8_half) (unsigned char *ReconPtr, unsigned char *RefPtr1,
- unsigned char *RefPtr2, ogg_int16_t *ChangePtr,
+ void (*recon_inter8x8_half) (unsigned char *ReconPtr, unsigned char *RefPtr1,
+ unsigned char *RefPtr2, ogg_int16_t *ChangePtr,
ogg_uint32_t LineStep);
void (*fdct_short) (ogg_int16_t *InputData, ogg_int16_t *OutputData);
@@ -62,7 +62,7 @@
unsigned char *ptr2, ogg_uint32_t stride2);
ogg_uint32_t (*sad8x8_thres) (unsigned char *ptr1, ogg_uint32_t stride1,
- unsigned char *ptr2, ogg_uint32_t stride2,
+ unsigned char *ptr2, ogg_uint32_t stride2,
ogg_uint32_t thres);
ogg_uint32_t (*sad8x8_xy2_thres)(unsigned char *SrcData, ogg_uint32_t SrcStride,
@@ -78,19 +78,19 @@
ogg_uint32_t (*inter8x8_err_xy2)(unsigned char *SrcData, ogg_uint32_t SrcStride,
unsigned char *RefDataPtr1,
unsigned char *RefDataPtr2, ogg_uint32_t RefStride);
-
+
void (*LoopFilter) (PB_INSTANCE *pbi, int FLimit);
void (*FilterVert) (unsigned char * PixelPtr,
ogg_int32_t LineLength, ogg_int16_t *BoundingValuePtr);
- void (*IDctSlow) (ogg_int16_t *InputData,
+ void (*IDctSlow) (ogg_int16_t *InputData,
ogg_int16_t *QuantMatrix, ogg_int16_t *OutputData);
- void (*IDct3) (ogg_int16_t *InputData,
+ void (*IDct3) (ogg_int16_t *InputData,
ogg_int16_t *QuantMatrix, ogg_int16_t *OutputData);
-
- void (*IDct10) (ogg_int16_t *InputData,
+
+ void (*IDct10) (ogg_int16_t *InputData,
ogg_int16_t *QuantMatrix, ogg_int16_t *OutputData);
} DspFunctions;
Modified: trunk/theora/lib/enc/encode.c
===================================================================
--- trunk/theora/lib/enc/encode.c 2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/enc/encode.c 2008-08-04 18:37:55 UTC (rev 15153)
@@ -28,10 +28,10 @@
#define HIGHBITDUPPED(X) (((ogg_int16_t) X) >> 15)
static ogg_uint32_t QuadCodeComponent ( CP_INSTANCE *cpi,
- ogg_uint32_t FirstSB,
- ogg_uint32_t SBRows,
- ogg_uint32_t SBCols,
- ogg_uint32_t PixelsPerLine){
+ ogg_uint32_t FirstSB,
+ ogg_uint32_t SBRows,
+ ogg_uint32_t SBCols,
+ ogg_uint32_t PixelsPerLine){
ogg_int32_t FragIndex; /* Fragment number */
ogg_uint32_t MB, B; /* Macro-Block, Block indices */
@@ -49,7 +49,7 @@
for ( SBcol=0; SBcol<SBCols; SBcol++ ) {
/* Check its four Macro-Blocks */
/* 'Macro-Block' is a misnomer in the chroma planes; this is
- really just a Hilbert curve iterator */
+ really just a Hilbert curve iterator */
for ( MB=0; MB<4; MB++ ) {
if ( QuadMapToMBTopLeft(cpi->pb.BlockMap,SB,MB) >= 0 ) {
@@ -359,31 +359,15 @@
/* Add the appropriate mode entropy token. */
ModeIndex = SchemeList[cpi->ModeList[i]];
oggpackB_write( opb, ModeBitPatterns[ModeIndex],
- (ogg_uint32_t)ModeBitLengths[ModeIndex] );
+ (ogg_uint32_t)ModeBitLengths[ModeIndex] );
}
}else{
/* Fall back to MODE_BITS per entry */
for ( i = 0; i < cpi->ModeListCount; i++)
/* Add the appropriate mode entropy token. */
- oggpackB_write( opb, cpi->ModeList[i], MODE_BITS );
+ oggpackB_write( opb, cpi->ModeList[i], MODE_BITS );
}
-
-#ifdef _TH_DEBUG_
- TH_DEBUG("mode encode scheme = %d\n",(int)BestScheme);
- if ( BestScheme == 0 ) {
- TH_DEBUG("mode scheme list = { ");
- for ( j = 0; j < MAX_MODES; j++ )
- TH_DEBUG("%d ",(int)BestModeSchemes[j]);
- TH_DEBUG("}\n");
- }
- TH_DEBUG("mode list = { ");
- for ( i = 0; i < cpi->ModeListCount; i++) {
- if((i&0x1f)==0)
- TH_DEBUG("\n ");
- TH_DEBUG("%d ",cpi->ModeList[i]);
- }
- TH_DEBUG("\n}\n");
-#endif
+
}
static void PackMotionVectors (CP_INSTANCE *cpi) {
@@ -422,15 +406,6 @@
(ogg_uint32_t)MvBitsPtr[cpi->MVList[i].y] );
}
-#ifdef _TH_DEBUG_
- TH_DEBUG("motion vectors = {");
- for ( i = 0; i < (ogg_int32_t)cpi->MvListCount; i++ ) {
- if((i&0x7)==0)
- TH_DEBUG("\n ");
- TH_DEBUG("%+03d,%+03d ",cpi->MVList[i].x,cpi->MVList[i].y);
- }
- TH_DEBUG("\n}\n");
-#endif
}
static void PackEOBRun( CP_INSTANCE *cpi) {
@@ -905,17 +880,6 @@
}
}
-#ifdef _TH_DEBUG_
- {
- int j;
- for ( i = 0; i < cpi->pb.CodedBlockIndex; i++ ) {
- FragIndex = cpi->pb.CodedBlockList[i];
- for(j=0;j<64;j++)
- cpi->pb.QFragQUAN[FragIndex][j] = cpi->pb.QFragData[FragIndex][j];
- }
- }
-#endif
-
/* Pack DC tokens and adjust the ones we couldn't predict 2d */
for ( i = 0; i < cpi->pb.CodedBlockIndex; i++ ) {
/* Get the linear index for the current coded fragment. */
@@ -1013,7 +977,7 @@
cpi->pb.FragCodingMethod[cpi->pb.YPlaneFragments +
cpi->pb.UVPlaneFragments + UVFragOffset] =
cpi->MBCodingMode;
- }
+ }
}
/* Next Super-Block */
@@ -1391,7 +1355,7 @@
cpi->MBCodingMode = CODE_INTER_PLUS_MV;
SetMBMotionVectorsAndMode(cpi,YFragIndex,UFragIndex,
- VFragIndex,&InterMVect);
+ VFragIndex,&InterMVect);
/* Update Prior last mv with last mv */
PriorLastInterMVect.x = LastInterMVect.x;
@@ -1407,7 +1371,7 @@
cpi->MBCodingMode = CODE_GOLDEN_MV;
SetMBMotionVectorsAndMode(cpi,YFragIndex,UFragIndex,
- VFragIndex,&GFMVect);
+ VFragIndex,&GFMVect);
/* Note last inter GF MV for future use */
LastGFMVect.x = GFMVect.x;
@@ -1463,7 +1427,7 @@
cpi->MBCodingMode = CODE_INTRA;
SetMBMotionVectorsAndMode(cpi,YFragIndex,UFragIndex,
- VFragIndex,&ZeroVect);
+ VFragIndex,&ZeroVect);
}
@@ -1487,17 +1451,11 @@
void WriteFrameHeader( CP_INSTANCE *cpi) {
ogg_uint32_t i;
oggpack_buffer *opb=cpi->oggbuffer;
-
- TH_DEBUG("\n>>>> beginning frame %ld\n\n",dframe);
-
/* Output the frame type (base/key frame or inter frame) */
oggpackB_write( opb, cpi->pb.FrameType, 1 );
- TH_DEBUG("frame type = video, %s\n",cpi->pb.FrameType?"predicted":"key");
-
/* Write out details of the current value of Q... variable resolution. */
for ( i = 0; i < Q_TABLE_SIZE; i++ ) {
if ( cpi->pb.ThisFrameQualityValue == cpi->pb.QThreshTable[i] ) {
- TH_DEBUG("frame quality = { %d }\n",i);
oggpackB_write( opb, i, 6 );
break;
}
Modified: trunk/theora/lib/enc/encoder_quant.c
===================================================================
--- trunk/theora/lib/enc/encoder_quant.c 2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/enc/encoder_quant.c 2008-08-04 18:37:55 UTC (rev 15153)
@@ -20,12 +20,6 @@
#include "codec_internal.h"
#include "quant_lookup.h"
-#ifdef _TH_DEBUG_
-#include <stdio.h>
-extern FILE *debugout;
-extern long dframe;
-#endif
-
#define OC_QUANT_MAX (1024<<2)
static const unsigned DC_QUANT_MIN[2]={4<<2,8<<2};
static const unsigned AC_QUANT_MIN[2]={2<<2,4<<2};
@@ -41,9 +35,9 @@
void WriteQTables(PB_INSTANCE *pbi,oggpack_buffer* _opb) {
-
- th_quant_info *_qinfo = &pbi->quant_info;
-
+
+ th_quant_info *_qinfo = &pbi->quant_info;
+
const th_quant_ranges *qranges;
const th_quant_base *base_mats[2*3*64];
int indices[2][3][64];
@@ -58,7 +52,7 @@
int plj;
int bmi;
int i;
-
+
/*Unlike the scale tables, we can't assume the maximum value will be in
index 0, so search for it here.*/
i=_qinfo->loop_filter_limits[0];
@@ -149,133 +143,57 @@
th_quant_info *qinfo = &pbi->quant_info;
pbi->QThreshTable = pbi->quant_info.ac_scale;
-
+
for(qti=0;qti<2;qti++){
for(pli=0;pli<3;pli++){
int qi; /* quality index */
int qri; /* range iterator */
-
+
for(qi=0,qri=0; qri<=qinfo->qi_ranges[qti][pli].nranges; qri++){
- th_quant_base base;
-
- ogg_uint32_t q;
- int qi_start;
- int qi_end;
- int ci;
- memcpy(base,qinfo->qi_ranges[qti][pli].base_matrices[qri],
- sizeof(base));
-
- qi_start=qi;
- if(qri==qinfo->qi_ranges[qti][pli].nranges)
- qi_end=qi+1;
- else
- qi_end=qi+qinfo->qi_ranges[qti][pli].sizes[qri];
-
- /* Iterate over quality indicies in this range */
- for(;;){
-
- /*Scale DC the coefficient from the proper table.*/
- q=((ogg_uint32_t)qinfo->dc_scale[qi]*base[0]/100)<<2;
- q=OC_CLAMPI(DC_QUANT_MIN[qti],q,OC_QUANT_MAX);
- pbi->quant_tables[qti][pli][qi][0]=(ogg_uint16_t)q;
-
- /*Now scale AC coefficients from the proper table.*/
- for(ci=1;ci<64;ci++){
- q=((ogg_uint32_t)qinfo->ac_scale[qi]*base[ci]/100)<<2;
- q=OC_CLAMPI(AC_QUANT_MIN[qti],q,OC_QUANT_MAX);
- pbi->quant_tables[qti][pli][qi][ci]=(ogg_uint16_t)q;
- }
-
- if(++qi>=qi_end)break;
-
- /*Interpolate the next base matrix.*/
- for(ci=0;ci<64;ci++){
- base[ci]=(unsigned char)
- ((2*((qi_end-qi)*qinfo->qi_ranges[qti][pli].base_matrices[qri][ci]+
- (qi-qi_start)*qinfo->qi_ranges[qti][pli].base_matrices[qri+1][ci])
- +qinfo->qi_ranges[qti][pli].sizes[qri])/
- (2*qinfo->qi_ranges[qti][pli].sizes[qri]));
- }
- }
- }
- }
- }
+ th_quant_base base;
-#ifdef _TH_DEBUG_
- int i, j, k, l;
+ ogg_uint32_t q;
+ int qi_start;
+ int qi_end;
+ int ci;
+ memcpy(base,qinfo->qi_ranges[qti][pli].base_matrices[qri],
+ sizeof(base));
- /* dump the static tables */
- {
- int i, j, k, l, m;
- TH_DEBUG("loop filter limits = {");
- for(i=0;i<64;){
- TH_DEBUG("\n ");
- for(j=0;j<16;i++,j++)
- TH_DEBUG("%3d ",qinfo->loop_filter_limits[i]);
- }
- TH_DEBUG("\n}\n\n");
+ qi_start=qi;
+ if(qri==qinfo->qi_ranges[qti][pli].nranges)
+ qi_end=qi+1;
+ else
+ qi_end=qi+qinfo->qi_ranges[qti][pli].sizes[qri];
- TH_DEBUG("ac scale = {");
- for(i=0;i<64;){
- TH_DEBUG("\n ");
- for(j=0;j<16;i++,j++)
- TH_DEBUG("%3d ",qinfo->ac_scale[i]);
- }
- TH_DEBUG("\n}\n\n");
+ /* Iterate over quality indicies in this range */
+ for(;;){
- TH_DEBUG("dc scale = {");
- for(i=0;i<64;){
- TH_DEBUG("\n ");
- for(j=0;j<16;i++,j++)
- TH_DEBUG("%3d ",qinfo->dc_scale[i]);
- }
- TH_DEBUG("\n}\n\n");
+ /*Scale DC the coefficient from the proper table.*/
+ q=((ogg_uint32_t)qinfo->dc_scale[qi]*base[0]/100)<<2;
+ q=OC_CLAMPI(DC_QUANT_MIN[qti],q,OC_QUANT_MAX);
+ pbi->quant_tables[qti][pli][qi][0]=(ogg_uint16_t)q;
- for(k=0;k<2;k++)
- for(l=0;l<3;l++){
- char *name[2][3]={
- {"intra Y bases","intra U bases", "intra V bases"},
- {"inter Y bases","inter U bases", "inter V bases"}
- };
+ /*Now scale AC coefficients from the proper table.*/
+ for(ci=1;ci<64;ci++){
+ q=((ogg_uint32_t)qinfo->ac_scale[qi]*base[ci]/100)<<2;
+ q=OC_CLAMPI(AC_QUANT_MIN[qti],q,OC_QUANT_MAX);
+ pbi->quant_tables[qti][pli][qi][ci]=(ogg_uint16_t)q;
+ }
- th_quant_ranges *r = &qinfo->qi_ranges[k][l];
- TH_DEBUG("%s = {\n",name[k][l]);
- TH_DEBUG(" ranges = %d\n",r->nranges);
- TH_DEBUG(" intervals = { ");
- for(i=0;i<r->nranges;i++)
- TH_DEBUG("%3d ",r->sizes[i]);
- TH_DEBUG("}\n");
- TH_DEBUG("\n matricies = { ");
- for(m=0;m<r->nranges+1;m++){
- TH_DEBUG("\n { ");
- for(i=0;i<64;){
- TH_DEBUG("\n ");
- for(j=0;j<8;i++,j++)
- TH_DEBUG("%3d ",r->base_matrices[m][i]);
- }
- TH_DEBUG("\n }");
- }
- TH_DEBUG("\n }\n");
- }
- }
+ if(++qi>=qi_end)break;
- /* dump the calculated quantizer tables */
- for(i=0;i<2;i++){
- for(j=0;j<3;j++){
- for(k=0;k<64;k++){
- TH_DEBUG("quantizer table [%s][%s][Q%d] = {",
- (i==0?"intra":"inter"),(j==0?"Y":(j==1?"U":"V")),k);
- for(l=0;l<64;l++){
- if((l&7)==0)
- TH_DEBUG("\n ");
- TH_DEBUG("%4d ",pbi->quant_tables[i][j][k][l]);
- }
- TH_DEBUG("}\n");
+ /*Interpolate the next base matrix.*/
+ for(ci=0;ci<64;ci++){
+ base[ci]=(unsigned char)
+ ((2*((qi_end-qi)*qinfo->qi_ranges[qti][pli].base_matrices[qri][ci]+
+ (qi-qi_start)*qinfo->qi_ranges[qti][pli].base_matrices[qri+1][ci])
+ +qinfo->qi_ranges[qti][pli].sizes[qri])/
+ (2*qinfo->qi_ranges[qti][pli].sizes[qri]));
+ }
+ }
}
}
}
-#endif
-
}
static void BuildZigZagIndex(PB_INSTANCE *pbi){
@@ -289,17 +207,17 @@
}
static void init_quantizer ( CP_INSTANCE *cpi,
- unsigned char QIndex ){
+ unsigned char QIndex ){
int i;
double ZBinFactor;
double RoundingFactor;
-
+
double temp_fp_quant_coeffs;
double temp_fp_quant_round;
double temp_fp_ZeroBinSize;
PB_INSTANCE *pbi = &cpi->pb;
-
-
+
+
const ogg_uint16_t * temp_Y_coeffs;
const ogg_uint16_t * temp_U_coeffs;
const ogg_uint16_t * temp_V_coeffs;
@@ -307,22 +225,22 @@
const ogg_uint16_t * temp_Inter_U_coeffs;
const ogg_uint16_t * temp_Inter_V_coeffs;
ogg_uint16_t scale_factor = cpi->pb.quant_info.ac_scale[QIndex];
-
+
/* Notes on setup of quantisers. The initial multiplication by
the scale factor is done in the ogg_int32_t domain to insure that the
precision in the quantiser is the same as in the inverse
quantiser where all calculations are integer. The "<< 2" is a
normalisation factor for the forward DCT transform. */
-
+
temp_Y_coeffs = pbi->quant_tables[0][0][QIndex];
temp_U_coeffs = pbi->quant_tables[0][1][QIndex];
temp_V_coeffs = pbi->quant_tables[0][2][QIndex];
temp_Inter_Y_coeffs = pbi->quant_tables[1][0][QIndex];
temp_Inter_U_coeffs = pbi->quant_tables[1][1][QIndex];
temp_Inter_V_coeffs = pbi->quant_tables[1][2][QIndex];
-
+
ZBinFactor = 0.9;
-
+
switch(cpi->pb.info.sharpness){
case 0:
ZBinFactor = 0.65;
@@ -393,7 +311,7 @@
pbi->fp_ZeroBinSize_Inter_U[0]= (0.5 + temp_fp_ZeroBinSize);
temp_fp_quant_coeffs= 1.0 / temp_fp_quant_coeffs;
pbi->fp_quant_Inter_U_coeffs[0]= (0.5 + SHIFT16 * temp_fp_quant_coeffs);
-
+
/* Inter V */
temp_fp_quant_coeffs = temp_Inter_V_coeffs[0];
temp_fp_quant_round = temp_fp_quant_coeffs * RoundingFactor;
@@ -402,8 +320,8 @@
pbi->fp_ZeroBinSize_Inter_V[0]= (0.5 + temp_fp_ZeroBinSize);
temp_fp_quant_coeffs= 1.0 / temp_fp_quant_coeffs;
pbi->fp_quant_Inter_V_coeffs[0]= (0.5 + SHIFT16 * temp_fp_quant_coeffs);
-
+
for ( i = 1; i < 64; i++ ){
/* Intra Y */
temp_fp_quant_coeffs = temp_Y_coeffs[i];
@@ -469,7 +387,7 @@
void select_quantiser(PB_INSTANCE *pbi, int type) {
/* select a quantiser according to what plane has to be coded in what
* mode. Could be extended to a more sophisticated scheme. */
-
+
switch(type) {
case BLOCK_Y:
pbi->fquant_coeffs = pbi->fp_quant_Y_coeffs;
@@ -494,12 +412,12 @@
case BLOCK_INTER_U:
pbi->fquant_coeffs = pbi->fp_quant_Inter_U_coeffs;
pbi->fquant_round = pbi->fp_quant_Inter_U_round;
- pbi->fquant_ZbSize = pbi->fp_ZeroBinSize_Inter_U;
+ pbi->fquant_ZbSize = pbi->fp_ZeroBinSize_Inter_U;
break;
case BLOCK_INTER_V:
pbi->fquant_coeffs = pbi->fp_quant_Inter_V_coeffs;
pbi->fquant_round = pbi->fp_quant_Inter_V_round;
- pbi->fquant_ZbSize = pbi->fp_ZeroBinSize_Inter_V;
+ pbi->fquant_ZbSize = pbi->fp_ZeroBinSize_Inter_V;
break;
}
}
@@ -523,7 +441,7 @@
/* Note that we add half divisor to effect rounding on positive number */
for( i = 0; i < VFRAGPIXELS; i++) {
-
+
int col;
/* Iterate through columns */
for( col = 0; col < 8; col++) {
@@ -538,7 +456,7 @@
quantized_list[ZigZagPtr[col]] = ( val < -511 ) ? -511 : val;
}
}
-
+
FquantRoundPtr += 8;
FquantCoeffsPtr += 8;
FquantZBinSizePtr += 8;
@@ -548,9 +466,9 @@
}
static void init_dequantizer ( PB_INSTANCE *pbi,
- unsigned char QIndex ){
+ unsigned char QIndex ){
int i, j;
-
+
ogg_uint16_t * InterY_coeffs;
ogg_uint16_t * InterU_coeffs;
ogg_uint16_t * InterV_coeffs;
@@ -564,7 +482,7 @@
InterY_coeffs = pbi->quant_tables[1][0][QIndex];
InterU_coeffs = pbi->quant_tables[1][1][QIndex];
InterV_coeffs = pbi->quant_tables[1][2][QIndex];
-
+
/* invert the dequant index into the quant index
the dxer has a different order than the cxer. */
BuildZigZagIndex(pbi);
@@ -606,7 +524,7 @@
else if (NewQIndex < 0) NewQIndex = 0;
pbi->FrameQIndex = NewQIndex;
-
+
qscale = pbi->quant_info.ac_scale[NewQIndex];
pbi->ThisFrameQualityValue = qscale;
@@ -624,7 +542,7 @@
qscale = pbi->quant_info.ac_scale[Q_TABLE_SIZE-1];
else if ( qscale > pbi->quant_info.ac_scale[0] )
qscale = pbi->quant_info.ac_scale[0];
-
+
/* Set the inter/intra descision control variables. */
pbi->FrameQIndex = Q_TABLE_SIZE - 1;
while ((ogg_int32_t) pbi->FrameQIndex >= 0 ) {
Modified: trunk/theora/lib/enc/encoder_toplevel.c
===================================================================
--- trunk/theora/lib/enc/encoder_toplevel.c 2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/enc/encoder_toplevel.c 2008-08-04 18:37:55 UTC (rev 15153)
@@ -26,18 +26,13 @@
#include "dsp.h"
#include "codec_internal.h"
-#ifdef _TH_DEBUG_
-FILE *debugout=NULL;
-long dframe=0;
-#endif
-
#define A_TABLE_SIZE 29
#define DF_CANDIDATE_WINDOW 5
/*
- * th_quant_info for VP3
+ * th_quant_info for VP3
*/
-
+
/*The default quantization parameters used by VP3.1.*/
static const int OC_VP31_RANGE_SIZES[1]={63};
static const th_quant_base OC_VP31_BASES_INTRA_Y[2]={
@@ -897,10 +892,6 @@
CP_INSTANCE *cpi;
-#ifdef _TH_DEBUG_
- debugout=fopen("theoraenc-debugout.txt","w");
-#endif
-
memset(th, 0, sizeof(*th));
/*Currently only the 4:2:0 format is supported.*/
if(c->pixelformat!=OC_PF_420)return OC_IMPL;
@@ -1043,7 +1034,7 @@
current clip. */
cpi->ThisIsFirstFrame = 1;
cpi->readyflag = 1;
-
+
cpi->pb.HeadersWritten = 0;
/*We overload this flag to track header output.*/
cpi->doneflag=-3;
@@ -1111,7 +1102,7 @@
if(cpi->LastKeyFrame >= (ogg_uint32_t)
cpi->pb.info.keyframe_frequency_force)
cpi->ThisIsKeyFrame = 1;
-
+
if ( cpi->ThisIsKeyFrame ) {
CompressKeyFrame(cpi);
cpi->ThisIsKeyFrame = 0;
@@ -1131,10 +1122,6 @@
((cpi->CurrentFrame - cpi->LastKeyFrame)<<cpi->pb.keyframe_granule_shift)+
cpi->LastKeyFrame - 1;
-#ifdef _TH_DEBUG_
- dframe++;
-#endif
-
return 0;
}
@@ -1170,7 +1157,7 @@
static void _tp_writelsbint(oggpack_buffer *opb, long value)
{
- oggpackB_write(opb, value&0xFF, 8);
+ oggpackB_write(opb, value&0xFF, 8);
oggpackB_write(opb, value>>8&0xFF, 8);
oggpackB_write(opb, value>>16&0xFF, 8);
oggpackB_write(opb, value>>24&0xFF, 8);
@@ -1197,7 +1184,7 @@
/* Applications use offset_y to mean offset from the top of the image; the
* meaning in the bitstream is the opposite (from the bottom). Transform.
*/
- offset_y = cpi->pb.info.height - cpi->pb.info.frame_height -
+ offset_y = cpi->pb.info.height - cpi->pb.info.frame_height -
cpi->pb.info.offset_y;
oggpackB_write(cpi->oggbuffer,offset_y,8);
@@ -1321,11 +1308,6 @@
_ogg_free(cpi);
}
-#ifdef _TH_DEBUG_
- fclose(debugout);
- debugout=NULL;
-#endif
-
memset(th,0,sizeof(*th));
}
@@ -1377,59 +1359,59 @@
CP_INSTANCE *cpi;
PB_INSTANCE *pbi;
int value;
-
+
if(th == NULL)
return TH_EFAULT;
cpi = th->internal_encode;
pbi = &cpi->pb;
-
+
switch(req) {
case TH_ENCCTL_SET_QUANT_PARAMS:
if( ( buf==NULL&&buf_sz!=0 )
- || ( buf!=NULL&&buf_sz!=sizeof(th_quant_info) )
- || cpi->pb.HeadersWritten ){
+ || ( buf!=NULL&&buf_sz!=sizeof(th_quant_info) )
+ || cpi->pb.HeadersWritten ){
return TH_EINVAL;
}
-
+
memcpy(&pbi->quant_info, buf, sizeof(th_quant_info));
InitQTables(pbi);
-
+
return 0;
case TH_ENCCTL_SET_VP3_COMPATIBLE:
if(cpi->pb.HeadersWritten)
return TH_EINVAL;
-
+
memcpy(&pbi->quant_info, &TH_VP31_QUANT_INFO, sizeof(th_quant_info));
InitQTables(pbi);
-
+
return 0;
case TH_ENCCTL_SET_SPLEVEL:
if(buf == NULL || buf_sz != sizeof(int))
return TH_EINVAL;
-
+
memcpy(&value, buf, sizeof(int));
-
+
switch(value) {
case 0:
cpi->MotionCompensation = 1;
pbi->info.quick_p = 0;
break;
-
+
case 1:
cpi->MotionCompensation = 1;
pbi->info.quick_p = 1;
break;
-
+
case 2:
cpi->MotionCompensation = 0;
pbi->info.quick_p = 1;
break;
-
+
default:
- return TH_EINVAL;
+ return TH_EINVAL;
}
-
+
return 0;
case TH_ENCCTL_GET_SPLEVEL_MAX:
value = 2;
Modified: trunk/theora/lib/enc/frarray.c
===================================================================
--- trunk/theora/lib/enc/frarray.c 2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/enc/frarray.c 2008-08-04 18:37:55 UTC (rev 15153)
@@ -123,11 +123,6 @@
memset( cpi->PartiallyCodedFlags, 0, cpi->pb.SuperBlocks );
memset( cpi->BlockCodedFlags, 0, cpi->pb.UnitFragments);
-#ifdef _TH_DEBUG_
- unsigned char blockraster[cpi->pb.UnitFragments];
- memset(blockraster,0,sizeof(blockraster));
-#endif
-
for( SB = 0; SB < cpi->pb.SuperBlocks; SB++ ) {
/* Check for coded blocks and macro-blocks */
for ( MB=0; MB<4; MB++ ) {
@@ -144,10 +139,6 @@
cpi->pb.SBCodedFlags[SB] = 1; /* SB at least partly coded */
cpi->BlockCodedFlags[BListIndex] = 1; /* Block is coded */
-#ifdef _TH_DEBUG_
- blockraster[DfBlockIndex]=1;
-#endif
-
}else{
cpi->pb.SBFullyFlags[SB] = 0; /* SB not fully coded */
cpi->BlockCodedFlags[BListIndex] = 0; /* Block is not coded */
@@ -170,77 +161,16 @@
}
}
-#ifdef _TH_DEBUG_
- // assuming 4:2:0 right now
- TH_DEBUG("predicted (partially coded frame)\n");
- TH_DEBUG("superblock coded flags = {");
- int x,y;
- i=0;
-
- for(y=0;y< (cpi->ScanConfig.VideoFrameHeight+31)/32;y++){
- TH_DEBUG("\n ");
- for(x=0;x< (cpi->ScanConfig.VideoFrameWidth+31)/32;x++,i++)
- TH_DEBUG("%x", ((cpi->pb.SBFullyFlags[i]!=0)|
- (cpi->PartiallyCodedFlags[i]!=0)));
- }
- TH_DEBUG("\n ");
- for(y=0;y< (cpi->ScanConfig.VideoFrameHeight+63)/64;y++){
- TH_DEBUG("\n ");
- for(x=0;x< (cpi->ScanConfig.VideoFrameWidth+63)/64;x++,i++)
- TH_DEBUG("%x", ((cpi->pb.SBFullyFlags[i]!=0)|
- (cpi->PartiallyCodedFlags[i]!=0)));
- }
- TH_DEBUG("\n ");
- for(y=0;y< (cpi->ScanConfig.VideoFrameHeight+63)/64;y++){
- TH_DEBUG("\n ");
- for(x=0;x< (cpi->ScanConfig.VideoFrameWidth+63)/64;x++,i++)
- TH_DEBUG("%x", ((cpi->pb.SBFullyFlags[i]!=0)|
- (cpi->PartiallyCodedFlags[i]!=0)));
- }
- TH_DEBUG("\n}\n");
-
- if(i!=cpi->pb.SuperBlocks)
- TH_DEBUG("WARNING! superblock count, raster %d != flat %d\n",
- i,cpi->pb.SuperBlocks);
-
- TH_DEBUG("block coded flags = {");
-
- i=0;
-
- for(y=0;y< (cpi->ScanConfig.VideoFrameHeight+7)/8;y++){
- TH_DEBUG("\n ");
- for(x=0;x< (cpi->ScanConfig.VideoFrameWidth+7)/8;x++,i++)
- TH_DEBUG("%x", blockraster[i]);
- }
- TH_DEBUG("\n ");
- for(y=0;y< (cpi->ScanConfig.VideoFrameHeight+15)/16;y++){
- TH_DEBUG("\n ");
- for(x=0;x< (cpi->ScanConfig.VideoFrameWidth+15)/16;x++,i++)
- TH_DEBUG("%x", blockraster[i]);
- }
- TH_DEBUG("\n ");
- for(y=0;y< (cpi->ScanConfig.VideoFrameHeight+15)/16;y++){
- TH_DEBUG("\n ");
- for(x=0;x< (cpi->ScanConfig.VideoFrameWidth+15)/16;x++,i++)
- TH_DEBUG("%x", blockraster[i]);
- }
- TH_DEBUG("\n}\n");
-
- if(i!=cpi->pb.UnitFragments)
- TH_DEBUG("WARNING! block count, raster %d != flat %d\n",
- i,cpi->pb.UnitFragments);
-#endif
-
/* Code list of partially coded Super-Block. */
val = cpi->PartiallyCodedFlags[0];
oggpackB_write( cpi->oggbuffer, (ogg_uint32_t)val, 1);
- i = 0;
+ i = 0;
while ( i < cpi->pb.SuperBlocks ) {
run_count = 0;
- while ( (i<cpi->pb.SuperBlocks) &&
- (cpi->PartiallyCodedFlags[i]==val) &&
- run_count<4129 ) {
+ while ( (i<cpi->pb.SuperBlocks) &&
+ (cpi->PartiallyCodedFlags[i]==val) &&
+ run_count<4129 ) {
i++;
run_count++;
}
@@ -251,13 +181,13 @@
if(run_count >= 4129 && i < cpi->pb.SuperBlocks ){
val = cpi->PartiallyCodedFlags[i];
oggpackB_write( cpi->oggbuffer, (ogg_uint32_t)val, 1);
-
+
}else
val = ( val == 0 ) ? 1 : 0;
}
/* RLC Super-Block fully/not coded. */
- i = 0;
+ i = 0;
/* Skip partially coded blocks */
while( (i < cpi->pb.SuperBlocks) && cpi->PartiallyCodedFlags[i] )
@@ -269,9 +199,9 @@
while ( i < cpi->pb.SuperBlocks ) {
run_count = 0;
- while ( (i < cpi->pb.SuperBlocks) &&
- (cpi->pb.SBFullyFlags[i] == val) &&
- run_count < 4129) {
+ while ( (i < cpi->pb.SuperBlocks) &&
+ (cpi->pb.SBFullyFlags[i] == val) &&
+ run_count < 4129) {
i++;
/* Skip partially coded blocks */
while( (i < cpi->pb.SuperBlocks) && cpi->PartiallyCodedFlags[i] )
@@ -311,6 +241,3 @@
}
}
}
-
-
-
Modified: trunk/theora/lib/enc/frinit.c
===================================================================
--- trunk/theora/lib/enc/frinit.c 2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/enc/frinit.c 2008-08-04 18:37:55 UTC (rev 15153)
@@ -141,14 +141,6 @@
if(pbi->FragCoefEOB) _ogg_free(pbi->FragCoefEOB);
if(pbi->skipped_display_fragments) _ogg_free(pbi->skipped_display_fragments);
if(pbi->QFragData) _ogg_free(pbi->QFragData);
-#ifdef _TH_DEBUG_
- if(pbi->QFragTIME) _ogg_free(pbi->QFragTIME);
- if(pbi->QFragFREQ) _ogg_free(pbi->QFragFREQ);
- if(pbi->QFragQUAN) _ogg_free(pbi->QFragQUAN);
- pbi->QFragTIME = 0;
- pbi->QFragFREQ = 0;
- pbi->QFragQUAN = 0;
-#endif
if(pbi->TokenList) _ogg_free(pbi->TokenList);
if(pbi->FragCodingMethod) _ogg_free(pbi->FragCodingMethod);
if(pbi->FragCoordinates) _ogg_free(pbi->FragCoordinates);
@@ -243,19 +235,6 @@
pbi->QFragData =
_ogg_malloc(pbi->UnitFragments * sizeof(*pbi->QFragData));
-#ifdef _TH_DEBUG_
-
- pbi->QFragTIME =
- _ogg_malloc(pbi->UnitFragments * sizeof(*pbi->QFragTIME));
-
- pbi->QFragFREQ =
- _ogg_malloc(pbi->UnitFragments * sizeof(*pbi->QFragFREQ));
-
- pbi->QFragQUAN =
- _ogg_malloc(pbi->UnitFragments * sizeof(*pbi->QFragQUAN));
-
-#endif
-
pbi->TokenList =
_ogg_malloc(pbi->UnitFragments * sizeof(*pbi->TokenList));
Modified: trunk/theora/lib/enc/mcomp.c
===================================================================
--- trunk/theora/lib/enc/mcomp.c 2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/enc/mcomp.c 2008-08-04 18:37:55 UTC (rev 15153)
@@ -111,7 +111,7 @@
RefDataPtr1, RefPixelsPerLine);
}else{
DiffVal = dsp_inter8x8_err_xy2 (cpi->dsp, NewDataPtr, PixelsPerLine,
- RefDataPtr1,
+ RefDataPtr1,
RefDataPtr2, RefPixelsPerLine);
}
@@ -133,11 +133,11 @@
if ( RefOffset == 0 ) {
/* Simple case as for non 0.5 pixel */
- DiffVal += dsp_sad8x8 (cpi->dsp, SrcData, PixelsPerLine,
+ DiffVal += dsp_sad8x8 (cpi->dsp, SrcData, PixelsPerLine,
RefDataPtr1, RefPixelsPerLine);
} else {
- DiffVal += dsp_sad8x8_xy2_thres (cpi->dsp, SrcData, PixelsPerLine,
- RefDataPtr1,
+ DiffVal += dsp_sad8x8_xy2_thres (cpi->dsp, SrcData, PixelsPerLine,
+ RefDataPtr1,
RefDataPtr2, RefPixelsPerLine, BestSoFar);
}
@@ -729,7 +729,7 @@
dsp_save_fpu (cpi->dsp);
- /* For the moment the 4MV mode is only deemed to be valid
+ /* For the moment the 4MV mode is only deemed to be valid
if all four Y blocks are to be updated */
/* This may be adapted later. */
if ( cpi->pb.display_fragments[FragIndex] &&
Modified: trunk/theora/lib/enc/x86_32/dct_decode_mmx.c
===================================================================
--- trunk/theora/lib/enc/x86_32/dct_decode_mmx.c 2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/enc/x86_32/dct_decode_mmx.c 2008-08-04 18:37:55 UTC (rev 15153)
@@ -27,7 +27,7 @@
0x0004000400040004LL;
static void loop_filter_v(unsigned char *_pix,int _ystride,
- const ogg_int16_t *_ll){
+ const ogg_int16_t *_ll){
long esi;
_pix-=_ystride*2;
__asm__ __volatile__(
@@ -210,7 +210,7 @@
four p0's to one register we must transpose the values in four mmx regs.
When half is done we repeat this for the rest.*/
static void loop_filter_h4(unsigned char *_pix,long _ystride,
- const ogg_int16_t *_ll){
+ const ogg_int16_t *_ll){
long esi;
long edi;
__asm__ __volatile__(
@@ -343,12 +343,12 @@
}
static void loop_filter_h(unsigned char *_pix,int _ystride,
- const ogg_int16_t *_ll){
+ const ogg_int16_t *_ll){
_pix-=2;
loop_filter_h4(_pix,_ystride,_ll);
loop_filter_h4(_pix+(_ystride<<2),_ystride,_ll);
}
-
+
static void loop_filter_mmx(PB_INSTANCE *pbi, int FLimit){
int j;
ogg_int16_t __attribute__((aligned(8))) ll[4];
@@ -359,7 +359,7 @@
ll[0]=ll[1]=ll[2]=ll[3]=FLimit;
for ( j = 0; j < 3 ; j++){
- ogg_uint32_t *bp_begin = bp;
+ ogg_uint32_t *bp_begin = bp;
ogg_uint32_t *bp_end;
int stride;
int h;
@@ -376,23 +376,23 @@
stride = pbi->UVStride;
break;
}
-
+
while(bp<bp_end){
ogg_uint32_t *bp_left = bp;
ogg_uint32_t *bp_right = bp + h;
while(bp<bp_right){
- if(cp[0]){
- if(bp>bp_left)
- loop_filter_h(&pbi->LastFrameRecon[bp[0]],stride,ll);
- if(bp_left>bp_begin)
- loop_filter_v(&pbi->LastFrameRecon[bp[0]],stride,ll);
- if(bp+1<bp_right && !cp[1])
- loop_filter_h(&pbi->LastFrameRecon[bp[0]]+8,stride,ll);
- if(bp+h<bp_end && !cp[h])
- loop_filter_v(&pbi->LastFrameRecon[bp[h]],stride,ll);
- }
- bp++;
- cp++;
+ if(cp[0]){
+ if(bp>bp_left)
+ loop_filter_h(&pbi->LastFrameRecon[bp[0]],stride,ll);
+ if(bp_left>bp_begin)
+ loop_filter_v(&pbi->LastFrameRecon[bp[0]],stride,ll);
+ if(bp+1<bp_right && !cp[1])
+ loop_filter_h(&pbi->LastFrameRecon[bp[0]]+8,stride,ll);
+ if(bp+h<bp_end && !cp[h])
+ loop_filter_v(&pbi->LastFrameRecon[bp[h]],stride,ll);
+ }
+ bp++;
+ cp++;
}
}
}
Modified: trunk/theora/lib/enc/x86_32/dsp_mmx.c
===================================================================
--- trunk/theora/lib/enc/x86_32/dsp_mmx.c 2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/enc/x86_32/dsp_mmx.c 2008-08-04 18:37:55 UTC (rev 15153)
@@ -50,12 +50,12 @@
static void sub8x8__mmx (unsigned char *FiltPtr, unsigned char *ReconPtr,
ogg_int16_t *DctInputPtr, ogg_uint32_t PixelsPerLine,
- ogg_uint32_t ReconPixelsPerLine)
+ ogg_uint32_t ReconPixelsPerLine)
{
__asm__ __volatile__ (
" .p2align 4 \n\t"
- " pxor %%mm7, %%mm7 \n\t"
+ " pxor %%mm7, %%mm7 \n\t"
SUB_LOOP
SUB_LOOP
SUB_LOOP
@@ -68,7 +68,7 @@
"+r" (ReconPtr),
"+r" (DctInputPtr)
: "m" (PixelsPerLine),
- "m" (ReconPixelsPerLine)
+ "m" (ReconPixelsPerLine)
: "memory"
);
}
@@ -86,16 +86,16 @@
" movq %%mm2, 8(%1) \n\t" /* write answer out */ \
/* Increment pointers */ \
" add $16, %1 \n\t" \
- " add %2, %0 \n\t"
+ " add %2, %0 \n\t"
static void sub8x8_128__mmx (unsigned char *FiltPtr, ogg_int16_t *DctInputPtr,
- ogg_uint32_t PixelsPerLine)
+ ogg_uint32_t PixelsPerLine)
{
__asm__ __volatile__ (
" .p2align 4 \n\t"
- " pxor %%mm7, %%mm7 \n\t"
+ " pxor %%mm7, %%mm7 \n\t"
" movq %[V128], %%mm1 \n\t"
SUB_128_LOOP
SUB_128_LOOP
@@ -140,18 +140,18 @@
" add $16, %3 \n\t" \
" add %4, %0 \n\t" \
" add %5, %1 \n\t" \
- " add %5, %2 \n\t"
+ " add %5, %2 \n\t"
static void sub8x8avg2__mmx (unsigned char *FiltPtr, unsigned char *ReconPtr1,
unsigned char *ReconPtr2, ogg_int16_t *DctInputPtr,
ogg_uint32_t PixelsPerLine,
- ogg_uint32_t ReconPixelsPerLine)
+ ogg_uint32_t ReconPixelsPerLine)
{
__asm__ __volatile__ (
" .p2align 4 \n\t"
- " pxor %%mm7, %%mm7 \n\t"
+ " pxor %%mm7, %%mm7 \n\t"
SUB_AVG2_LOOP
SUB_AVG2_LOOP
SUB_AVG2_LOOP
@@ -165,7 +165,7 @@
"+r" (ReconPtr2),
"+r" (DctInputPtr)
: "m" (PixelsPerLine),
- "m" (ReconPixelsPerLine)
+ "m" (ReconPixelsPerLine)
: "memory"
);
}
@@ -177,15 +177,15 @@
__asm__ __volatile__ (
" .p2align 4 \n\t"
- " pxor %%mm6, %%mm6 \n\t" /* zero out mm6 for unpack */
- " pxor %%mm7, %%mm7 \n\t" /* zero out mm7 for unpack */
- " movq (%1), %%mm0 \n\t" /* take 8 bytes */
+ " pxor %%mm6, %%mm6 \n\t" /* zero out mm6 for unpack */
+ " pxor %%mm7, %%mm7 \n\t" /* zero out mm7 for unpack */
+ " movq (%1), %%mm0 \n\t" /* take 8 bytes */
" movq (%2), %%mm1 \n\t"
" movq %%mm0, %%mm2 \n\t"
- " psubusb %%mm1, %%mm0 \n\t" /* A - B */
- " psubusb %%mm2, %%mm1 \n\t" /* B - A */
- " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */
+ " psubusb %%mm1, %%mm0 \n\t" /* A - B */
+ " psubusb %%mm2, %%mm1 \n\t" /* B - A */
+ " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */
" movq %%mm0, %%mm1 \n\t"
@@ -194,7 +194,7 @@
" movq %%mm0, %%mm2 \n\t"
" movq %%mm1, %%mm3 \n\t"
- " psrlq $32, %%mm2 \n\t" /* fold and add */
+ " psrlq $32, %%mm2 \n\t" /* fold and add */
" psrlq $32, %%mm3 \n\t"
" paddw %%mm2, %%mm0 \n\t"
" paddw %%mm3, %%mm1 \n\t"
@@ -206,13 +206,13 @@
" paddw %%mm3, %%mm1 \n\t"
" psubusw %%mm0, %%mm1 \n\t"
- " paddw %%mm0, %%mm1 \n\t" /* mm1 = max(mm1, mm0) */
+ " paddw %%mm0, %%mm1 \n\t" /* mm1 = max(mm1, mm0) */
" movd %%mm1, %0 \n\t"
" andl $0xffff, %0 \n\t"
: "=m" (MaxSad),
- "+r" (Src1),
- "+r" (Src2)
+ "+r" (Src1),
+ "+r" (Src2)
:
: "memory"
);
@@ -220,80 +220,80 @@
}
static ogg_uint32_t col_sad8x8__mmx (unsigned char *Src1, unsigned char *Src2,
- ogg_uint32_t stride)
+ ogg_uint32_t stride)
{
ogg_uint32_t MaxSad;
__asm__ __volatile__ (
" .p2align 4 \n\t"
- " pxor %%mm3, %%mm3 \n\t" /* zero out mm3 for unpack */
- " pxor %%mm4, %%mm4 \n\t" /* mm4 low sum */
- " pxor %%mm5, %%mm5 \n\t" /* mm5 high sum */
- " pxor %%mm6, %%mm6 \n\t" /* mm6 low sum */
- " pxor %%mm7, %%mm7 \n\t" /* mm7 high sum */
- " mov $4, %%edi \n\t" /* 4 rows */
+ " pxor %%mm3, %%mm3 \n\t" /* zero out mm3 for unpack */
+ " pxor %%mm4, %%mm4 \n\t" /* mm4 low sum */
+ " pxor %%mm5, %%mm5 \n\t" /* mm5 high sum */
+ " pxor %%mm6, %%mm6 \n\t" /* mm6 low sum */
+ " pxor %%mm7, %%mm7 \n\t" /* mm7 high sum */
+ " mov $4, %%edi \n\t" /* 4 rows */
"1: \n\t"
- " movq (%1), %%mm0 \n\t" /* take 8 bytes */
- " movq (%2), %%mm1 \n\t" /* take 8 bytes */
+ " movq (%1), %%mm0 \n\t" /* take 8 bytes */
+ " movq (%2), %%mm1 \n\t" /* take 8 bytes */
" movq %%mm0, %%mm2 \n\t"
- " psubusb %%mm1, %%mm0 \n\t" /* A - B */
- " psubusb %%mm2, %%mm1 \n\t" /* B - A */
- " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */
+ " psubusb %%mm1, %%mm0 \n\t" /* A - B */
+ " psubusb %%mm2, %%mm1 \n\t" /* B - A */
+ " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */
" movq %%mm0, %%mm1 \n\t"
- " punpcklbw %%mm3, %%mm0 \n\t" /* unpack to higher precision for accumulation */
- " paddw %%mm0, %%mm4 \n\t" /* accumulate difference... */
- " punpckhbw %%mm3, %%mm1 \n\t" /* unpack high four bytes to higher precision */
- " paddw %%mm1, %%mm5 \n\t" /* accumulate difference... */
- " add %3, %1 \n\t" /* Inc pointer into the new data */
- " add %3, %2 \n\t" /* Inc pointer into the new data */
+ " punpcklbw %%mm3, %%mm0 \n\t" /* unpack to higher precision for accumulation */
+ " paddw %%mm0, %%mm4 \n\t" /* accumulate difference... */
+ " punpckhbw %%mm3, %%mm1 \n\t" /* unpack high four bytes to higher precision */
+ " paddw %%mm1, %%mm5 \n\t" /* accumulate difference... */
+ " add %3, %1 \n\t" /* Inc pointer into the new data */
+ " add %3, %2 \n\t" /* Inc pointer into the new data */
" dec %%edi \n\t"
" jnz 1b \n\t"
- " mov $4, %%edi \n\t" /* 4 rows */
+ " mov $4, %%edi \n\t" /* 4 rows */
"2: \n\t"
- " movq (%1), %%mm0 \n\t" /* take 8 bytes */
- " movq (%2), %%mm1 \n\t" /* take 8 bytes */
+ " movq (%1), %%mm0 \n\t" /* take 8 bytes */
+ " movq (%2), %%mm1 \n\t" /* take 8 bytes */
" movq %%mm0, %%mm2 \n\t"
- " psubusb %%mm1, %%mm0 \n\t" /* A - B */
- " psubusb %%mm2, %%mm1 \n\t" /* B - A */
- " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */
+ " psubusb %%mm1, %%mm0 \n\t" /* A - B */
+ " psubusb %%mm2, %%mm1 \n\t" /* B - A */
+ " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */
" movq %%mm0, %%mm1 \n\t"
- " punpcklbw %%mm3, %%mm0 \n\t" /* unpack to higher precision for accumulation */
- " paddw %%mm0, %%mm6 \n\t" /* accumulate difference... */
- " punpckhbw %%mm3, %%mm1 \n\t" /* unpack high four bytes to higher precision */
- " paddw %%mm1, %%mm7 \n\t" /* accumulate difference... */
- " add %3, %1 \n\t" /* Inc pointer into the new data */
- " add %3, %2 \n\t" /* Inc pointer into the new data */
+ " punpcklbw %%mm3, %%mm0 \n\t" /* unpack to higher precision for accumulation */
+ " paddw %%mm0, %%mm6 \n\t" /* accumulate difference... */
+ " punpckhbw %%mm3, %%mm1 \n\t" /* unpack high four bytes to higher precision */
+ " paddw %%mm1, %%mm7 \n\t" /* accumulate difference... */
+ " add %3, %1 \n\t" /* Inc pointer into the new data */
+ " add %3, %2 \n\t" /* Inc pointer into the new data */
" dec %%edi \n\t"
" jnz 2b \n\t"
" psubusw %%mm6, %%mm7 \n\t"
- " paddw %%mm6, %%mm7 \n\t" /* mm7 = max(mm7, mm6) */
- " psubusw %%mm4, %%mm5 \n\t"
- " paddw %%mm4, %%mm5 \n\t" /* mm5 = max(mm5, mm4) */
- " psubusw %%mm5, %%mm7 \n\t"
- " paddw %%mm5, %%mm7 \n\t" /* mm7 = max(mm5, mm7) */
+ " paddw %%mm6, %%mm7 \n\t" /* mm7 = max(mm7, mm6) */
+ " psubusw %%mm4, %%mm5 \n\t"
+ " paddw %%mm4, %%mm5 \n\t" /* mm5 = max(mm5, mm4) */
+ " psubusw %%mm5, %%mm7 \n\t"
+ " paddw %%mm5, %%mm7 \n\t" /* mm7 = max(mm5, mm7) */
" movq %%mm7, %%mm6 \n\t"
" psrlq $32, %%mm6 \n\t"
- " psubusw %%mm6, %%mm7 \n\t"
- " paddw %%mm6, %%mm7 \n\t" /* mm7 = max(mm5, mm7) */
+ " psubusw %%mm6, %%mm7 \n\t"
+ " paddw %%mm6, %%mm7 \n\t" /* mm7 = max(mm5, mm7) */
" movq %%mm7, %%mm6 \n\t"
" psrlq $16, %%mm6 \n\t"
- " psubusw %%mm6, %%mm7 \n\t"
- " paddw %%mm6, %%mm7 \n\t" /* mm7 = max(mm5, mm7) */
+ " psubusw %%mm6, %%mm7 \n\t"
+ " paddw %%mm6, %%mm7 \n\t" /* mm7 = max(mm5, mm7) */
" movd %%mm7, %0 \n\t"
" andl $0xffff, %0 \n\t"
: "=r" (MaxSad),
- "+r" (Src1),
- "+r" (Src2)
+ "+r" (Src1),
+ "+r" (Src2)
: "r" (stride)
: "memory", "edi"
);
@@ -302,29 +302,29 @@
}
#define SAD_LOOP \
- " movq (%1), %%mm0 \n\t" /* take 8 bytes */ \
+ " movq (%1), %%mm0 \n\t" /* take 8 bytes */ \
" movq (%2), %%mm1 \n\t" \
" movq %%mm0, %%mm2 \n\t" \
- " psubusb %%mm1, %%mm0 \n\t" /* A - B */ \
- " psubusb %%mm2, %%mm1 \n\t" /* B - A */ \
+ " psubusb %%mm1, %%mm0 \n\t" /* A - B */ \
+ " psubusb %%mm2, %%mm1 \n\t" /* B - A */ \
" por %%mm1, %%mm0 \n\t" /* and or gives abs difference */ \
" movq %%mm0, %%mm1 \n\t" \
- " punpcklbw %%mm6, %%mm0 \n\t" /* unpack to higher precision for accumulation */ \
- " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */ \
- " punpckhbw %%mm6, %%mm1 \n\t" /* unpack high four bytes to higher precision */ \
- " add %3, %1 \n\t" /* Inc pointer into the new data */ \
- " paddw %%mm1, %%mm7 \n\t" /* accumulate difference... */ \
- " add %4, %2 \n\t" /* Inc pointer into ref data */
+ " punpcklbw %%mm6, %%mm0 \n\t" /* unpack to higher precision for accumulation */ \
+ " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */ \
+ " punpckhbw %%mm6, %%mm1 \n\t" /* unpack high four bytes to higher precision */ \
+ " add %3, %1 \n\t" /* Inc pointer into the new data */ \
+ " paddw %%mm1, %%mm7 \n\t" /* accumulate difference... */ \
+ " add %4, %2 \n\t" /* Inc pointer into ref data */
static ogg_uint32_t sad8x8__mmx (unsigned char *ptr1, ogg_uint32_t stride1,
- unsigned char *ptr2, ogg_uint32_t stride2)
+ unsigned char *ptr2, ogg_uint32_t stride2)
{
ogg_uint32_t DiffVal;
__asm__ __volatile__ (
" .p2align 4 \n\t"
- " pxor %%mm6, %%mm6 \n\t" /* zero out mm6 for unpack */
- " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */
+ " pxor %%mm6, %%mm6 \n\t" /* zero out mm6 for unpack */
+ " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */
SAD_LOOP
SAD_LOOP
SAD_LOOP
@@ -343,8 +343,8 @@
" andl $0xffff, %0 \n\t"
: "=m" (DiffVal),
- "+r" (ptr1),
- "+r" (ptr2)
+ "+r" (ptr1),
+ "+r" (ptr2)
: "r" (stride1),
"r" (stride2)
: "memory"
@@ -354,33 +354,33 @@
}
static ogg_uint32_t sad8x8_thres__mmx (unsigned char *ptr1, ogg_uint32_t stride1,
- unsigned char *ptr2, ogg_uint32_t stride2,
- ogg_uint32_t thres)
+ unsigned char *ptr2, ogg_uint32_t stride2,
+ ogg_uint32_t thres)
{
return sad8x8__mmx (ptr1, stride1, ptr2, stride2);
}
static ogg_uint32_t sad8x8_xy2_thres__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride,
- unsigned char *RefDataPtr1,
- unsigned char *RefDataPtr2, ogg_uint32_t RefStride,
- ogg_uint32_t thres)
+ unsigned char *RefDataPtr1,
+ unsigned char *RefDataPtr2, ogg_uint32_t RefStride,
+ ogg_uint32_t thres)
{
ogg_uint32_t DiffVal;
__asm__ __volatile__ (
" .p2align 4 \n\t"
- " pcmpeqd %%mm5, %%mm5 \n\t" /* fefefefefefefefe in mm5 */
+ " pcmpeqd %%mm5, %%mm5 \n\t" /* fefefefefefefefe in mm5 */
" paddb %%mm5, %%mm5 \n\t"
-
- " pxor %%mm6, %%mm6 \n\t" /* zero out mm6 for unpack */
- " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */
- " mov $8, %%edi \n\t" /* 8 rows */
+
+ " pxor %%mm6, %%mm6 \n\t" /* zero out mm6 for unpack */
+ " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */
+ " mov $8, %%edi \n\t" /* 8 rows */
"1: \n\t"
- " movq (%1), %%mm0 \n\t" /* take 8 bytes */
+ " movq (%1), %%mm0 \n\t" /* take 8 bytes */
" movq (%2), %%mm2 \n\t"
- " movq (%3), %%mm3 \n\t" /* take average of mm2 and mm3 */
+ " movq (%3), %%mm3 \n\t" /* take average of mm2 and mm3 */
" movq %%mm2, %%mm1 \n\t"
" pand %%mm3, %%mm1 \n\t"
" pxor %%mm2, %%mm3 \n\t"
@@ -390,18 +390,18 @@
" movq %%mm0, %%mm2 \n\t"
- " psubusb %%mm1, %%mm0 \n\t" /* A - B */
- " psubusb %%mm2, %%mm1 \n\t" /* B - A */
- " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */
+ " psubusb %%mm1, %%mm0 \n\t" /* A - B */
+ " psubusb %%mm2, %%mm1 \n\t" /* B - A */
+ " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */
" movq %%mm0, %%mm1 \n\t"
- " punpcklbw %%mm6, %%mm0 \n\t" /* unpack to higher precision for accumulation */
- " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */
- " punpckhbw %%mm6, %%mm1 \n\t" /* unpack high four bytes to higher precision */
- " add %4, %1 \n\t" /* Inc pointer into the new data */
- " paddw %%mm1, %%mm7 \n\t" /* accumulate difference... */
- " add %5, %2 \n\t" /* Inc pointer into ref data */
- " add %5, %3 \n\t" /* Inc pointer into ref data */
+ " punpcklbw %%mm6, %%mm0 \n\t" /* unpack to higher precision for accumulation */
+ " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */
+ " punpckhbw %%mm6, %%mm1 \n\t" /* unpack high four bytes to higher precision */
+ " add %4, %1 \n\t" /* Inc pointer into the new data */
+ " paddw %%mm1, %%mm7 \n\t" /* accumulate difference... */
+ " add %5, %2 \n\t" /* Inc pointer into ref data */
+ " add %5, %3 \n\t" /* Inc pointer into ref data */
" dec %%edi \n\t"
" jnz 1b \n\t"
@@ -416,9 +416,9 @@
" andl $0xffff, %0 \n\t"
: "=m" (DiffVal),
- "+r" (SrcData),
- "+r" (RefDataPtr1),
- "+r" (RefDataPtr2)
+ "+r" (SrcData),
+ "+r" (RefDataPtr1),
+ "+r" (RefDataPtr2)
: "m" (SrcStride),
"m" (RefStride)
: "edi", "memory"
@@ -440,7 +440,7 @@
" pxor %%mm7, %%mm7 \n\t"
" mov $8, %%edi \n\t"
"1: \n\t"
- " movq (%2), %%mm0 \n\t" /* take 8 bytes */
+ " movq (%2), %%mm0 \n\t" /* take 8 bytes */
" movq %%mm0, %%mm2 \n\t"
" punpcklbw %%mm6, %%mm0 \n\t"
@@ -451,11 +451,11 @@
" pmaddwd %%mm0, %%mm0 \n\t"
" pmaddwd %%mm2, %%mm2 \n\t"
-
+
" paddd %%mm0, %%mm7 \n\t"
" paddd %%mm2, %%mm7 \n\t"
- " add %3, %2 \n\t" /* Inc pointer into src data */
+ " add %3, %2 \n\t" /* Inc pointer into src data */
" dec %%edi \n\t"
" jnz 1b \n\t"
@@ -477,7 +477,7 @@
: "=r" (XSum),
"=r" (XXSum),
- "+r" (DataPtr)
+ "+r" (DataPtr)
: "r" (Stride)
: "edi", "memory"
);
@@ -487,7 +487,7 @@
}
static ogg_uint32_t inter8x8_err__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride,
- unsigned char *RefDataPtr, ogg_uint32_t RefStride)
+ unsigned char *RefDataPtr, ogg_uint32_t RefStride)
{
ogg_uint32_t XSum;
ogg_uint32_t XXSum;
@@ -500,7 +500,7 @@
" pxor %%mm7, %%mm7 \n\t"
" mov $8, %%edi \n\t"
"1: \n\t"
- " movq (%2), %%mm0 \n\t" /* take 8 bytes */
+ " movq (%2), %%mm0 \n\t" /* take 8 bytes */
" movq (%3), %%mm1 \n\t"
" movq %%mm0, %%mm2 \n\t"
" movq %%mm1, %%mm3 \n\t"
@@ -518,12 +518,12 @@
" pmaddwd %%mm0, %%mm0 \n\t"
" pmaddwd %%mm2, %%mm2 \n\t"
-
+
" paddd %%mm0, %%mm7 \n\t"
" paddd %%mm2, %%mm7 \n\t"
- " add %4, %2 \n\t" /* Inc pointer into src data */
- " add %5, %3 \n\t" /* Inc pointer into ref data */
+ " add %4, %2 \n\t" /* Inc pointer into src data */
+ " add %5, %3 \n\t" /* Inc pointer into ref data */
" dec %%edi \n\t"
" jnz 1b \n\t"
@@ -545,8 +545,8 @@
: "=m" (XSum),
"=m" (XXSum),
- "+r" (SrcData),
- "+r" (RefDataPtr)
+ "+r" (SrcData),
+ "+r" (RefDataPtr)
: "m" (SrcStride),
"m" (RefStride)
: "edi", "memory"
@@ -557,8 +557,8 @@
}
static ogg_uint32_t inter8x8_err_xy2__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride,
- unsigned char *RefDataPtr1,
- unsigned char *RefDataPtr2, ogg_uint32_t RefStride)
+ unsigned char *RefDataPtr1,
+ unsigned char *RefDataPtr2, ogg_uint32_t RefStride)
{
ogg_uint32_t XSum;
ogg_uint32_t XXSum;
@@ -566,17 +566,17 @@
__asm__ __volatile__ (
" .p2align 4 \n\t"
- " pcmpeqd %%mm4, %%mm4 \n\t" /* fefefefefefefefe in mm4 */
+ " pcmpeqd %%mm4, %%mm4 \n\t" /* fefefefefefefefe in mm4 */
" paddb %%mm4, %%mm4 \n\t"
" pxor %%mm5, %%mm5 \n\t"
" pxor %%mm6, %%mm6 \n\t"
" pxor %%mm7, %%mm7 \n\t"
" mov $8, %%edi \n\t"
"1: \n\t"
- " movq (%2), %%mm0 \n\t" /* take 8 bytes */
+ " movq (%2), %%mm0 \n\t" /* take 8 bytes */
" movq (%3), %%mm2 \n\t"
- " movq (%4), %%mm3 \n\t" /* take average of mm2 and mm3 */
+ " movq (%4), %%mm3 \n\t" /* take average of mm2 and mm3 */
" movq %%mm2, %%mm1 \n\t"
" pand %%mm3, %%mm1 \n\t"
" pxor %%mm2, %%mm3 \n\t"
@@ -600,13 +600,13 @@
" pmaddwd %%mm0, %%mm0 \n\t"
" pmaddwd %%mm2, %%mm2 \n\t"
-
+
" paddd %%mm0, %%mm7 \n\t"
" paddd %%mm2, %%mm7 \n\t"
- " add %5, %2 \n\t" /* Inc pointer into src data */
- " add %6, %3 \n\t" /* Inc pointer into ref data */
- " add %6, %4 \n\t" /* Inc pointer into ref data */
+ " add %5, %2 \n\t" /* Inc pointer into src data */
+ " add %6, %3 \n\t" /* Inc pointer into ref data */
+ " add %6, %4 \n\t" /* Inc pointer into ref data */
" dec %%edi \n\t"
" jnz 1b \n\t"
@@ -628,9 +628,9 @@
: "=m" (XSum),
"=m" (XXSum),
- "+r" (SrcData),
+ "+r" (SrcData),
"+r" (RefDataPtr1),
- "+r" (RefDataPtr2)
+ "+r" (RefDataPtr2)
: "m" (SrcStride),
"m" (RefStride)
: "edi", "memory"
@@ -649,7 +649,6 @@
void dsp_mmx_init(DspFunctions *funcs)
{
- TH_DEBUG("enabling accelerated x86_32 mmx dsp functions.\n");
funcs->restore_fpu = restore_fpu;
funcs->sub8x8 = sub8x8__mmx;
funcs->sub8x8_128 = sub8x8_128__mmx;
Modified: trunk/theora/lib/enc/x86_32/dsp_mmxext.c
===================================================================
--- trunk/theora/lib/enc/x86_32/dsp_mmxext.c 2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/enc/x86_32/dsp_mmxext.c 2008-08-04 18:37:55 UTC (rev 15153)
@@ -23,23 +23,23 @@
#if defined(USE_ASM)
#define SAD_MMXEXT_LOOP \
- " movq (%1), %%mm0 \n\t" /* take 8 bytes */ \
+ " movq (%1), %%mm0 \n\t" /* take 8 bytes */ \
" movq (%2), %%mm1 \n\t" \
" psadbw %%mm1, %%mm0 \n\t" \
- " add %3, %1 \n\t" /* Inc pointer into the new data */ \
- " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */ \
- " add %4, %2 \n\t" /* Inc pointer into ref data */
+ " add %3, %1 \n\t" /* Inc pointer into the new data */ \
+ " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */ \
+ " add %4, %2 \n\t" /* Inc pointer into ref data */
static ogg_uint32_t sad8x8__mmxext (unsigned char *ptr1, ogg_uint32_t stride1,
- unsigned char *ptr2, ogg_uint32_t stride2)
+ unsigned char *ptr2, ogg_uint32_t stride2)
{
ogg_uint32_t DiffVal;
__asm__ __volatile__ (
" .p2align 4 \n\t"
- " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */
-
+ " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */
+
SAD_MMXEXT_LOOP
SAD_MMXEXT_LOOP
SAD_MMXEXT_LOOP
@@ -48,15 +48,15 @@
SAD_MMXEXT_LOOP
SAD_MMXEXT_LOOP
- " movq (%1), %%mm0 \n\t" /* take 8 bytes */
+ " movq (%1), %%mm0 \n\t" /* take 8 bytes */
" movq (%2), %%mm1 \n\t"
" psadbw %%mm1, %%mm0 \n\t"
- " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */
+ " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */
" movd %%mm7, %0 \n\t"
: "=r" (DiffVal),
- "+r" (ptr1),
- "+r" (ptr2)
+ "+r" (ptr1),
+ "+r" (ptr2)
: "r" (stride1),
"r" (stride2)
: "memory"
@@ -66,23 +66,23 @@
}
#define SAD_TRES_LOOP \
- " movq (%1), %%mm0 \n\t" /* take 8 bytes */ \
+ " movq (%1), %%mm0 \n\t" /* take 8 bytes */ \
" movq (%2), %%mm1 \n\t" \
" psadbw %%mm1, %%mm0 \n\t" \
- " add %3, %1 \n\t" /* Inc pointer into the new data */ \
- " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */ \
- " add %4, %2 \n\t" /* Inc pointer into ref data */
+ " add %3, %1 \n\t" /* Inc pointer into the new data */ \
+ " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */ \
+ " add %4, %2 \n\t" /* Inc pointer into ref data */
static ogg_uint32_t sad8x8_thres__mmxext (unsigned char *ptr1, ogg_uint32_t stride1,
- unsigned char *ptr2, ogg_uint32_t stride2,
- ogg_uint32_t thres)
+ unsigned char *ptr2, ogg_uint32_t stride2,
+ ogg_uint32_t thres)
{
ogg_uint32_t DiffVal;
__asm__ __volatile__ (
" .p2align 4 \n\t"
- " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */
+ " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */
SAD_TRES_LOOP
SAD_TRES_LOOP
@@ -96,8 +96,8 @@
" movd %%mm7, %0 \n\t"
: "=r" (DiffVal),
- "+r" (ptr1),
- "+r" (ptr2)
+ "+r" (ptr1),
+ "+r" (ptr2)
: "r" (stride1),
"r" (stride2)
: "memory"
@@ -107,28 +107,28 @@
}
#define SAD_XY2_TRES \
- " movq (%1), %%mm0 \n\t" /* take 8 bytes */ \
+ " movq (%1), %%mm0 \n\t" /* take 8 bytes */ \
" movq (%2), %%mm1 \n\t" \
" movq (%3), %%mm2 \n\t" \
" pavgb %%mm2, %%mm1 \n\t" \
" psadbw %%mm1, %%mm0 \n\t" \
\
- " add %4, %1 \n\t" /* Inc pointer into the new data */ \
- " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */ \
- " add %5, %2 \n\t" /* Inc pointer into ref data */ \
- " add %5, %3 \n\t" /* Inc pointer into ref data */
+ " add %4, %1 \n\t" /* Inc pointer into the new data */ \
+ " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */ \
+ " add %5, %2 \n\t" /* Inc pointer into ref data */ \
+ " add %5, %3 \n\t" /* Inc pointer into ref data */
static ogg_uint32_t sad8x8_xy2_thres__mmxext (unsigned char *SrcData, ogg_uint32_t SrcStride,
- unsigned char *RefDataPtr1,
- unsigned char *RefDataPtr2, ogg_uint32_t RefStride,
- ogg_uint32_t thres)
+ unsigned char *RefDataPtr1,
+ unsigned char *RefDataPtr2, ogg_uint32_t RefStride,
+ ogg_uint32_t thres)
{
ogg_uint32_t DiffVal;
__asm__ __volatile__ (
" .p2align 4 \n\t"
- " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */
+ " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */
SAD_XY2_TRES
SAD_XY2_TRES
SAD_XY2_TRES
@@ -140,9 +140,9 @@
" movd %%mm7, %0 \n\t"
: "=m" (DiffVal),
- "+r" (SrcData),
- "+r" (RefDataPtr1),
- "+r" (RefDataPtr2)
+ "+r" (SrcData),
+ "+r" (RefDataPtr1),
+ "+r" (RefDataPtr2)
: "m" (SrcStride),
"m" (RefStride)
: "memory"
@@ -150,7 +150,7 @@
return DiffVal;
}
-
+
static ogg_uint32_t row_sad8__mmxext (unsigned char *Src1, unsigned char *Src2)
{
ogg_uint32_t MaxSad;
@@ -170,8 +170,8 @@
" andl $0xffff, %0 \n\t"
: "=m" (MaxSad),
- "+r" (Src1),
- "+r" (Src2)
+ "+r" (Src1),
+ "+r" (Src2)
:
: "memory"
);
@@ -180,56 +180,56 @@
}
static ogg_uint32_t col_sad8x8__mmxext (unsigned char *Src1, unsigned char *Src2,
- ogg_uint32_t stride)
+ ogg_uint32_t stride)
{
ogg_uint32_t MaxSad;
__asm__ __volatile__ (
" .p2align 4 \n\t"
- " pxor %%mm3, %%mm3 \n\t" /* zero out mm3 for unpack */
- " pxor %%mm4, %%mm4 \n\t" /* mm4 low sum */
- " pxor %%mm5, %%mm5 \n\t" /* mm5 high sum */
- " pxor %%mm6, %%mm6 \n\t" /* mm6 low sum */
- " pxor %%mm7, %%mm7 \n\t" /* mm7 high sum */
- " mov $4, %%edi \n\t" /* 4 rows */
+ " pxor %%mm3, %%mm3 \n\t" /* zero out mm3 for unpack */
+ " pxor %%mm4, %%mm4 \n\t" /* mm4 low sum */
+ " pxor %%mm5, %%mm5 \n\t" /* mm5 high sum */
+ " pxor %%mm6, %%mm6 \n\t" /* mm6 low sum */
+ " pxor %%mm7, %%mm7 \n\t" /* mm7 high sum */
+ " mov $4, %%edi \n\t" /* 4 rows */
"1: \n\t"
- " movq (%1), %%mm0 \n\t" /* take 8 bytes */
- " movq (%2), %%mm1 \n\t" /* take 8 bytes */
+ " movq (%1), %%mm0 \n\t" /* take 8 bytes */
+ " movq (%2), %%mm1 \n\t" /* take 8 bytes */
" movq %%mm0, %%mm2 \n\t"
- " psubusb %%mm1, %%mm0 \n\t" /* A - B */
- " psubusb %%mm2, %%mm1 \n\t" /* B - A */
- " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */
+ " psubusb %%mm1, %%mm0 \n\t" /* A - B */
+ " psubusb %%mm2, %%mm1 \n\t" /* B - A */
+ " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */
" movq %%mm0, %%mm1 \n\t"
- " punpcklbw %%mm3, %%mm0 \n\t" /* unpack to higher precision for accumulation */
- " paddw %%mm0, %%mm4 \n\t" /* accumulate difference... */
- " punpckhbw %%mm3, %%mm1 \n\t" /* unpack high four bytes to higher precision */
- " paddw %%mm1, %%mm5 \n\t" /* accumulate difference... */
- " add %3, %1 \n\t" /* Inc pointer into the new data */
- " add %3, %2 \n\t" /* Inc pointer into the new data */
+ " punpcklbw %%mm3, %%mm0 \n\t" /* unpack to higher precision for accumulation */
+ " paddw %%mm0, %%mm4 \n\t" /* accumulate difference... */
+ " punpckhbw %%mm3, %%mm1 \n\t" /* unpack high four bytes to higher precision */
+ " paddw %%mm1, %%mm5 \n\t" /* accumulate difference... */
+ " add %3, %1 \n\t" /* Inc pointer into the new data */
+ " add %3, %2 \n\t" /* Inc pointer into the new data */
" dec %%edi \n\t"
" jnz 1b \n\t"
- " mov $4, %%edi \n\t" /* 4 rows */
+ " mov $4, %%edi \n\t" /* 4 rows */
"2: \n\t"
- " movq (%1), %%mm0 \n\t" /* take 8 bytes */
- " movq (%2), %%mm1 \n\t" /* take 8 bytes */
+ " movq (%1), %%mm0 \n\t" /* take 8 bytes */
+ " movq (%2), %%mm1 \n\t" /* take 8 bytes */
" movq %%mm0, %%mm2 \n\t"
- " psubusb %%mm1, %%mm0 \n\t" /* A - B */
- " psubusb %%mm2, %%mm1 \n\t" /* B - A */
- " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */
+ " psubusb %%mm1, %%mm0 \n\t" /* A - B */
+ " psubusb %%mm2, %%mm1 \n\t" /* B - A */
+ " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */
" movq %%mm0, %%mm1 \n\t"
- " punpcklbw %%mm3, %%mm0 \n\t" /* unpack to higher precision for accumulation */
- " paddw %%mm0, %%mm6 \n\t" /* accumulate difference... */
- " punpckhbw %%mm3, %%mm1 \n\t" /* unpack high four bytes to higher precision */
- " paddw %%mm1, %%mm7 \n\t" /* accumulate difference... */
- " add %3, %1 \n\t" /* Inc pointer into the new data */
- " add %3, %2 \n\t" /* Inc pointer into the new data */
+ " punpcklbw %%mm3, %%mm0 \n\t" /* unpack to higher precision for accumulation */
+ " paddw %%mm0, %%mm6 \n\t" /* accumulate difference... */
+ " punpckhbw %%mm3, %%mm1 \n\t" /* unpack high four bytes to higher precision */
+ " paddw %%mm1, %%mm7 \n\t" /* accumulate difference... */
+ " add %3, %1 \n\t" /* Inc pointer into the new data */
+ " add %3, %2 \n\t" /* Inc pointer into the new data */
" dec %%edi \n\t"
" jnz 2b \n\t"
@@ -247,8 +247,8 @@
" andl $0xffff, %0 \n\t"
: "=r" (MaxSad),
- "+r" (Src1),
- "+r" (Src2)
+ "+r" (Src1),
+ "+r" (Src2)
: "r" (stride)
: "memory", "edi"
);
@@ -257,8 +257,8 @@
}
static ogg_uint32_t inter8x8_err_xy2__mmxext (unsigned char *SrcData, ogg_uint32_t SrcStride,
- unsigned char *RefDataPtr1,
- unsigned char *RefDataPtr2, ogg_uint32_t RefStride)
+ unsigned char *RefDataPtr1,
+ unsigned char *RefDataPtr2, ogg_uint32_t RefStride)
{
ogg_uint32_t XSum;
ogg_uint32_t XXSum;
@@ -272,10 +272,10 @@
" pxor %%mm7, %%mm7 \n\t"
" mov $8, %%edi \n\t"
"1: \n\t"
- " movq (%2), %%mm0 \n\t" /* take 8 bytes */
+ " movq (%2), %%mm0 \n\t" /* take 8 bytes */
" movq (%3), %%mm2 \n\t"
- " movq (%4), %%mm1 \n\t" /* take average of mm2 and mm1 */
+ " movq (%4), %%mm1 \n\t" /* take average of mm2 and mm1 */
" pavgb %%mm2, %%mm1 \n\t"
" movq %%mm0, %%mm2 \n\t"
@@ -294,13 +294,13 @@
" pmaddwd %%mm0, %%mm0 \n\t"
" pmaddwd %%mm2, %%mm2 \n\t"
-
+
" paddd %%mm0, %%mm7 \n\t"
" paddd %%mm2, %%mm7 \n\t"
- " add %5, %2 \n\t" /* Inc pointer into src data */
- " add %6, %3 \n\t" /* Inc pointer into ref data */
- " add %6, %4 \n\t" /* Inc pointer into ref data */
+ " add %5, %2 \n\t" /* Inc pointer into src data */
+ " add %6, %3 \n\t" /* Inc pointer into ref data */
+ " add %6, %4 \n\t" /* Inc pointer into ref data */
" dec %%edi \n\t"
" jnz 1b \n\t"
@@ -322,9 +322,9 @@
: "=m" (XSum),
"=m" (XXSum),
- "+r" (SrcData),
+ "+r" (SrcData),
"+r" (RefDataPtr1),
- "+r" (RefDataPtr2)
+ "+r" (RefDataPtr2)
: "m" (SrcStride),
"m" (RefStride)
: "edi", "memory"
@@ -336,7 +336,6 @@
void dsp_mmxext_init(DspFunctions *funcs)
{
- TH_DEBUG("enabling accelerated x86_32 mmxext dsp functions.\n");
funcs->row_sad8 = row_sad8__mmxext;
funcs->col_sad8x8 = col_sad8x8__mmxext;
funcs->sad8x8 = sad8x8__mmxext;
Modified: trunk/theora/lib/enc/x86_32/fdct_mmx.c
===================================================================
--- trunk/theora/lib/enc/x86_32/fdct_mmx.c 2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/enc/x86_32/fdct_mmx.c 2008-08-04 18:37:55 UTC (rev 15153)
@@ -59,7 +59,7 @@
" psubsw %%mm3, %%mm1 \n\t" /* mm1 = is12 - is56 */ \
" movq %%mm0," #temp " \n\t" /* Save is07 - is34 to free mm0; */ \
" paddsw %%mm3, %%mm3 \n\t" \
- " paddsw %%mm1, %%mm3 \n\t" /* mm3 = is12 + 1s56 = is1256 */ \
+ " paddsw %%mm1, %%mm3 \n\t" /* mm3 = is12 + 1s56 = is1256 */ \
\
" psubsw " #ip6 ", %%mm7 \n\t" /* mm7 = ip5 - ip6 = id56 */ \
/* ------------------------------------------------------------------- */ \
@@ -85,7 +85,7 @@
" pmulhw %[xC4S4], %%mm3 \n\t" /* mm3 = xC4S4 * ( is0734 +is1256 ) - ( is0734 +is1256 ) */ \
\
" psrlw $15, %%mm2 \n\t" \
- " paddw %%mm0, %%mm3 \n\t" /* mm3 = xC4S4 * ( is0734 +is1256 ) */ \
+ " paddw %%mm0, %%mm3 \n\t" /* mm3 = xC4S4 * ( is0734 +is1256 ) */ \
" paddw %%mm2, %%mm3 \n\t" /* Truncate mm3, now it is op[0] */ \
\
" movq %%mm3," #ip0 " \n\t" \
@@ -136,16 +136,16 @@
" movq %%mm1, %%mm3 \n\t" \
\
" pmulhw %%mm0, %%mm1 \n\t" /* mm0 = xC4S4 * ( is12 - is56 ) - ( is12 - is56 ) */ \
- " psrlw $15, %%mm2 \n\t" \
+ " psrlw $15, %%mm2 \n\t" \
\
" paddw %%mm3, %%mm1 \n\t" /* mm0 = xC4S4 * ( is12 - is56 ) */ \
" paddw %%mm2, %%mm1 \n\t" /* Truncate mm1, now it is icommon_product1 */ \
\
" movq %%mm7, %%mm2 \n\t" \
- " movq %%mm7, %%mm3 \n\t" \
+ " movq %%mm7, %%mm3 \n\t" \
\
" pmulhw %%mm0, %%mm7 \n\t" /* mm7 = xC4S4 * ( id12 + id56 ) - ( id12 + id56 ) */ \
- " psrlw $15, %%mm2 \n\t" \
+ " psrlw $15, %%mm2 \n\t" \
\
" paddw %%mm3, %%mm7 \n\t" /* mm7 = xC4S4 * ( id12 + id56 ) */ \
" paddw %%mm2, %%mm7 \n\t" /* Truncate mm7, now it is icommon_product2 */ \
@@ -234,10 +234,10 @@
" paddw %%mm6, %%mm7 \n\t" /* mm7 = xC3S5 * irot_input_y */ \
\
" paddw %%mm7, %%mm3 \n\t" /* ip5 */ \
- " movq %%mm3," #ip5 " \n\t"
+ " movq %%mm3," #ip5 " \n\t"
#define Transpose_mmx(ip0,ip1,ip2,ip3,ip4,ip5,ip6,ip7, \
- op0,op1,op2,op3,op4,op5,op6,op7) \
+ op0,op1,op2,op3,op4,op5,op6,op7) \
" movq " #ip0 ", %%mm0 \n\t" /* mm0 = a0 a1 a2 a3 */ \
" movq " #ip4 ", %%mm4 \n\t" /* mm4 = e4 e5 e6 e7 */ \
" movq " #ip1 ", %%mm1 \n\t" /* mm1 = b0 b1 b2 b3 */ \
@@ -251,9 +251,9 @@
" movq %%mm4, %%mm1 \n\t" /* mm1 = e3 e2 e1 e0 */ \
" punpcklwd %%mm5, %%mm4 \n\t" /* mm4 = f1 e1 f0 e0 */ \
" movq %%mm0," #op0 " \n\t" /* save a3 a2 a1 a0 */ \
- " punpckhwd %%mm5, %%mm1 \n\t" /* mm1 = f3 e3 f2 e2 */ \
+ " punpckhwd %%mm5, %%mm1 \n\t" /* mm1 = f3 e3 f2 e2 */ \
" movq %%mm6, %%mm0 \n\t" /* mm0 = g3 g2 g1 g0 */ \
- " punpcklwd %%mm7, %%mm6 \n\t" /* mm6 = h1 g1 h0 g0 */ \
+ " punpcklwd %%mm7, %%mm6 \n\t" /* mm6 = h1 g1 h0 g0 */ \
" movq %%mm4, %%mm5 \n\t" /* mm5 = f1 e1 f0 e0 */ \
" punpckldq %%mm6, %%mm4 \n\t" /* mm4 = h0 g0 f0 e0 = MM4 */ \
" punpckhdq %%mm6, %%mm5 \n\t" /* mm5 = h1 g1 f1 e1 = MM5 */ \
@@ -285,7 +285,7 @@
" movq %%mm2," #op2 " \n\t"
-/* This performs a 2D Forward DCT on an 8x8 block with short
+/* This performs a 2D Forward DCT on an 8x8 block with short
coefficients. We try to do the truncation to match the C
version. */
static void fdct_short__mmx ( ogg_int16_t *InputData, ogg_int16_t *OutputData)
@@ -299,23 +299,23 @@
* we will transpose the block of data to two 4x8 blocks???
*/
Transpose_mmx ( (%0), 16(%0), 32(%0), 48(%0), 8(%0), 24(%0), 40(%0), 56(%0),
- (%1), 16(%1), 32(%1), 48(%1), 8(%1), 24(%1), 40(%1), 56(%1))
+ (%1), 16(%1), 32(%1), 48(%1), 8(%1), 24(%1), 40(%1), 56(%1))
Fdct_mmx ( (%1), 16(%1), 32(%1), 48(%1), 8(%1), 24(%1), 40(%1), 56(%1), (%2))
Transpose_mmx (64(%0), 80(%0), 96(%0),112(%0), 72(%0), 88(%0),104(%0),120(%0),
- 64(%1), 80(%1), 96(%1),112(%1), 72(%1), 88(%1),104(%1),120(%1))
+ 64(%1), 80(%1), 96(%1),112(%1), 72(%1), 88(%1),104(%1),120(%1))
Fdct_mmx (64(%1), 80(%1), 96(%1),112(%1), 72(%1), 88(%1),104(%1),120(%1), (%2))
Transpose_mmx ( 0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1),
- 0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1))
+ 0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1))
Fdct_mmx ( 0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1), (%2))
Transpose_mmx ( 8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1),
- 8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1))
+ 8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1))
Fdct_mmx ( 8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1), (%2))
" emms \n\t"
-
+
: "+r" (InputData),
"+r" (OutputData)
: "r" (temp),
@@ -333,7 +333,6 @@
/* install our implementation in the function table */
void dsp_mmx_fdct_init(DspFunctions *funcs)
{
- TH_DEBUG("enabling accelerated x86_32 mmx fdct function.\n");
funcs->fdct_short = fdct_short__mmx;
}
Modified: trunk/theora/lib/enc/x86_32/idct_mmx.c
===================================================================
--- trunk/theora/lib/enc/x86_32/idct_mmx.c 2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/enc/x86_32/idct_mmx.c 2008-08-04 18:37:55 UTC (rev 15153)
@@ -401,7 +401,7 @@
" paddsw "r0","r0"\n" \
" movq "r1","I(1)"\n" /* save R1 */ \
" paddsw "r7","r0"\n" /* r0 = R0 = G. + C. */ \
- "#end RowIDCT" \
+ "#end RowIDCT" \
);
// end RowIDCT macro (8 + 38 = 46 cycles)
@@ -465,7 +465,7 @@
" movq "r5","J(5)"\n" /* store NR5 at J5 */ \
" movq "r7","J(7)"\n" /* store NR7 at J7 */ \
" movq "r0","I(0)"\n" /* store NR0 at I0 */ \
- "#end ColumnIDCT\n" \
+ "#end ColumnIDCT\n" \
);
// end ColumnIDCT macro (38 + 19 = 57 cycles)
@@ -559,7 +559,7 @@
" movq "r4","I(3)"\n" \
\
" movq "r2","I(2)"\n" \
- "#end Transpose\n" \
+ "#end Transpose\n" \
);
// end Transpose macro (19 cycles).
@@ -1013,7 +1013,7 @@
" paddsw "r0","r0"\n" \
" movq "r1","I(1)"\n" /* save R1 */ \
" paddsw "r7","r0"\n" /* r0 = R0 = G. + C. */ \
- "#end RowIDCT_10\n" \
+ "#end RowIDCT_10\n" \
);
// end RowIDCT macro (8 + 38 = 46 cycles)
@@ -1060,7 +1060,7 @@
" movq "r7","J(7)"\n" /* store NR7 at J7 */ \
\
" movq "r0","I(0)"\n" /* store NR0 at I0 */ \
- "#end ColumnIDCT_10\n" \
+ "#end ColumnIDCT_10\n" \
);
// end ColumnIDCT macro (38 + 19 = 57 cycles)
/* --------------------------------------------------------------- */
@@ -1389,7 +1389,7 @@
);
ASM(
- "movq (%eax), "r0"\n"
+ "movq (%eax), "r0"\n"
"pmullw (%esi), "r0"\n" /* r0 = 03 02 01 00 */
"movq "M(0)", "r2"\n" /* r2 = __ __ __ FF */
"movq "r0", "r3"\n" /* r3 = 03 02 01 00 */
@@ -1444,7 +1444,6 @@
/* install our implementation in the function table */
void dsp_mmx_idct_init(DspFunctions *funcs)
{
- TH_DEBUG("enabling accelerated x86_32 mmx idct functions.\n");
funcs->IDctSlow = IDctSlow__mmx;
funcs->IDct10 = IDct10__mmx;
funcs->IDct3 = IDct3__mmx;
Modified: trunk/theora/lib/enc/x86_32/recon_mmx.c
===================================================================
--- trunk/theora/lib/enc/x86_32/recon_mmx.c 2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/enc/x86_32/recon_mmx.c 2008-08-04 18:37:55 UTC (rev 15153)
@@ -22,8 +22,8 @@
static const __attribute__ ((aligned(8),used)) ogg_int64_t V128 = 0x8080808080808080LL;
static void copy8x8__mmx (unsigned char *src,
- unsigned char *dest,
- unsigned int stride)
+ unsigned char *dest,
+ unsigned int stride)
{
__asm__ __volatile__ (
" .p2align 4 \n\t"
@@ -35,14 +35,14 @@
" movq (%1, %2, 2), %%mm2 \n\t"
" movq (%1, %%edi), %%mm3 \n\t"
- " lea (%1, %2, 4), %1 \n\t"
+ " lea (%1, %2, 4), %1 \n\t"
" movq %%mm0, (%0) \n\t"
" movq %%mm1, (%0, %2) \n\t"
" movq %%mm2, (%0, %2, 2) \n\t"
" movq %%mm3, (%0, %%edi) \n\t"
- " lea (%0, %2, 4), %0 \n\t"
+ " lea (%0, %2, 4), %0 \n\t"
" movq (%1), %%mm0 \n\t"
" movq (%1, %2), %%mm1 \n\t"
@@ -61,7 +61,7 @@
}
static void recon_intra8x8__mmx (unsigned char *ReconPtr, ogg_int16_t *ChangePtr,
- ogg_uint32_t LineStep)
+ ogg_uint32_t LineStep)
{
__asm__ __volatile__ (
" .p2align 4 \n\t"
@@ -69,11 +69,11 @@
" movq %[V128], %%mm0 \n\t" /* Set mm0 to 0x8080808080808080 */
" lea 128(%1), %%edi \n\t" /* Endpoint in input buffer */
- "1: \n\t"
+ "1: \n\t"
" movq (%1), %%mm2 \n\t" /* First four input values */
" packsswb 8(%1), %%mm2 \n\t" /* pack with next(high) four values */
- " por %%mm0, %%mm0 \n\t"
+ " por %%mm0, %%mm0 \n\t"
" pxor %%mm0, %%mm2 \n\t" /* Convert result to unsigned (same as add 128) */
" lea 16(%1), %1 \n\t" /* Step source buffer */
" cmp %%edi, %1 \n\t" /* are we done */
@@ -91,7 +91,7 @@
}
static void recon_inter8x8__mmx (unsigned char *ReconPtr, unsigned char *RefPtr,
- ogg_int16_t *ChangePtr, ogg_uint32_t LineStep)
+ ogg_int16_t *ChangePtr, ogg_uint32_t LineStep)
{
__asm__ __volatile__ (
" .p2align 4 \n\t"
@@ -127,8 +127,8 @@
}
static void recon_inter8x8_half__mmx (unsigned char *ReconPtr, unsigned char *RefPtr1,
- unsigned char *RefPtr2, ogg_int16_t *ChangePtr,
- ogg_uint32_t LineStep)
+ unsigned char *RefPtr2, ogg_int16_t *ChangePtr,
+ ogg_uint32_t LineStep)
{
__asm__ __volatile__ (
" .p2align 4 \n\t"
@@ -173,7 +173,6 @@
void dsp_mmx_recon_init(DspFunctions *funcs)
{
- TH_DEBUG("enabling accelerated x86_32 mmx recon functions.\n");
funcs->copy8x8 = copy8x8__mmx;
funcs->recon_intra8x8 = recon_intra8x8__mmx;
funcs->recon_inter8x8 = recon_inter8x8__mmx;
Modified: trunk/theora/lib/enc/x86_32_vs/dsp_mmx.c
===================================================================
--- trunk/theora/lib/enc/x86_32_vs/dsp_mmx.c 2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/enc/x86_32_vs/dsp_mmx.c 2008-08-04 18:37:55 UTC (rev 15153)
@@ -32,7 +32,7 @@
static void sub8x8__mmx (unsigned char *FiltPtr, unsigned char *ReconPtr,
ogg_int16_t *DctInputPtr, ogg_uint32_t PixelsPerLine,
- ogg_uint32_t ReconPixelsPerLine)
+ ogg_uint32_t ReconPixelsPerLine)
{
//Make non-zero to use the C-version
@@ -59,192 +59,192 @@
__asm {
align 16
- pxor mm7, mm7
+ pxor mm7, mm7
mov eax, FiltPtr
mov ebx, ReconPtr
mov edx, DctInputPtr
- /* You can't use rept in inline masm and macro parsing seems screwed with inline asm*/
-
+ /* You can't use rept in inline masm and macro parsing seems screwed with inline asm*/
+
/* ITERATION 1 */
- movq mm0, [eax] /* mm0 = FiltPtr */
- movq mm1, [ebx] /* mm1 = ReconPtr */
- movq mm2, mm0 /* dup to prepare for up conversion */
- movq mm3, mm1 /* dup to prepare for up conversion */
+ movq mm0, [eax] /* mm0 = FiltPtr */
+ movq mm1, [ebx] /* mm1 = ReconPtr */
+ movq mm2, mm0 /* dup to prepare for up conversion */
+ movq mm3, mm1 /* dup to prepare for up conversion */
/* convert from UINT8 to INT16 */
- punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
- punpcklbw mm1, mm7 /* mm1 = INT16(ReconPtr) */
- punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
- punpckhbw mm3, mm7 /* mm3 = INT16(ReconPtr) */
+ punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
+ punpcklbw mm1, mm7 /* mm1 = INT16(ReconPtr) */
+ punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
+ punpckhbw mm3, mm7 /* mm3 = INT16(ReconPtr) */
/* start calculation */
- psubw mm0, mm1 /* mm0 = FiltPtr - ReconPtr */
- psubw mm2, mm3 /* mm2 = FiltPtr - ReconPtr */
- movq [edx], mm0 /* write answer out */
- movq [8 + edx], mm2 /* write answer out */
+ psubw mm0, mm1 /* mm0 = FiltPtr - ReconPtr */
+ psubw mm2, mm3 /* mm2 = FiltPtr - ReconPtr */
+ movq [edx], mm0 /* write answer out */
+ movq [8 + edx], mm2 /* write answer out */
/* Increment pointers */
- add edx, 16
- add eax, PixelsPerLine
- add ebx, ReconPixelsPerLine
+ add edx, 16
+ add eax, PixelsPerLine
+ add ebx, ReconPixelsPerLine
/* ITERATION 2 */
- movq mm0, [eax] /* mm0 = FiltPtr */
- movq mm1, [ebx] /* mm1 = ReconPtr */
- movq mm2, mm0 /* dup to prepare for up conversion */
- movq mm3, mm1 /* dup to prepare for up conversion */
+ movq mm0, [eax] /* mm0 = FiltPtr */
+ movq mm1, [ebx] /* mm1 = ReconPtr */
+ movq mm2, mm0 /* dup to prepare for up conversion */
+ movq mm3, mm1 /* dup to prepare for up conversion */
/* convert from UINT8 to INT16 */
- punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
- punpcklbw mm1, mm7 /* mm1 = INT16(ReconPtr) */
- punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
- punpckhbw mm3, mm7 /* mm3 = INT16(ReconPtr) */
+ punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
+ punpcklbw mm1, mm7 /* mm1 = INT16(ReconPtr) */
+ punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
+ punpckhbw mm3, mm7 /* mm3 = INT16(ReconPtr) */
/* start calculation */
- psubw mm0, mm1 /* mm0 = FiltPtr - ReconPtr */
- psubw mm2, mm3 /* mm2 = FiltPtr - ReconPtr */
- movq [edx], mm0 /* write answer out */
- movq [8 + edx], mm2 /* write answer out */
+ psubw mm0, mm1 /* mm0 = FiltPtr - ReconPtr */
+ psubw mm2, mm3 /* mm2 = FiltPtr - ReconPtr */
+ movq [edx], mm0 /* write answer out */
+ movq [8 + edx], mm2 /* write answer out */
/* Increment pointers */
- add edx, 16
- add eax, PixelsPerLine
- add ebx, ReconPixelsPerLine
+ add edx, 16
+ add eax, PixelsPerLine
+ add ebx, ReconPixelsPerLine
/* ITERATION 3 */
- movq mm0, [eax] /* mm0 = FiltPtr */
- movq mm1, [ebx] /* mm1 = ReconPtr */
- movq mm2, mm0 /* dup to prepare for up conversion */
- movq mm3, mm1 /* dup to prepare for up conversion */
+ movq mm0, [eax] /* mm0 = FiltPtr */
+ movq mm1, [ebx] /* mm1 = ReconPtr */
+ movq mm2, mm0 /* dup to prepare for up conversion */
+ movq mm3, mm1 /* dup to prepare for up conversion */
/* convert from UINT8 to INT16 */
- punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
- punpcklbw mm1, mm7 /* mm1 = INT16(ReconPtr) */
- punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
- punpckhbw mm3, mm7 /* mm3 = INT16(ReconPtr) */
+ punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
+ punpcklbw mm1, mm7 /* mm1 = INT16(ReconPtr) */
+ punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
+ punpckhbw mm3, mm7 /* mm3 = INT16(ReconPtr) */
/* start calculation */
- psubw mm0, mm1 /* mm0 = FiltPtr - ReconPtr */
- psubw mm2, mm3 /* mm2 = FiltPtr - ReconPtr */
- movq [edx], mm0 /* write answer out */
- movq [8 + edx], mm2 /* write answer out */
+ psubw mm0, mm1 /* mm0 = FiltPtr - ReconPtr */
+ psubw mm2, mm3 /* mm2 = FiltPtr - ReconPtr */
+ movq [edx], mm0 /* write answer out */
+ movq [8 + edx], mm2 /* write answer out */
/* Increment pointers */
- add edx, 16
- add eax, PixelsPerLine
- add ebx, ReconPixelsPerLine
+ add edx, 16
+ add eax, PixelsPerLine
+ add ebx, ReconPixelsPerLine
/* ITERATION 4 */
- movq mm0, [eax] /* mm0 = FiltPtr */
- movq mm1, [ebx] /* mm1 = ReconPtr */
- movq mm2, mm0 /* dup to prepare for up conversion */
- movq mm3, mm1 /* dup to prepare for up conversion */
+ movq mm0, [eax] /* mm0 = FiltPtr */
+ movq mm1, [ebx] /* mm1 = ReconPtr */
+ movq mm2, mm0 /* dup to prepare for up conversion */
+ movq mm3, mm1 /* dup to prepare for up conversion */
/* convert from UINT8 to INT16 */
- punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
- punpcklbw mm1, mm7 /* mm1 = INT16(ReconPtr) */
- punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
- punpckhbw mm3, mm7 /* mm3 = INT16(ReconPtr) */
+ punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
+ punpcklbw mm1, mm7 /* mm1 = INT16(ReconPtr) */
+ punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
+ punpckhbw mm3, mm7 /* mm3 = INT16(ReconPtr) */
/* start calculation */
- psubw mm0, mm1 /* mm0 = FiltPtr - ReconPtr */
- psubw mm2, mm3 /* mm2 = FiltPtr - ReconPtr */
- movq [edx], mm0 /* write answer out */
- movq [8 + edx], mm2 /* write answer out */
+ psubw mm0, mm1 /* mm0 = FiltPtr - ReconPtr */
+ psubw mm2, mm3 /* mm2 = FiltPtr - ReconPtr */
+ movq [edx], mm0 /* write answer out */
+ movq [8 + edx], mm2 /* write answer out */
/* Increment pointers */
- add edx, 16
- add eax, PixelsPerLine
- add ebx, ReconPixelsPerLine
+ add edx, 16
+ add eax, PixelsPerLine
+ add ebx, ReconPixelsPerLine
/* ITERATION 5 */
- movq mm0, [eax] /* mm0 = FiltPtr */
- movq mm1, [ebx] /* mm1 = ReconPtr */
- movq mm2, mm0 /* dup to prepare for up conversion */
- movq mm3, mm1 /* dup to prepare for up conversion */
+ movq mm0, [eax] /* mm0 = FiltPtr */
+ movq mm1, [ebx] /* mm1 = ReconPtr */
+ movq mm2, mm0 /* dup to prepare for up conversion */
+ movq mm3, mm1 /* dup to prepare for up conversion */
/* convert from UINT8 to INT16 */
- punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
- punpcklbw mm1, mm7 /* mm1 = INT16(ReconPtr) */
- punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
- punpckhbw mm3, mm7 /* mm3 = INT16(ReconPtr) */
+ punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
+ punpcklbw mm1, mm7 /* mm1 = INT16(ReconPtr) */
+ punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
+ punpckhbw mm3, mm7 /* mm3 = INT16(ReconPtr) */
/* start calculation */
- psubw mm0, mm1 /* mm0 = FiltPtr - ReconPtr */
- psubw mm2, mm3 /* mm2 = FiltPtr - ReconPtr */
- movq [edx], mm0 /* write answer out */
- movq [8 + edx], mm2 /* write answer out */
+ psubw mm0, mm1 /* mm0 = FiltPtr - ReconPtr */
+ psubw mm2, mm3 /* mm2 = FiltPtr - ReconPtr */
+ movq [edx], mm0 /* write answer out */
+ movq [8 + edx], mm2 /* write answer out */
/* Increment pointers */
- add edx, 16
- add eax, PixelsPerLine
- add ebx, ReconPixelsPerLine
+ add edx, 16
+ add eax, PixelsPerLine
+ add ebx, ReconPixelsPerLine
/* ITERATION 6 */
- movq mm0, [eax] /* mm0 = FiltPtr */
- movq mm1, [ebx] /* mm1 = ReconPtr */
- movq mm2, mm0 /* dup to prepare for up conversion */
- movq mm3, mm1 /* dup to prepare for up conversion */
+ movq mm0, [eax] /* mm0 = FiltPtr */
+ movq mm1, [ebx] /* mm1 = ReconPtr */
+ movq mm2, mm0 /* dup to prepare for up conversion */
+ movq mm3, mm1 /* dup to prepare for up conversion */
/* convert from UINT8 to INT16 */
- punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
- punpcklbw mm1, mm7 /* mm1 = INT16(ReconPtr) */
- punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
- punpckhbw mm3, mm7 /* mm3 = INT16(ReconPtr) */
+ punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
+ punpcklbw mm1, mm7 /* mm1 = INT16(ReconPtr) */
+ punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
+ punpckhbw mm3, mm7 /* mm3 = INT16(ReconPtr) */
/* start calculation */
- psubw mm0, mm1 /* mm0 = FiltPtr - ReconPtr */
- psubw mm2, mm3 /* mm2 = FiltPtr - ReconPtr */
- movq [edx], mm0 /* write answer out */
- movq [8 + edx], mm2 /* write answer out */
+ psubw mm0, mm1 /* mm0 = FiltPtr - ReconPtr */
+ psubw mm2, mm3 /* mm2 = FiltPtr - ReconPtr */
+ movq [edx], mm0 /* write answer out */
+ movq [8 + edx], mm2 /* write answer out */
/* Increment pointers */
- add edx, 16
- add eax, PixelsPerLine
- add ebx, ReconPixelsPerLine
+ add edx, 16
+ add eax, PixelsPerLine
+ add ebx, ReconPixelsPerLine
/* ITERATION 7 */
- movq mm0, [eax] /* mm0 = FiltPtr */
- movq mm1, [ebx] /* mm1 = ReconPtr */
- movq mm2, mm0 /* dup to prepare for up conversion */
- movq mm3, mm1 /* dup to prepare for up conversion */
+ movq mm0, [eax] /* mm0 = FiltPtr */
+ movq mm1, [ebx] /* mm1 = ReconPtr */
+ movq mm2, mm0 /* dup to prepare for up conversion */
+ movq mm3, mm1 /* dup to prepare for up conversion */
/* convert from UINT8 to INT16 */
- punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
- punpcklbw mm1, mm7 /* mm1 = INT16(ReconPtr) */
- punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
- punpckhbw mm3, mm7 /* mm3 = INT16(ReconPtr) */
+ punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
+ punpcklbw mm1, mm7 /* mm1 = INT16(ReconPtr) */
+ punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
+ punpckhbw mm3, mm7 /* mm3 = INT16(ReconPtr) */
/* start calculation */
- psubw mm0, mm1 /* mm0 = FiltPtr - ReconPtr */
- psubw mm2, mm3 /* mm2 = FiltPtr - ReconPtr */
- movq [edx], mm0 /* write answer out */
- movq [8 + edx], mm2 /* write answer out */
+ psubw mm0, mm1 /* mm0 = FiltPtr - ReconPtr */
+ psubw mm2, mm3 /* mm2 = FiltPtr - ReconPtr */
+ movq [edx], mm0 /* write answer out */
+ movq [8 + edx], mm2 /* write answer out */
/* Increment pointers */
- add edx, 16
- add eax, PixelsPerLine
- add ebx, ReconPixelsPerLine
+ add edx, 16
+ add eax, PixelsPerLine
+ add ebx, ReconPixelsPerLine
/* ITERATION 8 */
- movq mm0, [eax] /* mm0 = FiltPtr */
- movq mm1, [ebx] /* mm1 = ReconPtr */
- movq mm2, mm0 /* dup to prepare for up conversion */
- movq mm3, mm1 /* dup to prepare for up conversion */
+ movq mm0, [eax] /* mm0 = FiltPtr */
+ movq mm1, [ebx] /* mm1 = ReconPtr */
+ movq mm2, mm0 /* dup to prepare for up conversion */
+ movq mm3, mm1 /* dup to prepare for up conversion */
/* convert from UINT8 to INT16 */
- punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
- punpcklbw mm1, mm7 /* mm1 = INT16(ReconPtr) */
- punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
- punpckhbw mm3, mm7 /* mm3 = INT16(ReconPtr) */
+ punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
+ punpcklbw mm1, mm7 /* mm1 = INT16(ReconPtr) */
+ punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
+ punpckhbw mm3, mm7 /* mm3 = INT16(ReconPtr) */
/* start calculation */
- psubw mm0, mm1 /* mm0 = FiltPtr - ReconPtr */
- psubw mm2, mm3 /* mm2 = FiltPtr - ReconPtr */
- movq [edx], mm0 /* write answer out */
- movq [8 + edx], mm2 /* write answer out */
+ psubw mm0, mm1 /* mm0 = FiltPtr - ReconPtr */
+ psubw mm2, mm3 /* mm2 = FiltPtr - ReconPtr */
+ movq [edx], mm0 /* write answer out */
+ movq [8 + edx], mm2 /* write answer out */
/* Increment pointers */
- add edx, 16
- add eax, PixelsPerLine
- add ebx, ReconPixelsPerLine
+ add edx, 16
+ add eax, PixelsPerLine
+ add ebx, ReconPixelsPerLine
-
+
};
-
+
#endif
}
static void sub8x8_128__mmx (unsigned char *FiltPtr, ogg_int16_t *DctInputPtr,
- ogg_uint32_t PixelsPerLine)
+ ogg_uint32_t PixelsPerLine)
{
#if 0
@@ -273,142 +273,142 @@
__asm {
align 16
- pxor mm7, mm7
+ pxor mm7, mm7
mov eax, FiltPtr
mov ebx, DctInputPtr
- movq mm1, V128
+ movq mm1, V128
- /* ITERATION 1 */
- movq mm0, [eax] /* mm0 = FiltPtr */
- movq mm2, mm0 /* dup to prepare for up conversion */
+ /* ITERATION 1 */
+ movq mm0, [eax] /* mm0 = FiltPtr */
+ movq mm2, mm0 /* dup to prepare for up conversion */
/* convert from UINT8 to INT16 */
- punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
- punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
+ punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
+ punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
/* start calculation */
- psubw mm0, mm1 /* mm0 = FiltPtr - 128 */
- psubw mm2, mm1 /* mm2 = FiltPtr - 128 */
- movq [ebx], mm0 /* write answer out */
- movq [8 + ebx], mm2 /* write answer out */
+ psubw mm0, mm1 /* mm0 = FiltPtr - 128 */
+ psubw mm2, mm1 /* mm2 = FiltPtr - 128 */
+ movq [ebx], mm0 /* write answer out */
+ movq [8 + ebx], mm2 /* write answer out */
/* Increment pointers */
- add ebx, 16
- add eax, PixelsPerLine
+ add ebx, 16
+ add eax, PixelsPerLine
- /* ITERATION 2 */
- movq mm0, [eax] /* mm0 = FiltPtr */
- movq mm2, mm0 /* dup to prepare for up conversion */
+ /* ITERATION 2 */
+ movq mm0, [eax] /* mm0 = FiltPtr */
+ movq mm2, mm0 /* dup to prepare for up conversion */
/* convert from UINT8 to INT16 */
- punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
- punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
+ punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
+ punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
/* start calculation */
- psubw mm0, mm1 /* mm0 = FiltPtr - 128 */
- psubw mm2, mm1 /* mm2 = FiltPtr - 128 */
- movq [ebx], mm0 /* write answer out */
- movq [8 + ebx], mm2 /* write answer out */
+ psubw mm0, mm1 /* mm0 = FiltPtr - 128 */
+ psubw mm2, mm1 /* mm2 = FiltPtr - 128 */
+ movq [ebx], mm0 /* write answer out */
+ movq [8 + ebx], mm2 /* write answer out */
/* Increment pointers */
- add ebx, 16
- add eax, PixelsPerLine
+ add ebx, 16
+ add eax, PixelsPerLine
- /* ITERATION 3 */
- movq mm0, [eax] /* mm0 = FiltPtr */
- movq mm2, mm0 /* dup to prepare for up conversion */
+ /* ITERATION 3 */
+ movq mm0, [eax] /* mm0 = FiltPtr */
+ movq mm2, mm0 /* dup to prepare for up conversion */
/* convert from UINT8 to INT16 */
- punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
- punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
+ punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
+ punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
/* start calculation */
- psubw mm0, mm1 /* mm0 = FiltPtr - 128 */
- psubw mm2, mm1 /* mm2 = FiltPtr - 128 */
- movq [ebx], mm0 /* write answer out */
- movq [8 + ebx], mm2 /* write answer out */
+ psubw mm0, mm1 /* mm0 = FiltPtr - 128 */
+ psubw mm2, mm1 /* mm2 = FiltPtr - 128 */
+ movq [ebx], mm0 /* write answer out */
+ movq [8 + ebx], mm2 /* write answer out */
/* Increment pointers */
- add ebx, 16
- add eax, PixelsPerLine
+ add ebx, 16
+ add eax, PixelsPerLine
- /* ITERATION 4 */
- movq mm0, [eax] /* mm0 = FiltPtr */
- movq mm2, mm0 /* dup to prepare for up conversion */
+ /* ITERATION 4 */
+ movq mm0, [eax] /* mm0 = FiltPtr */
+ movq mm2, mm0 /* dup to prepare for up conversion */
/* convert from UINT8 to INT16 */
- punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
- punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
+ punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
+ punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
/* start calculation */
- psubw mm0, mm1 /* mm0 = FiltPtr - 128 */
- psubw mm2, mm1 /* mm2 = FiltPtr - 128 */
- movq [ebx], mm0 /* write answer out */
- movq [8 + ebx], mm2 /* write answer out */
+ psubw mm0, mm1 /* mm0 = FiltPtr - 128 */
+ psubw mm2, mm1 /* mm2 = FiltPtr - 128 */
+ movq [ebx], mm0 /* write answer out */
+ movq [8 + ebx], mm2 /* write answer out */
/* Increment pointers */
- add ebx, 16
- add eax, PixelsPerLine
+ add ebx, 16
+ add eax, PixelsPerLine
- /* ITERATION 5 */
- movq mm0, [eax] /* mm0 = FiltPtr */
- movq mm2, mm0 /* dup to prepare for up conversion */
+ /* ITERATION 5 */
+ movq mm0, [eax] /* mm0 = FiltPtr */
+ movq mm2, mm0 /* dup to prepare for up conversion */
/* convert from UINT8 to INT16 */
- punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
- punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
+ punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
+ punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
/* start calculation */
- psubw mm0, mm1 /* mm0 = FiltPtr - 128 */
- psubw mm2, mm1 /* mm2 = FiltPtr - 128 */
- movq [ebx], mm0 /* write answer out */
- movq [8 + ebx], mm2 /* write answer out */
+ psubw mm0, mm1 /* mm0 = FiltPtr - 128 */
+ psubw mm2, mm1 /* mm2 = FiltPtr - 128 */
+ movq [ebx], mm0 /* write answer out */
+ movq [8 + ebx], mm2 /* write answer out */
/* Increment pointers */
- add ebx, 16
- add eax, PixelsPerLine
+ add ebx, 16
+ add eax, PixelsPerLine
- /* ITERATION 6 */
- movq mm0, [eax] /* mm0 = FiltPtr */
- movq mm2, mm0 /* dup to prepare for up conversion */
+ /* ITERATION 6 */
+ movq mm0, [eax] /* mm0 = FiltPtr */
+ movq mm2, mm0 /* dup to prepare for up conversion */
/* convert from UINT8 to INT16 */
- punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
- punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
+ punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
+ punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
/* start calculation */
- psubw mm0, mm1 /* mm0 = FiltPtr - 128 */
- psubw mm2, mm1 /* mm2 = FiltPtr - 128 */
- movq [ebx], mm0 /* write answer out */
- movq [8 + ebx], mm2 /* write answer out */
+ psubw mm0, mm1 /* mm0 = FiltPtr - 128 */
+ psubw mm2, mm1 /* mm2 = FiltPtr - 128 */
+ movq [ebx], mm0 /* write answer out */
+ movq [8 + ebx], mm2 /* write answer out */
/* Increment pointers */
- add ebx, 16
- add eax, PixelsPerLine
+ add ebx, 16
+ add eax, PixelsPerLine
- /* ITERATION 7 */
- movq mm0, [eax] /* mm0 = FiltPtr */
- movq mm2, mm0 /* dup to prepare for up conversion */
+ /* ITERATION 7 */
+ movq mm0, [eax] /* mm0 = FiltPtr */
+ movq mm2, mm0 /* dup to prepare for up conversion */
/* convert from UINT8 to INT16 */
- punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
- punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
+ punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
+ punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
/* start calculation */
- psubw mm0, mm1 /* mm0 = FiltPtr - 128 */
- psubw mm2, mm1 /* mm2 = FiltPtr - 128 */
- movq [ebx], mm0 /* write answer out */
- movq [8 + ebx], mm2 /* write answer out */
+ psubw mm0, mm1 /* mm0 = FiltPtr - 128 */
+ psubw mm2, mm1 /* mm2 = FiltPtr - 128 */
+ movq [ebx], mm0 /* write answer out */
+ movq [8 + ebx], mm2 /* write answer out */
/* Increment pointers */
- add ebx, 16
- add eax, PixelsPerLine
+ add ebx, 16
+ add eax, PixelsPerLine
- /* ITERATION 8 */
- movq mm0, [eax] /* mm0 = FiltPtr */
- movq mm2, mm0 /* dup to prepare for up conversion */
+ /* ITERATION 8 */
+ movq mm0, [eax] /* mm0 = FiltPtr */
+ movq mm2, mm0 /* dup to prepare for up conversion */
/* convert from UINT8 to INT16 */
- punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
- punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
+ punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
+ punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
/* start calculation */
- psubw mm0, mm1 /* mm0 = FiltPtr - 128 */
- psubw mm2, mm1 /* mm2 = FiltPtr - 128 */
- movq [ebx], mm0 /* write answer out */
- movq [8 + ebx], mm2 /* write answer out */
+ psubw mm0, mm1 /* mm0 = FiltPtr - 128 */
+ psubw mm2, mm1 /* mm2 = FiltPtr - 128 */
+ movq [ebx], mm0 /* write answer out */
+ movq [8 + ebx], mm2 /* write answer out */
/* Increment pointers */
- add ebx, 16
- add eax, PixelsPerLine
+ add ebx, 16
+ add eax, PixelsPerLine
};
-
+
#endif
}
@@ -418,7 +418,7 @@
static void sub8x8avg2__mmx (unsigned char *FiltPtr, unsigned char *ReconPtr1,
unsigned char *ReconPtr2, ogg_int16_t *DctInputPtr,
ogg_uint32_t PixelsPerLine,
- ogg_uint32_t ReconPixelsPerLine)
+ ogg_uint32_t ReconPixelsPerLine)
{
#if 0
@@ -453,251 +453,251 @@
mov ecx, ReconPtr2
mov edx, DctInputPtr
- /* ITERATION 1 */
- movq mm0, [eax] ; /* mm0 = FiltPtr */
- movq mm1, [ebx] ; /* mm1 = ReconPtr1 */
- movq mm4, [ecx] ; /* mm1 = ReconPtr2 */
- movq mm2, mm0 ; /* dup to prepare for up conversion */
- movq mm3, mm1 ; /* dup to prepare for up conversion */
- movq mm5, mm4 ; /* dup to prepare for up conversion */
- ; /* convert from UINT8 to INT16 */
- punpcklbw mm0, mm7 ; /* mm0 = INT16(FiltPtr) */
- punpcklbw mm1, mm7 ; /* mm1 = INT16(ReconPtr1) */
- punpcklbw mm4, mm7 ; /* mm1 = INT16(ReconPtr2) */
- punpckhbw mm2, mm7 ; /* mm2 = INT16(FiltPtr) */
- punpckhbw mm3, mm7 ; /* mm3 = INT16(ReconPtr1) */
- punpckhbw mm5, mm7 ; /* mm3 = INT16(ReconPtr2) */
- ; /* average ReconPtr1 and ReconPtr2 */
- paddw mm1, mm4 ; /* mm1 = ReconPtr1 + ReconPtr2 */
- paddw mm3, mm5 ; /* mm3 = ReconPtr1 + ReconPtr2 */
- psrlw mm1, 1 ; /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
- psrlw mm3, 1 ; /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
- psubw mm0, mm1 ; /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
- psubw mm2, mm3 ; /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
- movq [edx], mm0 ; /* write answer out */
- movq [8 + edx], mm2 ; /* write answer out */
- ; /* Increment pointers */
- add edx, 16 ;
- add eax, PixelsPerLine ;
- add ebx, ReconPixelsPerLine ;
- add ecx, ReconPixelsPerLine ;
-
+ /* ITERATION 1 */
+ movq mm0, [eax] ; /* mm0 = FiltPtr */
+ movq mm1, [ebx] ; /* mm1 = ReconPtr1 */
+ movq mm4, [ecx] ; /* mm1 = ReconPtr2 */
+ movq mm2, mm0 ; /* dup to prepare for up conversion */
+ movq mm3, mm1 ; /* dup to prepare for up conversion */
+ movq mm5, mm4 ; /* dup to prepare for up conversion */
+ ; /* convert from UINT8 to INT16 */
+ punpcklbw mm0, mm7 ; /* mm0 = INT16(FiltPtr) */
+ punpcklbw mm1, mm7 ; /* mm1 = INT16(ReconPtr1) */
+ punpcklbw mm4, mm7 ; /* mm1 = INT16(ReconPtr2) */
+ punpckhbw mm2, mm7 ; /* mm2 = INT16(FiltPtr) */
+ punpckhbw mm3, mm7 ; /* mm3 = INT16(ReconPtr1) */
+ punpckhbw mm5, mm7 ; /* mm3 = INT16(ReconPtr2) */
+ ; /* average ReconPtr1 and ReconPtr2 */
+ paddw mm1, mm4 ; /* mm1 = ReconPtr1 + ReconPtr2 */
+ paddw mm3, mm5 ; /* mm3 = ReconPtr1 + ReconPtr2 */
+ psrlw mm1, 1 ; /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
+ psrlw mm3, 1 ; /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
+ psubw mm0, mm1 ; /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+ psubw mm2, mm3 ; /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+ movq [edx], mm0 ; /* write answer out */
+ movq [8 + edx], mm2 ; /* write answer out */
+ ; /* Increment pointers */
+ add edx, 16 ;
+ add eax, PixelsPerLine ;
+ add ebx, ReconPixelsPerLine ;
+ add ecx, ReconPixelsPerLine ;
- /* ITERATION 2 */
- movq mm0, [eax] ; /* mm0 = FiltPtr */
- movq mm1, [ebx] ; /* mm1 = ReconPtr1 */
- movq mm4, [ecx] ; /* mm1 = ReconPtr2 */
- movq mm2, mm0 ; /* dup to prepare for up conversion */
- movq mm3, mm1 ; /* dup to prepare for up conversion */
- movq mm5, mm4 ; /* dup to prepare for up conversion */
- ; /* convert from UINT8 to INT16 */
- punpcklbw mm0, mm7 ; /* mm0 = INT16(FiltPtr) */
- punpcklbw mm1, mm7 ; /* mm1 = INT16(ReconPtr1) */
- punpcklbw mm4, mm7 ; /* mm1 = INT16(ReconPtr2) */
- punpckhbw mm2, mm7 ; /* mm2 = INT16(FiltPtr) */
- punpckhbw mm3, mm7 ; /* mm3 = INT16(ReconPtr1) */
- punpckhbw mm5, mm7 ; /* mm3 = INT16(ReconPtr2) */
- ; /* average ReconPtr1 and ReconPtr2 */
- paddw mm1, mm4 ; /* mm1 = ReconPtr1 + ReconPtr2 */
- paddw mm3, mm5 ; /* mm3 = ReconPtr1 + ReconPtr2 */
- psrlw mm1, 1 ; /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
- psrlw mm3, 1 ; /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
- psubw mm0, mm1 ; /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
- psubw mm2, mm3 ; /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
- movq [edx], mm0 ; /* write answer out */
- movq [8 + edx], mm2 ; /* write answer out */
- ; /* Increment pointers */
- add edx, 16 ;
- add eax, PixelsPerLine ;
- add ebx, ReconPixelsPerLine ;
- add ecx, ReconPixelsPerLine ;
+ /* ITERATION 2 */
+ movq mm0, [eax] ; /* mm0 = FiltPtr */
+ movq mm1, [ebx] ; /* mm1 = ReconPtr1 */
+ movq mm4, [ecx] ; /* mm1 = ReconPtr2 */
+ movq mm2, mm0 ; /* dup to prepare for up conversion */
+ movq mm3, mm1 ; /* dup to prepare for up conversion */
+ movq mm5, mm4 ; /* dup to prepare for up conversion */
+ ; /* convert from UINT8 to INT16 */
+ punpcklbw mm0, mm7 ; /* mm0 = INT16(FiltPtr) */
+ punpcklbw mm1, mm7 ; /* mm1 = INT16(ReconPtr1) */
+ punpcklbw mm4, mm7 ; /* mm1 = INT16(ReconPtr2) */
+ punpckhbw mm2, mm7 ; /* mm2 = INT16(FiltPtr) */
+ punpckhbw mm3, mm7 ; /* mm3 = INT16(ReconPtr1) */
+ punpckhbw mm5, mm7 ; /* mm3 = INT16(ReconPtr2) */
+ ; /* average ReconPtr1 and ReconPtr2 */
+ paddw mm1, mm4 ; /* mm1 = ReconPtr1 + ReconPtr2 */
+ paddw mm3, mm5 ; /* mm3 = ReconPtr1 + ReconPtr2 */
+ psrlw mm1, 1 ; /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
+ psrlw mm3, 1 ; /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
+ psubw mm0, mm1 ; /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+ psubw mm2, mm3 ; /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+ movq [edx], mm0 ; /* write answer out */
+ movq [8 + edx], mm2 ; /* write answer out */
+ ; /* Increment pointers */
+ add edx, 16 ;
+ add eax, PixelsPerLine ;
+ add ebx, ReconPixelsPerLine ;
+ add ecx, ReconPixelsPerLine ;
- /* ITERATION 3 */
- movq mm0, [eax] ; /* mm0 = FiltPtr */
- movq mm1, [ebx] ; /* mm1 = ReconPtr1 */
- movq mm4, [ecx] ; /* mm1 = ReconPtr2 */
- movq mm2, mm0 ; /* dup to prepare for up conversion */
- movq mm3, mm1 ; /* dup to prepare for up conversion */
- movq mm5, mm4 ; /* dup to prepare for up conversion */
- ; /* convert from UINT8 to INT16 */
- punpcklbw mm0, mm7 ; /* mm0 = INT16(FiltPtr) */
- punpcklbw mm1, mm7 ; /* mm1 = INT16(ReconPtr1) */
- punpcklbw mm4, mm7 ; /* mm1 = INT16(ReconPtr2) */
- punpckhbw mm2, mm7 ; /* mm2 = INT16(FiltPtr) */
- punpckhbw mm3, mm7 ; /* mm3 = INT16(ReconPtr1) */
- punpckhbw mm5, mm7 ; /* mm3 = INT16(ReconPtr2) */
- ; /* average ReconPtr1 and ReconPtr2 */
- paddw mm1, mm4 ; /* mm1 = ReconPtr1 + ReconPtr2 */
- paddw mm3, mm5 ; /* mm3 = ReconPtr1 + ReconPtr2 */
- psrlw mm1, 1 ; /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
- psrlw mm3, 1 ; /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
- psubw mm0, mm1 ; /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
- psubw mm2, mm3 ; /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
- movq [edx], mm0 ; /* write answer out */
- movq [8 + edx], mm2 ; /* write answer out */
- ; /* Increment pointers */
- add edx, 16 ;
- add eax, PixelsPerLine ;
- add ebx, ReconPixelsPerLine ;
- add ecx, ReconPixelsPerLine ;
+ /* ITERATION 3 */
+ movq mm0, [eax] ; /* mm0 = FiltPtr */
+ movq mm1, [ebx] ; /* mm1 = ReconPtr1 */
+ movq mm4, [ecx] ; /* mm1 = ReconPtr2 */
+ movq mm2, mm0 ; /* dup to prepare for up conversion */
+ movq mm3, mm1 ; /* dup to prepare for up conversion */
+ movq mm5, mm4 ; /* dup to prepare for up conversion */
+ ; /* convert from UINT8 to INT16 */
+ punpcklbw mm0, mm7 ; /* mm0 = INT16(FiltPtr) */
+ punpcklbw mm1, mm7 ; /* mm1 = INT16(ReconPtr1) */
+ punpcklbw mm4, mm7 ; /* mm1 = INT16(ReconPtr2) */
+ punpckhbw mm2, mm7 ; /* mm2 = INT16(FiltPtr) */
+ punpckhbw mm3, mm7 ; /* mm3 = INT16(ReconPtr1) */
+ punpckhbw mm5, mm7 ; /* mm3 = INT16(ReconPtr2) */
+ ; /* average ReconPtr1 and ReconPtr2 */
+ paddw mm1, mm4 ; /* mm1 = ReconPtr1 + ReconPtr2 */
+ paddw mm3, mm5 ; /* mm3 = ReconPtr1 + ReconPtr2 */
+ psrlw mm1, 1 ; /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
+ psrlw mm3, 1 ; /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
+ psubw mm0, mm1 ; /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+ psubw mm2, mm3 ; /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+ movq [edx], mm0 ; /* write answer out */
+ movq [8 + edx], mm2 ; /* write answer out */
+ ; /* Increment pointers */
+ add edx, 16 ;
+ add eax, PixelsPerLine ;
+ add ebx, ReconPixelsPerLine ;
+ add ecx, ReconPixelsPerLine ;
- /* ITERATION 4 */
- movq mm0, [eax] ; /* mm0 = FiltPtr */
- movq mm1, [ebx] ; /* mm1 = ReconPtr1 */
- movq mm4, [ecx] ; /* mm1 = ReconPtr2 */
- movq mm2, mm0 ; /* dup to prepare for up conversion */
- movq mm3, mm1 ; /* dup to prepare for up conversion */
- movq mm5, mm4 ; /* dup to prepare for up conversion */
- ; /* convert from UINT8 to INT16 */
- punpcklbw mm0, mm7 ; /* mm0 = INT16(FiltPtr) */
- punpcklbw mm1, mm7 ; /* mm1 = INT16(ReconPtr1) */
- punpcklbw mm4, mm7 ; /* mm1 = INT16(ReconPtr2) */
- punpckhbw mm2, mm7 ; /* mm2 = INT16(FiltPtr) */
- punpckhbw mm3, mm7 ; /* mm3 = INT16(ReconPtr1) */
- punpckhbw mm5, mm7 ; /* mm3 = INT16(ReconPtr2) */
- ; /* average ReconPtr1 and ReconPtr2 */
- paddw mm1, mm4 ; /* mm1 = ReconPtr1 + ReconPtr2 */
- paddw mm3, mm5 ; /* mm3 = ReconPtr1 + ReconPtr2 */
- psrlw mm1, 1 ; /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
- psrlw mm3, 1 ; /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
- psubw mm0, mm1 ; /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
- psubw mm2, mm3 ; /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
- movq [edx], mm0 ; /* write answer out */
- movq [8 + edx], mm2 ; /* write answer out */
- ; /* Increment pointers */
- add edx, 16 ;
- add eax, PixelsPerLine ;
- add ebx, ReconPixelsPerLine ;
- add ecx, ReconPixelsPerLine ;
+ /* ITERATION 4 */
+ movq mm0, [eax] ; /* mm0 = FiltPtr */
+ movq mm1, [ebx] ; /* mm1 = ReconPtr1 */
+ movq mm4, [ecx] ; /* mm1 = ReconPtr2 */
+ movq mm2, mm0 ; /* dup to prepare for up conversion */
+ movq mm3, mm1 ; /* dup to prepare for up conversion */
+ movq mm5, mm4 ; /* dup to prepare for up conversion */
+ ; /* convert from UINT8 to INT16 */
+ punpcklbw mm0, mm7 ; /* mm0 = INT16(FiltPtr) */
+ punpcklbw mm1, mm7 ; /* mm1 = INT16(ReconPtr1) */
+ punpcklbw mm4, mm7 ; /* mm1 = INT16(ReconPtr2) */
+ punpckhbw mm2, mm7 ; /* mm2 = INT16(FiltPtr) */
+ punpckhbw mm3, mm7 ; /* mm3 = INT16(ReconPtr1) */
+ punpckhbw mm5, mm7 ; /* mm3 = INT16(ReconPtr2) */
+ ; /* average ReconPtr1 and ReconPtr2 */
+ paddw mm1, mm4 ; /* mm1 = ReconPtr1 + ReconPtr2 */
+ paddw mm3, mm5 ; /* mm3 = ReconPtr1 + ReconPtr2 */
+ psrlw mm1, 1 ; /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
+ psrlw mm3, 1 ; /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
+ psubw mm0, mm1 ; /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+ psubw mm2, mm3 ; /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+ movq [edx], mm0 ; /* write answer out */
+ movq [8 + edx], mm2 ; /* write answer out */
+ ; /* Increment pointers */
+ add edx, 16 ;
+ add eax, PixelsPerLine ;
+ add ebx, ReconPixelsPerLine ;
+ add ecx, ReconPixelsPerLine ;
- /* ITERATION 5 */
- movq mm0, [eax] ; /* mm0 = FiltPtr */
- movq mm1, [ebx] ; /* mm1 = ReconPtr1 */
- movq mm4, [ecx] ; /* mm1 = ReconPtr2 */
- movq mm2, mm0 ; /* dup to prepare for up conversion */
- movq mm3, mm1 ; /* dup to prepare for up conversion */
- movq mm5, mm4 ; /* dup to prepare for up conversion */
- ; /* convert from UINT8 to INT16 */
- punpcklbw mm0, mm7 ; /* mm0 = INT16(FiltPtr) */
- punpcklbw mm1, mm7 ; /* mm1 = INT16(ReconPtr1) */
- punpcklbw mm4, mm7 ; /* mm1 = INT16(ReconPtr2) */
- punpckhbw mm2, mm7 ; /* mm2 = INT16(FiltPtr) */
- punpckhbw mm3, mm7 ; /* mm3 = INT16(ReconPtr1) */
- punpckhbw mm5, mm7 ; /* mm3 = INT16(ReconPtr2) */
- ; /* average ReconPtr1 and ReconPtr2 */
- paddw mm1, mm4 ; /* mm1 = ReconPtr1 + ReconPtr2 */
- paddw mm3, mm5 ; /* mm3 = ReconPtr1 + ReconPtr2 */
- psrlw mm1, 1 ; /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
- psrlw mm3, 1 ; /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
- psubw mm0, mm1 ; /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
- psubw mm2, mm3 ; /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
- movq [edx], mm0 ; /* write answer out */
- movq [8 + edx], mm2 ; /* write answer out */
- ; /* Increment pointers */
- add edx, 16 ;
- add eax, PixelsPerLine ;
- add ebx, ReconPixelsPerLine ;
- add ecx, ReconPixelsPerLine ;
+ /* ITERATION 5 */
+ movq mm0, [eax] ; /* mm0 = FiltPtr */
+ movq mm1, [ebx] ; /* mm1 = ReconPtr1 */
+ movq mm4, [ecx] ; /* mm1 = ReconPtr2 */
+ movq mm2, mm0 ; /* dup to prepare for up conversion */
+ movq mm3, mm1 ; /* dup to prepare for up conversion */
+ movq mm5, mm4 ; /* dup to prepare for up conversion */
+ ; /* convert from UINT8 to INT16 */
+ punpcklbw mm0, mm7 ; /* mm0 = INT16(FiltPtr) */
+ punpcklbw mm1, mm7 ; /* mm1 = INT16(ReconPtr1) */
+ punpcklbw mm4, mm7 ; /* mm1 = INT16(ReconPtr2) */
+ punpckhbw mm2, mm7 ; /* mm2 = INT16(FiltPtr) */
+ punpckhbw mm3, mm7 ; /* mm3 = INT16(ReconPtr1) */
+ punpckhbw mm5, mm7 ; /* mm3 = INT16(ReconPtr2) */
+ ; /* average ReconPtr1 and ReconPtr2 */
+ paddw mm1, mm4 ; /* mm1 = ReconPtr1 + ReconPtr2 */
+ paddw mm3, mm5 ; /* mm3 = ReconPtr1 + ReconPtr2 */
+ psrlw mm1, 1 ; /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
+ psrlw mm3, 1 ; /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
+ psubw mm0, mm1 ; /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+ psubw mm2, mm3 ; /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+ movq [edx], mm0 ; /* write answer out */
+ movq [8 + edx], mm2 ; /* write answer out */
+ ; /* Increment pointers */
+ add edx, 16 ;
+ add eax, PixelsPerLine ;
+ add ebx, ReconPixelsPerLine ;
+ add ecx, ReconPixelsPerLine ;
- /* ITERATION 6 */
- movq mm0, [eax] ; /* mm0 = FiltPtr */
- movq mm1, [ebx] ; /* mm1 = ReconPtr1 */
- movq mm4, [ecx] ; /* mm1 = ReconPtr2 */
- movq mm2, mm0 ; /* dup to prepare for up conversion */
- movq mm3, mm1 ; /* dup to prepare for up conversion */
- movq mm5, mm4 ; /* dup to prepare for up conversion */
- ; /* convert from UINT8 to INT16 */
- punpcklbw mm0, mm7 ; /* mm0 = INT16(FiltPtr) */
- punpcklbw mm1, mm7 ; /* mm1 = INT16(ReconPtr1) */
- punpcklbw mm4, mm7 ; /* mm1 = INT16(ReconPtr2) */
- punpckhbw mm2, mm7 ; /* mm2 = INT16(FiltPtr) */
- punpckhbw mm3, mm7 ; /* mm3 = INT16(ReconPtr1) */
- punpckhbw mm5, mm7 ; /* mm3 = INT16(ReconPtr2) */
- ; /* average ReconPtr1 and ReconPtr2 */
- paddw mm1, mm4 ; /* mm1 = ReconPtr1 + ReconPtr2 */
- paddw mm3, mm5 ; /* mm3 = ReconPtr1 + ReconPtr2 */
- psrlw mm1, 1 ; /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
- psrlw mm3, 1 ; /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
- psubw mm0, mm1 ; /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
- psubw mm2, mm3 ; /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
- movq [edx], mm0 ; /* write answer out */
- movq [8 + edx], mm2 ; /* write answer out */
- ; /* Increment pointers */
- add edx, 16 ;
- add eax, PixelsPerLine ;
- add ebx, ReconPixelsPerLine ;
- add ecx, ReconPixelsPerLine ;
+ /* ITERATION 6 */
+ movq mm0, [eax] ; /* mm0 = FiltPtr */
+ movq mm1, [ebx] ; /* mm1 = ReconPtr1 */
+ movq mm4, [ecx] ; /* mm1 = ReconPtr2 */
+ movq mm2, mm0 ; /* dup to prepare for up conversion */
+ movq mm3, mm1 ; /* dup to prepare for up conversion */
+ movq mm5, mm4 ; /* dup to prepare for up conversion */
+ ; /* convert from UINT8 to INT16 */
+ punpcklbw mm0, mm7 ; /* mm0 = INT16(FiltPtr) */
+ punpcklbw mm1, mm7 ; /* mm1 = INT16(ReconPtr1) */
+ punpcklbw mm4, mm7 ; /* mm1 = INT16(ReconPtr2) */
+ punpckhbw mm2, mm7 ; /* mm2 = INT16(FiltPtr) */
+ punpckhbw mm3, mm7 ; /* mm3 = INT16(ReconPtr1) */
+ punpckhbw mm5, mm7 ; /* mm3 = INT16(ReconPtr2) */
+ ; /* average ReconPtr1 and ReconPtr2 */
+ paddw mm1, mm4 ; /* mm1 = ReconPtr1 + ReconPtr2 */
+ paddw mm3, mm5 ; /* mm3 = ReconPtr1 + ReconPtr2 */
+ psrlw mm1, 1 ; /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
+ psrlw mm3, 1 ; /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
+ psubw mm0, mm1 ; /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+ psubw mm2, mm3 ; /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+ movq [edx], mm0 ; /* write answer out */
+ movq [8 + edx], mm2 ; /* write answer out */
+ ; /* Increment pointers */
+ add edx, 16 ;
+ add eax, PixelsPerLine ;
+ add ebx, ReconPixelsPerLine ;
+ add ecx, ReconPixelsPerLine ;
- /* ITERATION 7 */
- movq mm0, [eax] ; /* mm0 = FiltPtr */
- movq mm1, [ebx] ; /* mm1 = ReconPtr1 */
- movq mm4, [ecx] ; /* mm1 = ReconPtr2 */
- movq mm2, mm0 ; /* dup to prepare for up conversion */
- movq mm3, mm1 ; /* dup to prepare for up conversion */
- movq mm5, mm4 ; /* dup to prepare for up conversion */
- ; /* convert from UINT8 to INT16 */
- punpcklbw mm0, mm7 ; /* mm0 = INT16(FiltPtr) */
- punpcklbw mm1, mm7 ; /* mm1 = INT16(ReconPtr1) */
- punpcklbw mm4, mm7 ; /* mm1 = INT16(ReconPtr2) */
- punpckhbw mm2, mm7 ; /* mm2 = INT16(FiltPtr) */
- punpckhbw mm3, mm7 ; /* mm3 = INT16(ReconPtr1) */
- punpckhbw mm5, mm7 ; /* mm3 = INT16(ReconPtr2) */
- ; /* average ReconPtr1 and ReconPtr2 */
- paddw mm1, mm4 ; /* mm1 = ReconPtr1 + ReconPtr2 */
- paddw mm3, mm5 ; /* mm3 = ReconPtr1 + ReconPtr2 */
- psrlw mm1, 1 ; /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
- psrlw mm3, 1 ; /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
- psubw mm0, mm1 ; /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
- psubw mm2, mm3 ; /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
- movq [edx], mm0 ; /* write answer out */
- movq [8 + edx], mm2 ; /* write answer out */
- ; /* Increment pointers */
- add edx, 16 ;
- add eax, PixelsPerLine ;
- add ebx, ReconPixelsPerLine ;
- add ecx, ReconPixelsPerLine ;
+ /* ITERATION 7 */
+ movq mm0, [eax] ; /* mm0 = FiltPtr */
+ movq mm1, [ebx] ; /* mm1 = ReconPtr1 */
+ movq mm4, [ecx] ; /* mm1 = ReconPtr2 */
+ movq mm2, mm0 ; /* dup to prepare for up conversion */
+ movq mm3, mm1 ; /* dup to prepare for up conversion */
+ movq mm5, mm4 ; /* dup to prepare for up conversion */
+ ; /* convert from UINT8 to INT16 */
+ punpcklbw mm0, mm7 ; /* mm0 = INT16(FiltPtr) */
+ punpcklbw mm1, mm7 ; /* mm1 = INT16(ReconPtr1) */
+ punpcklbw mm4, mm7 ; /* mm1 = INT16(ReconPtr2) */
+ punpckhbw mm2, mm7 ; /* mm2 = INT16(FiltPtr) */
+ punpckhbw mm3, mm7 ; /* mm3 = INT16(ReconPtr1) */
+ punpckhbw mm5, mm7 ; /* mm3 = INT16(ReconPtr2) */
+ ; /* average ReconPtr1 and ReconPtr2 */
+ paddw mm1, mm4 ; /* mm1 = ReconPtr1 + ReconPtr2 */
+ paddw mm3, mm5 ; /* mm3 = ReconPtr1 + ReconPtr2 */
+ psrlw mm1, 1 ; /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
+ psrlw mm3, 1 ; /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
+ psubw mm0, mm1 ; /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+ psubw mm2, mm3 ; /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+ movq [edx], mm0 ; /* write answer out */
+ movq [8 + edx], mm2 ; /* write answer out */
+ ; /* Increment pointers */
+ add edx, 16 ;
+ add eax, PixelsPerLine ;
+ add ebx, ReconPixelsPerLine ;
+ add ecx, ReconPixelsPerLine ;
- /* ITERATION 8 */
- movq mm0, [eax] ; /* mm0 = FiltPtr */
- movq mm1, [ebx] ; /* mm1 = ReconPtr1 */
- movq mm4, [ecx] ; /* mm1 = ReconPtr2 */
- movq mm2, mm0 ; /* dup to prepare for up conversion */
- movq mm3, mm1 ; /* dup to prepare for up conversion */
- movq mm5, mm4 ; /* dup to prepare for up conversion */
- ; /* convert from UINT8 to INT16 */
- punpcklbw mm0, mm7 ; /* mm0 = INT16(FiltPtr) */
- punpcklbw mm1, mm7 ; /* mm1 = INT16(ReconPtr1) */
- punpcklbw mm4, mm7 ; /* mm1 = INT16(ReconPtr2) */
- punpckhbw mm2, mm7 ; /* mm2 = INT16(FiltPtr) */
- punpckhbw mm3, mm7 ; /* mm3 = INT16(ReconPtr1) */
- punpckhbw mm5, mm7 ; /* mm3 = INT16(ReconPtr2) */
- ; /* average ReconPtr1 and ReconPtr2 */
- paddw mm1, mm4 ; /* mm1 = ReconPtr1 + ReconPtr2 */
- paddw mm3, mm5 ; /* mm3 = ReconPtr1 + ReconPtr2 */
- psrlw mm1, 1 ; /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
- psrlw mm3, 1 ; /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
- psubw mm0, mm1 ; /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
- psubw mm2, mm3 ; /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
- movq [edx], mm0 ; /* write answer out */
- movq [8 + edx], mm2 ; /* write answer out */
- ; /* Increment pointers */
- add edx, 16 ;
- add eax, PixelsPerLine ;
- add ebx, ReconPixelsPerLine ;
- add ecx, ReconPixelsPerLine ;
+ /* ITERATION 8 */
+ movq mm0, [eax] ; /* mm0 = FiltPtr */
+ movq mm1, [ebx] ; /* mm1 = ReconPtr1 */
+ movq mm4, [ecx] ; /* mm1 = ReconPtr2 */
+ movq mm2, mm0 ; /* dup to prepare for up conversion */
+ movq mm3, mm1 ; /* dup to prepare for up conversion */
+ movq mm5, mm4 ; /* dup to prepare for up conversion */
+ ; /* convert from UINT8 to INT16 */
+ punpcklbw mm0, mm7 ; /* mm0 = INT16(FiltPtr) */
+ punpcklbw mm1, mm7 ; /* mm1 = INT16(ReconPtr1) */
+ punpcklbw mm4, mm7 ; /* mm1 = INT16(ReconPtr2) */
+ punpckhbw mm2, mm7 ; /* mm2 = INT16(FiltPtr) */
+ punpckhbw mm3, mm7 ; /* mm3 = INT16(ReconPtr1) */
+ punpckhbw mm5, mm7 ; /* mm3 = INT16(ReconPtr2) */
+ ; /* average ReconPtr1 and ReconPtr2 */
+ paddw mm1, mm4 ; /* mm1 = ReconPtr1 + ReconPtr2 */
+ paddw mm3, mm5 ; /* mm3 = ReconPtr1 + ReconPtr2 */
+ psrlw mm1, 1 ; /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
+ psrlw mm3, 1 ; /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
+ psubw mm0, mm1 ; /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+ psubw mm2, mm3 ; /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+ movq [edx], mm0 ; /* write answer out */
+ movq [8 + edx], mm2 ; /* write answer out */
+ ; /* Increment pointers */
+ add edx, 16 ;
+ add eax, PixelsPerLine ;
+ add ebx, ReconPixelsPerLine ;
+ add ecx, ReconPixelsPerLine ;
+
};
-
+
#endif
}
@@ -708,15 +708,15 @@
ogg_uint32_t SadValue;
ogg_uint32_t SadValue1;
- SadValue = DSP_OP_ABS_DIFF (Src1[0], Src2[0]) +
- DSP_OP_ABS_DIFF (Src1[1], Src2[1]) +
- DSP_OP_ABS_DIFF (Src1[2], Src2[2]) +
- DSP_OP_ABS_DIFF (Src1[3], Src2[3]);
+ SadValue = DSP_OP_ABS_DIFF (Src1[0], Src2[0]) +
+ DSP_OP_ABS_DIFF (Src1[1], Src2[1]) +
+ DSP_OP_ABS_DIFF (Src1[2], Src2[2]) +
+ DSP_OP_ABS_DIFF (Src1[3], Src2[3]);
- SadValue1 = DSP_OP_ABS_DIFF (Src1[4], Src2[4]) +
- DSP_OP_ABS_DIFF (Src1[5], Src2[5]) +
- DSP_OP_ABS_DIFF (Src1[6], Src2[6]) +
- DSP_OP_ABS_DIFF (Src1[7], Src2[7]);
+ SadValue1 = DSP_OP_ABS_DIFF (Src1[4], Src2[4]) +
+ DSP_OP_ABS_DIFF (Src1[5], Src2[5]) +
+ DSP_OP_ABS_DIFF (Src1[6], Src2[6]) +
+ DSP_OP_ABS_DIFF (Src1[7], Src2[7]);
SadValue = ( SadValue > SadValue1 ) ? SadValue : SadValue1;
@@ -725,54 +725,54 @@
#else
ogg_uint32_t MaxSad;
-
+
__asm {
align 16
mov ebx, Src1
mov ecx, Src2
- pxor mm6, mm6 ; /* zero out mm6 for unpack */
- pxor mm7, mm7 ; /* zero out mm7 for unpack */
- movq mm0, [ebx] ; /* take 8 bytes */
- movq mm1, [ecx] ;
+ pxor mm6, mm6 ; /* zero out mm6 for unpack */
+ pxor mm7, mm7 ; /* zero out mm7 for unpack */
+ movq mm0, [ebx] ; /* take 8 bytes */
+ movq mm1, [ecx] ;
- movq mm2, mm0 ;
- psubusb mm0, mm1 ; /* A - B */
- psubusb mm1, mm2 ; /* B - A */
- por mm0, mm1 ; /* and or gives abs difference */
+ movq mm2, mm0 ;
+ psubusb mm0, mm1 ; /* A - B */
+ psubusb mm1, mm2 ; /* B - A */
+ por mm0, mm1 ; /* and or gives abs difference */
- movq mm1, mm0 ;
+ movq mm1, mm0 ;
- punpcklbw mm0, mm6 ; /* ; unpack low four bytes to higher precision */
- punpckhbw mm1, mm7 ; /* ; unpack high four bytes to higher precision */
+ punpcklbw mm0, mm6 ; /* ; unpack low four bytes to higher precision */
+ punpckhbw mm1, mm7 ; /* ; unpack high four bytes to higher precision */
- movq mm2, mm0 ;
- movq mm3, mm1 ;
- psrlq mm2, 32 ; /* fold and add */
- psrlq mm3, 32 ;
- paddw mm0, mm2 ;
- paddw mm1, mm3 ;
- movq mm2, mm0 ;
- movq mm3, mm1 ;
- psrlq mm2, 16 ;
- psrlq mm3, 16 ;
- paddw mm0, mm2 ;
- paddw mm1, mm3 ;
+ movq mm2, mm0 ;
+ movq mm3, mm1 ;
+ psrlq mm2, 32 ; /* fold and add */
+ psrlq mm3, 32 ;
+ paddw mm0, mm2 ;
+ paddw mm1, mm3 ;
+ movq mm2, mm0 ;
+ movq mm3, mm1 ;
+ psrlq mm2, 16 ;
+ psrlq mm3, 16 ;
+ paddw mm0, mm2 ;
+ paddw mm1, mm3 ;
- psubusw mm1, mm0 ;
- paddw mm1, mm0 ; /* mm1 = max(mm1, mm0) */
- movd eax, mm1 ;
+ psubusw mm1, mm0 ;
+ paddw mm1, mm0 ; /* mm1 = max(mm1, mm0) */
+ movd eax, mm1 ;
and eax, 0xffff
mov MaxSad, eax
};
return MaxSad;
-
-
-
-
+
+
+
+
#endif
}
@@ -780,7 +780,7 @@
static ogg_uint32_t col_sad8x8__mmx (unsigned char *Src1, unsigned char *Src2,
- ogg_uint32_t stride)
+ ogg_uint32_t stride)
{
#if 0
@@ -798,7 +798,7 @@
SadValue[5] += abs(Src1[5] - Src2[5]);
SadValue[6] += abs(Src1[6] - Src2[6]);
SadValue[7] += abs(Src1[7] - Src2[7]);
-
+
Src1 += stride;
Src2 += stride;
}
@@ -812,18 +812,18 @@
SadValue2[5] += abs(Src1[5] - Src2[5]);
SadValue2[6] += abs(Src1[6] - Src2[6]);
SadValue2[7] += abs(Src1[7] - Src2[7]);
-
+
Src1 += stride;
Src2 += stride;
}
-
+
for ( i = 0; i < 8; i++ ){
if ( SadValue[i] > MaxSad )
MaxSad = SadValue[i];
if ( SadValue2[i] > MaxSad )
MaxSad = SadValue2[i];
}
-
+
return MaxSad;
#else
ogg_uint32_t MaxSad;
@@ -834,69 +834,69 @@
mov ebx, Src1
mov ecx, Src2
- pxor mm3, mm3 ; /* zero out mm3 for unpack */
- pxor mm4, mm4 ; /* mm4 low sum */
- pxor mm5, mm5 ; /* mm5 high sum */
- pxor mm6, mm6 ; /* mm6 low sum */
- pxor mm7, mm7 ; /* mm7 high sum */
- mov edi, 4 ; /* 4 rows */
- label_1: ;
- movq mm0, [ebx] ; /* take 8 bytes */
- movq mm1, [ecx] ; /* take 8 bytes */
+ pxor mm3, mm3 ; /* zero out mm3 for unpack */
+ pxor mm4, mm4 ; /* mm4 low sum */
+ pxor mm5, mm5 ; /* mm5 high sum */
+ pxor mm6, mm6 ; /* mm6 low sum */
+ pxor mm7, mm7 ; /* mm7 high sum */
+ mov edi, 4 ; /* 4 rows */
+ label_1: ;
+ movq mm0, [ebx] ; /* take 8 bytes */
+ movq mm1, [ecx] ; /* take 8 bytes */
- movq mm2, mm0 ;
- psubusb mm0, mm1 ; /* A - B */
- psubusb mm1, mm2 ; /* B - A */
- por mm0, mm1 ; /* and or gives abs difference */
- movq mm1, mm0 ;
+ movq mm2, mm0 ;
+ psubusb mm0, mm1 ; /* A - B */
+ psubusb mm1, mm2 ; /* B - A */
+ por mm0, mm1 ; /* and or gives abs difference */
+ movq mm1, mm0 ;
- punpcklbw mm0, mm3 ; /* unpack to higher precision for accumulation */
- paddw mm4, mm0 ; /* accumulate difference... */
- punpckhbw mm1, mm3 ; /* unpack high four bytes to higher precision */
- paddw mm5, mm1 ; /* accumulate difference... */
- add ebx, stride ; /* Inc pointer into the new data */
- add ecx, stride ; /* Inc pointer into the new data */
+ punpcklbw mm0, mm3 ; /* unpack to higher precision for accumulation */
+ paddw mm4, mm0 ; /* accumulate difference... */
+ punpckhbw mm1, mm3 ; /* unpack high four bytes to higher precision */
+ paddw mm5, mm1 ; /* accumulate difference... */
+ add ebx, stride ; /* Inc pointer into the new data */
+ add ecx, stride ; /* Inc pointer into the new data */
- dec edi ;
- jnz label_1 ;
+ dec edi ;
+ jnz label_1 ;
- mov edi, 4 ; /* 4 rows */
- label_2: ;
- movq mm0, [ebx] ; /* take 8 bytes */
- movq mm1, [ecx] ; /* take 8 bytes */
+ mov edi, 4 ; /* 4 rows */
+ label_2: ;
+ movq mm0, [ebx] ; /* take 8 bytes */
+ movq mm1, [ecx] ; /* take 8 bytes */
- movq mm2, mm0 ;
- psubusb mm0, mm1 ; /* A - B */
- psubusb mm1, mm2 ; /* B - A */
- por mm0, mm1 ; /* and or gives abs difference */
- movq mm1, mm0 ;
+ movq mm2, mm0 ;
+ psubusb mm0, mm1 ; /* A - B */
+ psubusb mm1, mm2 ; /* B - A */
+ por mm0, mm1 ; /* and or gives abs difference */
+ movq mm1, mm0 ;
- punpcklbw mm0, mm3 ; /* unpack to higher precision for accumulation */
- paddw mm6, mm0 ; /* accumulate difference... */
- punpckhbw mm1, mm3 ; /* unpack high four bytes to higher precision */
- paddw mm7, mm1 ; /* accumulate difference... */
- add ebx, stride ; /* Inc pointer into the new data */
- add ecx, stride ; /* Inc pointer into the new data */
+ punpcklbw mm0, mm3 ; /* unpack to higher precision for accumulation */
+ paddw mm6, mm0 ; /* accumulate difference... */
+ punpckhbw mm1, mm3 ; /* unpack high four bytes to higher precision */
+ paddw mm7, mm1 ; /* accumulate difference... */
+ add ebx, stride ; /* Inc pointer into the new data */
+ add ecx, stride ; /* Inc pointer into the new data */
- dec edi ;
- jnz label_2 ;
+ dec edi ;
+ jnz label_2 ;
- psubusw mm7, mm6 ;
- paddw mm7, mm6 ; /* mm7 = max(mm7, mm6) */
- psubusw mm5, mm4 ;
- paddw mm5, mm4 ; /* mm5 = max(mm5, mm4) */
- psubusw mm7, mm5 ;
- paddw mm7, mm5 ; /* mm7 = max(mm5, mm7) */
- movq mm6, mm7 ;
- psrlq mm6, 32 ;
- psubusw mm7, mm6 ;
- paddw mm7, mm6 ; /* mm7 = max(mm5, mm7) */
- movq mm6, mm7 ;
- psrlq mm6, 16 ;
- psubusw mm7, mm6 ;
- paddw mm7, mm6 ; /* mm7 = max(mm5, mm7) */
- movd eax, mm7 ;
- and eax, 0xffff ;
+ psubusw mm7, mm6 ;
+ paddw mm7, mm6 ; /* mm7 = max(mm7, mm6) */
+ psubusw mm5, mm4 ;
+ paddw mm5, mm4 ; /* mm5 = max(mm5, mm4) */
+ psubusw mm7, mm5 ;
+ paddw mm7, mm5 ; /* mm7 = max(mm5, mm7) */
+ movq mm6, mm7 ;
+ psrlq mm6, 32 ;
+ psubusw mm7, mm6 ;
+ paddw mm7, mm6 ; /* mm7 = max(mm5, mm7) */
+ movq mm6, mm7 ;
+ psrlq mm6, 16 ;
+ psubusw mm7, mm6 ;
+ paddw mm7, mm6 ; /* mm7 = max(mm5, mm7) */
+ movd eax, mm7 ;
+ and eax, 0xffff ;
mov MaxSad, eax
};
@@ -908,7 +908,7 @@
}
static ogg_uint32_t sad8x8__mmx (unsigned char *ptr1, ogg_uint32_t stride1,
- unsigned char *ptr2, ogg_uint32_t stride2)
+ unsigned char *ptr2, ogg_uint32_t stride2)
{
#if 0
@@ -940,177 +940,177 @@
mov ebx, ptr1
mov edx, ptr2
- pxor mm6, mm6 ; /* zero out mm6 for unpack */
- pxor mm7, mm7 ; /* mm7 contains the result */
-
+ pxor mm6, mm6 ; /* zero out mm6 for unpack */
+ pxor mm7, mm7 ; /* mm7 contains the result */
+
; /* ITERATION 1 */
- movq mm0, [ebx] ; /* take 8 bytes */
- movq mm1, [edx] ;
- movq mm2, mm0 ;
+ movq mm0, [ebx] ; /* take 8 bytes */
+ movq mm1, [edx] ;
+ movq mm2, mm0 ;
- psubusb mm0, mm1 ; /* A - B */
- psubusb mm1, mm2 ; /* B - A */
- por mm0, mm1 ; /* and or gives abs difference */
- movq mm1, mm0 ;
+ psubusb mm0, mm1 ; /* A - B */
+ psubusb mm1, mm2 ; /* B - A */
+ por mm0, mm1 ; /* and or gives abs difference */
+ movq mm1, mm0 ;
- punpcklbw mm0, mm6 ; /* unpack to higher precision for accumulation */
- paddw mm7, mm0 ; /* accumulate difference... */
- punpckhbw mm1, mm6 ; /* unpack high four bytes to higher precision */
- add ebx, stride1 ; /* Inc pointer into the new data */
- paddw mm7, mm1 ; /* accumulate difference... */
- add edx, stride2 ; /* Inc pointer into ref data */
+ punpcklbw mm0, mm6 ; /* unpack to higher precision for accumulation */
+ paddw mm7, mm0 ; /* accumulate difference... */
+ punpckhbw mm1, mm6 ; /* unpack high four bytes to higher precision */
+ add ebx, stride1 ; /* Inc pointer into the new data */
+ paddw mm7, mm1 ; /* accumulate difference... */
+ add edx, stride2 ; /* Inc pointer into ref data */
; /* ITERATION 2 */
- movq mm0, [ebx] ; /* take 8 bytes */
- movq mm1, [edx] ;
- movq mm2, mm0 ;
+ movq mm0, [ebx] ; /* take 8 bytes */
+ movq mm1, [edx] ;
+ movq mm2, mm0 ;
- psubusb mm0, mm1 ; /* A - B */
- psubusb mm1, mm2 ; /* B - A */
- por mm0, mm1 ; /* and or gives abs difference */
- movq mm1, mm0 ;
+ psubusb mm0, mm1 ; /* A - B */
+ psubusb mm1, mm2 ; /* B - A */
+ por mm0, mm1 ; /* and or gives abs difference */
+ movq mm1, mm0 ;
- punpcklbw mm0, mm6 ; /* unpack to higher precision for accumulation */
- paddw mm7, mm0 ; /* accumulate difference... */
- punpckhbw mm1, mm6 ; /* unpack high four bytes to higher precision */
- add ebx, stride1 ; /* Inc pointer into the new data */
- paddw mm7, mm1 ; /* accumulate difference... */
- add edx, stride2 ; /* Inc pointer into ref data */
+ punpcklbw mm0, mm6 ; /* unpack to higher precision for accumulation */
+ paddw mm7, mm0 ; /* accumulate difference... */
+ punpckhbw mm1, mm6 ; /* unpack high four bytes to higher precision */
+ add ebx, stride1 ; /* Inc pointer into the new data */
+ paddw mm7, mm1 ; /* accumulate difference... */
+ add edx, stride2 ; /* Inc pointer into ref data */
; /* ITERATION 3 */
- movq mm0, [ebx] ; /* take 8 bytes */
- movq mm1, [edx] ;
- movq mm2, mm0 ;
+ movq mm0, [ebx] ; /* take 8 bytes */
+ movq mm1, [edx] ;
+ movq mm2, mm0 ;
- psubusb mm0, mm1 ; /* A - B */
- psubusb mm1, mm2 ; /* B - A */
- por mm0, mm1 ; /* and or gives abs difference */
- movq mm1, mm0 ;
+ psubusb mm0, mm1 ; /* A - B */
+ psubusb mm1, mm2 ; /* B - A */
+ por mm0, mm1 ; /* and or gives abs difference */
+ movq mm1, mm0 ;
- punpcklbw mm0, mm6 ; /* unpack to higher precision for accumulation */
- paddw mm7, mm0 ; /* accumulate difference... */
- punpckhbw mm1, mm6 ; /* unpack high four bytes to higher precision */
- add ebx, stride1 ; /* Inc pointer into the new data */
- paddw mm7, mm1 ; /* accumulate difference... */
- add edx, stride2 ; /* Inc pointer into ref data */
+ punpcklbw mm0, mm6 ; /* unpack to higher precision for accumulation */
+ paddw mm7, mm0 ; /* accumulate difference... */
+ punpckhbw mm1, mm6 ; /* unpack high four bytes to higher precision */
+ add ebx, stride1 ; /* Inc pointer into the new data */
+ paddw mm7, mm1 ; /* accumulate difference... */
+ add edx, stride2 ; /* Inc pointer into ref data */
; /* ITERATION 4 */
- movq mm0, [ebx] ; /* take 8 bytes */
- movq mm1, [edx] ;
- movq mm2, mm0 ;
+ movq mm0, [ebx] ; /* take 8 bytes */
+ movq mm1, [edx] ;
+ movq mm2, mm0 ;
- psubusb mm0, mm1 ; /* A - B */
- psubusb mm1, mm2 ; /* B - A */
- por mm0, mm1 ; /* and or gives abs difference */
- movq mm1, mm0 ;
+ psubusb mm0, mm1 ; /* A - B */
+ psubusb mm1, mm2 ; /* B - A */
+ por mm0, mm1 ; /* and or gives abs difference */
+ movq mm1, mm0 ;
- punpcklbw mm0, mm6 ; /* unpack to higher precision for accumulation */
- paddw mm7, mm0 ; /* accumulate difference... */
- punpckhbw mm1, mm6 ; /* unpack high four bytes to higher precision */
- add ebx, stride1 ; /* Inc pointer into the new data */
- paddw mm7, mm1 ; /* accumulate difference... */
- add edx, stride2 ; /* Inc pointer into ref data */
+ punpcklbw mm0, mm6 ; /* unpack to higher precision for accumulation */
+ paddw mm7, mm0 ; /* accumulate difference... */
+ punpckhbw mm1, mm6 ; /* unpack high four bytes to higher precision */
+ add ebx, stride1 ; /* Inc pointer into the new data */
+ paddw mm7, mm1 ; /* accumulate difference... */
+ add edx, stride2 ; /* Inc pointer into ref data */
; /* ITERATION 5 */
- movq mm0, [ebx] ; /* take 8 bytes */
- movq mm1, [edx] ;
- movq mm2, mm0 ;
+ movq mm0, [ebx] ; /* take 8 bytes */
+ movq mm1, [edx] ;
+ movq mm2, mm0 ;
- psubusb mm0, mm1 ; /* A - B */
- psubusb mm1, mm2 ; /* B - A */
- por mm0, mm1 ; /* and or gives abs difference */
- movq mm1, mm0 ;
+ psubusb mm0, mm1 ; /* A - B */
+ psubusb mm1, mm2 ; /* B - A */
+ por mm0, mm1 ; /* and or gives abs difference */
+ movq mm1, mm0 ;
- punpcklbw mm0, mm6 ; /* unpack to higher precision for accumulation */
- paddw mm7, mm0 ; /* accumulate difference... */
- punpckhbw mm1, mm6 ; /* unpack high four bytes to higher precision */
- add ebx, stride1 ; /* Inc pointer into the new data */
- paddw mm7, mm1 ; /* accumulate difference... */
- add edx, stride2 ; /* Inc pointer into ref data */
+ punpcklbw mm0, mm6 ; /* unpack to higher precision for accumulation */
+ paddw mm7, mm0 ; /* accumulate difference... */
+ punpckhbw mm1, mm6 ; /* unpack high four bytes to higher precision */
+ add ebx, stride1 ; /* Inc pointer into the new data */
+ paddw mm7, mm1 ; /* accumulate difference... */
+ add edx, stride2 ; /* Inc pointer into ref data */
; /* ITERATION 6 */
- movq mm0, [ebx] ; /* take 8 bytes */
- movq mm1, [edx] ;
- movq mm2, mm0 ;
+ movq mm0, [ebx] ; /* take 8 bytes */
+ movq mm1, [edx] ;
+ movq mm2, mm0 ;
- psubusb mm0, mm1 ; /* A - B */
- psubusb mm1, mm2 ; /* B - A */
- por mm0, mm1 ; /* and or gives abs difference */
- movq mm1, mm0 ;
+ psubusb mm0, mm1 ; /* A - B */
+ psubusb mm1, mm2 ; /* B - A */
+ por mm0, mm1 ; /* and or gives abs difference */
+ movq mm1, mm0 ;
- punpcklbw mm0, mm6 ; /* unpack to higher precision for accumulation */
- paddw mm7, mm0 ; /* accumulate difference... */
- punpckhbw mm1, mm6 ; /* unpack high four bytes to higher precision */
- add ebx, stride1 ; /* Inc pointer into the new data */
- paddw mm7, mm1 ; /* accumulate difference... */
- add edx, stride2 ; /* Inc pointer into ref data */
+ punpcklbw mm0, mm6 ; /* unpack to higher precision for accumulation */
+ paddw mm7, mm0 ; /* accumulate difference... */
+ punpckhbw mm1, mm6 ; /* unpack high four bytes to higher precision */
+ add ebx, stride1 ; /* Inc pointer into the new data */
+ paddw mm7, mm1 ; /* accumulate difference... */
+ add edx, stride2 ; /* Inc pointer into ref data */
; /* ITERATION 7 */
- movq mm0, [ebx] ; /* take 8 bytes */
- movq mm1, [edx] ;
- movq mm2, mm0 ;
+ movq mm0, [ebx] ; /* take 8 bytes */
+ movq mm1, [edx] ;
+ movq mm2, mm0 ;
- psubusb mm0, mm1 ; /* A - B */
- psubusb mm1, mm2 ; /* B - A */
- por mm0, mm1 ; /* and or gives abs difference */
- movq mm1, mm0 ;
+ psubusb mm0, mm1 ; /* A - B */
+ psubusb mm1, mm2 ; /* B - A */
+ por mm0, mm1 ; /* and or gives abs difference */
+ movq mm1, mm0 ;
- punpcklbw mm0, mm6 ; /* unpack to higher precision for accumulation */
- paddw mm7, mm0 ; /* accumulate difference... */
- punpckhbw mm1, mm6 ; /* unpack high four bytes to higher precision */
- add ebx, stride1 ; /* Inc pointer into the new data */
- paddw mm7, mm1 ; /* accumulate difference... */
- add edx, stride2 ; /* Inc pointer into ref data */
+ punpcklbw mm0, mm6 ; /* unpack to higher precision for accumulation */
+ paddw mm7, mm0 ; /* accumulate difference... */
+ punpckhbw mm1, mm6 ; /* unpack high four bytes to higher precision */
+ add ebx, stride1 ; /* Inc pointer into the new data */
+ paddw mm7, mm1 ; /* accumulate difference... */
+ add edx, stride2 ; /* Inc pointer into ref data */
; /* ITERATION 8 */
- movq mm0, [ebx] ; /* take 8 bytes */
- movq mm1, [edx] ;
- movq mm2, mm0 ;
+ movq mm0, [ebx] ; /* take 8 bytes */
+ movq mm1, [edx] ;
+ movq mm2, mm0 ;
- psubusb mm0, mm1 ; /* A - B */
- psubusb mm1, mm2 ; /* B - A */
- por mm0, mm1 ; /* and or gives abs difference */
- movq mm1, mm0 ;
+ psubusb mm0, mm1 ; /* A - B */
+ psubusb mm1, mm2 ; /* B - A */
+ por mm0, mm1 ; /* and or gives abs difference */
+ movq mm1, mm0 ;
- punpcklbw mm0, mm6 ; /* unpack to higher precision for accumulation */
- paddw mm7, mm0 ; /* accumulate difference... */
- punpckhbw mm1, mm6 ; /* unpack high four bytes to higher precision */
- add ebx, stride1 ; /* Inc pointer into the new data */
- paddw mm7, mm1 ; /* accumulate difference... */
- add edx, stride2 ; /* Inc pointer into ref data */
+ punpcklbw mm0, mm6 ; /* unpack to higher precision for accumulation */
+ paddw mm7, mm0 ; /* accumulate difference... */
+ punpckhbw mm1, mm6 ; /* unpack high four bytes to higher precision */
+ add ebx, stride1 ; /* Inc pointer into the new data */
+ paddw mm7, mm1 ; /* accumulate difference... */
+ add edx, stride2 ; /* Inc pointer into ref data */
; /* ------ */
- movq mm0, mm7 ;
- psrlq mm7, 32 ;
- paddw mm7, mm0 ;
- movq mm0, mm7 ;
- psrlq mm7, 16 ;
- paddw mm7, mm0 ;
- movd eax, mm7 ;
- and eax, 0xffff ;
+ movq mm0, mm7 ;
+ psrlq mm7, 32 ;
+ paddw mm7, mm0 ;
+ movq mm0, mm7 ;
+ psrlq mm7, 16 ;
+ paddw mm7, mm0 ;
+ movd eax, mm7 ;
+ and eax, 0xffff ;
mov DiffVal, eax
};
return DiffVal;
-
+
#endif
}
static ogg_uint32_t sad8x8_thres__mmx (unsigned char *ptr1, ogg_uint32_t stride1,
- unsigned char *ptr2, ogg_uint32_t stride2,
- ogg_uint32_t thres)
+ unsigned char *ptr2, ogg_uint32_t stride2,
+ ogg_uint32_t thres)
{
#if 0
ogg_uint32_t i;
@@ -1142,9 +1142,9 @@
static ogg_uint32_t sad8x8_xy2_thres__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride,
- unsigned char *RefDataPtr1,
- unsigned char *RefDataPtr2, ogg_uint32_t RefStride,
- ogg_uint32_t thres)
+ unsigned char *RefDataPtr1,
+ unsigned char *RefDataPtr2, ogg_uint32_t RefStride,
+ ogg_uint32_t thres)
{
#if 0
ogg_uint32_t i;
@@ -1181,58 +1181,58 @@
mov edx, RefDataPtr2
- pcmpeqd mm5, mm5 ; /* fefefefefefefefe in mm5 */
- paddb mm5, mm5 ;
- ;
- pxor mm6, mm6 ; /* zero out mm6 for unpack */
- pxor mm7, mm7 ; /* mm7 contains the result */
- mov edi, 8 ; /* 8 rows */
- loop_start: ;
- movq mm0, [ebx] ; /* take 8 bytes */
+ pcmpeqd mm5, mm5 ; /* fefefefefefefefe in mm5 */
+ paddb mm5, mm5 ;
+ ;
+ pxor mm6, mm6 ; /* zero out mm6 for unpack */
+ pxor mm7, mm7 ; /* mm7 contains the result */
+ mov edi, 8 ; /* 8 rows */
+ loop_start: ;
+ movq mm0, [ebx] ; /* take 8 bytes */
- movq mm2, [ecx] ;
- movq mm3, [edx] ; /* take average of mm2 and mm3 */
- movq mm1, mm2 ;
- pand mm1, mm3 ;
- pxor mm3, mm2 ;
- pand mm3, mm5 ;
- psrlq mm3, 1 ;
- paddb mm1, mm3 ;
+ movq mm2, [ecx] ;
+ movq mm3, [edx] ; /* take average of mm2 and mm3 */
+ movq mm1, mm2 ;
+ pand mm1, mm3 ;
+ pxor mm3, mm2 ;
+ pand mm3, mm5 ;
+ psrlq mm3, 1 ;
+ paddb mm1, mm3 ;
- movq mm2, mm0 ;
+ movq mm2, mm0 ;
- psubusb mm0, mm1 ; /* A - B */
- psubusb mm1, mm2 ; /* B - A */
- por mm0, mm1 ; /* and or gives abs difference */
- movq mm1, mm0 ;
+ psubusb mm0, mm1 ; /* A - B */
+ psubusb mm1, mm2 ; /* B - A */
+ por mm0, mm1 ; /* and or gives abs difference */
+ movq mm1, mm0 ;
- punpcklbw mm0, mm6 ; /* unpack to higher precision for accumulation */
- paddw mm7, mm0 ; /* accumulate difference... */
- punpckhbw mm1, mm6 ; /* unpack high four bytes to higher precision */
- add ebx, SrcStride ; /* Inc pointer into the new data */
- paddw mm7, mm1 ; /* accumulate difference... */
- add ecx, RefStride ; /* Inc pointer into ref data */
- add edx, RefStride ; /* Inc pointer into ref data */
+ punpcklbw mm0, mm6 ; /* unpack to higher precision for accumulation */
+ paddw mm7, mm0 ; /* accumulate difference... */
+ punpckhbw mm1, mm6 ; /* unpack high four bytes to higher precision */
+ add ebx, SrcStride ; /* Inc pointer into the new data */
+ paddw mm7, mm1 ; /* accumulate difference... */
+ add ecx, RefStride ; /* Inc pointer into ref data */
+ add edx, RefStride ; /* Inc pointer into ref data */
- dec edi ;
- jnz loop_start ;
+ dec edi ;
+ jnz loop_start ;
- movq mm0, mm7 ;
- psrlq mm7, 32 ;
- paddw mm7, mm0 ;
- movq mm0, mm7 ;
- psrlq mm7, 16 ;
- paddw mm7, mm0 ;
- movd eax, mm7 ;
- and eax, 0xffff ;
+ movq mm0, mm7 ;
+ psrlq mm7, 32 ;
+ paddw mm7, mm0 ;
+ movq mm0, mm7 ;
+ psrlq mm7, 16 ;
+ paddw mm7, mm0 ;
+ movd eax, mm7 ;
+ and eax, 0xffff ;
mov DiffVal, eax
};
return DiffVal;
-
+
#endif
}
@@ -1277,45 +1277,45 @@
mov ecx, DataPtr
- pxor mm5, mm5 ;
- pxor mm6, mm6 ;
- pxor mm7, mm7 ;
- mov edi, 8 ;
- loop_start:
- movq mm0, [ecx] ; /* take 8 bytes */
- movq mm2, mm0 ;
+ pxor mm5, mm5 ;
+ pxor mm6, mm6 ;
+ pxor mm7, mm7 ;
+ mov edi, 8 ;
+ loop_start:
+ movq mm0, [ecx] ; /* take 8 bytes */
+ movq mm2, mm0 ;
- punpcklbw mm0, mm6 ;
- punpckhbw mm2, mm6 ;
+ punpcklbw mm0, mm6 ;
+ punpckhbw mm2, mm6 ;
- paddw mm5, mm0 ;
- paddw mm5, mm2 ;
+ paddw mm5, mm0 ;
+ paddw mm5, mm2 ;
- pmaddwd mm0, mm0 ;
- pmaddwd mm2, mm2 ;
- ;
- paddd mm7, mm0 ;
- paddd mm7, mm2 ;
+ pmaddwd mm0, mm0 ;
+ pmaddwd mm2, mm2 ;
+ ;
+ paddd mm7, mm0 ;
+ paddd mm7, mm2 ;
- add ecx, Stride ; /* Inc pointer into src data */
+ add ecx, Stride ; /* Inc pointer into src data */
- dec edi ;
- jnz loop_start ;
+ dec edi ;
+ jnz loop_start ;
- movq mm0, mm5 ;
- psrlq mm5, 32 ;
- paddw mm5, mm0 ;
- movq mm0, mm5 ;
- psrlq mm5, 16 ;
- paddw mm5, mm0 ;
- movd edi, mm5 ;
- movsx edi, di ;
- mov eax, edi ;
+ movq mm0, mm5 ;
+ psrlq mm5, 32 ;
+ paddw mm5, mm0 ;
+ movq mm0, mm5 ;
+ psrlq mm5, 16 ;
+ paddw mm5, mm0 ;
+ movd edi, mm5 ;
+ movsx edi, di ;
+ mov eax, edi ;
- movq mm0, mm7 ;
- psrlq mm7, 32 ;
- paddd mm7, mm0 ;
- movd ebx, mm7 ;
+ movq mm0, mm7 ;
+ psrlq mm7, 32 ;
+ paddd mm7, mm0 ;
+ movd ebx, mm7 ;
mov XSum, eax
mov XXSum, ebx;
@@ -1324,13 +1324,13 @@
/* Compute population variance as mis-match metric. */
return (( (XXSum<<6) - XSum*XSum ) );
-
+
#endif
}
static ogg_uint32_t inter8x8_err__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride,
- unsigned char *RefDataPtr, ogg_uint32_t RefStride)
+ unsigned char *RefDataPtr, ogg_uint32_t RefStride)
{
#if 0
@@ -1355,23 +1355,23 @@
DiffVal = DSP_OP_DIFF (SrcData[3], RefDataPtr[3]);
XSum += DiffVal;
XXSum += DiffVal*DiffVal;
-
+
DiffVal = DSP_OP_DIFF (SrcData[4], RefDataPtr[4]);
XSum += DiffVal;
XXSum += DiffVal*DiffVal;
-
+
DiffVal = DSP_OP_DIFF (SrcData[5], RefDataPtr[5]);
XSum += DiffVal;
XXSum += DiffVal*DiffVal;
-
+
DiffVal = DSP_OP_DIFF (SrcData[6], RefDataPtr[6]);
XSum += DiffVal;
XXSum += DiffVal*DiffVal;
-
+
DiffVal = DSP_OP_DIFF (SrcData[7], RefDataPtr[7]);
XSum += DiffVal;
XXSum += DiffVal*DiffVal;
-
+
/* Step to next row of block. */
SrcData += SrcStride;
RefDataPtr += RefStride;
@@ -1390,53 +1390,53 @@
mov ecx, SrcData
mov edx, RefDataPtr
- pxor mm5, mm5 ;
- pxor mm6, mm6 ;
- pxor mm7, mm7 ;
- mov edi, 8 ;
- loop_start: ;
- movq mm0, [ecx] ; /* take 8 bytes */
- movq mm1, [edx] ;
- movq mm2, mm0 ;
- movq mm3, mm1 ;
+ pxor mm5, mm5 ;
+ pxor mm6, mm6 ;
+ pxor mm7, mm7 ;
+ mov edi, 8 ;
+ loop_start: ;
+ movq mm0, [ecx] ; /* take 8 bytes */
+ movq mm1, [edx] ;
+ movq mm2, mm0 ;
+ movq mm3, mm1 ;
- punpcklbw mm0, mm6 ;
- punpcklbw mm1, mm6 ;
- punpckhbw mm2, mm6 ;
- punpckhbw mm3, mm6 ;
+ punpcklbw mm0, mm6 ;
+ punpcklbw mm1, mm6 ;
+ punpckhbw mm2, mm6 ;
+ punpckhbw mm3, mm6 ;
- psubsw mm0, mm1 ;
- psubsw mm2, mm3 ;
+ psubsw mm0, mm1 ;
+ psubsw mm2, mm3 ;
- paddw mm5, mm0 ;
- paddw mm5, mm2 ;
+ paddw mm5, mm0 ;
+ paddw mm5, mm2 ;
- pmaddwd mm0, mm0 ;
- pmaddwd mm2, mm2 ;
- ;
- paddd mm7, mm0 ;
- paddd mm7, mm2 ;
+ pmaddwd mm0, mm0 ;
+ pmaddwd mm2, mm2 ;
+ ;
+ paddd mm7, mm0 ;
+ paddd mm7, mm2 ;
- add ecx, SrcStride ; /* Inc pointer into src data */
- add edx, RefStride ; /* Inc pointer into ref data */
+ add ecx, SrcStride ; /* Inc pointer into src data */
+ add edx, RefStride ; /* Inc pointer into ref data */
- dec edi ;
- jnz loop_start ;
+ dec edi ;
+ jnz loop_start ;
- movq mm0, mm5 ;
- psrlq mm5, 32 ;
- paddw mm5, mm0 ;
- movq mm0, mm5 ;
- psrlq mm5, 16 ;
- paddw mm5, mm0 ;
- movd edi, mm5 ;
- movsx edi, di ;
- mov eax, edi ;
+ movq mm0, mm5 ;
+ psrlq mm5, 32 ;
+ paddw mm5, mm0 ;
+ movq mm0, mm5 ;
+ psrlq mm5, 16 ;
+ paddw mm5, mm0 ;
+ movd edi, mm5 ;
+ movsx edi, di ;
+ mov eax, edi ;
- movq mm0, mm7 ;
- psrlq mm7, 32 ;
- paddd mm7, mm0 ;
- movd ebx, mm7 ;
+ movq mm0, mm7 ;
+ psrlq mm7, 32 ;
+ paddd mm7, mm0 ;
+ movd ebx, mm7 ;
mov XSum, eax
mov XXSum, ebx
@@ -1446,13 +1446,13 @@
/* Compute and return population variance as mis-match metric. */
return (( (XXSum<<6) - XSum*XSum ));
-
+
#endif
}
static ogg_uint32_t inter8x8_err_xy2__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride,
- unsigned char *RefDataPtr1,
- unsigned char *RefDataPtr2, ogg_uint32_t RefStride)
+ unsigned char *RefDataPtr1,
+ unsigned char *RefDataPtr2, ogg_uint32_t RefStride)
{
#if 0
ogg_uint32_t i;
@@ -1512,65 +1512,65 @@
mov ecx, RefDataPtr1
mov edx, RefDataPtr2
- pcmpeqd mm4, mm4 ; /* fefefefefefefefe in mm4 */
- paddb mm4, mm4 ;
- pxor mm5, mm5 ;
- pxor mm6, mm6 ;
- pxor mm7, mm7 ;
- mov edi, 8 ;
- loop_start: ;
- movq mm0, [ebx] ; /* take 8 bytes */
+ pcmpeqd mm4, mm4 ; /* fefefefefefefefe in mm4 */
+ paddb mm4, mm4 ;
+ pxor mm5, mm5 ;
+ pxor mm6, mm6 ;
+ pxor mm7, mm7 ;
+ mov edi, 8 ;
+ loop_start: ;
+ movq mm0, [ebx] ; /* take 8 bytes */
- movq mm2, [ecx] ;
- movq mm3, [edx] ; /* take average of mm2 and mm3 */
- movq mm1, mm2 ;
- pand mm1, mm3 ;
- pxor mm3, mm2 ;
- pand mm3, mm4 ;
- psrlq mm3, 1 ;
- paddb mm1, mm3 ;
+ movq mm2, [ecx] ;
+ movq mm3, [edx] ; /* take average of mm2 and mm3 */
+ movq mm1, mm2 ;
+ pand mm1, mm3 ;
+ pxor mm3, mm2 ;
+ pand mm3, mm4 ;
+ psrlq mm3, 1 ;
+ paddb mm1, mm3 ;
- movq mm2, mm0 ;
- movq mm3, mm1 ;
+ movq mm2, mm0 ;
+ movq mm3, mm1 ;
- punpcklbw mm0, mm6 ;
- punpcklbw mm1, mm6 ;
- punpckhbw mm2, mm6 ;
- punpckhbw mm3, mm6 ;
+ punpcklbw mm0, mm6 ;
+ punpcklbw mm1, mm6 ;
+ punpckhbw mm2, mm6 ;
+ punpckhbw mm3, mm6 ;
- psubsw mm0, mm1 ;
- psubsw mm2, mm3 ;
+ psubsw mm0, mm1 ;
+ psubsw mm2, mm3 ;
- paddw mm5, mm0 ;
- paddw mm5, mm2 ;
+ paddw mm5, mm0 ;
+ paddw mm5, mm2 ;
- pmaddwd mm0, mm0 ;
- pmaddwd mm2, mm2 ;
- ;
- paddd mm7, mm0 ;
- paddd mm7, mm2 ;
+ pmaddwd mm0, mm0 ;
+ pmaddwd mm2, mm2 ;
+ ;
+ paddd mm7, mm0 ;
+ paddd mm7, mm2 ;
- add ebx, SrcStride ; /* Inc pointer into src data */
- add ecx, RefStride ; /* Inc pointer into ref data */
- add edx, RefStride ; /* Inc pointer into ref data */
+ add ebx, SrcStride ; /* Inc pointer into src data */
+ add ecx, RefStride ; /* Inc pointer into ref data */
+ add edx, RefStride ; /* Inc pointer into ref data */
- dec edi ;
- jnz loop_start ;
+ dec edi ;
+ jnz loop_start ;
- movq mm0, mm5 ;
- psrlq mm5, 32 ;
- paddw mm5, mm0 ;
- movq mm0, mm5 ;
- psrlq mm5, 16 ;
- paddw mm5, mm0 ;
- movd edi, mm5 ;
- movsx edi, di ;
- mov XSum, edi ; /* movl eax, edi ; Modified for vc to resuse eax*/
+ movq mm0, mm5 ;
+ psrlq mm5, 32 ;
+ paddw mm5, mm0 ;
+ movq mm0, mm5 ;
+ psrlq mm5, 16 ;
+ paddw mm5, mm0 ;
+ movd edi, mm5 ;
+ movsx edi, di ;
+ mov XSum, edi ; /* movl eax, edi ; Modified for vc to resuse eax*/
- movq mm0, mm7 ;
- psrlq mm7, 32 ;
- paddd mm7, mm0 ;
- movd XXSum, mm7 ; /*movd eax, mm7 ; Modified for vc to reuse eax */
+ movq mm0, mm7 ;
+ psrlq mm7, 32 ;
+ paddd mm7, mm0 ;
+ movd XXSum, mm7 ; /*movd eax, mm7 ; Modified for vc to reuse eax */
};
return (( (XXSum<<6) - XSum*XSum ));
@@ -1589,7 +1589,6 @@
void dsp_mmx_init(DspFunctions *funcs)
{
- TH_DEBUG("enabling accelerated x86_32 mmx dsp functions.\n");
funcs->restore_fpu = restore_fpu;
funcs->sub8x8 = sub8x8__mmx;
funcs->sub8x8_128 = sub8x8_128__mmx;
Modified: trunk/theora/lib/enc/x86_32_vs/fdct_mmx.c
===================================================================
--- trunk/theora/lib/enc/x86_32_vs/fdct_mmx.c 2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/enc/x86_32_vs/fdct_mmx.c 2008-08-04 18:37:55 UTC (rev 15153)
@@ -35,51 +35,51 @@
mov edx, OutputData2
- movq mm0, [eax] ; /* mm0 = a0 a1 a2 a3 */
- movq mm4, [ebx] ; /* mm4 = e4 e5 e6 e7 */
- movq mm1, [16 + eax] ; /* mm1 = b0 b1 b2 b3 */
- movq mm5, [16 + ebx] ; /* mm5 = f4 f5 f6 f7 */
- movq mm2, [32 + eax] ; /* mm2 = c0 c1 c2 c3 */
- movq mm6, [32 + ebx] ; /* mm6 = g4 g5 g6 g7 */
- movq mm3, [48 + eax] ; /* mm3 = d0 d1 d2 d3 */
- movq [16 + ecx], mm1 ; /* save b0 b1 b2 b3 */
- movq mm7, [48 + ebx] ; /* mm7 = h0 h1 h2 h3 */
- ; /* Transpose 2x8 block */
- movq mm1, mm4 ; /* mm1 = e3 e2 e1 e0 */
- punpcklwd mm4, mm5 ; /* mm4 = f1 e1 f0 e0 */
- movq [ecx], mm0 ; /* save a3 a2 a1 a0 */
- punpckhwd mm1, mm5 ; /* mm1 = f3 e3 f2 e2 */
- movq mm0, mm6 ; /* mm0 = g3 g2 g1 g0 */
- punpcklwd mm6, mm7 ; /* mm6 = h1 g1 h0 g0 */
- movq mm5, mm4 ; /* mm5 = f1 e1 f0 e0 */
- punpckldq mm4, mm6 ; /* mm4 = h0 g0 f0 e0 = MM4 */
- punpckhdq mm5, mm6 ; /* mm5 = h1 g1 f1 e1 = MM5 */
- movq mm6, mm1 ; /* mm6 = f3 e3 f2 e2 */
- movq [edx], mm4 ;
- punpckhwd mm0, mm7 ; /* mm0 = h3 g3 h2 g2 */
- movq [16 + edx], mm5 ;
- punpckhdq mm6, mm0 ; /* mm6 = h3 g3 f3 e3 = MM7 */
- movq mm4, [ecx] ; /* mm4 = a3 a2 a1 a0 */
- punpckldq mm1, mm0 ; /* mm1 = h2 g2 f2 e2 = MM6 */
- movq mm5, [16 + ecx] ; /* mm5 = b3 b2 b1 b0 */
- movq mm0, mm4 ; /* mm0 = a3 a2 a1 a0 */
- movq [48 + edx], mm6 ;
- punpcklwd mm0, mm5 ; /* mm0 = b1 a1 b0 a0 */
- movq [32 + edx], mm1 ;
- punpckhwd mm4, mm5 ; /* mm4 = b3 a3 b2 a2 */
- movq mm5, mm2 ; /* mm5 = c3 c2 c1 c0 */
- punpcklwd mm2, mm3 ; /* mm2 = d1 c1 d0 c0 */
- movq mm1, mm0 ; /* mm1 = b1 a1 b0 a0 */
- punpckldq mm0, mm2 ; /* mm0 = d0 c0 b0 a0 = MM0 */
- punpckhdq mm1, mm2 ; /* mm1 = d1 c1 b1 a1 = MM1 */
- movq mm2, mm4 ; /* mm2 = b3 a3 b2 a2 */
- movq [ecx], mm0 ;
- punpckhwd mm5, mm3 ; /* mm5 = d3 c3 d2 c2 */
- movq [16 + ecx], mm1 ;
- punpckhdq mm4, mm5 ; /* mm4 = d3 c3 b3 a3 = MM3 */
- punpckldq mm2, mm5 ; /* mm2 = d2 c2 b2 a2 = MM2 */
- movq [48 + ecx], mm4 ;
- movq [32 + ecx], mm2 ;
+ movq mm0, [eax] ; /* mm0 = a0 a1 a2 a3 */
+ movq mm4, [ebx] ; /* mm4 = e4 e5 e6 e7 */
+ movq mm1, [16 + eax] ; /* mm1 = b0 b1 b2 b3 */
+ movq mm5, [16 + ebx] ; /* mm5 = f4 f5 f6 f7 */
+ movq mm2, [32 + eax] ; /* mm2 = c0 c1 c2 c3 */
+ movq mm6, [32 + ebx] ; /* mm6 = g4 g5 g6 g7 */
+ movq mm3, [48 + eax] ; /* mm3 = d0 d1 d2 d3 */
+ movq [16 + ecx], mm1 ; /* save b0 b1 b2 b3 */
+ movq mm7, [48 + ebx] ; /* mm7 = h0 h1 h2 h3 */
+ ; /* Transpose 2x8 block */
+ movq mm1, mm4 ; /* mm1 = e3 e2 e1 e0 */
+ punpcklwd mm4, mm5 ; /* mm4 = f1 e1 f0 e0 */
+ movq [ecx], mm0 ; /* save a3 a2 a1 a0 */
+ punpckhwd mm1, mm5 ; /* mm1 = f3 e3 f2 e2 */
+ movq mm0, mm6 ; /* mm0 = g3 g2 g1 g0 */
+ punpcklwd mm6, mm7 ; /* mm6 = h1 g1 h0 g0 */
+ movq mm5, mm4 ; /* mm5 = f1 e1 f0 e0 */
+ punpckldq mm4, mm6 ; /* mm4 = h0 g0 f0 e0 = MM4 */
+ punpckhdq mm5, mm6 ; /* mm5 = h1 g1 f1 e1 = MM5 */
+ movq mm6, mm1 ; /* mm6 = f3 e3 f2 e2 */
+ movq [edx], mm4 ;
+ punpckhwd mm0, mm7 ; /* mm0 = h3 g3 h2 g2 */
+ movq [16 + edx], mm5 ;
+ punpckhdq mm6, mm0 ; /* mm6 = h3 g3 f3 e3 = MM7 */
+ movq mm4, [ecx] ; /* mm4 = a3 a2 a1 a0 */
+ punpckldq mm1, mm0 ; /* mm1 = h2 g2 f2 e2 = MM6 */
+ movq mm5, [16 + ecx] ; /* mm5 = b3 b2 b1 b0 */
+ movq mm0, mm4 ; /* mm0 = a3 a2 a1 a0 */
+ movq [48 + edx], mm6 ;
+ punpcklwd mm0, mm5 ; /* mm0 = b1 a1 b0 a0 */
+ movq [32 + edx], mm1 ;
+ punpckhwd mm4, mm5 ; /* mm4 = b3 a3 b2 a2 */
+ movq mm5, mm2 ; /* mm5 = c3 c2 c1 c0 */
+ punpcklwd mm2, mm3 ; /* mm2 = d1 c1 d0 c0 */
+ movq mm1, mm0 ; /* mm1 = b1 a1 b0 a0 */
+ punpckldq mm0, mm2 ; /* mm0 = d0 c0 b0 a0 = MM0 */
+ punpckhdq mm1, mm2 ; /* mm1 = d1 c1 b1 a1 = MM1 */
+ movq mm2, mm4 ; /* mm2 = b3 a3 b2 a2 */
+ movq [ecx], mm0 ;
+ punpckhwd mm5, mm3 ; /* mm5 = d3 c3 d2 c2 */
+ movq [16 + ecx], mm1 ;
+ punpckhdq mm4, mm5 ; /* mm4 = d3 c3 b3 a3 = MM3 */
+ punpckldq mm2, mm5 ; /* mm2 = d2 c2 b2 a2 = MM2 */
+ movq [48 + ecx], mm4 ;
+ movq [32 + ecx], mm2 ;
};
@@ -96,208 +96,208 @@
mov eax, InputData1
mov ebx, InputData2
mov ecx, temp
- movq mm0, [eax] ;
- movq mm1, [16 + eax] ;
- movq mm2, [48 + eax] ;
- movq mm3, [16 + ebx] ;
- movq mm4, mm0 ;
- movq mm5, mm1 ;
- movq mm6, mm2 ;
- movq mm7, mm3 ;
- ;
- paddsw mm0, [48 + ebx] ; /* mm0 = ip0 + ip7 = is07 */
- paddsw mm1, [32 + eax] ; /* mm1 = ip1 + ip2 = is12 */
- paddsw mm2, [ebx] ; /* mm2 = ip3 + ip4 = is34 */
- paddsw mm3, [32 + ebx] ; /* mm3 = ip5 + ip6 = is56 */
- psubsw mm4, [48 + ebx] ; /* mm4 = ip0 - ip7 = id07 */
- psubsw mm5, [32 + eax] ; /* mm5 = ip1 - ip2 = id12 */
- ;
- psubsw mm0, mm2 ; /* mm0 = is07 - is34 */
- ;
- paddsw mm2, mm2 ;
- ;
- psubsw mm6, [ebx] ; /* mm6 = ip3 - ip4 = id34 */
- ;
- paddsw mm2, mm0 ; /* mm2 = is07 + is34 = is0734 */
- psubsw mm1, mm3 ; /* mm1 = is12 - is56 */
- movq [ecx], mm0 ; /* Save is07 - is34 to free mm0; */
- paddsw mm3, mm3 ;
- paddsw mm3, mm1 ; /* mm3 = is12 + 1s56 = is1256 */
- ;
- psubsw mm7, [32 + ebx] ; /* mm7 = ip5 - ip6 = id56 */
- ; /* ------------------------------------------------------------------- */
- psubsw mm5, mm7 ; /* mm5 = id12 - id56 */
- paddsw mm7, mm7 ;
- paddsw mm7, mm5 ; /* mm7 = id12 + id56 */
- ; /* ------------------------------------------------------------------- */
- psubsw mm2, mm3 ; /* mm2 = is0734 - is1256 */
- paddsw mm3, mm3 ;
- ;
- movq mm0, mm2 ; /* make a copy */
- paddsw mm3, mm2 ; /* mm3 = is0734 + is1256 */
- ;
- pmulhw mm0, xC4S4 ; /* mm0 = xC4S4 * ( is0734 - is1256 ) - ( is0734 - is1256 ) */
- paddw mm0, mm2 ; /* mm0 = xC4S4 * ( is0734 - is1256 ) */
- psrlw mm2, 15 ;
- paddw mm0, mm2 ; /* Truncate mm0, now it is op[4] */
- ;
- movq mm2, mm3 ;
- movq [ebx], mm0 ; /* save ip4, now mm0,mm2 are free */
- ;
- movq mm0, mm3 ;
- pmulhw mm3, xC4S4 ; /* mm3 = xC4S4 * ( is0734 +is1256 ) - ( is0734 +is1256 ) */
- ;
- psrlw mm2, 15 ;
- paddw mm3, mm0 ; /* mm3 = xC4S4 * ( is0734 +is1256 ) */
- paddw mm3, mm2 ; /* Truncate mm3, now it is op[0] */
- ;
- movq [eax], mm3 ;
- ; /* ------------------------------------------------------------------- */
- movq mm3, [ecx] ; /* mm3 = irot_input_y */
- pmulhw mm3, xC2S6 ; /* mm3 = xC2S6 * irot_input_y - irot_input_y */
- ;
- movq mm2, [ecx] ;
- movq mm0, mm2 ;
- ;
- psrlw mm2, 15 ; /* mm3 = xC2S6 * irot_input_y */
- paddw mm3, mm0 ;
- ;
- paddw mm3, mm2 ; /* Truncated */
- movq mm0, mm5 ;
- ;
- movq mm2, mm5 ;
- pmulhw mm0, xC6S2 ; /* mm0 = xC6S2 * irot_input_x */
- ;
- psrlw mm2, 15 ;
- paddw mm0, mm2 ; /* Truncated */
- ;
- paddsw mm3, mm0 ; /* ip[2] */
- movq [32 + eax], mm3 ; /* Save ip2 */
- ;
- movq mm0, mm5 ;
- movq mm2, mm5 ;
- ;
- pmulhw mm5, xC2S6 ; /* mm5 = xC2S6 * irot_input_x - irot_input_x */
- psrlw mm2, 15 ;
- ;
- movq mm3, [ecx] ;
- paddw mm5, mm0 ; /* mm5 = xC2S6 * irot_input_x */
- ;
- paddw mm5, mm2 ; /* Truncated */
- movq mm2, mm3 ;
- ;
- pmulhw mm3, xC6S2 ; /* mm3 = xC6S2 * irot_input_y */
- psrlw mm2, 15 ;
- ;
- paddw mm3, mm2 ; /* Truncated */
- psubsw mm3, mm5 ;
- ;
- movq [32 + ebx], mm3 ;
- ; /* ------------------------------------------------------------------- */
- movq mm0, xC4S4 ;
- movq mm2, mm1 ;
- movq mm3, mm1 ;
- ;
- pmulhw mm1, mm0 ; /* mm0 = xC4S4 * ( is12 - is56 ) - ( is12 - is56 ) */
- psrlw mm2, 15 ;
- ;
- paddw mm1, mm3 ; /* mm0 = xC4S4 * ( is12 - is56 ) */
- paddw mm1, mm2 ; /* Truncate mm1, now it is icommon_product1 */
- ;
- movq mm2, mm7 ;
- movq mm3, mm7 ;
- ;
- pmulhw mm7, mm0 ; /* mm7 = xC4S4 * ( id12 + id56 ) - ( id12 + id56 ) */
- psrlw mm2, 15 ;
- ;
- paddw mm7, mm3 ; /* mm7 = xC4S4 * ( id12 + id56 ) */
- paddw mm7, mm2 ; /* Truncate mm7, now it is icommon_product2 */
- ; /* ------------------------------------------------------------------- */
- pxor mm0, mm0 ; /* Clear mm0 */
- psubsw mm0, mm6 ; /* mm0 = - id34 */
- ;
- psubsw mm0, mm7 ; /* mm0 = - ( id34 + idcommon_product2 ) */
- paddsw mm6, mm6 ;
- paddsw mm6, mm0 ; /* mm6 = id34 - icommon_product2 */
- ;
- psubsw mm4, mm1 ; /* mm4 = id07 - icommon_product1 */
- paddsw mm1, mm1 ;
- paddsw mm1, mm4 ; /* mm1 = id07 + icommon_product1 */
- ; /* ------------------------------------------------------------------- */
- movq mm7, xC1S7 ;
- movq mm2, mm1 ;
- ;
- movq mm3, mm1 ;
- pmulhw mm1, mm7 ; /* mm1 = xC1S7 * irot_input_x - irot_input_x */
- ;
- movq mm7, xC7S1 ;
- psrlw mm2, 15 ;
- ;
- paddw mm1, mm3 ; /* mm1 = xC1S7 * irot_input_x */
- paddw mm1, mm2 ; /* Trucated */
- ;
- pmulhw mm3, mm7 ; /* mm3 = xC7S1 * irot_input_x */
- paddw mm3, mm2 ; /* Truncated */
- ;
- movq mm5, mm0 ;
- movq mm2, mm0 ;
- ;
- movq mm7, xC1S7 ;
- pmulhw mm0, mm7 ; /* mm0 = xC1S7 * irot_input_y - irot_input_y */
- ;
- movq mm7, xC7S1 ;
- psrlw mm2, 15 ;
- ;
- paddw mm0, mm5 ; /* mm0 = xC1S7 * irot_input_y */
- paddw mm0, mm2 ; /* Truncated */
- ;
- pmulhw mm5, mm7 ; /* mm5 = xC7S1 * irot_input_y */
- paddw mm5, mm2 ; /* Truncated */
- ;
- psubsw mm1, mm5 ; /* mm1 = xC1S7 * irot_input_x - xC7S1 * irot_input_y = ip1 */
- paddsw mm3, mm0 ; /* mm3 = xC7S1 * irot_input_x - xC1S7 * irot_input_y = ip7 */
- ;
- movq [16 + eax], mm1 ;
- movq [48 + ebx], mm3 ;
- ; /* ------------------------------------------------------------------- */
- movq mm0, xC3S5 ;
- movq mm1, xC5S3 ;
- ;
- movq mm5, mm6 ;
- movq mm7, mm6 ;
- ;
- movq mm2, mm4 ;
- movq mm3, mm4 ;
- ;
- pmulhw mm4, mm0 ; /* mm4 = xC3S5 * irot_input_x - irot_input_x */
- pmulhw mm6, mm1 ; /* mm6 = xC5S3 * irot_input_y - irot_input_y */
- ;
- psrlw mm2, 15 ;
- psrlw mm5, 15 ;
- ;
- paddw mm4, mm3 ; /* mm4 = xC3S5 * irot_input_x */
- paddw mm6, mm7 ; /* mm6 = xC5S3 * irot_input_y */
- ;
- paddw mm4, mm2 ; /* Truncated */
- paddw mm6, mm5 ; /* Truncated */
- ;
- psubsw mm4, mm6 ; /* ip3 */
- movq [48 + eax], mm4 ;
- ;
- movq mm4, mm3 ;
- movq mm6, mm7 ;
- ;
- pmulhw mm3, mm1 ; /* mm3 = xC5S3 * irot_input_x - irot_input_x */
- pmulhw mm7, mm0 ; /* mm7 = xC3S5 * irot_input_y - irot_input_y */
- ;
- paddw mm4, mm2 ;
- paddw mm6, mm5 ;
- ;
- paddw mm3, mm4 ; /* mm3 = xC5S3 * irot_input_x */
- paddw mm7, mm6 ; /* mm7 = xC3S5 * irot_input_y */
- ;
- paddw mm3, mm7 ; /* ip5 */
- movq [16 + ebx], mm3 ;
+ movq mm0, [eax] ;
+ movq mm1, [16 + eax] ;
+ movq mm2, [48 + eax] ;
+ movq mm3, [16 + ebx] ;
+ movq mm4, mm0 ;
+ movq mm5, mm1 ;
+ movq mm6, mm2 ;
+ movq mm7, mm3 ;
+ ;
+ paddsw mm0, [48 + ebx] ; /* mm0 = ip0 + ip7 = is07 */
+ paddsw mm1, [32 + eax] ; /* mm1 = ip1 + ip2 = is12 */
+ paddsw mm2, [ebx] ; /* mm2 = ip3 + ip4 = is34 */
+ paddsw mm3, [32 + ebx] ; /* mm3 = ip5 + ip6 = is56 */
+ psubsw mm4, [48 + ebx] ; /* mm4 = ip0 - ip7 = id07 */
+ psubsw mm5, [32 + eax] ; /* mm5 = ip1 - ip2 = id12 */
+ ;
+ psubsw mm0, mm2 ; /* mm0 = is07 - is34 */
+ ;
+ paddsw mm2, mm2 ;
+ ;
+ psubsw mm6, [ebx] ; /* mm6 = ip3 - ip4 = id34 */
+ ;
+ paddsw mm2, mm0 ; /* mm2 = is07 + is34 = is0734 */
+ psubsw mm1, mm3 ; /* mm1 = is12 - is56 */
+ movq [ecx], mm0 ; /* Save is07 - is34 to free mm0; */
+ paddsw mm3, mm3 ;
+ paddsw mm3, mm1 ; /* mm3 = is12 + 1s56 = is1256 */
+ ;
+ psubsw mm7, [32 + ebx] ; /* mm7 = ip5 - ip6 = id56 */
+ ; /* ------------------------------------------------------------------- */
+ psubsw mm5, mm7 ; /* mm5 = id12 - id56 */
+ paddsw mm7, mm7 ;
+ paddsw mm7, mm5 ; /* mm7 = id12 + id56 */
+ ; /* ------------------------------------------------------------------- */
+ psubsw mm2, mm3 ; /* mm2 = is0734 - is1256 */
+ paddsw mm3, mm3 ;
+ ;
+ movq mm0, mm2 ; /* make a copy */
+ paddsw mm3, mm2 ; /* mm3 = is0734 + is1256 */
+ ;
+ pmulhw mm0, xC4S4 ; /* mm0 = xC4S4 * ( is0734 - is1256 ) - ( is0734 - is1256 ) */
+ paddw mm0, mm2 ; /* mm0 = xC4S4 * ( is0734 - is1256 ) */
+ psrlw mm2, 15 ;
+ paddw mm0, mm2 ; /* Truncate mm0, now it is op[4] */
+ ;
+ movq mm2, mm3 ;
+ movq [ebx], mm0 ; /* save ip4, now mm0,mm2 are free */
+ ;
+ movq mm0, mm3 ;
+ pmulhw mm3, xC4S4 ; /* mm3 = xC4S4 * ( is0734 +is1256 ) - ( is0734 +is1256 ) */
+ ;
+ psrlw mm2, 15 ;
+ paddw mm3, mm0 ; /* mm3 = xC4S4 * ( is0734 +is1256 ) */
+ paddw mm3, mm2 ; /* Truncate mm3, now it is op[0] */
+ ;
+ movq [eax], mm3 ;
+ ; /* ------------------------------------------------------------------- */
+ movq mm3, [ecx] ; /* mm3 = irot_input_y */
+ pmulhw mm3, xC2S6 ; /* mm3 = xC2S6 * irot_input_y - irot_input_y */
+ ;
+ movq mm2, [ecx] ;
+ movq mm0, mm2 ;
+ ;
+ psrlw mm2, 15 ; /* mm3 = xC2S6 * irot_input_y */
+ paddw mm3, mm0 ;
+ ;
+ paddw mm3, mm2 ; /* Truncated */
+ movq mm0, mm5 ;
+ ;
+ movq mm2, mm5 ;
+ pmulhw mm0, xC6S2 ; /* mm0 = xC6S2 * irot_input_x */
+ ;
+ psrlw mm2, 15 ;
+ paddw mm0, mm2 ; /* Truncated */
+ ;
+ paddsw mm3, mm0 ; /* ip[2] */
+ movq [32 + eax], mm3 ; /* Save ip2 */
+ ;
+ movq mm0, mm5 ;
+ movq mm2, mm5 ;
+ ;
+ pmulhw mm5, xC2S6 ; /* mm5 = xC2S6 * irot_input_x - irot_input_x */
+ psrlw mm2, 15 ;
+ ;
+ movq mm3, [ecx] ;
+ paddw mm5, mm0 ; /* mm5 = xC2S6 * irot_input_x */
+ ;
+ paddw mm5, mm2 ; /* Truncated */
+ movq mm2, mm3 ;
+ ;
+ pmulhw mm3, xC6S2 ; /* mm3 = xC6S2 * irot_input_y */
+ psrlw mm2, 15 ;
+ ;
+ paddw mm3, mm2 ; /* Truncated */
+ psubsw mm3, mm5 ;
+ ;
+ movq [32 + ebx], mm3 ;
+ ; /* ------------------------------------------------------------------- */
+ movq mm0, xC4S4 ;
+ movq mm2, mm1 ;
+ movq mm3, mm1 ;
+ ;
+ pmulhw mm1, mm0 ; /* mm0 = xC4S4 * ( is12 - is56 ) - ( is12 - is56 ) */
+ psrlw mm2, 15 ;
+ ;
+ paddw mm1, mm3 ; /* mm0 = xC4S4 * ( is12 - is56 ) */
+ paddw mm1, mm2 ; /* Truncate mm1, now it is icommon_product1 */
+ ;
+ movq mm2, mm7 ;
+ movq mm3, mm7 ;
+ ;
+ pmulhw mm7, mm0 ; /* mm7 = xC4S4 * ( id12 + id56 ) - ( id12 + id56 ) */
+ psrlw mm2, 15 ;
+ ;
+ paddw mm7, mm3 ; /* mm7 = xC4S4 * ( id12 + id56 ) */
+ paddw mm7, mm2 ; /* Truncate mm7, now it is icommon_product2 */
+ ; /* ------------------------------------------------------------------- */
+ pxor mm0, mm0 ; /* Clear mm0 */
+ psubsw mm0, mm6 ; /* mm0 = - id34 */
+ ;
+ psubsw mm0, mm7 ; /* mm0 = - ( id34 + idcommon_product2 ) */
+ paddsw mm6, mm6 ;
+ paddsw mm6, mm0 ; /* mm6 = id34 - icommon_product2 */
+ ;
+ psubsw mm4, mm1 ; /* mm4 = id07 - icommon_product1 */
+ paddsw mm1, mm1 ;
+ paddsw mm1, mm4 ; /* mm1 = id07 + icommon_product1 */
+ ; /* ------------------------------------------------------------------- */
+ movq mm7, xC1S7 ;
+ movq mm2, mm1 ;
+ ;
+ movq mm3, mm1 ;
+ pmulhw mm1, mm7 ; /* mm1 = xC1S7 * irot_input_x - irot_input_x */
+ ;
+ movq mm7, xC7S1 ;
+ psrlw mm2, 15 ;
+ ;
+ paddw mm1, mm3 ; /* mm1 = xC1S7 * irot_input_x */
+ paddw mm1, mm2 ; /* Trucated */
+ ;
+ pmulhw mm3, mm7 ; /* mm3 = xC7S1 * irot_input_x */
+ paddw mm3, mm2 ; /* Truncated */
+ ;
+ movq mm5, mm0 ;
+ movq mm2, mm0 ;
+ ;
+ movq mm7, xC1S7 ;
+ pmulhw mm0, mm7 ; /* mm0 = xC1S7 * irot_input_y - irot_input_y */
+ ;
+ movq mm7, xC7S1 ;
+ psrlw mm2, 15 ;
+ ;
+ paddw mm0, mm5 ; /* mm0 = xC1S7 * irot_input_y */
+ paddw mm0, mm2 ; /* Truncated */
+ ;
+ pmulhw mm5, mm7 ; /* mm5 = xC7S1 * irot_input_y */
+ paddw mm5, mm2 ; /* Truncated */
+ ;
+ psubsw mm1, mm5 ; /* mm1 = xC1S7 * irot_input_x - xC7S1 * irot_input_y = ip1 */
+ paddsw mm3, mm0 ; /* mm3 = xC7S1 * irot_input_x - xC1S7 * irot_input_y = ip7 */
+ ;
+ movq [16 + eax], mm1 ;
+ movq [48 + ebx], mm3 ;
+ ; /* ------------------------------------------------------------------- */
+ movq mm0, xC3S5 ;
+ movq mm1, xC5S3 ;
+ ;
+ movq mm5, mm6 ;
+ movq mm7, mm6 ;
+ ;
+ movq mm2, mm4 ;
+ movq mm3, mm4 ;
+ ;
+ pmulhw mm4, mm0 ; /* mm4 = xC3S5 * irot_input_x - irot_input_x */
+ pmulhw mm6, mm1 ; /* mm6 = xC5S3 * irot_input_y - irot_input_y */
+ ;
+ psrlw mm2, 15 ;
+ psrlw mm5, 15 ;
+ ;
+ paddw mm4, mm3 ; /* mm4 = xC3S5 * irot_input_x */
+ paddw mm6, mm7 ; /* mm6 = xC5S3 * irot_input_y */
+ ;
+ paddw mm4, mm2 ; /* Truncated */
+ paddw mm6, mm5 ; /* Truncated */
+ ;
+ psubsw mm4, mm6 ; /* ip3 */
+ movq [48 + eax], mm4 ;
+ ;
+ movq mm4, mm3 ;
+ movq mm6, mm7 ;
+ ;
+ pmulhw mm3, mm1 ; /* mm3 = xC5S3 * irot_input_x - irot_input_x */
+ pmulhw mm7, mm0 ; /* mm7 = xC3S5 * irot_input_y - irot_input_y */
+ ;
+ paddw mm4, mm2 ;
+ paddw mm6, mm5 ;
+ ;
+ paddw mm3, mm4 ; /* mm3 = xC5S3 * irot_input_x */
+ paddw mm7, mm6 ; /* mm7 = xC3S5 * irot_input_y */
+ ;
+ paddw mm3, mm7 ; /* ip5 */
+ movq [16 + ebx], mm3 ;
};
@@ -329,6 +329,5 @@
void dsp_mmx_fdct_init(DspFunctions *funcs)
{
- TH_DEBUG("enabling accelerated x86_32 mmx fdct function.\n");
funcs->fdct_short = fdct_short__mmx;
}
Modified: trunk/theora/lib/enc/x86_32_vs/recon_mmx.c
===================================================================
--- trunk/theora/lib/enc/x86_32_vs/recon_mmx.c 2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/enc/x86_32_vs/recon_mmx.c 2008-08-04 18:37:55 UTC (rev 15153)
@@ -21,44 +21,44 @@
static const unsigned __int64 V128 = 0x8080808080808080;
static void copy8x8__mmx (unsigned char *src,
- unsigned char *dest,
- unsigned int stride)
+ unsigned char *dest,
+ unsigned int stride)
{
//Is this even the fastest way to do this?
__asm {
- align 16
+ align 16
mov eax, src
mov ebx, dest
mov ecx, stride
- lea edi, [ecx + ecx * 2]
- movq mm0, [eax]
- movq mm1, [eax + ecx]
- movq mm2, [eax + ecx * 2]
- movq mm3, [eax + edi]
- lea eax, [eax + ecx * 4]
- movq [ebx], mm0
- movq [ebx + ecx], mm1
- movq [ebx + ecx * 2], mm2
- movq [ebx + edi], mm3
- lea ebx, [ebx + ecx * 4]
- movq mm0, [eax]
- movq mm1, [eax + ecx]
- movq mm2, [eax + ecx * 2]
- movq mm3, [eax + edi]
- movq [ebx], mm0
- movq [ebx + ecx], mm1
- movq [ebx + ecx * 2], mm2
- movq [ebx + edi], mm3
+ lea edi, [ecx + ecx * 2]
+ movq mm0, [eax]
+ movq mm1, [eax + ecx]
+ movq mm2, [eax + ecx * 2]
+ movq mm3, [eax + edi]
+ lea eax, [eax + ecx * 4]
+ movq [ebx], mm0
+ movq [ebx + ecx], mm1
+ movq [ebx + ecx * 2], mm2
+ movq [ebx + edi], mm3
+ lea ebx, [ebx + ecx * 4]
+ movq mm0, [eax]
+ movq mm1, [eax + ecx]
+ movq mm2, [eax + ecx * 2]
+ movq mm3, [eax + edi]
+ movq [ebx], mm0
+ movq [ebx + ecx], mm1
+ movq [ebx + ecx * 2], mm2
+ movq [ebx + edi], mm3
};
}
static void recon_intra8x8__mmx (unsigned char *ReconPtr, ogg_int16_t *ChangePtr,
- ogg_uint32_t LineStep)
+ ogg_uint32_t LineStep)
{
__asm {
@@ -68,28 +68,28 @@
mov ebx, ChangePtr
mov ecx, LineStep
- movq mm0, V128
+ movq mm0, V128
- lea edi, [128 + ebx]
- loop_start:
- movq mm2, [ebx]
+ lea edi, [128 + ebx]
+ loop_start:
+ movq mm2, [ebx]
- packsswb mm2, [8 + ebx]
- por mm0, mm0
- pxor mm2, mm0
- lea ebx, [16 + ebx]
- cmp ebx, edi
+ packsswb mm2, [8 + ebx]
+ por mm0, mm0
+ pxor mm2, mm0
+ lea ebx, [16 + ebx]
+ cmp ebx, edi
- movq [eax], mm2
+ movq [eax], mm2
- lea eax, [eax + ecx]
- jc loop_start
+ lea eax, [eax + ecx]
+ jc loop_start
};
-
+
}
@@ -97,7 +97,7 @@
static void recon_inter8x8__mmx (unsigned char *ReconPtr, unsigned char *RefPtr,
- ogg_int16_t *ChangePtr, ogg_uint32_t LineStep)
+ ogg_int16_t *ChangePtr, ogg_uint32_t LineStep)
{
__asm {
@@ -108,29 +108,29 @@
mov ebx, ChangePtr
mov ecx, LineStep
mov edx, RefPtr
-
- pxor mm0, mm0
- lea edi, [128 + ebx]
+ pxor mm0, mm0
+ lea edi, [128 + ebx]
+
loop_start:
- movq mm2, [edx]
+ movq mm2, [edx]
- movq mm4, [ebx]
- movq mm3, mm2
- movq mm5, [8 + ebx]
- punpcklbw mm2, mm0
- paddsw mm2, mm4
- punpckhbw mm3, mm0
- paddsw mm3, mm5
- add edx, ecx
- packuswb mm2, mm3
- lea ebx, [16 + ebx]
- cmp ebx, edi
+ movq mm4, [ebx]
+ movq mm3, mm2
+ movq mm5, [8 + ebx]
+ punpcklbw mm2, mm0
+ paddsw mm2, mm4
+ punpckhbw mm3, mm0
+ paddsw mm3, mm5
+ add edx, ecx
+ packuswb mm2, mm3
+ lea ebx, [16 + ebx]
+ cmp ebx, edi
- movq [eax], mm2
+ movq [eax], mm2
- lea eax, [eax + ecx]
- jc loop_start
+ lea eax, [eax + ecx]
+ jc loop_start
};
}
@@ -139,8 +139,8 @@
static void recon_inter8x8_half__mmx (unsigned char *ReconPtr, unsigned char *RefPtr1,
- unsigned char *RefPtr2, ogg_int16_t *ChangePtr,
- ogg_uint32_t LineStep)
+ unsigned char *RefPtr2, ogg_int16_t *ChangePtr,
+ ogg_uint32_t LineStep)
{
__asm {
align 16
@@ -149,36 +149,36 @@
mov ebx, ChangePtr
mov ecx, RefPtr1
mov edx, RefPtr2
-
- pxor mm0, mm0
- lea edi, [128 + ebx]
+ pxor mm0, mm0
+ lea edi, [128 + ebx]
+
loop_start:
- movq mm2, [ecx]
- movq mm4, [edx]
+ movq mm2, [ecx]
+ movq mm4, [edx]
- movq mm3, mm2
- punpcklbw mm2, mm0
- movq mm5, mm4
- movq mm6, [ebx]
- punpckhbw mm3, mm0
- movq mm7, [8 + ebx]
- punpcklbw mm4, mm0
- punpckhbw mm5, mm0
- paddw mm2, mm4
- paddw mm3, mm5
- psrlw mm2, 1
- psrlw mm3, 1
- paddw mm2, mm6
- paddw mm3, mm7
- lea ebx, [16 + ebx]
- packuswb mm2, mm3
- add ecx, LineStep
- add edx, LineStep
- movq [eax], mm2
- add eax, LineStep
- cmp ebx, edi
- jc loop_start
+ movq mm3, mm2
+ punpcklbw mm2, mm0
+ movq mm5, mm4
+ movq mm6, [ebx]
+ punpckhbw mm3, mm0
+ movq mm7, [8 + ebx]
+ punpcklbw mm4, mm0
+ punpckhbw mm5, mm0
+ paddw mm2, mm4
+ paddw mm3, mm5
+ psrlw mm2, 1
+ psrlw mm3, 1
+ paddw mm2, mm6
+ paddw mm3, mm7
+ lea ebx, [16 + ebx]
+ packuswb mm2, mm3
+ add ecx, LineStep
+ add edx, LineStep
+ movq [eax], mm2
+ add eax, LineStep
+ cmp ebx, edi
+ jc loop_start
};
@@ -189,7 +189,6 @@
void dsp_mmx_recon_init(DspFunctions *funcs)
{
- TH_DEBUG("enabling accelerated x86_32 mmx recon functions.\n");
funcs->copy8x8 = copy8x8__mmx;
funcs->recon_intra8x8 = recon_intra8x8__mmx;
funcs->recon_inter8x8 = recon_inter8x8__mmx;
Modified: trunk/theora/lib/enc/x86_64/dct_decode_mmx.c
===================================================================
--- trunk/theora/lib/enc/x86_64/dct_decode_mmx.c 2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/enc/x86_64/dct_decode_mmx.c 2008-08-04 18:37:55 UTC (rev 15153)
@@ -27,7 +27,7 @@
0x0004000400040004LL;
static void loop_filter_v(unsigned char *_pix,int _ystride,
- const ogg_int16_t *_ll){
+ const ogg_int16_t *_ll){
long esi;
_pix-=_ystride*2;
__asm__ __volatile__(
@@ -210,7 +210,7 @@
four p0's to one register we must transpose the values in four mmx regs.
When half is done we repeat this for the rest.*/
static void loop_filter_h4(unsigned char *_pix,long _ystride,
- const ogg_int16_t *_ll){
+ const ogg_int16_t *_ll){
long esi;
long edi;
__asm__ __volatile__(
@@ -343,12 +343,12 @@
}
static void loop_filter_h(unsigned char *_pix,int _ystride,
- const ogg_int16_t *_ll){
+ const ogg_int16_t *_ll){
_pix-=2;
loop_filter_h4(_pix,_ystride,_ll);
loop_filter_h4(_pix+(_ystride<<2),_ystride,_ll);
}
-
+
static void loop_filter_mmx(PB_INSTANCE *pbi, int FLimit){
int j;
ogg_int16_t __attribute__((aligned(8))) ll[4];
@@ -359,7 +359,7 @@
ll[0]=ll[1]=ll[2]=ll[3]=FLimit;
for ( j = 0; j < 3 ; j++){
- ogg_uint32_t *bp_begin = bp;
+ ogg_uint32_t *bp_begin = bp;
ogg_uint32_t *bp_end;
int stride;
int h;
@@ -376,23 +376,23 @@
stride = pbi->UVStride;
break;
}
-
+
while(bp<bp_end){
ogg_uint32_t *bp_left = bp;
ogg_uint32_t *bp_right = bp + h;
while(bp<bp_right){
- if(cp[0]){
- if(bp>bp_left)
- loop_filter_h(&pbi->LastFrameRecon[bp[0]],stride,ll);
- if(bp_left>bp_begin)
- loop_filter_v(&pbi->LastFrameRecon[bp[0]],stride,ll);
- if(bp+1<bp_right && !cp[1])
- loop_filter_h(&pbi->LastFrameRecon[bp[0]]+8,stride,ll);
- if(bp+h<bp_end && !cp[h])
- loop_filter_v(&pbi->LastFrameRecon[bp[h]],stride,ll);
- }
- bp++;
- cp++;
+ if(cp[0]){
+ if(bp>bp_left)
+ loop_filter_h(&pbi->LastFrameRecon[bp[0]],stride,ll);
+ if(bp_left>bp_begin)
+ loop_filter_v(&pbi->LastFrameRecon[bp[0]],stride,ll);
+ if(bp+1<bp_right && !cp[1])
+ loop_filter_h(&pbi->LastFrameRecon[bp[0]]+8,stride,ll);
+ if(bp+h<bp_end && !cp[h])
+ loop_filter_v(&pbi->LastFrameRecon[bp[h]],stride,ll);
+ }
+ bp++;
+ cp++;
}
}
}
Modified: trunk/theora/lib/enc/x86_64/dsp_mmx.c
===================================================================
--- trunk/theora/lib/enc/x86_64/dsp_mmx.c 2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/enc/x86_64/dsp_mmx.c 2008-08-04 18:37:55 UTC (rev 15153)
@@ -32,12 +32,12 @@
static void sub8x8__mmx (unsigned char *FiltPtr, unsigned char *ReconPtr,
ogg_int16_t *DctInputPtr, ogg_uint32_t PixelsPerLine,
- ogg_uint32_t ReconPixelsPerLine)
+ ogg_uint32_t ReconPixelsPerLine)
{
__asm__ __volatile__ (
" .balign 16 \n\t"
- " pxor %%mm7, %%mm7 \n\t"
+ " pxor %%mm7, %%mm7 \n\t"
".rept 8 \n\t"
" movq (%0), %%mm0 \n\t" /* mm0 = FiltPtr */
@@ -64,20 +64,20 @@
"+r" (ReconPtr),
"+r" (DctInputPtr)
: "r" ((ogg_uint64_t)PixelsPerLine),
- "r" ((ogg_uint64_t)ReconPixelsPerLine)
+ "r" ((ogg_uint64_t)ReconPixelsPerLine)
: "memory"
);
}
static void sub8x8_128__mmx (unsigned char *FiltPtr, ogg_int16_t *DctInputPtr,
- ogg_uint32_t PixelsPerLine)
+ ogg_uint32_t PixelsPerLine)
{
ogg_uint64_t ppl = PixelsPerLine;
__asm__ __volatile__ (
" .balign 16 \n\t"
- " pxor %%mm7, %%mm7 \n\t"
+ " pxor %%mm7, %%mm7 \n\t"
" movq %[V128], %%mm1 \n\t"
".rept 8 \n\t"
@@ -107,12 +107,12 @@
static void sub8x8avg2__mmx (unsigned char *FiltPtr, unsigned char *ReconPtr1,
unsigned char *ReconPtr2, ogg_int16_t *DctInputPtr,
ogg_uint32_t PixelsPerLine,
- ogg_uint32_t ReconPixelsPerLine)
+ ogg_uint32_t ReconPixelsPerLine)
{
__asm__ __volatile__ (
" .balign 16 \n\t"
- " pxor %%mm7, %%mm7 \n\t"
+ " pxor %%mm7, %%mm7 \n\t"
".rept 8 \n\t"
" movq (%0), %%mm0 \n\t" /* mm0 = FiltPtr */
@@ -149,7 +149,7 @@
"+r" (ReconPtr2),
"+r" (DctInputPtr)
: "r" ((ogg_uint64_t)PixelsPerLine),
- "r" ((ogg_uint64_t)ReconPixelsPerLine)
+ "r" ((ogg_uint64_t)ReconPixelsPerLine)
: "memory"
);
}
@@ -167,7 +167,7 @@
" pxor %%mm7, %%mm7 \n\t"
" mov $8, %%rdi \n\t"
"1: \n\t"
- " movq (%2), %%mm0 \n\t" /* take 8 bytes */
+ " movq (%2), %%mm0 \n\t" /* take 8 bytes */
" movq %%mm0, %%mm2 \n\t"
" punpcklbw %%mm6, %%mm0 \n\t"
@@ -178,11 +178,11 @@
" pmaddwd %%mm0, %%mm0 \n\t"
" pmaddwd %%mm2, %%mm2 \n\t"
-
+
" paddd %%mm0, %%mm7 \n\t"
" paddd %%mm2, %%mm7 \n\t"
- " add %3, %2 \n\t" /* Inc pointer into src data */
+ " add %3, %2 \n\t" /* Inc pointer into src data */
" dec %%rdi \n\t"
" jnz 1b \n\t"
@@ -204,7 +204,7 @@
: "=r" (XSum),
"=r" (XXSum),
- "+r" (DataPtr)
+ "+r" (DataPtr)
: "r" ((ogg_uint64_t)Stride)
: "rdi", "memory"
);
@@ -214,7 +214,7 @@
}
static ogg_uint32_t inter8x8_err__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride,
- unsigned char *RefDataPtr, ogg_uint32_t RefStride)
+ unsigned char *RefDataPtr, ogg_uint32_t RefStride)
{
ogg_uint64_t XSum;
ogg_uint64_t XXSum;
@@ -227,7 +227,7 @@
" pxor %%mm7, %%mm7 \n\t"
" mov $8, %%rdi \n\t"
"1: \n\t"
- " movq (%2), %%mm0 \n\t" /* take 8 bytes */
+ " movq (%2), %%mm0 \n\t" /* take 8 bytes */
" movq (%3), %%mm1 \n\t"
" movq %%mm0, %%mm2 \n\t"
" movq %%mm1, %%mm3 \n\t"
@@ -245,12 +245,12 @@
" pmaddwd %%mm0, %%mm0 \n\t"
" pmaddwd %%mm2, %%mm2 \n\t"
-
+
" paddd %%mm0, %%mm7 \n\t"
" paddd %%mm2, %%mm7 \n\t"
- " add %4, %2 \n\t" /* Inc pointer into src data */
- " add %5, %3 \n\t" /* Inc pointer into ref data */
+ " add %4, %2 \n\t" /* Inc pointer into src data */
+ " add %5, %3 \n\t" /* Inc pointer into ref data */
" dec %%rdi \n\t"
" jnz 1b \n\t"
@@ -272,8 +272,8 @@
: "=m" (XSum),
"=m" (XXSum),
- "+r" (SrcData),
- "+r" (RefDataPtr)
+ "+r" (SrcData),
+ "+r" (RefDataPtr)
: "r" ((ogg_uint64_t)SrcStride),
"r" ((ogg_uint64_t)RefStride)
: "rdi", "memory"
@@ -292,7 +292,6 @@
void dsp_mmx_init(DspFunctions *funcs)
{
- TH_DEBUG("setting accelerated x86_64 mmx dsp functions.\n");
funcs->restore_fpu = restore_fpu;
funcs->sub8x8 = sub8x8__mmx;
funcs->sub8x8_128 = sub8x8_128__mmx;
Modified: trunk/theora/lib/enc/x86_64/dsp_mmxext.c
===================================================================
--- trunk/theora/lib/enc/x86_64/dsp_mmxext.c 2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/enc/x86_64/dsp_mmxext.c 2008-08-04 18:37:55 UTC (rev 15153)
@@ -31,26 +31,26 @@
__asm__ __volatile__ (
" .balign 16 \n\t"
- " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */
+ " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */
".rept 7 \n\t"
- " movq (%1), %%mm0 \n\t" /* take 8 bytes */
+ " movq (%1), %%mm0 \n\t" /* take 8 bytes */
" movq (%2), %%mm1 \n\t"
" psadbw %%mm1, %%mm0 \n\t"
- " add %3, %1 \n\t" /* Inc pointer into the new data */
- " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */
- " add %4, %2 \n\t" /* Inc pointer into ref data */
+ " add %3, %1 \n\t" /* Inc pointer into the new data */
+ " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */
+ " add %4, %2 \n\t" /* Inc pointer into ref data */
".endr \n\t"
- " movq (%1), %%mm0 \n\t" /* take 8 bytes */
+ " movq (%1), %%mm0 \n\t" /* take 8 bytes */
" movq (%2), %%mm1 \n\t"
" psadbw %%mm1, %%mm0 \n\t"
- " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */
+ " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */
" movd %%mm7, %0 \n\t"
: "=r" (DiffVal),
- "+r" (ptr1),
- "+r" (ptr2)
+ "+r" (ptr1),
+ "+r" (ptr2)
: "r" ((ogg_uint64_t)stride1),
"r" ((ogg_uint64_t)stride2)
: "memory"
@@ -60,29 +60,29 @@
}
static ogg_uint32_t sad8x8_thres__mmxext (unsigned char *ptr1, ogg_uint32_t stride1,
- unsigned char *ptr2, ogg_uint32_t stride2,
- ogg_uint32_t thres)
+ unsigned char *ptr2, ogg_uint32_t stride2,
+ ogg_uint32_t thres)
{
ogg_uint32_t DiffVal;
__asm__ __volatile__ (
" .balign 16 \n\t"
- " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */
+ " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */
".rept 8 \n\t"
- " movq (%1), %%mm0 \n\t" /* take 8 bytes */
+ " movq (%1), %%mm0 \n\t" /* take 8 bytes */
" movq (%2), %%mm1 \n\t"
" psadbw %%mm1, %%mm0 \n\t"
- " add %3, %1 \n\t" /* Inc pointer into the new data */
- " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */
- " add %4, %2 \n\t" /* Inc pointer into ref data */
+ " add %3, %1 \n\t" /* Inc pointer into the new data */
+ " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */
+ " add %4, %2 \n\t" /* Inc pointer into ref data */
".endr \n\t"
" movd %%mm7, %0 \n\t"
: "=r" (DiffVal),
- "+r" (ptr1),
- "+r" (ptr2)
+ "+r" (ptr1),
+ "+r" (ptr2)
: "r" ((ogg_uint64_t)stride1),
"r" ((ogg_uint64_t)stride2)
: "memory"
@@ -100,25 +100,25 @@
__asm__ __volatile__ (
" .balign 16 \n\t"
- " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */
+ " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */
".rept 8 \n\t"
- " movq (%1), %%mm0 \n\t" /* take 8 bytes */
+ " movq (%1), %%mm0 \n\t" /* take 8 bytes */
" movq (%2), %%mm1 \n\t"
" movq (%3), %%mm2 \n\t"
" pavgb %%mm2, %%mm1 \n\t"
" psadbw %%mm1, %%mm0 \n\t"
- " add %4, %1 \n\t" /* Inc pointer into the new data */
- " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */
- " add %5, %2 \n\t" /* Inc pointer into ref data */
- " add %5, %3 \n\t" /* Inc pointer into ref data */
+ " add %4, %1 \n\t" /* Inc pointer into the new data */
+ " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */
+ " add %5, %2 \n\t" /* Inc pointer into ref data */
+ " add %5, %3 \n\t" /* Inc pointer into ref data */
".endr \n\t"
" movd %%mm7, %0 \n\t"
: "=m" (DiffVal),
- "+r" (SrcData),
- "+r" (RefDataPtr1),
- "+r" (RefDataPtr2)
+ "+r" (SrcData),
+ "+r" (RefDataPtr1),
+ "+r" (RefDataPtr2)
: "r" ((ogg_uint64_t)SrcStride),
"r" ((ogg_uint64_t)RefStride)
: "memory"
@@ -126,7 +126,7 @@
return DiffVal;
}
-
+
static ogg_uint32_t row_sad8__mmxext (unsigned char *Src1, unsigned char *Src2)
{
ogg_uint32_t MaxSad;
@@ -146,8 +146,8 @@
" andl $0xffff, %0 \n\t"
: "=m" (MaxSad),
- "+r" (Src1),
- "+r" (Src2)
+ "+r" (Src1),
+ "+r" (Src2)
:
: "memory"
);
@@ -156,56 +156,56 @@
}
static ogg_uint32_t col_sad8x8__mmxext (unsigned char *Src1, unsigned char *Src2,
- ogg_uint32_t stride)
+ ogg_uint32_t stride)
{
ogg_uint32_t MaxSad;
__asm__ __volatile__ (
" .balign 16 \n\t"
- " pxor %%mm3, %%mm3 \n\t" /* zero out mm3 for unpack */
- " pxor %%mm4, %%mm4 \n\t" /* mm4 low sum */
- " pxor %%mm5, %%mm5 \n\t" /* mm5 high sum */
- " pxor %%mm6, %%mm6 \n\t" /* mm6 low sum */
- " pxor %%mm7, %%mm7 \n\t" /* mm7 high sum */
- " mov $4, %%rdi \n\t" /* 4 rows */
+ " pxor %%mm3, %%mm3 \n\t" /* zero out mm3 for unpack */
+ " pxor %%mm4, %%mm4 \n\t" /* mm4 low sum */
+ " pxor %%mm5, %%mm5 \n\t" /* mm5 high sum */
+ " pxor %%mm6, %%mm6 \n\t" /* mm6 low sum */
+ " pxor %%mm7, %%mm7 \n\t" /* mm7 high sum */
+ " mov $4, %%rdi \n\t" /* 4 rows */
"1: \n\t"
- " movq (%1), %%mm0 \n\t" /* take 8 bytes */
- " movq (%2), %%mm1 \n\t" /* take 8 bytes */
+ " movq (%1), %%mm0 \n\t" /* take 8 bytes */
+ " movq (%2), %%mm1 \n\t" /* take 8 bytes */
" movq %%mm0, %%mm2 \n\t"
- " psubusb %%mm1, %%mm0 \n\t" /* A - B */
- " psubusb %%mm2, %%mm1 \n\t" /* B - A */
- " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */
+ " psubusb %%mm1, %%mm0 \n\t" /* A - B */
+ " psubusb %%mm2, %%mm1 \n\t" /* B - A */
+ " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */
" movq %%mm0, %%mm1 \n\t"
- " punpcklbw %%mm3, %%mm0 \n\t" /* unpack to higher precision for accumulation */
- " paddw %%mm0, %%mm4 \n\t" /* accumulate difference... */
- " punpckhbw %%mm3, %%mm1 \n\t" /* unpack high four bytes to higher precision */
- " paddw %%mm1, %%mm5 \n\t" /* accumulate difference... */
- " add %3, %1 \n\t" /* Inc pointer into the new data */
- " add %3, %2 \n\t" /* Inc pointer into the new data */
+ " punpcklbw %%mm3, %%mm0 \n\t" /* unpack to higher precision for accumulation */
+ " paddw %%mm0, %%mm4 \n\t" /* accumulate difference... */
+ " punpckhbw %%mm3, %%mm1 \n\t" /* unpack high four bytes to higher precision */
+ " paddw %%mm1, %%mm5 \n\t" /* accumulate difference... */
+ " add %3, %1 \n\t" /* Inc pointer into the new data */
+ " add %3, %2 \n\t" /* Inc pointer into the new data */
" dec %%rdi \n\t"
" jnz 1b \n\t"
- " mov $4, %%rdi \n\t" /* 4 rows */
+ " mov $4, %%rdi \n\t" /* 4 rows */
"2: \n\t"
- " movq (%1), %%mm0 \n\t" /* take 8 bytes */
- " movq (%2), %%mm1 \n\t" /* take 8 bytes */
+ " movq (%1), %%mm0 \n\t" /* take 8 bytes */
+ " movq (%2), %%mm1 \n\t" /* take 8 bytes */
" movq %%mm0, %%mm2 \n\t"
- " psubusb %%mm1, %%mm0 \n\t" /* A - B */
- " psubusb %%mm2, %%mm1 \n\t" /* B - A */
- " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */
+ " psubusb %%mm1, %%mm0 \n\t" /* A - B */
+ " psubusb %%mm2, %%mm1 \n\t" /* B - A */
+ " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */
" movq %%mm0, %%mm1 \n\t"
- " punpcklbw %%mm3, %%mm0 \n\t" /* unpack to higher precision for accumulation */
- " paddw %%mm0, %%mm6 \n\t" /* accumulate difference... */
- " punpckhbw %%mm3, %%mm1 \n\t" /* unpack high four bytes to higher precision */
- " paddw %%mm1, %%mm7 \n\t" /* accumulate difference... */
- " add %3, %1 \n\t" /* Inc pointer into the new data */
- " add %3, %2 \n\t" /* Inc pointer into the new data */
+ " punpcklbw %%mm3, %%mm0 \n\t" /* unpack to higher precision for accumulation */
+ " paddw %%mm0, %%mm6 \n\t" /* accumulate difference... */
+ " punpckhbw %%mm3, %%mm1 \n\t" /* unpack high four bytes to higher precision */
+ " paddw %%mm1, %%mm7 \n\t" /* accumulate difference... */
+ " add %3, %1 \n\t" /* Inc pointer into the new data */
+ " add %3, %2 \n\t" /* Inc pointer into the new data */
" dec %%rdi \n\t"
" jnz 2b \n\t"
@@ -223,8 +223,8 @@
" andl $0xffff, %0 \n\t"
: "=r" (MaxSad),
- "+r" (Src1),
- "+r" (Src2)
+ "+r" (Src1),
+ "+r" (Src2)
: "r" ((ogg_uint64_t)stride)
: "memory", "rdi"
);
@@ -248,10 +248,10 @@
" pxor %%mm7, %%mm7 \n\t"
" mov $8, %%rdi \n\t"
"1: \n\t"
- " movq (%2), %%mm0 \n\t" /* take 8 bytes */
+ " movq (%2), %%mm0 \n\t" /* take 8 bytes */
" movq (%3), %%mm2 \n\t"
- " movq (%4), %%mm1 \n\t" /* take average of mm2 and mm1 */
+ " movq (%4), %%mm1 \n\t" /* take average of mm2 and mm1 */
" pavgb %%mm2, %%mm1 \n\t"
" movq %%mm0, %%mm2 \n\t"
@@ -270,13 +270,13 @@
" pmaddwd %%mm0, %%mm0 \n\t"
" pmaddwd %%mm2, %%mm2 \n\t"
-
+
" paddd %%mm0, %%mm7 \n\t"
" paddd %%mm2, %%mm7 \n\t"
- " add %5, %2 \n\t" /* Inc pointer into src data */
- " add %6, %3 \n\t" /* Inc pointer into ref data */
- " add %6, %4 \n\t" /* Inc pointer into ref data */
+ " add %5, %2 \n\t" /* Inc pointer into src data */
+ " add %6, %3 \n\t" /* Inc pointer into ref data */
+ " add %6, %4 \n\t" /* Inc pointer into ref data */
" dec %%rdi \n\t"
" jnz 1b \n\t"
@@ -298,9 +298,9 @@
: "=m" (XSum),
"=m" (XXSum),
- "+r" (SrcData),
+ "+r" (SrcData),
"+r" (RefDataPtr1),
- "+r" (RefDataPtr2)
+ "+r" (RefDataPtr2)
: "r" ((ogg_uint64_t)SrcStride),
"r" ((ogg_uint64_t)RefStride)
: "rdi", "memory"
@@ -312,7 +312,6 @@
void dsp_mmxext_init(DspFunctions *funcs)
{
- TH_DEBUG("enabling accerated x86_64 mmxext dsp functions.\n");
funcs->row_sad8 = row_sad8__mmxext;
funcs->col_sad8x8 = col_sad8x8__mmxext;
funcs->sad8x8 = sad8x8__mmxext;
Modified: trunk/theora/lib/enc/x86_64/fdct_mmx.c
===================================================================
--- trunk/theora/lib/enc/x86_64/fdct_mmx.c 2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/enc/x86_64/fdct_mmx.c 2008-08-04 18:37:55 UTC (rev 15153)
@@ -62,7 +62,7 @@
" psubsw %%mm3, %%mm1 \n\t" /* mm1 = is12 - is56 */ \
" movq %%mm0," #temp " \n\t" /* Save is07 - is34 to free mm0; */ \
" paddsw %%mm3, %%mm3 \n\t" \
- " paddsw %%mm1, %%mm3 \n\t" /* mm3 = is12 + 1s56 = is1256 */ \
+ " paddsw %%mm1, %%mm3 \n\t" /* mm3 = is12 + 1s56 = is1256 */ \
\
" psubsw " #ip6 ", %%mm7 \n\t" /* mm7 = ip5 - ip6 = id56 */ \
/* ------------------------------------------------------------------- */ \
@@ -88,7 +88,7 @@
" pmulhw %[xC4S4], %%mm3 \n\t" /* mm3 = xC4S4 * ( is0734 +is1256 ) - ( is0734 +is1256 ) */ \
\
" psrlw $15, %%mm2 \n\t" \
- " paddw %%mm0, %%mm3 \n\t" /* mm3 = xC4S4 * ( is0734 +is1256 ) */ \
+ " paddw %%mm0, %%mm3 \n\t" /* mm3 = xC4S4 * ( is0734 +is1256 ) */ \
" paddw %%mm2, %%mm3 \n\t" /* Truncate mm3, now it is op[0] */ \
\
" movq %%mm3," #ip0 " \n\t" \
@@ -139,16 +139,16 @@
" movq %%mm1, %%mm3 \n\t" \
\
" pmulhw %%mm0, %%mm1 \n\t" /* mm0 = xC4S4 * ( is12 - is56 ) - ( is12 - is56 ) */ \
- " psrlw $15, %%mm2 \n\t" \
+ " psrlw $15, %%mm2 \n\t" \
\
" paddw %%mm3, %%mm1 \n\t" /* mm0 = xC4S4 * ( is12 - is56 ) */ \
" paddw %%mm2, %%mm1 \n\t" /* Truncate mm1, now it is icommon_product1 */ \
\
" movq %%mm7, %%mm2 \n\t" \
- " movq %%mm7, %%mm3 \n\t" \
+ " movq %%mm7, %%mm3 \n\t" \
\
" pmulhw %%mm0, %%mm7 \n\t" /* mm7 = xC4S4 * ( id12 + id56 ) - ( id12 + id56 ) */ \
- " psrlw $15, %%mm2 \n\t" \
+ " psrlw $15, %%mm2 \n\t" \
\
" paddw %%mm3, %%mm7 \n\t" /* mm7 = xC4S4 * ( id12 + id56 ) */ \
" paddw %%mm2, %%mm7 \n\t" /* Truncate mm7, now it is icommon_product2 */ \
@@ -237,10 +237,10 @@
" paddw %%mm6, %%mm7 \n\t" /* mm7 = xC3S5 * irot_input_y */ \
\
" paddw %%mm7, %%mm3 \n\t" /* ip5 */ \
- " movq %%mm3," #ip5 " \n\t"
+ " movq %%mm3," #ip5 " \n\t"
#define Transpose_mmx(ip0,ip1,ip2,ip3,ip4,ip5,ip6,ip7, \
- op0,op1,op2,op3,op4,op5,op6,op7) \
+ op0,op1,op2,op3,op4,op5,op6,op7) \
" movq " #ip0 ", %%mm0 \n\t" /* mm0 = a0 a1 a2 a3 */ \
" movq " #ip4 ", %%mm4 \n\t" /* mm4 = e4 e5 e6 e7 */ \
" movq " #ip1 ", %%mm1 \n\t" /* mm1 = b0 b1 b2 b3 */ \
@@ -254,9 +254,9 @@
" movq %%mm4, %%mm1 \n\t" /* mm1 = e3 e2 e1 e0 */ \
" punpcklwd %%mm5, %%mm4 \n\t" /* mm4 = f1 e1 f0 e0 */ \
" movq %%mm0," #op0 " \n\t" /* save a3 a2 a1 a0 */ \
- " punpckhwd %%mm5, %%mm1 \n\t" /* mm1 = f3 e3 f2 e2 */ \
+ " punpckhwd %%mm5, %%mm1 \n\t" /* mm1 = f3 e3 f2 e2 */ \
" movq %%mm6, %%mm0 \n\t" /* mm0 = g3 g2 g1 g0 */ \
- " punpcklwd %%mm7, %%mm6 \n\t" /* mm6 = h1 g1 h0 g0 */ \
+ " punpcklwd %%mm7, %%mm6 \n\t" /* mm6 = h1 g1 h0 g0 */ \
" movq %%mm4, %%mm5 \n\t" /* mm5 = f1 e1 f0 e0 */ \
" punpckldq %%mm6, %%mm4 \n\t" /* mm4 = h0 g0 f0 e0 = MM4 */ \
" punpckhdq %%mm6, %%mm5 \n\t" /* mm5 = h1 g1 f1 e1 = MM5 */ \
@@ -302,23 +302,23 @@
* we will transpose the block of data to two 4x8 blocks???
*/
Transpose_mmx ( (%0), 16(%0), 32(%0), 48(%0), 8(%0), 24(%0), 40(%0), 56(%0),
- (%1), 16(%1), 32(%1), 48(%1), 8(%1), 24(%1), 40(%1), 56(%1))
+ (%1), 16(%1), 32(%1), 48(%1), 8(%1), 24(%1), 40(%1), 56(%1))
Fdct_mmx ( (%1), 16(%1), 32(%1), 48(%1), 8(%1), 24(%1), 40(%1), 56(%1), (%2))
Transpose_mmx (64(%0), 80(%0), 96(%0),112(%0), 72(%0), 88(%0),104(%0),120(%0),
- 64(%1), 80(%1), 96(%1),112(%1), 72(%1), 88(%1),104(%1),120(%1))
+ 64(%1), 80(%1), 96(%1),112(%1), 72(%1), 88(%1),104(%1),120(%1))
Fdct_mmx (64(%1), 80(%1), 96(%1),112(%1), 72(%1), 88(%1),104(%1),120(%1), (%2))
Transpose_mmx ( 0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1),
- 0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1))
+ 0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1))
Fdct_mmx ( 0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1), (%2))
Transpose_mmx ( 8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1),
- 8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1))
+ 8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1))
Fdct_mmx ( 8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1), (%2))
" emms \n\t"
-
+
: "+r" (InputData),
"+r" (OutputData)
: "r" (temp),
@@ -336,7 +336,6 @@
/* install our implementation in the function table */
void dsp_mmx_fdct_init(DspFunctions *funcs)
{
- TH_DEBUG("enabling accelerated x86_64 mmx fdct function.\n");
funcs->fdct_short = fdct_short__mmx;
}
Modified: trunk/theora/lib/enc/x86_64/recon_mmx.c
===================================================================
--- trunk/theora/lib/enc/x86_64/recon_mmx.c 2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/enc/x86_64/recon_mmx.c 2008-08-04 18:37:55 UTC (rev 15153)
@@ -37,14 +37,14 @@
" movq (%1, %2, 2), %%mm2 \n\t"
" movq (%1, %%rdi), %%mm3 \n\t"
- " lea (%1, %2, 4), %1 \n\t"
+ " lea (%1, %2, 4), %1 \n\t"
" movq %%mm0, (%0) \n\t"
" movq %%mm1, (%0, %2) \n\t"
" movq %%mm2, (%0, %2, 2) \n\t"
" movq %%mm3, (%0, %%rdi) \n\t"
- " lea (%0, %2, 4), %0 \n\t"
+ " lea (%0, %2, 4), %0 \n\t"
" movq (%1), %%mm0 \n\t"
" movq (%1, %2), %%mm1 \n\t"
@@ -71,11 +71,11 @@
" movq %[V128], %%mm0 \n\t" /* Set mm0 to 0x8080808080808080 */
" lea 128(%1), %%rdi \n\t" /* Endpoint in input buffer */
- "1: \n\t"
+ "1: \n\t"
" movq (%1), %%mm2 \n\t" /* First four input values */
" packsswb 8(%1), %%mm2 \n\t" /* pack with next(high) four values */
- " por %%mm0, %%mm0 \n\t"
+ " por %%mm0, %%mm0 \n\t"
" pxor %%mm0, %%mm2 \n\t" /* Convert result to unsigned (same as add 128) */
" lea 16(%1), %1 \n\t" /* Step source buffer */
" cmp %%rdi, %1 \n\t" /* are we done */
@@ -175,7 +175,6 @@
void dsp_mmx_recon_init(DspFunctions *funcs)
{
- TH_DEBUG("enabling accelerated x86_64 mmx recon functions.\n");
funcs->copy8x8 = copy8x8__mmx;
funcs->recon_intra8x8 = recon_intra8x8__mmx;
funcs->recon_inter8x8 = recon_inter8x8__mmx;
Modified: trunk/theora/lib/internal.h
===================================================================
--- trunk/theora/lib/internal.h 2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/internal.h 2008-08-04 18:37:55 UTC (rev 15153)
@@ -27,16 +27,6 @@
# include "dec/huffman.h"
# include "dec/quant.h"
-/* debug macros */
-#ifdef _TH_DEBUG_
-#include <stdio.h>
-extern long dframe;
-extern FILE *debugout;
-#define TH_DEBUG(...) fprintf(debugout, __VA_ARGS__)
-#else
-#define TH_DEBUG(...)
-#endif
-
/*Thank you Microsoft, I know the order of operations.*/
# if defined(_MSC_VER)
# pragma warning(disable:4554) /* order of operations */
@@ -238,14 +228,6 @@
oc_border_info *border;
/*The motion vector used for this fragment.*/
oc_mv mv;
-
-#ifdef _TH_DEBUG_
- int quant[64];
- int freq[64];
- int time[64];
- int recon[64];
- int loop[64];
-#endif
}oc_fragment;
@@ -296,77 +278,77 @@
/*Common state information between the encoder and decoder.*/
struct oc_theora_state{
/*The stream information.*/
- th_info info;
+ th_info info;
/*Table for shared accelerated functions.*/
- oc_base_opt_vtable opt_vtable;
+ oc_base_opt_vtable opt_vtable;
/*CPU flags to detect the presence of extended instruction sets.*/
- ogg_uint32_t cpu_flags;
+ ogg_uint32_t cpu_flags;
/*The fragment plane descriptions.*/
- oc_fragment_plane fplanes[3];
+ oc_fragment_plane fplanes[3];
/*The total number of fragments in a single frame.*/
- int nfrags;
+ int nfrags;
/*The list of fragments, indexed in image order.*/
- oc_fragment *frags;
+ oc_fragment *frags;
/*The total number of super blocks in a single frame.*/
- int nsbs;
+ int nsbs;
/*The list of super blocks, indexed in image order.*/
- oc_sb *sbs;
+ oc_sb *sbs;
/*The number of macro blocks in the X direction.*/
- int nhmbs;
+ int nhmbs;
/*The number of macro blocks in the Y direction.*/
- int nvmbs;
+ int nvmbs;
/*The total number of macro blocks.*/
- int nmbs;
+ int nmbs;
/*The list of macro blocks, indexed in super block order.
That is, the macro block corresponding to the macro block mbi in (luma
plane) super block sbi is (sbi<<2|mbi).*/
- oc_mb *mbs;
+ oc_mb *mbs;
/*The list of coded fragments, in coded order.*/
- int *coded_fragis;
+ int *coded_fragis;
/*The number of coded fragments in each plane.*/
- int ncoded_fragis[3];
+ int ncoded_fragis[3];
/*The list of uncoded fragments.
This just past the end of the list, which is in reverse order, and
uses the same block of allocated storage as the coded_fragis list.*/
- int *uncoded_fragis;
+ int *uncoded_fragis;
/*The number of uncoded fragments in each plane.*/
- int nuncoded_fragis[3];
+ int nuncoded_fragis[3];
/*The list of coded macro blocks in the Y plane, in coded order.*/
- int *coded_mbis;
+ int *coded_mbis;
/*The number of coded macro blocks in the Y plane.*/
- int ncoded_mbis;
+ int ncoded_mbis;
/*A copy of the image data used to fill the input pointers in each fragment.
If the data pointers or strides change, these input pointers must be
re-populated.*/
- th_ycbcr_buffer input;
+ th_ycbcr_buffer input;
/*The number of unique border patterns.*/
- int nborders;
+ int nborders;
/*The storage for the border info for all border fragments.
This data is pointed to from the appropriate fragments.*/
- oc_border_info borders[16];
+ oc_border_info borders[16];
/*The index of the buffers being used for each OC_FRAME_* reference frame.*/
- int ref_frame_idx[3];
+ int ref_frame_idx[3];
/*The actual buffers used for the previously decoded frames.*/
- th_ycbcr_buffer ref_frame_bufs[3];
+ th_ycbcr_buffer ref_frame_bufs[3];
/*The storage for the reference frame buffers.*/
- unsigned char *ref_frame_data;
+ unsigned char *ref_frame_data;
/*The frame number of the last keyframe.*/
- ogg_int64_t keyframe_num;
+ ogg_int64_t keyframe_num;
/*The frame number of the current frame.*/
- ogg_int64_t curframe_num;
+ ogg_int64_t curframe_num;
/*The granpos of the current frame.*/
- ogg_int64_t granpos;
+ ogg_int64_t granpos;
/*The type of the current frame.*/
- int frame_type;
+ int frame_type;
/*The quality indices of the current frame.*/
- int qis[3];
+ int qis[3];
/*The number of quality indices used in the current frame.*/
- int nqis;
+ int nqis;
/*The dequantization tables.*/
- oc_quant_table *dequant_tables[2][3];
- oc_quant_tables dequant_table_data[2][3];
+ oc_quant_table *dequant_tables[2][3];
+ oc_quant_tables dequant_table_data[2][3];
/*Loop filter strength parameters.*/
- unsigned char loop_filter_limits[64];
+ unsigned char loop_filter_limits[64];
};
More information about the commits
mailing list