[xiph-commits] r15153 - in trunk/theora: examples include/theora lib lib/dec lib/dec/x86 lib/dec/x86_vc lib/enc lib/enc/x86_32 lib/enc/x86_32_vs lib/enc/x86_64

Mon Aug 4 11:37:56 PDT 2008

Author: tterribe
Date: 2008-08-04 11:37:55 -0700 (Mon, 04 Aug 2008)
New Revision: 15153

Modified:
   trunk/theora/examples/player_example.c
   trunk/theora/include/theora/codec.h
   trunk/theora/lib/cpu.c
   trunk/theora/lib/dec/bitwise.c
   trunk/theora/lib/dec/bitwise.h
   trunk/theora/lib/dec/decapiwrapper.c
   trunk/theora/lib/dec/decint.h
   trunk/theora/lib/dec/decode.c
   trunk/theora/lib/dec/dequant.c
   trunk/theora/lib/dec/idct.c
   trunk/theora/lib/dec/quant.c
   trunk/theora/lib/dec/quant.h
   trunk/theora/lib/dec/state.c
   trunk/theora/lib/dec/x86/mmxfrag.c
   trunk/theora/lib/dec/x86/mmxstate.c
   trunk/theora/lib/dec/x86_vc/mmxidct.c
   trunk/theora/lib/dec/x86_vc/mmxloopfilter.c
   trunk/theora/lib/dec/x86_vc/mmxstate.c
   trunk/theora/lib/dec/x86_vc/x86state.c
   trunk/theora/lib/enc/codec_internal.h
   trunk/theora/lib/enc/dct_decode.c
   trunk/theora/lib/enc/dct_encode.c
   trunk/theora/lib/enc/dsp.c
   trunk/theora/lib/enc/dsp.h
   trunk/theora/lib/enc/encode.c
   trunk/theora/lib/enc/encoder_quant.c
   trunk/theora/lib/enc/encoder_toplevel.c
   trunk/theora/lib/enc/frarray.c
   trunk/theora/lib/enc/frinit.c
   trunk/theora/lib/enc/mcomp.c
   trunk/theora/lib/enc/x86_32/dct_decode_mmx.c
   trunk/theora/lib/enc/x86_32/dsp_mmx.c
   trunk/theora/lib/enc/x86_32/dsp_mmxext.c
   trunk/theora/lib/enc/x86_32/fdct_mmx.c
   trunk/theora/lib/enc/x86_32/idct_mmx.c
   trunk/theora/lib/enc/x86_32/recon_mmx.c
   trunk/theora/lib/enc/x86_32_vs/dsp_mmx.c
   trunk/theora/lib/enc/x86_32_vs/fdct_mmx.c
   trunk/theora/lib/enc/x86_32_vs/recon_mmx.c
   trunk/theora/lib/enc/x86_64/dct_decode_mmx.c
   trunk/theora/lib/enc/x86_64/dsp_mmx.c
   trunk/theora/lib/enc/x86_64/dsp_mmxext.c
   trunk/theora/lib/enc/x86_64/fdct_mmx.c
   trunk/theora/lib/enc/x86_64/recon_mmx.c
   trunk/theora/lib/internal.h
Log:
Remove all TH_DEBUG statements.
They required variadic macros, which are not standard in C90.
They also cluttered up the code, and were unlikely to be maintained properly
 anyway.
Also, remove all the tabs and trailing whitespace, etc., that xiphmont gunked
 up my code with.


Modified: trunk/theora/examples/player_example.c
===================================================================

--- trunk/theora/examples/player_example.c	2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/examples/player_example.c	2008-08-04 18:37:55 UTC (rev 15153)
@@ -567,6 +567,10 @@
 
   /* and now we have it all.  initialize decoders */
   if(theora_p){
+    ti.offset_x=0;
+    ti.offset_y=0;
+    ti.frame_width=ti.width;
+    ti.frame_height=ti.height;
     theora_decode_init(&td,&ti);
     printf("Ogg logical stream %lx is Theora %dx%d %.02f fps",
            to.serialno,ti.width,ti.height,

Modified: trunk/theora/include/theora/codec.h
===================================================================
--- trunk/theora/include/theora/codec.h	2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/include/theora/codec.h	2008-08-04 18:37:55 UTC (rev 15153)
@@ -112,13 +112,18 @@
  *  specification</a>, Section 4.4, for details on the precise sample
  *  locations.*/
 typedef enum{
-  /**Chroma decimation by 2 in both the X and Y directions (4:2:0).*/
+  /**Chroma decimation by 2 in both the X and Y directions (4:2:0).
+     The Cb and Cr chroma planes are half the width and half the height of the
+      luma plane.*/
   TH_PF_420,
   /**Currently reserved.*/
   TH_PF_RSVD,
-  /**Chroma decimation by 2 in the X direction (4:2:2).*/
+  /**Chroma decimation by 2 in the X direction (4:2:2).
+     The Cb and Cr chroma planes are half the width of the luma plane, but full
+      height.*/
   TH_PF_422,
-  /**No chroma decimation (4:4:4).*/
+  /**No chroma decimation (4:4:4).
+     The Cb and Cr chroma planes are full width and full height.*/
   TH_PF_444,
   /**The total number of currently defined pixel formats.*/
   TH_PF_NFORMATS

Modified: trunk/theora/lib/cpu.c
===================================================================
--- trunk/theora/lib/cpu.c	2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/cpu.c	2008-08-04 18:37:55 UTC (rev 15153)
@@ -220,24 +220,6 @@
     /*Implement me.*/
     flags=0;
   }
-# if defined(DEBUG)
-  if(flags){
-    TH_DEBUG("vectorized instruction sets supported:");
-    if(flags&OC_CPU_X86_MMX)TH_DEBUG(" mmx");
-    if(flags&OC_CPU_X86_MMXEXT)TH_DEBUG(" mmxext");
-    if(flags&OC_CPU_X86_SSE)TH_DEBUG(" sse");
-    if(flags&OC_CPU_X86_SSE2)TH_DEBUG(" sse2");
-    if(flags&OC_CPU_X86_3DNOW)TH_DEBUG(" 3dnow");
-    if(flags&OC_CPU_X86_3DNOWEXT)TH_DEBUG(" 3dnowext");
-    if(flags&OC_CPU_X86_PNI)TH_DEBUG(" pni");
-    if(flags&OC_CPU_X86_SSSE3)TH_DEBUG(" ssse3");
-    if(flags&OC_CPU_X86_SSE4_1)TH_DEBUG(" sse4_1");
-    if(flags&OC_CPU_X86_SSE4_2)TH_DEBUG(" sse4_2");
-    if(flags&OC_CPU_X86_SSE4A)TH_DEBUG(" sse4a");
-    if(flags&OC_CPU_X86_SSE5)TH_DEBUG(" sse5");
-    TH_DEBUG("\n");
-  }
-# endif
   return flags;
 }
 #endif

Modified: trunk/theora/lib/dec/bitwise.c
===================================================================
--- trunk/theora/lib/dec/bitwise.c	2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/dec/bitwise.c	2008-08-04 18:37:55 UTC (rev 15153)
@@ -15,112 +15,107 @@
 
  ********************************************************************/
 
-/* We're 'MSb' endian; if we write a word but read individual bits,
-   then we'll read the msb first */
+/*We're 'MSb' endian; if we write a word but read individual bits,
+   then we'll read the MSb first.*/
 
 #include <string.h>
 #include <stdlib.h>
 #include "bitwise.h"
 
-void theorapackB_reset(oggpack_buffer *b){
-  b->ptr=b->buffer;
-  b->buffer[0]=0;
-  b->endbit=b->endbyte=0;
+void theorapackB_readinit(oggpack_buffer *_b,unsigned char *_buf,int _bytes){
+  memset(_b,0,sizeof(*_b));
+  _b->buffer=_b->ptr=_buf;
+  _b->storage=_bytes;
 }
 
-void theorapackB_readinit(oggpack_buffer *b,unsigned char *buf,int bytes){
-  memset(b,0,sizeof(*b));
-  b->buffer=b->ptr=buf;
-  b->storage=bytes;
-}
-
-int theorapackB_look1(oggpack_buffer *b,long *_ret){
-  if(b->endbyte>=b->storage){
+int theorapackB_look1(oggpack_buffer *_b,long *_ret){
+  if(_b->endbyte>=_b->storage){
     *_ret=0L;
     return -1;
   }
-  *_ret=((b->ptr[0]>>(7-b->endbit))&1);
+  *_ret=(_b->ptr[0]>>7-_b->endbit)&1;
   return 0;
 }
 
-void theorapackB_adv1(oggpack_buffer *b){
-  if(++(b->endbit)>7){
-    b->endbit=0;
-    b->ptr++;
-    b->endbyte++;
+void theorapackB_adv1(oggpack_buffer *_b){
+  if(++(_b->endbit)>7){
+    _b->endbit=0;
+    _b->ptr++;
+    _b->endbyte++;
   }
 }
 
-/* bits <= 32 */
-int theorapackB_read(oggpack_buffer *b,int bits,long *_ret){
+/*Here we assume that 0<=_bits&&_bits<=32.*/
+int theorapackB_read(oggpack_buffer *_b,int _bits,long *_ret){
   long ret;
   long m;
+  long d;
   int fail;
-  m=32-bits;
-  bits+=b->endbit;
-  if(b->endbyte+4>=b->storage){
-    /* not the main path */
-    if(b->endbyte*8+bits>b->storage*8){
+  m=32-_bits;
+  _bits+=_b->endbit;
+  d=_b->storage-_b->endbyte;
+  if(d<=4){
+    /*Not the main path.*/
+    if(d*8<_bits){
       *_ret=0L;
       fail=-1;
       goto overflow;
     }
-    /* special case to avoid reading b->ptr[0], which might be past the end of
-        the buffer; also skips some useless accounting */
-    else if(!bits){
+    /*Special case to avoid reading _b->ptr[0], which might be past the end of
+       the buffer; also skips some useless accounting.*/
+    else if(!_bits){
       *_ret=0L;
       return 0;
     }
   }
-  ret=b->ptr[0]<<(24+b->endbit);
-  if(bits>8){
-    ret|=b->ptr[1]<<(16+b->endbit);
-    if(bits>16){
-      ret|=b->ptr[2]<<(8+b->endbit);
-      if(bits>24){
-        ret|=b->ptr[3]<<(b->endbit);
-        if(bits>32 && b->endbit)
-          ret|=b->ptr[4]>>(8-b->endbit);
+  ret=_b->ptr[0]<<24+_b->endbit;
+  if(_bits>8){
+    ret|=_b->ptr[1]<<16+_b->endbit;
+    if(_bits>16){
+      ret|=_b->ptr[2]<<8+_b->endbit;
+      if(_bits>24){
+        ret|=_b->ptr[3]<<_b->endbit;
+        if(_bits>32)ret|=_b->ptr[4]>>8-_b->endbit;
       }
     }
   }
-  *_ret=((ret&0xffffffffUL)>>(m>>1))>>((m+1)>>1);
+  *_ret=((ret&0xFFFFFFFFUL)>>(m>>1))>>(m+1>>1);
   fail=0;
 overflow:
-  b->ptr+=bits/8;
-  b->endbyte+=bits/8;
-  b->endbit=bits&7;
+  _b->ptr+=_bits>>3;
+  _b->endbyte+=_bits>>3;
+  _b->endbit=_bits&7;
   return fail;
 }
 
-int theorapackB_read1(oggpack_buffer *b,long *_ret){
+int theorapackB_read1(oggpack_buffer *_b,long *_ret){
   int fail;
-  if(b->endbyte>=b->storage){
-    /* not the main path */
+  if(_b->endbyte>=_b->storage){
+    /*Not the main path.*/
     *_ret=0L;
     fail=-1;
-    goto overflow;
   }
-  *_ret=(b->ptr[0]>>(7-b->endbit))&1;
-  fail=0;
-overflow:
-  b->endbit++;
-  if(b->endbit>7){
-    b->endbit=0;
-    b->ptr++;
-    b->endbyte++;
+  else{
+    *_ret=(_b->ptr[0]>>7-_b->endbit)&1;
+    fail=0;
   }
+  _b->endbit++;
+  if(_b->endbit>7){
+    _b->endbit=0;
+    _b->ptr++;
+    _b->endbyte++;
+  }
   return fail;
 }
 
-long theorapackB_bytes(oggpack_buffer *b){
-  return(b->endbyte+(b->endbit+7)/8);
+long theorapackB_bytes(oggpack_buffer *_b){
+  return _b->endbyte+(_b->endbit+7>>3);
 }
 
-long theorapackB_bits(oggpack_buffer *b){
-  return(b->endbyte*8+b->endbit);
+long theorapackB_bits(oggpack_buffer *_b){
+  return _b->endbyte*8+_b->endbit;
 }
 
-unsigned char *theorapackB_get_buffer(oggpack_buffer *b){
-  return(b->buffer);
+unsigned char *theorapackB_get_buffer(oggpack_buffer *_b){
+  return _b->buffer;
 }

Modified: trunk/theora/lib/dec/bitwise.h
===================================================================
--- trunk/theora/lib/dec/bitwise.h	2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/dec/bitwise.h	2008-08-04 18:37:55 UTC (rev 15153)
@@ -18,59 +18,61 @@
 # define _bitwise_H (1)
 # include <ogg/ogg.h>
 
-void theorapackB_reset(oggpack_buffer *b);
-void theorapackB_readinit(oggpack_buffer *b,unsigned char *buf,int bytes);
-/* Read in bits without advancing the bitptr; bits <= 32 */
-static int theorapackB_look(oggpack_buffer *b,int bits,long *_ret);
-int theorapackB_look1(oggpack_buffer *b,long *_ret);
-static void theorapackB_adv(oggpack_buffer *b,int bits);
-void theorapackB_adv1(oggpack_buffer *b);
-/* bits <= 32 */
-int theorapackB_read(oggpack_buffer *b,int bits,long *_ret);
-int theorapackB_read1(oggpack_buffer *b,long *_ret);
-long theorapackB_bytes(oggpack_buffer *b);
-long theorapackB_bits(oggpack_buffer *b);
-unsigned char *theorapackB_get_buffer(oggpack_buffer *b);
+void theorapackB_readinit(oggpack_buffer *_b,unsigned char *_buf,int _bytes);
+/*Read in bits without advancing the bitptr.
+  Here we assume 0<=_bits&&_bits<=32.*/
+static int theorapackB_look(oggpack_buffer *_b,int _bits,long *_ret);
+int theorapackB_look1(oggpack_buffer *_b,long *_ret);
+static void theorapackB_adv(oggpack_buffer *_b,int _bits);
+void theorapackB_adv1(oggpack_buffer *_b);
+/*Here we assume 0<=_bits&&_bits<=32.*/
+int theorapackB_read(oggpack_buffer *_b,int _bits,long *_ret);
+int theorapackB_read1(oggpack_buffer *_b,long *_ret);
+long theorapackB_bytes(oggpack_buffer *_b);
+long theorapackB_bits(oggpack_buffer *_b);
+unsigned char *theorapackB_get_buffer(oggpack_buffer *_b);
 
 /*These two functions are only used in one place, and declaring them static so
    they can be inlined saves considerable function call overhead.*/
 
-/* Read in bits without advancing the bitptr; bits <= 32 */
-static int theorapackB_look(oggpack_buffer *b,int bits,long *_ret){
+/*Read in bits without advancing the bitptr.
+  Here we assume 0<=_bits&&_bits<=32.*/
+static int theorapackB_look(oggpack_buffer *_b,int _bits,long *_ret){
   long ret;
   long m;
-  m=32-bits;
-  bits+=b->endbit;
-  if(b->endbyte+4>=b->storage){
-    /* not the main path */
-    if(b->endbyte>=b->storage){
+  long d;
+  m=32-_bits;
+  _bits+=_b->endbit;
+  d=_b->storage-_b->endbyte;
+  if(d<=4){
+    /*Not the main path.*/
+    if(d<=0){
       *_ret=0L;
-      return -1;
+      return -(_bits>d*8);
     }
     /*If we have some bits left, but not enough, return the ones we have.*/
-    if((b->storage-b->endbyte)*8<bits)bits=(b->storage-b->endbyte)*8;
+    if(d*8<_bits)_bits=d*8;
   }
-  ret=b->ptr[0]<<(24+b->endbit);
-  if(bits>8){
-    ret|=b->ptr[1]<<(16+b->endbit);
-    if(bits>16){
-      ret|=b->ptr[2]<<(8+b->endbit);
-      if(bits>24){
-        ret|=b->ptr[3]<<(b->endbit);
-        if(bits>32&&b->endbit)
-          ret|=b->ptr[4]>>(8-b->endbit);
+  ret=_b->ptr[0]<<24+_b->endbit;
+  if(_bits>8){
+    ret|=_b->ptr[1]<<16+_b->endbit;
+    if(_bits>16){
+      ret|=_b->ptr[2]<<8+_b->endbit;
+      if(_bits>24){
+        ret|=_b->ptr[3]<<_b->endbit;
+        if(_bits>32)ret|=_b->ptr[4]>>8-_b->endbit;
       }
     }
   }
-  *_ret=((ret&0xffffffff)>>(m>>1))>>((m+1)>>1);
+  *_ret=((ret&0xFFFFFFFF)>>(m>>1))>>(m+1>>1);
   return 0;
 }
 
-static void theorapackB_adv(oggpack_buffer *b,int bits){
-  bits+=b->endbit;
-  b->ptr+=bits/8;
-  b->endbyte+=bits/8;
-  b->endbit=bits&7;
+static void theorapackB_adv(oggpack_buffer *_b,int _bits){
+  _bits+=_b->endbit;
+  _b->ptr+=_bits>>3;
+  _b->endbyte+=_bits>>3;
+  _b->endbit=_bits&7;
 }
 
 #endif

Modified: trunk/theora/lib/dec/decapiwrapper.c
===================================================================
--- trunk/theora/lib/dec/decapiwrapper.c	2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/dec/decapiwrapper.c	2008-08-04 18:37:55 UTC (rev 15153)
@@ -29,10 +29,6 @@
 
 static void theora_decode_clear(theora_state *_td){
   if(_td->i!=NULL)theora_info_clear(_td->i);
-#ifdef _TH_DEBUG_
-  fclose(debugout);
-  debugout=NULL;
-#endif
   memset(_td,0,sizeof(*_td));
 }
 
@@ -92,7 +88,6 @@
   th_api_info    *apiinfo;
   th_api_wrapper *api;
   th_info         info;
-
   api=(th_api_wrapper *)_ci->codec_setup;
   /*Allocate our own combined API wrapper/theora_info struct.
     We put them both in one malloc'd block so that when the API wrapper is
@@ -130,11 +125,6 @@
   th_api_wrapper *api;
   th_info         info;
   int             ret;
-
-#ifdef _TH_DEBUG_
-  debugout = fopen("theoradec-debugout.txt","w");
-#endif
-
   api=(th_api_wrapper *)_ci->codec_setup;
   /*Allocate an API wrapper struct on demand, since it will not also include a
      theora_info struct like the ones that are used in a theora_state struct.*/
@@ -167,16 +157,9 @@
   th_api_wrapper *api;
   ogg_int64_t     gp;
   int             ret;
-  
-  if(!_td || !_td->i || !_td->i->codec_setup)return OC_FAULT;
+  if(!_td||!_td->i||!_td->i->codec_setup)return OC_FAULT;
   api=(th_api_wrapper *)_td->i->codec_setup;
-  if(!api || !api->decode)return OC_FAULT;
   ret=th_decode_packetin(api->decode,_op,&gp);
-
-#ifdef _TH_DEBUG_
-  dframe++;
-#endif
-
   if(ret<0)return OC_BADPACKET;
   _td->granulepos=gp;
   return 0;
@@ -186,10 +169,9 @@
   th_api_wrapper  *api;
   th_ycbcr_buffer  buf;
   int              ret;
-
-  if(!_td || !_td->i || !_td->i->codec_setup)return OC_FAULT;
+  if(!_td||!_td->i||!_td->i->codec_setup)return OC_FAULT;
   api=(th_api_wrapper *)_td->i->codec_setup;
-  if(!api || !api->decode)return OC_FAULT;
+  if(!api->decode)return OC_FAULT;
   ret=th_decode_ycbcr_out(api->decode,buf);
   if(ret>=0){
     _yuv->y_width=buf[0].width;
@@ -202,6 +184,5 @@
     _yuv->u=buf[1].data;
     _yuv->v=buf[2].data;
   }
-
   return ret;
 }

Modified: trunk/theora/lib/dec/decint.h
===================================================================
--- trunk/theora/lib/dec/decint.h	2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/dec/decint.h	2008-08-04 18:37:55 UTC (rev 15153)
@@ -47,45 +47,45 @@
 
 struct th_dec_ctx{
   /*Shared encoder/decoder state.*/
-  oc_theora_state          state;
+  oc_theora_state      state;
   /*Whether or not packets are ready to be emitted.
     This takes on negative values while there are remaining header packets to
      be emitted, reaches 0 when the codec is ready for input, and goes to 1
      when a frame has been processed and a data packet is ready.*/
-  int                      packet_state;
+  int                  packet_state;
   /*Buffer in which to assemble packets.*/
-  oggpack_buffer           opb;
+  oggpack_buffer       opb;
   /*Huffman decode trees.*/
-  oc_huff_node            *huff_tables[TH_NHUFFMAN_TABLES];
+  oc_huff_node        *huff_tables[TH_NHUFFMAN_TABLES];
   /*The index of one past the last token in each plane for each coefficient.
     The final entries are the total number of tokens for each coefficient.*/
-  int                      ti0[3][64];
+  int                  ti0[3][64];
   /*The index of one past the last extra bits entry in each plane for each
      coefficient.
     The final entries are the total number of extra bits entries for each
      coefficient.*/
-  int                      ebi0[3][64];
+  int                  ebi0[3][64];
   /*The number of outstanding EOB runs at the start of each coefficient in each
      plane.*/
-  int                      eob_runs[3][64];
+  int                  eob_runs[3][64];
   /*The DCT token lists.*/
-  unsigned char          **dct_tokens;
+  unsigned char      **dct_tokens;
   /*The extra bits associated with DCT tokens.*/
-  ogg_uint16_t           **extra_bits;
+  ogg_uint16_t       **extra_bits;
   /*The out-of-loop post-processing level.*/
-  int                      pp_level;
+  int                  pp_level;
   /*The DC scale used for out-of-loop deblocking.*/
-  int                      pp_dc_scale[64];
+  int                  pp_dc_scale[64];
   /*The sharpen modifier used for out-of-loop deringing.*/
-  int                      pp_sharp_mod[64];
+  int                  pp_sharp_mod[64];
   /*The DC quantization index of each block.*/
-  unsigned char           *dc_qis;
+  unsigned char       *dc_qis;
   /*The variance of each block.*/
-  int                     *variances;
+  int                 *variances;
   /*The storage for the post-processed frame buffer.*/
-  unsigned char           *pp_frame_data;
+  unsigned char       *pp_frame_data;
   /*Whether or not the post-processsed frame buffer has space for chroma.*/
-  int                      pp_frame_has_chroma;
+  int                  pp_frame_has_chroma;
   /*The buffer used for the post-processed frame.*/
   th_ycbcr_buffer      pp_frame_buf;
   /*The striped decode callback function.*/

Modified: trunk/theora/lib/dec/decode.c
===================================================================
--- trunk/theora/lib/dec/decode.c	2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/dec/decode.c	2008-08-04 18:37:55 UTC (rev 15153)
@@ -12,7 +12,7 @@
 
   function:
     last mod: $Id$
-  
+
  ********************************************************************/
 
 #include <stdlib.h>
@@ -170,7 +170,7 @@
      _dec->state.dequant_table_data[qti][pli];
   }
   oc_dequant_tables_init(_dec->state.dequant_tables,_dec->pp_dc_scale,
-			 &_setup->qinfo);
+   &_setup->qinfo);
   for(qi=0;qi<64;qi++){
     int qsum;
     qsum=0;
@@ -210,38 +210,28 @@
 
 static int oc_dec_frame_header_unpack(oc_dec_ctx *_dec){
   long val;
-
-  TH_DEBUG("\n>>>> beginning frame %ld\n\n",dframe);
-
   /*Check to make sure this is a data packet.*/
   theorapackB_read1(&_dec->opb,&val);
-  TH_DEBUG("frame type = %s, ",val==0?"video":"unknown");
   if(val!=0)return TH_EBADPACKET;
   /*Read in the frame type (I or P).*/
   theorapackB_read1(&_dec->opb,&val);
   _dec->state.frame_type=(int)val;
-  TH_DEBUG("%s\n",val?"predicted":"key");
   /*Read in the current qi.*/
   theorapackB_read(&_dec->opb,6,&val);
   _dec->state.qis[0]=(int)val;
-  TH_DEBUG("frame quality = { %ld ",val);
   theorapackB_read1(&_dec->opb,&val);
   if(!val)_dec->state.nqis=1;
   else{
     theorapackB_read(&_dec->opb,6,&val);
     _dec->state.qis[1]=(int)val;
-    TH_DEBUG("%ld ",val);
     theorapackB_read1(&_dec->opb,&val);
     if(!val)_dec->state.nqis=2;
     else{
       theorapackB_read(&_dec->opb,6,&val);
-      TH_DEBUG("%ld ",val);
       _dec->state.qis[2]=(int)val;
       _dec->state.nqis=3;
     }
   }
-  TH_DEBUG("}\n");
-
   if(_dec->state.frame_type==OC_INTRA_FRAME){
     /*Keyframes have 3 unused configuration bits, holdovers from VP3 days.
       Most of the other unused bits in the VP3 headers were eliminated.
@@ -305,7 +295,6 @@
   int    run_count;
   theorapackB_read1(&_dec->opb,&val);
   flag=(int)val;
-
   sb=_dec->state.sbs;
   sb_end=sb+_dec->state.nsbs;
   run_count=npartial=0;
@@ -319,7 +308,6 @@
       npartial+=flag;
       sb++;
     }
-
     while(--run_count>0&&sb<sb_end);
     if(full_run&&sb<sb_end){
       theorapackB_read1(&_dec->opb,&val);
@@ -349,7 +337,6 @@
   for(;sb->coded_partially;sb++);
   theorapackB_read1(&_dec->opb,&val);
   flag=(int)val;
-
   while(sb<sb_end){
     int full_run;
     run_count=oc_sb_run_unpack(&_dec->opb);
@@ -428,71 +415,6 @@
   }
   /*TODO: run_count should be 0 here.
     If it's not, we should issue a warning of some kind.*/
-
-
-#ifdef _TH_DEBUG_
-  // assuming 4:2:0 right now; THIS IS WRONG but only an issue if dumping debug info
-  TH_DEBUG("predicted (partially coded frame)\n");
-  TH_DEBUG("superblock coded flags = {");
-  int x,y,i;
-  int w = _dec->state.info.frame_width;
-  int h = _dec->state.info.frame_height;
-
-  i=0;
-  for(y=0;y< (h+31)/32;y++){
-    TH_DEBUG("\n   ");
-    for(x=0;x< (w+31)/32;x++,i++)
-      TH_DEBUG("%x", (_dec->state.sbs[i].coded_partially!=0)|
-	       (_dec->state.sbs[i].coded_fully));
-  }
-
-  TH_DEBUG("\n   ");
-  for(y=0;y< (h+63)/64;y++){
-    TH_DEBUG("\n   ");
-    for(x=0;x< (w+63)/64;x++,i++)
-      TH_DEBUG("%x", (_dec->state.sbs[i].coded_partially!=0)|
-	       (_dec->state.sbs[i].coded_fully));
-  }
-  TH_DEBUG("\n   ");
-  for(y=0;y< (h+63)/64;y++){
-    TH_DEBUG("\n   ");
-    for(x=0;x< (w+63)/64;x++,i++)
-      TH_DEBUG("%x", (_dec->state.sbs[i].coded_partially!=0)|
-	       (_dec->state.sbs[i].coded_fully));
-  }
-  TH_DEBUG("\n}\n");
-
-  if(i!=_dec->state.nsbs)
-    TH_DEBUG("WARNING!  superblock count, raster %d != flat %d\n",
-	     i,_dec->state.nsbs);
-  
-  TH_DEBUG("block coded flags = {");
-
-  i=0;
-  for(y=0;y< (h+7)/8;y++){
-    TH_DEBUG("\n   ");
-    for(x=0;x< (w+7)/8;x++,i++)
-      TH_DEBUG("%x", (_dec->state.frags[i].coded!=0));
-  }
-  TH_DEBUG("\n   ");
-  for(y=0;y< (h+15)/16;y++){
-    TH_DEBUG("\n   ");
-    for(x=0;x< (w+15)/16;x++,i++)
-      TH_DEBUG("%x", (_dec->state.frags[i].coded!=0));
-  }
-  TH_DEBUG("\n   ");
-  for(y=0;y< (h+15)/16;y++){
-    TH_DEBUG("\n   ");
-    for(x=0;x< (w+15)/16;x++,i++)
-      TH_DEBUG("%x", (_dec->state.frags[i].coded!=0));
-  }
-  TH_DEBUG("\n}\n");
-  
-  if(i!=_dec->state.nfrags)
-    TH_DEBUG("WARNING!  block count, raster %d != flat %d\n",
-	     i,_dec->state.nfrags);
-#endif	      
-
 }
 
 
@@ -526,57 +448,37 @@
   int                  mode_scheme;
   theorapackB_read(&_dec->opb,3,&val);
   mode_scheme=(int)val;
-  TH_DEBUG("mode encode scheme = %d\n",(int)val);
-
   if(mode_scheme==0){
     int mi;
     /*Just in case, initialize the modes to something.
       If the bitstream doesn't contain each index exactly once, it's likely
        corrupt and the rest of the packet is garbage anyway, but this way we
        won't crash, and we'll decode SOMETHING.*/
-    TH_DEBUG("mode scheme list = { ");
     /*LOOP VECTORIZES.*/
     for(mi=0;mi<OC_NMODES;mi++)scheme0_alphabet[mi]=OC_MODE_INTER_NOMV;
     for(mi=0;mi<OC_NMODES;mi++){
       theorapackB_read(&_dec->opb,3,&val);
       scheme0_alphabet[val]=OC_MODE_ALPHABETS[6][mi];
-      TH_DEBUG("%d ",(int)val);
     }
-    TH_DEBUG("}\n");
     alphabet=scheme0_alphabet;
-  }else 
-    alphabet=OC_MODE_ALPHABETS[mode_scheme-1];
-  if(mode_scheme==7)
-    mode_unpack=oc_clc_mode_unpack;
-  else 
-    mode_unpack=oc_vlc_mode_unpack;
+  }
+  else alphabet=OC_MODE_ALPHABETS[mode_scheme-1];
+  if(mode_scheme==7)mode_unpack=oc_clc_mode_unpack;
+  else mode_unpack=oc_vlc_mode_unpack;
   mb=_dec->state.mbs;
   mb_end=mb+_dec->state.nmbs;
-
-  TH_DEBUG("mode list = { ");
-  for(j=0;mb<mb_end;mb++){
+  for(;mb<mb_end;mb++){
     if(mb->mode!=OC_MODE_INVALID){
       int bi;
       for(bi=0;bi<4;bi++){
-	int fragi;
-	fragi=mb->map[0][bi];
-	if(fragi>=0&&_dec->state.frags[fragi].coded)break;
+        int fragi;
+        fragi=mb->map[0][bi];
+        if(fragi>=0&&_dec->state.frags[fragi].coded)break;
       }
-      if(bi<4){
-	mb->mode=alphabet[(*mode_unpack)(&_dec->opb)];
-	
-#ifdef _TH_DEBUG_
-	if((j&0x1f)==0)
-	  TH_DEBUG("\n   ");
-	TH_DEBUG("%d ",mb->mode);
-	j++;
-#endif
-
-      }else 
-	mb->mode=OC_MODE_INTER_NOMV;
+      if(bi<4)mb->mode=alphabet[(*mode_unpack)(&_dec->opb)];
+      else mb->mode=OC_MODE_INTER_NOMV;
     }
   }
-  TH_DEBUG("\n}\n");
 }
 
 
@@ -629,23 +531,16 @@
   const int              *map_idxs;
   long                    val;
   int                     map_nidxs;
-#ifdef _TH_DEBUG_
-  int                     j=0;
-#endif
   oc_mv                   last_mv[2];
   oc_mv                   cbmvs[4];
   set_chroma_mvs=OC_SET_CHROMA_MVS_TABLE[_dec->state.info.pixel_fmt];
   theorapackB_read1(&_dec->opb,&val);
-  TH_DEBUG("motion vector table = %d\n",(int)val);
   mv_comp_unpack=val?oc_clc_mv_comp_unpack:oc_vlc_mv_comp_unpack;
   map_idxs=OC_MB_MAP_IDXS[_dec->state.info.pixel_fmt];
   map_nidxs=OC_MB_MAP_NIDXS[_dec->state.info.pixel_fmt];
   memset(last_mv,0,sizeof(last_mv));
   mb=_dec->state.mbs;
   mb_end=mb+_dec->state.nmbs;
-
-  TH_DEBUG("motion vectors = {");
-
   for(;mb<mb_end;mb++)if(mb->mode!=OC_MODE_INVALID){
     oc_fragment *frag;
     oc_mv        mbmv;
@@ -667,98 +562,62 @@
     if(ncoded<=0)continue;
     mb_mode=mb->mode;
     switch(mb_mode){
-    case OC_MODE_INTER_MV_FOUR:
-      {
-	oc_mv       lbmvs[4];
-	int         bi;
-	/*Mark the tail of the list, so we don't accidentally go past it.*/
-	coded[ncoded]=-1;
-	for(bi=codedi=0;bi<4;bi++){
-	  if(coded[codedi]==bi){
-	    codedi++;
-	    frag=_dec->state.frags+mb->map[0][bi];
-	    frag->mbmode=mb_mode;
-	    frag->mv[0]=lbmvs[bi][0]=(signed char)(*mv_comp_unpack)(&_dec->opb);
-	    frag->mv[1]=lbmvs[bi][1]=(signed char)(*mv_comp_unpack)(&_dec->opb);
-
-#ifdef _TH_DEBUG_
-	    if((j&0x7)==0)
-	      TH_DEBUG("\n   ");
-	    TH_DEBUG("%+03d,%+03d ",frag->mv[0],frag->mv[1]);
-	    j++;
-#endif
-
-	  }
-	  else lbmvs[bi][0]=lbmvs[bi][1]=0;
-	}
-	if(codedi>0){
-	  last_mv[1][0]=last_mv[0][0];
-	  last_mv[1][1]=last_mv[0][1];
-	  last_mv[0][0]=lbmvs[coded[codedi-1]][0];
-	  last_mv[0][1]=lbmvs[coded[codedi-1]][1];
-	}
-	if(codedi<ncoded){
-	  (*set_chroma_mvs)(cbmvs,(const oc_mv *)lbmvs);
-	  for(;codedi<ncoded;codedi++){
-	    mapi=coded[codedi];
-	    bi=mapi&3;
-	    frag=_dec->state.frags+mb->map[mapi>>2][bi];
-	    frag->mbmode=mb_mode;
-	    frag->mv[0]=cbmvs[bi][0];
-	    frag->mv[1]=cbmvs[bi][1];
-	  }
-	}
-      }
-      break;
-    case OC_MODE_INTER_MV:
-      {
-	last_mv[1][0]=last_mv[0][0];
-	last_mv[1][1]=last_mv[0][1];
-	mbmv[0]=last_mv[0][0]=(signed char)(*mv_comp_unpack)(&_dec->opb);
-	mbmv[1]=last_mv[0][1]=(signed char)(*mv_comp_unpack)(&_dec->opb);
-
-#ifdef _TH_DEBUG_
-	if((j&0x7)==0)
-	  TH_DEBUG("\n   ");
-	TH_DEBUG("%+03d,%+03d ",mbmv[0],mbmv[1]);
-	j++;
-#endif
-
-      }
-      break;
-    case OC_MODE_INTER_MV_LAST:
-      {
+      case OC_MODE_INTER_MV_FOUR:{
+        oc_mv       lbmvs[4];
+        int         bi;
+        /*Mark the tail of the list, so we don't accidentally go past it.*/
+        coded[ncoded]=-1;
+        for(bi=codedi=0;bi<4;bi++){
+          if(coded[codedi]==bi){
+            codedi++;
+            frag=_dec->state.frags+mb->map[0][bi];
+            frag->mbmode=mb_mode;
+            frag->mv[0]=lbmvs[bi][0]=(signed char)(*mv_comp_unpack)(&_dec->opb);
+            frag->mv[1]=lbmvs[bi][1]=(signed char)(*mv_comp_unpack)(&_dec->opb);
+          }
+          else lbmvs[bi][0]=lbmvs[bi][1]=0;
+        }
+        if(codedi>0){
+          last_mv[1][0]=last_mv[0][0];
+          last_mv[1][1]=last_mv[0][1];
+          last_mv[0][0]=lbmvs[coded[codedi-1]][0];
+          last_mv[0][1]=lbmvs[coded[codedi-1]][1];
+        }
+        if(codedi<ncoded){
+          (*set_chroma_mvs)(cbmvs,(const oc_mv *)lbmvs);
+          for(;codedi<ncoded;codedi++){
+            mapi=coded[codedi];
+            bi=mapi&3;
+            frag=_dec->state.frags+mb->map[mapi>>2][bi];
+            frag->mbmode=mb_mode;
+            frag->mv[0]=cbmvs[bi][0];
+            frag->mv[1]=cbmvs[bi][1];
+          }
+        }
+      }break;
+      case OC_MODE_INTER_MV:{
+        last_mv[1][0]=last_mv[0][0];
+        last_mv[1][1]=last_mv[0][1];
+        mbmv[0]=last_mv[0][0]=(signed char)(*mv_comp_unpack)(&_dec->opb);
+        mbmv[1]=last_mv[0][1]=(signed char)(*mv_comp_unpack)(&_dec->opb);
+      }break;
+      case OC_MODE_INTER_MV_LAST:{
         mbmv[0]=last_mv[0][0];
         mbmv[1]=last_mv[0][1];
-      }
-      break;
-    case OC_MODE_INTER_MV_LAST2:
-      {
+      }break;
+      case OC_MODE_INTER_MV_LAST2:{
         mbmv[0]=last_mv[1][0];
         mbmv[1]=last_mv[1][1];
         last_mv[1][0]=last_mv[0][0];
         last_mv[1][1]=last_mv[0][1];
         last_mv[0][0]=mbmv[0];
         last_mv[0][1]=mbmv[1];
-      }
-      break;
-    case OC_MODE_GOLDEN_MV:
-      {
+      }break;
+      case OC_MODE_GOLDEN_MV:{
         mbmv[0]=(signed char)(*mv_comp_unpack)(&_dec->opb);
         mbmv[1]=(signed char)(*mv_comp_unpack)(&_dec->opb);
-
-#ifdef _TH_DEBUG_
-	if((j&0x7)==0)
-	  TH_DEBUG("\n   ");
-	TH_DEBUG("%+03d,%+03d ",mbmv[0],mbmv[1]);
-	j++;
-#endif
-
-      }
-      break;
-    default:
-      mbmv[0]=mbmv[1]=0;
-      break;
+      }break;
+      default:mbmv[0]=mbmv[1]=0;break;
     }
     /*4MV mode fills in the fragments itself.
       For all other modes we can use this common code.*/
@@ -773,9 +632,6 @@
       }
     }
   }
-
-  TH_DEBUG("\n}\n");
-
 }
 
 static void oc_dec_block_qis_unpack(oc_dec_ctx *_dec){
@@ -1362,7 +1218,7 @@
     if(_dec->pp_level<OC_PP_LEVEL_DEBLOCKC){
       _dec->variances=(int *)_ogg_realloc(_dec->variances,
        _dec->state.fplanes[0].nfrags*sizeof(_dec->variances[0]));
-      _dec->pp_frame_data=(unsigned char *)_ogg_realloc( 
+      _dec->pp_frame_data=(unsigned char *)_ogg_realloc(
        _dec->pp_frame_data,frame_sz*sizeof(_dec->pp_frame_data[0]));
       _dec->pp_frame_buf[0].width=_dec->state.info.frame_width;
       _dec->pp_frame_buf[0].height=_dec->state.info.frame_height;
@@ -1382,7 +1238,7 @@
       c_h=_dec->state.info.frame_height>>!(_dec->state.info.pixel_fmt&2);
       c_sz=c_w*c_h;
       frame_sz+=c_sz<<1;
-      _dec->pp_frame_data=(unsigned char *)_ogg_realloc( 
+      _dec->pp_frame_data=(unsigned char *)_ogg_realloc(
        _dec->pp_frame_data,frame_sz*sizeof(_dec->pp_frame_data[0]));
       _dec->pp_frame_buf[0].width=_dec->state.info.frame_width;
       _dec->pp_frame_buf[0].height=_dec->state.info.frame_height;
@@ -1503,9 +1359,6 @@
   for(fragy=fragy0;fragy<fragy_end;fragy++){
     for(fragx=0;fragx<fplane->nhfrags;fragx++,frag++){
       if(!frag->coded)continue;
-#ifdef _TH_DEBUG_
-      frag->quant[0] = frag->dc; /* stash un-predicted dc for debug output */
-#endif
       pred_last[OC_FRAME_FOR_MODE[frag->mbmode]]=frag->dc+=
        oc_frag_pred_dc(frag,fplane,fragx,fragy,pred_last);
       ncoded_fragis++;
@@ -1597,40 +1450,6 @@
   _pipe->uncoded_fragis[_pli]-=_pipe->nuncoded_fragis[_pli];
   oc_state_frag_copy(&_dec->state,_pipe->uncoded_fragis[_pli],
    _pipe->nuncoded_fragis[_pli],OC_FRAME_SELF,OC_FRAME_PREV,_pli);
-
-#ifdef _TH_DEBUG_
-  {
-    int i,j,k;
-    int framei=_dec->state.ref_frame_idx[OC_FRAME_SELF];
-    int ystride=_dec->state.ref_frame_bufs[framei][_pli].stride;
-    int *fragi_end = _pipe->coded_fragis[_pli];
-    int *fragi = fragi_end-_pipe->ncoded_fragis[_pli];
-
-    for(;fragi<fragi_end;fragi++){
-      oc_fragment   *frag=_dec->state.frags+*fragi;
-      unsigned char *src=frag->buffer[framei];
-      for(i=0,j=0;j<8;j++){
-	for(k=0;k<8;k++,i++)
-	  frag->recon[i] = src[k];
-	src+=ystride;
-      }
-    }
-
-    fragi = _pipe->uncoded_fragis[_pli];
-    fragi_end = fragi+_pipe->nuncoded_fragis[_pli];
-
-    for(;fragi<fragi_end;fragi++){
-      oc_fragment   *frag=_dec->state.frags+*fragi;
-      unsigned char *src=frag->buffer[framei];
-      for(i=0,j=0;j<8;j++){
-	for(k=0;k<8;k++,i++)
-	  frag->recon[i] = src[k];
-	src+=ystride;
-      }
-    }
-  }
-#endif
-    
 }
 
 /*Filter a horizontal block edge.*/
@@ -2039,7 +1858,6 @@
   /*A completely empty packet indicates a dropped frame and is treated exactly
      like an inter frame with no coded blocks.
     Only proceed if we have a non-empty packet.*/
-
   if(_op->bytes!=0){
     oc_dec_pipeline_state pipe;
     th_ycbcr_buffer       stripe_buf;
@@ -2093,7 +1911,6 @@
     }
     oc_dec_block_qis_unpack(_dec);
     oc_dec_residual_tokens_unpack(_dec);
-
     /*Update granule position.
       This must be done before the striped decode callbacks so that the
        application knows what to do with the frame data.*/
@@ -2203,91 +2020,6 @@
       }
       notstart=1;
     }
-
-#ifdef _TH_DEBUG_
-    {
-      int x,y,i,j,k,xn,yn;
-      int plane;
-      int buf;
-
-      /* dump fragment DCT components */
-      for(plane=0;plane<3;plane++){
-	char *plstr;
-	int offset;
-	switch(plane){
-	case 0:
-	  plstr="Y";
-	  xn = _dec->state.info.frame_width>>3;
-	  yn = _dec->state.info.frame_height>>3;
-	  offset = 0; 
-	  break;
-	case 1:
-	  plstr="U";
-	  xn = _dec->state.info.frame_width>>4;
-	  yn = _dec->state.info.frame_height>>4;
-	  offset = xn*yn*4;
-	  break;
-	case 2:
-	  plstr="V";
-	  xn = _dec->state.info.frame_width>>4;
-	  yn = _dec->state.info.frame_height>>4;
-	  offset = xn*yn*5;
-	  break;
-	}
-	for(y=0;y<yn;y++){
-	  for(x=0;x<xn;x++,i++){
-	    
-	    for(buf=0;buf<4;buf++){
-	      int *ptr;
-	      char *bufn;
-	      int codecheck=0;
-
-	      i = offset + y*xn + x;
-
-	      switch(buf){
-	      case 0:
-		codecheck=1;
-		bufn = "coded";
-		ptr = _dec->state.frags[i].quant;
-		break;
-	      case 1:
-		codecheck=1;
-		bufn = "coeff";
-		ptr = _dec->state.frags[i].freq;
-		break;
-	      case 2:
-		codecheck=1;
-		bufn = "idct";
-		ptr = _dec->state.frags[i].time;
-		break;
-	      case 3:
-		bufn = "recon";
-		ptr = _dec->state.frags[i].loop;
-		break;
-	      }
-	      
-	      
-	      TH_DEBUG("%s %s [%d][%d] = {",bufn,plstr,x,y);
-	      if(codecheck && !_dec->state.frags[i].coded)
-		TH_DEBUG(" not coded }\n");
-	      else{
-		int l=0;
-		for(j=0;j<8;j++){
-		  TH_DEBUG("\n   ");
-		  for(k=0;k<8;k++,l++){
-		    TH_DEBUG("%d ",ptr[l]);
-		  }
-		}
-		TH_DEBUG(" }\n");
-	      }
-	    }
-	    TH_DEBUG("\n");
-	  }
-	}
-      }
-    }
-#endif
-
     /*Finish filling in the reference frame borders.*/
     for(pli=0;pli<3;pli++)oc_state_borders_fill_caps(&_dec->state,refi,pli);
     /*Update the reference frame indices.*/

Modified: trunk/theora/lib/dec/dequant.c
===================================================================
--- trunk/theora/lib/dec/dequant.c	2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/dec/dequant.c	2008-08-04 18:37:55 UTC (rev 15153)
@@ -127,65 +127,6 @@
     }
     while(qri-->0);
   }
-
-#ifdef _TH_DEBUG_
-  /* dump the tables */
-  {
-    int i, j, k, l, m;
-    TH_DEBUG("loop filter limits = {");
-    for(i=0;i<64;){
-      TH_DEBUG("\n        ");
-      for(j=0;j<16;i++,j++)
-	TH_DEBUG("%3d ",_qinfo->loop_filter_limits[i]);
-    }
-    TH_DEBUG("\n}\n\n");
-
-    TH_DEBUG("ac scale = {");
-    for(i=0;i<64;){
-      TH_DEBUG("\n        ");
-      for(j=0;j<16;i++,j++)
-	TH_DEBUG("%3d ",_qinfo->ac_scale[i]);
-    }
-    TH_DEBUG("\n}\n\n");
-
-    TH_DEBUG("dc scale = {");
-    for(i=0;i<64;){
-      TH_DEBUG("\n        ");
-      for(j=0;j<16;i++,j++)
-	TH_DEBUG("%3d ",_qinfo->dc_scale[i]);
-    }
-    TH_DEBUG("\n}\n\n");
-
-    for(k=0;k<2;k++)
-      for(l=0;l<3;l++){
-	char *name[2][3]={
-	  {"intra Y bases","intra U bases", "intra V bases"},
-	  {"inter Y bases","inter U bases", "inter V bases"}
-	};
-
-	th_quant_ranges *r = &_qinfo->qi_ranges[k][l];
-	TH_DEBUG("%s = {\n",name[k][l]);
-	TH_DEBUG("        ranges = %d\n",r->nranges);
-	TH_DEBUG("        intervals = { ");
-	for(i=0;i<r->nranges;i++)
-	  TH_DEBUG("%3d ",r->sizes[i]);
-	TH_DEBUG("}\n");
-	TH_DEBUG("\n        matricies = { ");
-	for(m=0;m<r->nranges+1;m++){
-	  TH_DEBUG("\n          { ");
-	  for(i=0;i<64;){
-	    TH_DEBUG("\n            ");
-	    for(j=0;j<8;i++,j++)
-	      TH_DEBUG("%3d ",r->base_matrices[m][i]);
-	  }
-	  TH_DEBUG("\n          }");
-	}
-	TH_DEBUG("\n        }\n");
-      }
-  }
-    
-#endif
-
   _ogg_free(base_mats);
   return 0;
 }
@@ -227,4 +168,3 @@
     _ogg_free((void *)_qinfo->qi_ranges[qti][pli].base_matrices);
   }
 }
-

Modified: trunk/theora/lib/dec/idct.c
===================================================================
--- trunk/theora/lib/dec/idct.c	2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/dec/idct.c	2008-08-04 18:37:55 UTC (rev 15153)
@@ -169,7 +169,6 @@
   _y[7<<3]=(ogg_int16_t)(t[0]-t[7]);
 }
 
-
 /*Performs an inverse 8 point Type-II DCT transform.
   The output is scaled by a factor of 2 relative to the orthonormal version of
    the transform.
@@ -204,7 +203,6 @@
   _y[7<<3]=(ogg_int16_t)(t[0]-t[7]);
 }
 
-
 /*Performs an inverse 8 point Type-II DCT transform.
   The output is scaled by a factor of 2 relative to the orthonormal version of
    the transform.

Modified: trunk/theora/lib/dec/quant.c
===================================================================
--- trunk/theora/lib/dec/quant.c	2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/dec/quant.c	2008-08-04 18:37:55 UTC (rev 15153)
@@ -39,114 +39,84 @@
    qi values change between frames (this is what VP3 did).*/
 void oc_dequant_tables_init(oc_quant_table *_dequant[2][3],
  int _pp_dc_scale[64],const th_quant_info *_qinfo){
-  int          qti; /* coding mode: intra or inter */
-  int          pli; /* Y U V */
+  /*coding mode: intra or inter.*/
+  int          qti;
+  /*Y', C_b, C_r*/
+  int          pli;
   for(qti=0;qti<2;qti++){
     for(pli=0;pli<3;pli++){
       oc_quant_tables stage;
-
-      int qi;  /* quality index */
-      int qri; /* range iterator */
-      
+      /*Quality index.*/
+      int qi;
+      /*Range iterator.*/
+      int qri;
       for(qi=0,qri=0; qri<=_qinfo->qi_ranges[qti][pli].nranges; qri++){
-	th_quant_base base;
-	
-	ogg_uint32_t      q;
-	int               qi_start;
-	int               qi_end;
-	int               ci;
-	memcpy(base,_qinfo->qi_ranges[qti][pli].base_matrices[qri],
-	       sizeof(base));
-
-	qi_start=qi;
-	if(qri==_qinfo->qi_ranges[qti][pli].nranges)
-	  qi_end=qi+1;
-	else 
-	  qi_end=qi+_qinfo->qi_ranges[qti][pli].sizes[qri];
-	
-	/* Iterate over quality indicies in this range */
-	for(;;){
-	  
-	  /*In the original VP3.2 code, the rounding offset and the size of the
-	    dead zone around 0 were controlled by a "sharpness" parameter.
-	    The size of our dead zone is now controlled by the per-coefficient
-	    quality thresholds returned by our HVS module.
-	    We round down from a more accurate value when the quality of the
-	    reconstruction does not fall below our threshold and it saves bits.
-	    Hence, all of that VP3.2 code is gone from here, and the remaining
-	    floating point code has been implemented as equivalent integer code
-	    with exact precision.*/
-
-	  /* for postprocess, not dequant */
-	  if(_pp_dc_scale!=NULL)
-	    _pp_dc_scale[qi]=(int)((ogg_uint32_t)_qinfo->dc_scale[qi]*base[0]/160);
-
-	  /*Scale DC the coefficient from the proper table.*/
-	  q=((ogg_uint32_t)_qinfo->dc_scale[qi]*base[0]/100)<<2;
-	  q=OC_CLAMPI(OC_DC_QUANT_MIN[qti],q,OC_QUANT_MAX);
-	  stage[qi][0]=(ogg_uint16_t)q;
-	  
-	  /*Now scale AC coefficients from the proper table.*/
-	  for(ci=1;ci<64;ci++){
-	    q=((ogg_uint32_t)_qinfo->ac_scale[qi]*base[ci]/100)<<2;
-	    q=OC_CLAMPI(OC_AC_QUANT_MIN[qti],q,OC_QUANT_MAX);
-	    stage[qi][ci]=(ogg_uint16_t)q;
-	  }
-	  
-	  if(++qi>=qi_end)break;
-	  
-	  /*Interpolate the next base matrix.*/
-	  for(ci=0;ci<64;ci++){
-	    base[ci]=(unsigned char)
-	      ((2*((qi_end-qi)*_qinfo->qi_ranges[qti][pli].base_matrices[qri][ci]+
-		   (qi-qi_start)*_qinfo->qi_ranges[qti][pli].base_matrices[qri+1][ci])
-		+_qinfo->qi_ranges[qti][pli].sizes[qri])/
-	       (2*_qinfo->qi_ranges[qti][pli].sizes[qri]));
-	  }
-	}
+        th_quant_base base;
+        ogg_uint32_t  q;
+        int           qi_start;
+        int           qi_end;
+        int           ci;
+        memcpy(base,_qinfo->qi_ranges[qti][pli].base_matrices[qri],
+         sizeof(base));
+        qi_start=qi;
+        if(qri==_qinfo->qi_ranges[qti][pli].nranges)qi_end=qi+1;
+        else qi_end=qi+_qinfo->qi_ranges[qti][pli].sizes[qri];
+        /*Iterate over quality indicies in this range.*/
+        for(;;){
+          ogg_uint32_t qfac;
+          /*In the original VP3.2 code, the rounding offset and the size of the
+             dead zone around 0 were controlled by a "sharpness" parameter.
+            The size of our dead zone is now controlled by the per-coefficient
+             quality thresholds returned by our HVS module.
+            We round down from a more accurate value when the quality of the
+             reconstruction does not fall below our threshold and it saves bits.
+            Hence, all of that VP3.2 code is gone from here, and the remaining
+             floating point code has been implemented as equivalent integer code
+             with exact precision.*/
+          qfac=(ogg_uint32_t)_qinfo->dc_scale[qi]*base[0];
+          /*For postprocessing, not dequantization.*/
+          if(_pp_dc_scale!=NULL)_pp_dc_scale[qi]=(int)(qfac/160);
+          /*Scale DC the coefficient from the proper table.*/
+          q=(qfac/100)<<2;
+          q=OC_CLAMPI(OC_DC_QUANT_MIN[qti],q,OC_QUANT_MAX);
+          stage[qi][0]=(ogg_uint16_t)q;
+          /*Now scale AC coefficients from the proper table.*/
+          for(ci=1;ci<64;ci++){
+            q=((ogg_uint32_t)_qinfo->ac_scale[qi]*base[ci]/100)<<2;
+            q=OC_CLAMPI(OC_AC_QUANT_MIN[qti],q,OC_QUANT_MAX);
+            stage[qi][ci]=(ogg_uint16_t)q;
+          }
+          if(++qi>=qi_end)break;
+          /*Interpolate the next base matrix.*/
+          for(ci=0;ci<64;ci++){
+            base[ci]=(unsigned char)(
+             (2*((qi_end-qi)*_qinfo->qi_ranges[qti][pli].base_matrices[qri][ci]+
+             (qi-qi_start)*_qinfo->qi_ranges[qti][pli].base_matrices[qri+1][ci])
+             +_qinfo->qi_ranges[qti][pli].sizes[qri])/
+             (2*_qinfo->qi_ranges[qti][pli].sizes[qri]));
+          }
+        }
       }
-
-      /* Staging matricies complete; commit to memory only if this
-	 isn't a duplicate of a preceeding plane. This simple check
-	 helps us improve cache coherency later.*/
+      /*Staging matrices complete; commit to memory only if this isn't a
+         duplicate of a preceeding plane.
+        This simple check helps us improve cache coherency later.*/
       {
-	int dupe = 0;
-	int i,j;
-	for(i=0;i<=qti;i++){
-	  for(j=0;j<(i<qti?3:pli);j++){
-	    if(!memcmp(stage,_dequant[i][j],sizeof(stage))){
-	      dupe = 1;
-	      break;
-	    }
-	  }
-	  if(dupe)break;
-	}
-	if(dupe){
-	  _dequant[qti][pli]=_dequant[i][j];
-	}else{
-	  memcpy(_dequant[qti][pli],stage,sizeof(stage));
-	}
+        int dupe;
+        int qtj;
+        int plj;
+        dupe=0;
+        for(qtj=0;qtj<=qti;qtj++){
+          for(plj=0;plj<(qtj<qti?3:pli);plj++){
+            if(!memcmp(stage,_dequant[qtj][plj],sizeof(stage))){
+              dupe=1;
+              break;
+            }
+          }
+          if(dupe)break;
+        }
+        if(dupe)_dequant[qti][pli]=_dequant[qtj][plj];
+        else memcpy(_dequant[qti][pli],stage,sizeof(stage));
       }
     }
   }
-
-#ifdef _TH_DEBUG_
-  int i, j, k, l;
-  /* dump the calculated quantizer tables */
-  for(i=0;i<2;i++){
-    for(j=0;j<3;j++){
-      for(k=0;k<64;k++){
-	TH_DEBUG("quantizer table [%s][%s][Q%d] = {",
-		 (i==0?"intra":"inter"),(j==0?"Y":(j==1?"U":"V")),k);
-	for(l=0;l<64;l++){
-	  if((l&7)==0)
-	    TH_DEBUG("\n   ");
-	  TH_DEBUG("%4d ",_dequant[i][j][k][l]);
-	}
-	TH_DEBUG("}\n");
-      }
-    }
-  }
-#endif
-
 }

Modified: trunk/theora/lib/dec/quant.h
===================================================================
--- trunk/theora/lib/dec/quant.h	2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/dec/quant.h	2008-08-04 18:37:55 UTC (rev 15153)
@@ -40,7 +40,6 @@
 
 
 void oc_dequant_tables_init(oc_quant_table *_dequant[2][3],
-			    int _pp_dc_scale[64],
-			    const th_quant_info *_qinfo);
+ int _pp_dc_scale[64],const th_quant_info *_qinfo);
 
 #endif

Modified: trunk/theora/lib/dec/state.c
===================================================================
--- trunk/theora/lib/dec/state.c	2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/dec/state.c	2008-08-04 18:37:55 UTC (rev 15153)
@@ -831,37 +831,11 @@
     ogg_int16_t p;
     /*Why is the iquant product rounded in this case and no others?
       Who knows.*/
-
     p=(ogg_int16_t)((ogg_int32_t)_frag->dc*_dc_iquant+15>>5);
     /*LOOP VECTORIZES.*/
     for(ci=0;ci<64;ci++)res_buf[ci]=p;
-
-#ifdef _TH_DEBUG_
-    {
-      int i;
-      _frag->freq[0] = _frag->dc*_dc_iquant;
-      _frag->time[0] = p;
-      for(i=1;i<64;i++){
-	_frag->quant[i] = 0;
-	_frag->freq[i] = 0;
-	_frag->time[i] = p;
-      }
-    }
-#endif
-
   }
   else{
-
-#ifdef _TH_DEBUG_
-    {
-      int i;
-      for(i=1;i<_ncoefs;i++)
-	_frag->quant[i] = _dct_coeffs[i];
-      for(;i<64;i++)
-	_frag->quant[i] = 0;
-    }
-#endif
-
     /*First, dequantize the coefficients.*/
     dct_buf[0]=(ogg_int16_t)((ogg_int32_t)_frag->dc*_dc_iquant);
     for(zzi=1;zzi<_ncoefs;zzi++){
@@ -869,21 +843,6 @@
       ci=OC_FZIG_ZAG[zzi];
       dct_buf[ci]=(ogg_int16_t)((ogg_int32_t)_dct_coeffs[zzi]*_ac_iquant[ci]);
     }
-
-#ifdef _TH_DEBUG_
-    for(;zzi<64;zzi++){
-      int ci;
-      ci=OC_FZIG_ZAG[zzi];
-      dct_buf[ci]=0;
-    }
-
-    {
-      int i;
-      for(i=0;i<64;i++)
-	_frag->freq[i] = dct_buf[i];
-    }
-#endif
-
     /*Then, fill in the remainder of the coefficients with 0's, and perform
        the iDCT.*/
     if(_last_zzi<10){
@@ -894,15 +853,6 @@
       for(;zzi<64;zzi++)dct_buf[OC_FZIG_ZAG[zzi]]=0;
       oc_idct8x8_c(res_buf,dct_buf);
     }
-
-#ifdef _TH_DEBUG_
-    {
-      int i;
-      for(i=0;i<64;i++)
-	_frag->time[i] = res_buf[i];
-    }
-#endif
-
   }
   /*Fill in the target buffer.*/
   dst_framei=_state->ref_frame_idx[OC_FRAME_SELF];
@@ -1038,7 +988,7 @@
 }
 
 void oc_state_loop_filter_frag_rows_c(oc_theora_state *_state,int *_bv,
- int _refi,int _pli,int _fragy0,int _fragy_end){  
+ int _refi,int _pli,int _fragy0,int _fragy_end){
   th_img_plane      *iplane;
   oc_fragment_plane *fplane;
   oc_fragment       *frag_top;
@@ -1050,7 +1000,6 @@
   _bv+=127;
   iplane=_state->ref_frame_bufs[_refi]+_pli;
   fplane=_state->fplanes+_pli;
-
   /*The following loops are constructed somewhat non-intuitively on purpose.
     The main idea is: if a block boundary has at least one coded fragment on
      it, the filter is applied to it.
@@ -1079,46 +1028,6 @@
            iplane->stride,_bv);
         }
       }
-
-
-#ifdef _TH_DEBUG_
-      {
-	int i,j,k,l;
-	unsigned char *src;
-	
-	for(l=0;l<5;l++){
-	  oc_fragment *f;
-	  switch(l){
-	  case 0: 
-	    f = frag;
-	    break;
-	  case 1: /* left */
-	    if(frag == frag0)continue;
-	    f = frag-1;
-	    break;
-	  case 2: /* bottom (top once flipped) */
-	    if(frag0 == frag_top)continue;
-	    f = frag - fplane->nhfrags;
-	    break;
-	  case 3: /* right */
-	    if(frag+1 >= frag_end) continue;
-	    f = frag + 1;
-	    break;
-	  case 4: /* top (bottom once flipped) */
-	    if(frag+fplane->nhfrags >= frag_bot)continue;
-	    f = frag + fplane->nhfrags;
-	    break;
-	  }
-	  
-	  src = f->buffer[_refi];
-	  for(i=0,j=0;j<8;j++){
-	    for(k=0;k<8;k++,i++)
-	      f->loop[i] = src[k];
-	    src+=iplane->stride;
-	  }
-	}
-      }
-#endif
       frag++;
     }
     frag0+=fplane->nhfrags;

Modified: trunk/theora/lib/dec/x86/mmxfrag.c
===================================================================
--- trunk/theora/lib/dec/x86/mmxfrag.c	2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/dec/x86/mmxfrag.c	2008-08-04 18:37:55 UTC (rev 15153)
@@ -21,7 +21,7 @@
   Note: Loops are unrolled for best performance.
   The iteration each instruction belongs to is marked in the comments as #i.*/
 #include "x86int.h"
-#include <stdlib.h>
+#include <stddef.h>
 
 #if defined(USE_ASM)
 

Modified: trunk/theora/lib/dec/x86/mmxstate.c
===================================================================
--- trunk/theora/lib/dec/x86/mmxstate.c	2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/dec/x86/mmxstate.c	2008-08-04 18:37:55 UTC (rev 15153)
@@ -19,7 +19,7 @@
   Originally written by Rudolf Marek.*/
 #include "x86int.h"
 #include "../../internal.h"
-#include <stdlib.h>
+#include <stddef.h>
 
 #if defined(USE_ASM)
 

Modified: trunk/theora/lib/dec/x86_vc/mmxidct.c
===================================================================
--- trunk/theora/lib/dec/x86_vc/mmxidct.c	2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/dec/x86_vc/mmxidct.c	2008-08-04 18:37:55 UTC (rev 15153)
@@ -11,7 +11,7 @@
  ********************************************************************
 
   function:
-    last mod: $Id: 
+    last mod: $Id:
 
  ********************************************************************/
 
@@ -30,7 +30,7 @@
 #include "x86int.h"
 
 /*A table of constants used by the MMX routines.*/
-static const __declspec(align(16)) ogg_uint16_t 
+static const __declspec(align(16)) ogg_uint16_t
  OC_IDCT_CONSTS[(7+1)*4]={
   (ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7,
   (ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7,
@@ -53,475 +53,475 @@
 void oc_idct8x8_10_mmx(ogg_int16_t _y[64]){
   _asm {
     mov     edx, [_y]
-    mov     eax, offset OC_IDCT_CONSTS             
-    movq    mm2, [edx + 30H]             
-    movq    mm6, [eax + 10H]             
-    movq    mm4, mm2                               
-    movq    mm7, [edx + 18H]             
-    pmulhw  mm4, mm6                               
-    movq    mm1, [eax + 20H]             
-    pmulhw  mm6, mm7                               
-    movq    mm5, mm1                               
-    pmulhw  mm1, mm2                               
-    movq    mm3, [edx + 10H]             
-    pmulhw  mm5, mm7                               
-    movq    mm0, [eax]                   
-    paddw   mm4, mm2                               
-    paddw   mm6, mm7                               
-    paddw   mm2, mm1                               
-    movq    mm1, [edx + 38H]             
-    paddw   mm7, mm5                               
-    movq    mm5, mm0                               
-    pmulhw  mm0, mm3                               
-    paddw   mm4, mm7                               
-    pmulhw  mm5, mm1                               
-    movq    mm7, [eax + 30H]             
-    psubw   mm6, mm2                               
-    paddw   mm0, mm3                               
-    pmulhw  mm3, mm7                               
-    movq    mm2, [edx + 20H]             
-    pmulhw  mm7, mm1                               
-    paddw   mm5, mm1                               
-    movq    mm1, mm2                               
-    pmulhw  mm2, [eax + 08H]             
-    psubw   mm3, mm5                               
-    movq    mm5, [edx + 28H]             
-    paddw   mm0, mm7                               
-    movq    mm7, mm5                               
-    psubw   mm0, mm4                               
-    pmulhw  mm5, [eax + 08H]             
-    paddw   mm2, mm1                               
-    pmulhw  mm1, [eax + 28H]             
-    paddw   mm4, mm4                               
-    paddw   mm4, mm0                               
-    psubw   mm3, mm6                               
-    paddw   mm5, mm7                               
-    paddw   mm6, mm6                               
-    pmulhw  mm7, [eax + 28H]             
-    paddw   mm6, mm3                               
-    movq    [edx + 10H], mm4             
-    psubw   mm1, mm5                               
-    movq    mm4, [eax + 18H]             
-    movq    mm5, mm3                               
-    pmulhw  mm3, mm4                               
-    paddw   mm7, mm2                               
-    movq    [edx + 20H], mm6             
-    movq    mm2, mm0                               
-    movq    mm6, [edx]                   
-    pmulhw  mm0, mm4                               
-    paddw   mm5, mm3                               
-    movq    mm3, [edx + 08H]             
-    psubw   mm5, mm1                               
-    paddw   mm2, mm0                               
-    psubw   mm6, mm3                               
-    movq    mm0, mm6                               
-    pmulhw  mm6, mm4                               
-    paddw   mm3, mm3                               
-    paddw   mm1, mm1                               
-    paddw   mm3, mm0                               
-    paddw   mm1, mm5                               
-    pmulhw  mm4, mm3                               
-    paddw   mm6, mm0                               
-    psubw   mm6, mm2                               
-    paddw   mm2, mm2                               
-    movq    mm0, [edx + 10H]             
-    paddw   mm2, mm6                               
-    paddw   mm4, mm3                               
-    psubw   mm2, mm1                               
-    movq    mm3, [edx + 20H]             
-    psubw   mm4, mm7                               
-    paddw   mm1, mm1                               
-    paddw   mm7, mm7                               
-    paddw   mm1, mm2                               
-    paddw   mm7, mm4                               
-    psubw   mm4, mm3                               
-    paddw   mm3, mm3                               
-    psubw   mm6, mm5                               
-    paddw   mm5, mm5                               
-    paddw   mm3, mm4                               
-    paddw   mm5, mm6                               
-    psubw   mm7, mm0                               
-    paddw   mm0, mm0                               
-    movq    [edx + 10H], mm1             
-    paddw   mm0, mm7                               
-    movq    mm1, mm4                               
-    punpcklwd mm4, mm5                             
-    movq    [edx], mm0                   
-    punpckhwd mm1, mm5                             
-    movq    mm0, mm6                               
-    punpcklwd mm6, mm7                             
-    movq    mm5, mm4                               
-    punpckldq mm4, mm6                             
-    punpckhdq mm5, mm6                             
-    movq    mm6, mm1                               
-    movq    [edx + 08H], mm4             
-    punpckhwd mm0, mm7                             
-    movq    [edx + 18H], mm5             
-    punpckhdq mm6, mm0                             
-    movq    mm4, [edx]                   
-    punpckldq mm1, mm0                             
-    movq    mm5, [edx + 10H]             
-    movq    mm0, mm4                               
-    movq    [edx + 38H], mm6             
-    punpcklwd mm0, mm5                             
-    movq    [edx + 28H], mm1             
-    punpckhwd mm4, mm5                             
-    movq    mm5, mm2                               
-    punpcklwd mm2, mm3                             
-    movq    mm1, mm0                               
-    punpckldq mm0, mm2                             
-    punpckhdq mm1, mm2                             
-    movq    mm2, mm4                               
-    movq    [edx], mm0                   
-    punpckhwd mm5, mm3                             
-    movq    [edx + 10H], mm1             
-    punpckhdq mm4, mm5                             
-    punpckldq mm2, mm5                             
-    movq    [edx + 30H], mm4             
-    movq    [edx + 20H], mm2             
-    movq    mm2, [edx + 70H]             
-    movq    mm6, [eax + 10H]             
-    movq    mm4, mm2                               
-    movq    mm7, [edx + 58H]             
-    pmulhw  mm4, mm6                               
-    movq    mm1, [eax + 20H]             
-    pmulhw  mm6, mm7                               
-    movq    mm5, mm1                               
-    pmulhw  mm1, mm2                               
-    movq    mm3, [edx + 50H]             
-    pmulhw  mm5, mm7                               
-    movq    mm0, [eax]                   
-    paddw   mm4, mm2                               
-    paddw   mm6, mm7                               
-    paddw   mm2, mm1                               
-    movq    mm1, [edx + 78H]             
-    paddw   mm7, mm5                               
-    movq    mm5, mm0                               
-    pmulhw  mm0, mm3                               
-    paddw   mm4, mm7                               
-    pmulhw  mm5, mm1                               
-    movq    mm7, [eax + 30H]             
-    psubw   mm6, mm2                               
-    paddw   mm0, mm3                               
-    pmulhw  mm3, mm7                               
-    movq    mm2, [edx + 60H]             
-    pmulhw  mm7, mm1                               
-    paddw   mm5, mm1                               
-    movq    mm1, mm2                               
-    pmulhw  mm2, [eax + 08H]             
-    psubw   mm3, mm5                               
-    movq    mm5, [edx + 68H]             
-    paddw   mm0, mm7                               
-    movq    mm7, mm5                               
-    psubw   mm0, mm4                               
-    pmulhw  mm5, [eax + 08H]             
-    paddw   mm2, mm1                               
-    pmulhw  mm1, [eax + 28H]             
-    paddw   mm4, mm4                               
-    paddw   mm4, mm0                               
-    psubw   mm3, mm6                               
-    paddw   mm5, mm7                               
-    paddw   mm6, mm6                               
-    pmulhw  mm7, [eax + 28H]             
-    paddw   mm6, mm3                               
-    movq    [edx + 50H], mm4             
-    psubw   mm1, mm5                               
-    movq    mm4, [eax + 18H]             
-    movq    mm5, mm3                               
-    pmulhw  mm3, mm4                               
-    paddw   mm7, mm2                               
-    movq    [edx + 60H], mm6             
-    movq    mm2, mm0                               
-    movq    mm6, [edx + 40H]             
-    pmulhw  mm0, mm4                               
-    paddw   mm5, mm3                               
-    movq    mm3, [edx + 48H]             
-    psubw   mm5, mm1                               
-    paddw   mm2, mm0                               
-    psubw   mm6, mm3                               
-    movq    mm0, mm6                               
-    pmulhw  mm6, mm4                               
-    paddw   mm3, mm3                               
-    paddw   mm1, mm1                               
-    paddw   mm3, mm0                               
-    paddw   mm1, mm5                               
-    pmulhw  mm4, mm3                               
-    paddw   mm6, mm0                               
-    psubw   mm6, mm2                               
-    paddw   mm2, mm2                               
-    movq    mm0, [edx + 50H]             
-    paddw   mm2, mm6                               
-    paddw   mm4, mm3                               
-    psubw   mm2, mm1                               
-    movq    mm3, [edx + 60H]             
-    psubw   mm4, mm7                               
-    paddw   mm1, mm1                               
-    paddw   mm7, mm7                               
-    paddw   mm1, mm2                               
-    paddw   mm7, mm4                               
-    psubw   mm4, mm3                               
-    paddw   mm3, mm3                               
-    psubw   mm6, mm5                               
-    paddw   mm5, mm5                               
-    paddw   mm3, mm4                               
-    paddw   mm5, mm6                               
-    psubw   mm7, mm0                               
-    paddw   mm0, mm0                               
-    movq    [edx + 50H], mm1             
-    paddw   mm0, mm7                               
-    movq    mm1, mm4                               
-    punpcklwd mm4, mm5                             
-    movq    [edx + 40H], mm0             
-    punpckhwd mm1, mm5                             
-    movq    mm0, mm6                               
-    punpcklwd mm6, mm7                             
-    movq    mm5, mm4                               
-    punpckldq mm4, mm6                             
-    punpckhdq mm5, mm6                             
-    movq    mm6, mm1                               
-    movq    [edx + 48H], mm4             
-    punpckhwd mm0, mm7                             
-    movq    [edx + 58H], mm5             
-    punpckhdq mm6, mm0                             
-    movq    mm4, [edx + 40H]             
-    punpckldq mm1, mm0                             
-    movq    mm5, [edx + 50H]             
-    movq    mm0, mm4                               
-    movq    [edx + 78H], mm6             
-    punpcklwd mm0, mm5                             
-    movq    [edx + 68H], mm1             
-    punpckhwd mm4, mm5                             
-    movq    mm5, mm2                               
-    punpcklwd mm2, mm3                             
-    movq    mm1, mm0                               
-    punpckldq mm0, mm2                             
-    punpckhdq mm1, mm2                             
-    movq    mm2, mm4                               
-    movq    [edx + 40H], mm0             
-    punpckhwd mm5, mm3                             
-    movq    [edx + 50H], mm1             
-    punpckhdq mm4, mm5                             
-    punpckldq mm2, mm5                             
-    movq    [edx + 70H], mm4             
-    movq    [edx + 60H], mm2             
-    movq    mm2, [edx + 30H]             
-    movq    mm6, [eax + 10H]             
-    movq    mm4, mm2                               
-    movq    mm7, [edx + 50H]             
-    pmulhw  mm4, mm6                               
-    movq    mm1, [eax + 20H]             
-    pmulhw  mm6, mm7                               
-    movq    mm5, mm1                               
-    pmulhw  mm1, mm2                               
-    movq    mm3, [edx + 10H]             
-    pmulhw  mm5, mm7                               
-    movq    mm0, [eax]                   
-    paddw   mm4, mm2                               
-    paddw   mm6, mm7                               
-    paddw   mm2, mm1                               
-    movq    mm1, [edx + 70H]             
-    paddw   mm7, mm5                               
-    movq    mm5, mm0                               
-    pmulhw  mm0, mm3                               
-    paddw   mm4, mm7                               
-    pmulhw  mm5, mm1                               
-    movq    mm7, [eax + 30H]             
-    psubw   mm6, mm2                               
-    paddw   mm0, mm3                               
-    pmulhw  mm3, mm7                               
-    movq    mm2, [edx + 20H]             
-    pmulhw  mm7, mm1                               
-    paddw   mm5, mm1                               
-    movq    mm1, mm2                               
-    pmulhw  mm2, [eax + 08H]             
-    psubw   mm3, mm5                               
-    movq    mm5, [edx + 60H]             
-    paddw   mm0, mm7                               
-    movq    mm7, mm5                               
-    psubw   mm0, mm4                               
-    pmulhw  mm5, [eax + 08H]             
-    paddw   mm2, mm1                               
-    pmulhw  mm1, [eax + 28H]             
-    paddw   mm4, mm4                               
-    paddw   mm4, mm0                               
-    psubw   mm3, mm6                               
-    paddw   mm5, mm7                               
-    paddw   mm6, mm6                               
-    pmulhw  mm7, [eax + 28H]             
-    paddw   mm6, mm3                               
-    movq    [edx + 10H], mm4             
-    psubw   mm1, mm5                               
-    movq    mm4, [eax + 18H]             
-    movq    mm5, mm3                               
-    pmulhw  mm3, mm4                               
-    paddw   mm7, mm2                               
-    movq    [edx + 20H], mm6             
-    movq    mm2, mm0                               
-    movq    mm6, [edx]                   
-    pmulhw  mm0, mm4                               
-    paddw   mm5, mm3                               
-    movq    mm3, [edx + 40H]             
-    psubw   mm5, mm1                               
-    paddw   mm2, mm0                               
-    psubw   mm6, mm3                               
-    movq    mm0, mm6                               
-    pmulhw  mm6, mm4                               
-    paddw   mm3, mm3                               
-    paddw   mm1, mm1                               
-    paddw   mm3, mm0                               
-    paddw   mm1, mm5                               
-    pmulhw  mm4, mm3                               
-    paddw   mm6, mm0                               
-    psubw   mm6, mm2                               
-    paddw   mm2, mm2                               
-    movq    mm0, [edx + 10H]             
-    paddw   mm2, mm6                               
-    paddw   mm4, mm3                               
-    psubw   mm2, mm1                               
-    paddw   mm2, [eax + 38H]             
-    paddw   mm1, mm1                               
-    paddw   mm1, mm2                               
-    psraw   mm2, 4                                 
-    psubw   mm4, mm7                               
-    psraw   mm1, 4                                 
-    movq    mm3, [edx + 20H]             
-    paddw   mm7, mm7                               
-    movq    [edx + 20H], mm2             
-    paddw   mm7, mm4                               
-    movq    [edx + 10H], mm1             
-    psubw   mm4, mm3                               
-    paddw   mm4, [eax + 38H]             
-    paddw   mm3, mm3                               
-    paddw   mm3, mm4                               
-    psraw   mm4, 4                                 
-    psubw   mm6, mm5                               
-    psraw   mm3, 4                                 
-    paddw   mm6, [eax + 38H]             
-    paddw   mm5, mm5                               
-    paddw   mm5, mm6                               
-    psraw   mm6, 4                                 
-    movq    [edx + 40H], mm4             
-    psraw   mm5, 4                                 
-    movq    [edx + 30H], mm3             
-    psubw   mm7, mm0                               
-    paddw   mm7, [eax + 38H]             
-    paddw   mm0, mm0                               
-    paddw   mm0, mm7                               
-    psraw   mm7, 4                                 
-    movq    [edx + 60H], mm6             
-    psraw   mm0, 4                                 
-    movq    [edx + 50H], mm5             
-    movq    [edx + 70H], mm7             
-    movq    [edx], mm0                   
-    movq    mm2, [edx + 38H]             
-    movq    mm6, [eax + 10H]             
-    movq    mm4, mm2                               
-    movq    mm7, [edx + 58H]             
-    pmulhw  mm4, mm6                               
-    movq    mm1, [eax + 20H]             
-    pmulhw  mm6, mm7                               
-    movq    mm5, mm1                               
-    pmulhw  mm1, mm2                               
-    movq    mm3, [edx + 18H]             
-    pmulhw  mm5, mm7                               
-    movq    mm0, [eax]                   
-    paddw   mm4, mm2                               
-    paddw   mm6, mm7                               
-    paddw   mm2, mm1                               
-    movq    mm1, [edx + 78H]             
-    paddw   mm7, mm5                               
-    movq    mm5, mm0                               
-    pmulhw  mm0, mm3                               
-    paddw   mm4, mm7                               
-    pmulhw  mm5, mm1                               
-    movq    mm7, [eax + 30H]             
-    psubw   mm6, mm2                               
-    paddw   mm0, mm3                               
-    pmulhw  mm3, mm7                               
-    movq    mm2, [edx + 28H]             
-    pmulhw  mm7, mm1                               
-    paddw   mm5, mm1                               
-    movq    mm1, mm2                               
-    pmulhw  mm2, [eax + 08H]             
-    psubw   mm3, mm5                               
-    movq    mm5, [edx + 68H]             
-    paddw   mm0, mm7                               
-    movq    mm7, mm5                               
-    psubw   mm0, mm4                               
-    pmulhw  mm5, [eax + 08H]             
-    paddw   mm2, mm1                               
-    pmulhw  mm1, [eax + 28H]             
-    paddw   mm4, mm4                               
-    paddw   mm4, mm0                               
-    psubw   mm3, mm6                               
-    paddw   mm5, mm7                               
-    paddw   mm6, mm6                               
-    pmulhw  mm7, [eax + 28H]             
-    paddw   mm6, mm3                               
-    movq    [edx + 18H], mm4             
-    psubw   mm1, mm5                               
-    movq    mm4, [eax + 18H]             
-    movq    mm5, mm3                               
-    pmulhw  mm3, mm4                               
-    paddw   mm7, mm2                               
-    movq    [edx + 28H], mm6             
-    movq    mm2, mm0                               
-    movq    mm6, [edx + 08H]             
-    pmulhw  mm0, mm4                               
-    paddw   mm5, mm3                               
-    movq    mm3, [edx + 48H]             
-    psubw   mm5, mm1                               
-    paddw   mm2, mm0                               
-    psubw   mm6, mm3                               
-    movq    mm0, mm6                               
-    pmulhw  mm6, mm4                               
-    paddw   mm3, mm3                               
-    paddw   mm1, mm1                               
-    paddw   mm3, mm0                               
-    paddw   mm1, mm5                               
-    pmulhw  mm4, mm3                               
-    paddw   mm6, mm0                               
-    psubw   mm6, mm2                               
-    paddw   mm2, mm2                               
-    movq    mm0, [edx + 18H]             
-    paddw   mm2, mm6                               
-    paddw   mm4, mm3                               
-    psubw   mm2, mm1                               
-    paddw   mm2, [eax + 38H]             
-    paddw   mm1, mm1                               
-    paddw   mm1, mm2                               
-    psraw   mm2, 4                                 
-    psubw   mm4, mm7                               
-    psraw   mm1, 4                                 
-    movq    mm3, [edx + 28H]             
-    paddw   mm7, mm7                               
-    movq    [edx + 28H], mm2             
-    paddw   mm7, mm4                               
-    movq    [edx + 18H], mm1             
-    psubw   mm4, mm3                               
-    paddw   mm4, [eax + 38H]             
-    paddw   mm3, mm3                               
-    paddw   mm3, mm4                               
-    psraw   mm4, 4                                 
-    psubw   mm6, mm5                               
-    psraw   mm3, 4                                 
-    paddw   mm6, [eax + 38H]             
-    paddw   mm5, mm5                               
-    paddw   mm5, mm6                               
-    psraw   mm6, 4                                 
-    movq    [edx + 48H], mm4             
-    psraw   mm5, 4                                 
-    movq    [edx + 38H], mm3             
-    psubw   mm7, mm0                               
-    paddw   mm7, [eax + 38H]             
-    paddw   mm0, mm0                               
-    paddw   mm0, mm7                               
-    psraw   mm7, 4                                 
-    movq    [edx + 68H], mm6             
-    psraw   mm0, 4                                 
-    movq    [edx + 58H], mm5             
-    movq    [edx + 78H], mm7             
-    movq    [edx + 08H], mm0             
+    mov     eax, offset OC_IDCT_CONSTS
+    movq    mm2, [edx + 30H]
+    movq    mm6, [eax + 10H]
+    movq    mm4, mm2
+    movq    mm7, [edx + 18H]
+    pmulhw  mm4, mm6
+    movq    mm1, [eax + 20H]
+    pmulhw  mm6, mm7
+    movq    mm5, mm1
+    pmulhw  mm1, mm2
+    movq    mm3, [edx + 10H]
+    pmulhw  mm5, mm7
+    movq    mm0, [eax]
+    paddw   mm4, mm2
+    paddw   mm6, mm7
+    paddw   mm2, mm1
+    movq    mm1, [edx + 38H]
+    paddw   mm7, mm5
+    movq    mm5, mm0
+    pmulhw  mm0, mm3
+    paddw   mm4, mm7
+    pmulhw  mm5, mm1
+    movq    mm7, [eax + 30H]
+    psubw   mm6, mm2
+    paddw   mm0, mm3
+    pmulhw  mm3, mm7
+    movq    mm2, [edx + 20H]
+    pmulhw  mm7, mm1
+    paddw   mm5, mm1
+    movq    mm1, mm2
+    pmulhw  mm2, [eax + 08H]
+    psubw   mm3, mm5
+    movq    mm5, [edx + 28H]
+    paddw   mm0, mm7
+    movq    mm7, mm5
+    psubw   mm0, mm4
+    pmulhw  mm5, [eax + 08H]
+    paddw   mm2, mm1
+    pmulhw  mm1, [eax + 28H]
+    paddw   mm4, mm4
+    paddw   mm4, mm0
+    psubw   mm3, mm6
+    paddw   mm5, mm7
+    paddw   mm6, mm6
+    pmulhw  mm7, [eax + 28H]
+    paddw   mm6, mm3
+    movq    [edx + 10H], mm4
+    psubw   mm1, mm5
+    movq    mm4, [eax + 18H]
+    movq    mm5, mm3
+    pmulhw  mm3, mm4
+    paddw   mm7, mm2
+    movq    [edx + 20H], mm6
+    movq    mm2, mm0
+    movq    mm6, [edx]
+    pmulhw  mm0, mm4
+    paddw   mm5, mm3
+    movq    mm3, [edx + 08H]
+    psubw   mm5, mm1
+    paddw   mm2, mm0
+    psubw   mm6, mm3
+    movq    mm0, mm6
+    pmulhw  mm6, mm4
+    paddw   mm3, mm3
+    paddw   mm1, mm1
+    paddw   mm3, mm0
+    paddw   mm1, mm5
+    pmulhw  mm4, mm3
+    paddw   mm6, mm0
+    psubw   mm6, mm2
+    paddw   mm2, mm2
+    movq    mm0, [edx + 10H]
+    paddw   mm2, mm6
+    paddw   mm4, mm3
+    psubw   mm2, mm1
+    movq    mm3, [edx + 20H]
+    psubw   mm4, mm7
+    paddw   mm1, mm1
+    paddw   mm7, mm7
+    paddw   mm1, mm2
+    paddw   mm7, mm4
+    psubw   mm4, mm3
+    paddw   mm3, mm3
+    psubw   mm6, mm5
+    paddw   mm5, mm5
+    paddw   mm3, mm4
+    paddw   mm5, mm6
+    psubw   mm7, mm0
+    paddw   mm0, mm0
+    movq    [edx + 10H], mm1
+    paddw   mm0, mm7
+    movq    mm1, mm4
+    punpcklwd mm4, mm5
+    movq    [edx], mm0
+    punpckhwd mm1, mm5
+    movq    mm0, mm6
+    punpcklwd mm6, mm7
+    movq    mm5, mm4
+    punpckldq mm4, mm6
+    punpckhdq mm5, mm6
+    movq    mm6, mm1
+    movq    [edx + 08H], mm4
+    punpckhwd mm0, mm7
+    movq    [edx + 18H], mm5
+    punpckhdq mm6, mm0
+    movq    mm4, [edx]
+    punpckldq mm1, mm0
+    movq    mm5, [edx + 10H]
+    movq    mm0, mm4
+    movq    [edx + 38H], mm6
+    punpcklwd mm0, mm5
+    movq    [edx + 28H], mm1
+    punpckhwd mm4, mm5
+    movq    mm5, mm2
+    punpcklwd mm2, mm3
+    movq    mm1, mm0
+    punpckldq mm0, mm2
+    punpckhdq mm1, mm2
+    movq    mm2, mm4
+    movq    [edx], mm0
+    punpckhwd mm5, mm3
+    movq    [edx + 10H], mm1
+    punpckhdq mm4, mm5
+    punpckldq mm2, mm5
+    movq    [edx + 30H], mm4
+    movq    [edx + 20H], mm2
+    movq    mm2, [edx + 70H]
+    movq    mm6, [eax + 10H]
+    movq    mm4, mm2
+    movq    mm7, [edx + 58H]
+    pmulhw  mm4, mm6
+    movq    mm1, [eax + 20H]
+    pmulhw  mm6, mm7
+    movq    mm5, mm1
+    pmulhw  mm1, mm2
+    movq    mm3, [edx + 50H]
+    pmulhw  mm5, mm7
+    movq    mm0, [eax]
+    paddw   mm4, mm2
+    paddw   mm6, mm7
+    paddw   mm2, mm1
+    movq    mm1, [edx + 78H]
+    paddw   mm7, mm5
+    movq    mm5, mm0
+    pmulhw  mm0, mm3
+    paddw   mm4, mm7
+    pmulhw  mm5, mm1
+    movq    mm7, [eax + 30H]
+    psubw   mm6, mm2
+    paddw   mm0, mm3
+    pmulhw  mm3, mm7
+    movq    mm2, [edx + 60H]
+    pmulhw  mm7, mm1
+    paddw   mm5, mm1
+    movq    mm1, mm2
+    pmulhw  mm2, [eax + 08H]
+    psubw   mm3, mm5
+    movq    mm5, [edx + 68H]
+    paddw   mm0, mm7
+    movq    mm7, mm5
+    psubw   mm0, mm4
+    pmulhw  mm5, [eax + 08H]
+    paddw   mm2, mm1
+    pmulhw  mm1, [eax + 28H]
+    paddw   mm4, mm4
+    paddw   mm4, mm0
+    psubw   mm3, mm6
+    paddw   mm5, mm7
+    paddw   mm6, mm6
+    pmulhw  mm7, [eax + 28H]
+    paddw   mm6, mm3
+    movq    [edx + 50H], mm4
+    psubw   mm1, mm5
+    movq    mm4, [eax + 18H]
+    movq    mm5, mm3
+    pmulhw  mm3, mm4
+    paddw   mm7, mm2
+    movq    [edx + 60H], mm6
+    movq    mm2, mm0
+    movq    mm6, [edx + 40H]
+    pmulhw  mm0, mm4
+    paddw   mm5, mm3
+    movq    mm3, [edx + 48H]
+    psubw   mm5, mm1
+    paddw   mm2, mm0
+    psubw   mm6, mm3
+    movq    mm0, mm6
+    pmulhw  mm6, mm4
+    paddw   mm3, mm3
+    paddw   mm1, mm1
+    paddw   mm3, mm0
+    paddw   mm1, mm5
+    pmulhw  mm4, mm3
+    paddw   mm6, mm0
+    psubw   mm6, mm2
+    paddw   mm2, mm2
+    movq    mm0, [edx + 50H]
+    paddw   mm2, mm6
+    paddw   mm4, mm3
+    psubw   mm2, mm1
+    movq    mm3, [edx + 60H]
+    psubw   mm4, mm7
+    paddw   mm1, mm1
+    paddw   mm7, mm7
+    paddw   mm1, mm2
+    paddw   mm7, mm4
+    psubw   mm4, mm3
+    paddw   mm3, mm3
+    psubw   mm6, mm5
+    paddw   mm5, mm5
+    paddw   mm3, mm4
+    paddw   mm5, mm6
+    psubw   mm7, mm0
+    paddw   mm0, mm0
+    movq    [edx + 50H], mm1
+    paddw   mm0, mm7
+    movq    mm1, mm4
+    punpcklwd mm4, mm5
+    movq    [edx + 40H], mm0
+    punpckhwd mm1, mm5
+    movq    mm0, mm6
+    punpcklwd mm6, mm7
+    movq    mm5, mm4
+    punpckldq mm4, mm6
+    punpckhdq mm5, mm6
+    movq    mm6, mm1
+    movq    [edx + 48H], mm4
+    punpckhwd mm0, mm7
+    movq    [edx + 58H], mm5
+    punpckhdq mm6, mm0
+    movq    mm4, [edx + 40H]
+    punpckldq mm1, mm0
+    movq    mm5, [edx + 50H]
+    movq    mm0, mm4
+    movq    [edx + 78H], mm6
+    punpcklwd mm0, mm5
+    movq    [edx + 68H], mm1
+    punpckhwd mm4, mm5
+    movq    mm5, mm2
+    punpcklwd mm2, mm3
+    movq    mm1, mm0
+    punpckldq mm0, mm2
+    punpckhdq mm1, mm2
+    movq    mm2, mm4
+    movq    [edx + 40H], mm0
+    punpckhwd mm5, mm3
+    movq    [edx + 50H], mm1
+    punpckhdq mm4, mm5
+    punpckldq mm2, mm5
+    movq    [edx + 70H], mm4
+    movq    [edx + 60H], mm2
+    movq    mm2, [edx + 30H]
+    movq    mm6, [eax + 10H]
+    movq    mm4, mm2
+    movq    mm7, [edx + 50H]
+    pmulhw  mm4, mm6
+    movq    mm1, [eax + 20H]
+    pmulhw  mm6, mm7
+    movq    mm5, mm1
+    pmulhw  mm1, mm2
+    movq    mm3, [edx + 10H]
+    pmulhw  mm5, mm7
+    movq    mm0, [eax]
+    paddw   mm4, mm2
+    paddw   mm6, mm7
+    paddw   mm2, mm1
+    movq    mm1, [edx + 70H]
+    paddw   mm7, mm5
+    movq    mm5, mm0
+    pmulhw  mm0, mm3
+    paddw   mm4, mm7
+    pmulhw  mm5, mm1
+    movq    mm7, [eax + 30H]
+    psubw   mm6, mm2
+    paddw   mm0, mm3
+    pmulhw  mm3, mm7
+    movq    mm2, [edx + 20H]
+    pmulhw  mm7, mm1
+    paddw   mm5, mm1
+    movq    mm1, mm2
+    pmulhw  mm2, [eax + 08H]
+    psubw   mm3, mm5
+    movq    mm5, [edx + 60H]
+    paddw   mm0, mm7
+    movq    mm7, mm5
+    psubw   mm0, mm4
+    pmulhw  mm5, [eax + 08H]
+    paddw   mm2, mm1
+    pmulhw  mm1, [eax + 28H]
+    paddw   mm4, mm4
+    paddw   mm4, mm0
+    psubw   mm3, mm6
+    paddw   mm5, mm7
+    paddw   mm6, mm6
+    pmulhw  mm7, [eax + 28H]
+    paddw   mm6, mm3
+    movq    [edx + 10H], mm4
+    psubw   mm1, mm5
+    movq    mm4, [eax + 18H]
+    movq    mm5, mm3
+    pmulhw  mm3, mm4
+    paddw   mm7, mm2
+    movq    [edx + 20H], mm6
+    movq    mm2, mm0
+    movq    mm6, [edx]
+    pmulhw  mm0, mm4
+    paddw   mm5, mm3
+    movq    mm3, [edx + 40H]
+    psubw   mm5, mm1
+    paddw   mm2, mm0
+    psubw   mm6, mm3
+    movq    mm0, mm6
+    pmulhw  mm6, mm4
+    paddw   mm3, mm3
+    paddw   mm1, mm1
+    paddw   mm3, mm0
+    paddw   mm1, mm5
+    pmulhw  mm4, mm3
+    paddw   mm6, mm0
+    psubw   mm6, mm2
+    paddw   mm2, mm2
+    movq    mm0, [edx + 10H]
+    paddw   mm2, mm6
+    paddw   mm4, mm3
+    psubw   mm2, mm1
+    paddw   mm2, [eax + 38H]
+    paddw   mm1, mm1
+    paddw   mm1, mm2
+    psraw   mm2, 4
+    psubw   mm4, mm7
+    psraw   mm1, 4
+    movq    mm3, [edx + 20H]
+    paddw   mm7, mm7
+    movq    [edx + 20H], mm2
+    paddw   mm7, mm4
+    movq    [edx + 10H], mm1
+    psubw   mm4, mm3
+    paddw   mm4, [eax + 38H]
+    paddw   mm3, mm3
+    paddw   mm3, mm4
+    psraw   mm4, 4
+    psubw   mm6, mm5
+    psraw   mm3, 4
+    paddw   mm6, [eax + 38H]
+    paddw   mm5, mm5
+    paddw   mm5, mm6
+    psraw   mm6, 4
+    movq    [edx + 40H], mm4
+    psraw   mm5, 4
+    movq    [edx + 30H], mm3
+    psubw   mm7, mm0
+    paddw   mm7, [eax + 38H]
+    paddw   mm0, mm0
+    paddw   mm0, mm7
+    psraw   mm7, 4
+    movq    [edx + 60H], mm6
+    psraw   mm0, 4
+    movq    [edx + 50H], mm5
+    movq    [edx + 70H], mm7
+    movq    [edx], mm0
+    movq    mm2, [edx + 38H]
+    movq    mm6, [eax + 10H]
+    movq    mm4, mm2
+    movq    mm7, [edx + 58H]
+    pmulhw  mm4, mm6
+    movq    mm1, [eax + 20H]
+    pmulhw  mm6, mm7
+    movq    mm5, mm1
+    pmulhw  mm1, mm2
+    movq    mm3, [edx + 18H]
+    pmulhw  mm5, mm7
+    movq    mm0, [eax]
+    paddw   mm4, mm2
+    paddw   mm6, mm7
+    paddw   mm2, mm1
+    movq    mm1, [edx + 78H]
+    paddw   mm7, mm5
+    movq    mm5, mm0
+    pmulhw  mm0, mm3
+    paddw   mm4, mm7
+    pmulhw  mm5, mm1
+    movq    mm7, [eax + 30H]
+    psubw   mm6, mm2
+    paddw   mm0, mm3
+    pmulhw  mm3, mm7
+    movq    mm2, [edx + 28H]
+    pmulhw  mm7, mm1
+    paddw   mm5, mm1
+    movq    mm1, mm2
+    pmulhw  mm2, [eax + 08H]
+    psubw   mm3, mm5
+    movq    mm5, [edx + 68H]
+    paddw   mm0, mm7
+    movq    mm7, mm5
+    psubw   mm0, mm4
+    pmulhw  mm5, [eax + 08H]
+    paddw   mm2, mm1
+    pmulhw  mm1, [eax + 28H]
+    paddw   mm4, mm4
+    paddw   mm4, mm0
+    psubw   mm3, mm6
+    paddw   mm5, mm7
+    paddw   mm6, mm6
+    pmulhw  mm7, [eax + 28H]
+    paddw   mm6, mm3
+    movq    [edx + 18H], mm4
+    psubw   mm1, mm5
+    movq    mm4, [eax + 18H]
+    movq    mm5, mm3
+    pmulhw  mm3, mm4
+    paddw   mm7, mm2
+    movq    [edx + 28H], mm6
+    movq    mm2, mm0
+    movq    mm6, [edx + 08H]
+    pmulhw  mm0, mm4
+    paddw   mm5, mm3
+    movq    mm3, [edx + 48H]
+    psubw   mm5, mm1
+    paddw   mm2, mm0
+    psubw   mm6, mm3
+    movq    mm0, mm6
+    pmulhw  mm6, mm4
+    paddw   mm3, mm3
+    paddw   mm1, mm1
+    paddw   mm3, mm0
+    paddw   mm1, mm5
+    pmulhw  mm4, mm3
+    paddw   mm6, mm0
+    psubw   mm6, mm2
+    paddw   mm2, mm2
+    movq    mm0, [edx + 18H]
+    paddw   mm2, mm6
+    paddw   mm4, mm3
+    psubw   mm2, mm1
+    paddw   mm2, [eax + 38H]
+    paddw   mm1, mm1
+    paddw   mm1, mm2
+    psraw   mm2, 4
+    psubw   mm4, mm7
+    psraw   mm1, 4
+    movq    mm3, [edx + 28H]
+    paddw   mm7, mm7
+    movq    [edx + 28H], mm2
+    paddw   mm7, mm4
+    movq    [edx + 18H], mm1
+    psubw   mm4, mm3
+    paddw   mm4, [eax + 38H]
+    paddw   mm3, mm3
+    paddw   mm3, mm4
+    psraw   mm4, 4
+    psubw   mm6, mm5
+    psraw   mm3, 4
+    paddw   mm6, [eax + 38H]
+    paddw   mm5, mm5
+    paddw   mm5, mm6
+    psraw   mm6, 4
+    movq    [edx + 48H], mm4
+    psraw   mm5, 4
+    movq    [edx + 38H], mm3
+    psubw   mm7, mm0
+    paddw   mm7, [eax + 38H]
+    paddw   mm0, mm0
+    paddw   mm0, mm7
+    psraw   mm7, 4
+    movq    [edx + 68H], mm6
+    psraw   mm0, 4
+    movq    [edx + 58H], mm5
+    movq    [edx + 78H], mm7
+    movq    [edx + 08H], mm0
     /* emms  */
   }
 }
@@ -530,477 +530,477 @@
 void oc_idct8x8_mmx(ogg_int16_t _y[64]){
   _asm {
     mov     edx, [_y]
-    mov     eax, offset OC_IDCT_CONSTS             
-    movq    mm2, [edx + 30H]             
-    movq    mm6, [eax + 10H]             
-    movq    mm4, mm2                               
-    movq    mm7, [edx + 18H]             
-    pmulhw  mm4, mm6                               
-    movq    mm1, [eax + 20H]             
-    pmulhw  mm6, mm7                               
-    movq    mm5, mm1                               
-    pmulhw  mm1, mm2                               
-    movq    mm3, [edx + 10H]             
-    pmulhw  mm5, mm7                               
-    movq    mm0, [eax]                   
-    paddw   mm4, mm2                               
-    paddw   mm6, mm7                               
-    paddw   mm2, mm1                               
-    movq    mm1, [edx + 38H]             
-    paddw   mm7, mm5                               
-    movq    mm5, mm0                               
-    pmulhw  mm0, mm3                               
-    paddw   mm4, mm7                               
-    pmulhw  mm5, mm1                               
-    movq    mm7, [eax + 30H]             
-    psubw   mm6, mm2                               
-    paddw   mm0, mm3                               
-    pmulhw  mm3, mm7                               
-    movq    mm2, [edx + 20H]             
-    pmulhw  mm7, mm1                               
-    paddw   mm5, mm1                               
-    movq    mm1, mm2                               
-    pmulhw  mm2, [eax + 08H]             
-    psubw   mm3, mm5                               
-    movq    mm5, [edx + 28H]             
-    paddw   mm0, mm7                               
-    movq    mm7, mm5                               
-    psubw   mm0, mm4                               
-    pmulhw  mm5, [eax + 08H]             
-    paddw   mm2, mm1                               
-    pmulhw  mm1, [eax + 28H]             
-    paddw   mm4, mm4                               
-    paddw   mm4, mm0                               
-    psubw   mm3, mm6                               
-    paddw   mm5, mm7                               
-    paddw   mm6, mm6                               
-    pmulhw  mm7, [eax + 28H]             
-    paddw   mm6, mm3                               
-    movq    [edx + 10H], mm4             
-    psubw   mm1, mm5                               
-    movq    mm4, [eax + 18H]             
-    movq    mm5, mm3                               
-    pmulhw  mm3, mm4                               
-    paddw   mm7, mm2                               
-    movq    [edx + 20H], mm6             
-    movq    mm2, mm0                               
-    movq    mm6, [edx]                   
-    pmulhw  mm0, mm4                               
-    paddw   mm5, mm3                               
-    movq    mm3, [edx + 08H]             
-    psubw   mm5, mm1                               
-    paddw   mm2, mm0                               
-    psubw   mm6, mm3                               
-    movq    mm0, mm6                               
-    pmulhw  mm6, mm4                               
-    paddw   mm3, mm3                               
-    paddw   mm1, mm1                               
-    paddw   mm3, mm0                               
-    paddw   mm1, mm5                               
-    pmulhw  mm4, mm3                               
-    paddw   mm6, mm0                               
-    psubw   mm6, mm2                               
-    paddw   mm2, mm2                               
-    movq    mm0, [edx + 10H]             
-    paddw   mm2, mm6                               
-    paddw   mm4, mm3                               
-    psubw   mm2, mm1                               
-    movq    mm3, [edx + 20H]             
-    psubw   mm4, mm7                               
-    paddw   mm1, mm1                               
-    paddw   mm7, mm7                               
-    paddw   mm1, mm2                               
-    paddw   mm7, mm4                               
-    psubw   mm4, mm3                               
-    paddw   mm3, mm3                               
-    psubw   mm6, mm5                               
-    paddw   mm5, mm5                               
-    paddw   mm3, mm4                               
-    paddw   mm5, mm6                               
-    psubw   mm7, mm0                               
-    paddw   mm0, mm0                               
-    movq    [edx + 10H], mm1             
-    paddw   mm0, mm7                               
-    movq    mm1, mm4                               
-    punpcklwd mm4, mm5                             
-    movq    [edx], mm0                   
-    punpckhwd mm1, mm5                             
-    movq    mm0, mm6                               
-    punpcklwd mm6, mm7                             
-    movq    mm5, mm4                               
-    punpckldq mm4, mm6                             
-    punpckhdq mm5, mm6                             
-    movq    mm6, mm1                               
-    movq    [edx + 08H], mm4             
-    punpckhwd mm0, mm7                             
-    movq    [edx + 18H], mm5             
-    punpckhdq mm6, mm0                             
-    movq    mm4, [edx]                   
-    punpckldq mm1, mm0                             
-    movq    mm5, [edx + 10H]             
-    movq    mm0, mm4                               
-    movq    [edx + 38H], mm6             
-    punpcklwd mm0, mm5                             
-    movq    [edx + 28H], mm1             
-    punpckhwd mm4, mm5                             
-    movq    mm5, mm2                               
-    punpcklwd mm2, mm3                             
-    movq    mm1, mm0                               
-    punpckldq mm0, mm2                             
-    punpckhdq mm1, mm2                             
-    movq    mm2, mm4                               
-    movq    [edx], mm0                   
-    punpckhwd mm5, mm3                             
-    movq    [edx + 10H], mm1             
-    punpckhdq mm4, mm5                             
-    punpckldq mm2, mm5                             
-    movq    [edx + 30H], mm4             
-    movq    [edx + 20H], mm2             
-    movq    mm2, [edx + 70H]             
-    movq    mm6, [eax + 10H]             
-    movq    mm4, mm2                               
-    movq    mm7, [edx + 58H]             
-    pmulhw  mm4, mm6                               
-    movq    mm1, [eax + 20H]             
-    pmulhw  mm6, mm7                               
-    movq    mm5, mm1                               
-    pmulhw  mm1, mm2                               
-    movq    mm3, [edx + 50H]             
-    pmulhw  mm5, mm7                               
-    movq    mm0, [eax]                   
-    paddw   mm4, mm2                               
-    paddw   mm6, mm7                               
-    paddw   mm2, mm1                               
-    movq    mm1, [edx + 78H]             
-    paddw   mm7, mm5                               
-    movq    mm5, mm0                               
-    pmulhw  mm0, mm3                               
-    paddw   mm4, mm7                               
-    pmulhw  mm5, mm1                               
-    movq    mm7, [eax + 30H]             
-    psubw   mm6, mm2                               
-    paddw   mm0, mm3                               
-    pmulhw  mm3, mm7                               
-    movq    mm2, [edx + 60H]             
-    pmulhw  mm7, mm1                               
-    paddw   mm5, mm1                               
-    movq    mm1, mm2                               
-    pmulhw  mm2, [eax + 08H]             
-    psubw   mm3, mm5                               
-    movq    mm5, [edx + 68H]             
-    paddw   mm0, mm7                               
-    movq    mm7, mm5                               
-    psubw   mm0, mm4                               
-    pmulhw  mm5, [eax + 08H]             
-    paddw   mm2, mm1                               
-    pmulhw  mm1, [eax + 28H]             
-    paddw   mm4, mm4                               
-    paddw   mm4, mm0                               
-    psubw   mm3, mm6                               
-    paddw   mm5, mm7                               
-    paddw   mm6, mm6                               
-    pmulhw  mm7, [eax + 28H]             
-    paddw   mm6, mm3                               
-    movq    [edx + 50H], mm4             
-    psubw   mm1, mm5                               
-    movq    mm4, [eax + 18H]             
-    movq    mm5, mm3                               
-    pmulhw  mm3, mm4                               
-    paddw   mm7, mm2                               
-    movq    [edx + 60H], mm6             
-    movq    mm2, mm0                               
-    movq    mm6, [edx + 40H]             
-    pmulhw  mm0, mm4                               
-    paddw   mm5, mm3                               
-    movq    mm3, [edx + 48H]             
-    psubw   mm5, mm1                               
-    paddw   mm2, mm0                               
-    psubw   mm6, mm3                               
-    movq    mm0, mm6                               
-    pmulhw  mm6, mm4                               
-    paddw   mm3, mm3                               
-    paddw   mm1, mm1                               
-    paddw   mm3, mm0                               
-    paddw   mm1, mm5                               
-    pmulhw  mm4, mm3                               
-    paddw   mm6, mm0                               
-    psubw   mm6, mm2                               
-    paddw   mm2, mm2                               
-    movq    mm0, [edx + 50H]             
-    paddw   mm2, mm6                               
-    paddw   mm4, mm3                               
-    psubw   mm2, mm1                               
-    movq    mm3, [edx + 60H]             
-    psubw   mm4, mm7                               
-    paddw   mm1, mm1                               
-    paddw   mm7, mm7                               
-    paddw   mm1, mm2                               
-    paddw   mm7, mm4                               
-    psubw   mm4, mm3                               
-    paddw   mm3, mm3                               
-    psubw   mm6, mm5                               
-    paddw   mm5, mm5                               
-    paddw   mm3, mm4                               
-    paddw   mm5, mm6                               
-    psubw   mm7, mm0                               
-    paddw   mm0, mm0                               
-    movq    [edx + 50H], mm1             
-    paddw   mm0, mm7                               
-    movq    mm1, mm4                               
-    punpcklwd mm4, mm5                             
-    movq    [edx + 40H], mm0             
-    punpckhwd mm1, mm5                             
-    movq    mm0, mm6                               
-    punpcklwd mm6, mm7                             
-    movq    mm5, mm4                               
-    punpckldq mm4, mm6                             
-    punpckhdq mm5, mm6                             
-    movq    mm6, mm1                               
-    movq    [edx + 48H], mm4             
-    punpckhwd mm0, mm7                             
-    movq    [edx + 58H], mm5             
-    punpckhdq mm6, mm0                             
-    movq    mm4, [edx + 40H]             
-    punpckldq mm1, mm0                             
-    movq    mm5, [edx + 50H]             
-    movq    mm0, mm4                               
-    movq    [edx + 78H], mm6             
-    punpcklwd mm0, mm5                             
-    movq    [edx + 68H], mm1             
-    punpckhwd mm4, mm5                             
-    movq    mm5, mm2                               
-    punpcklwd mm2, mm3                             
-    movq    mm1, mm0                               
-    punpckldq mm0, mm2                             
-    punpckhdq mm1, mm2                             
-    movq    mm2, mm4                               
-    movq    [edx + 40H], mm0             
-    punpckhwd mm5, mm3                             
-    movq    [edx + 50H], mm1             
-    punpckhdq mm4, mm5                             
-    punpckldq mm2, mm5                             
-    movq    [edx + 70H], mm4             
-    movq    [edx + 60H], mm2             
-    movq    mm2, [edx + 30H]             
-    movq    mm6, [eax + 10H]             
-    movq    mm4, mm2                               
-    movq    mm7, [edx + 50H]             
-    pmulhw  mm4, mm6                               
-    movq    mm1, [eax + 20H]             
-    pmulhw  mm6, mm7                               
-    movq    mm5, mm1                               
-    pmulhw  mm1, mm2                               
-    movq    mm3, [edx + 10H]             
-    pmulhw  mm5, mm7                               
-    movq    mm0, [eax]                   
-    paddw   mm4, mm2                               
-    paddw   mm6, mm7                               
-    paddw   mm2, mm1                               
-    movq    mm1, [edx + 70H]             
-    paddw   mm7, mm5                               
-    movq    mm5, mm0                               
-    pmulhw  mm0, mm3                               
-    paddw   mm4, mm7                               
-    pmulhw  mm5, mm1                               
-    movq    mm7, [eax + 30H]             
-    psubw   mm6, mm2                               
-    paddw   mm0, mm3                               
-    pmulhw  mm3, mm7                               
-    movq    mm2, [edx + 20H]             
-    pmulhw  mm7, mm1                               
-    paddw   mm5, mm1                               
-    movq    mm1, mm2                               
-    pmulhw  mm2, [eax + 08H]             
-    psubw   mm3, mm5                               
-    movq    mm5, [edx + 60H]             
-    paddw   mm0, mm7                               
-    movq    mm7, mm5                               
-    psubw   mm0, mm4                               
-    pmulhw  mm5, [eax + 08H]             
-    paddw   mm2, mm1                               
-    pmulhw  mm1, [eax + 28H]             
-    paddw   mm4, mm4                               
-    paddw   mm4, mm0                               
-    psubw   mm3, mm6                               
-    paddw   mm5, mm7                               
-    paddw   mm6, mm6                               
-    pmulhw  mm7, [eax + 28H]             
-    paddw   mm6, mm3                               
-    movq    [edx + 10H], mm4             
-    psubw   mm1, mm5                               
-    movq    mm4, [eax + 18H]             
-    movq    mm5, mm3                               
-    pmulhw  mm3, mm4                               
-    paddw   mm7, mm2                               
-    movq    [edx + 20H], mm6             
-    movq    mm2, mm0                               
-    movq    mm6, [edx]                   
-    pmulhw  mm0, mm4                               
-    paddw   mm5, mm3                               
-    movq    mm3, [edx + 40H]             
-    psubw   mm5, mm1                               
-    paddw   mm2, mm0                               
-    psubw   mm6, mm3                               
-    movq    mm0, mm6                               
-    pmulhw  mm6, mm4                               
-    paddw   mm3, mm3                               
-    paddw   mm1, mm1                               
-    paddw   mm3, mm0                               
-    paddw   mm1, mm5                               
-    pmulhw  mm4, mm3                               
-    paddw   mm6, mm0                               
-    psubw   mm6, mm2                               
-    paddw   mm2, mm2                               
-    movq    mm0, [edx + 10H]             
-    paddw   mm2, mm6                               
-    paddw   mm4, mm3                               
-    psubw   mm2, mm1                               
-    paddw   mm2, [eax + 38H]             
-    paddw   mm1, mm1                               
-    paddw   mm1, mm2                               
-    psraw   mm2, 4                                 
-    psubw   mm4, mm7                               
-    psraw   mm1, 4                                 
-    movq    mm3, [edx + 20H]             
-    paddw   mm7, mm7                               
-    movq    [edx + 20H], mm2             
-    paddw   mm7, mm4                               
-    movq    [edx + 10H], mm1             
-    psubw   mm4, mm3                               
-    paddw   mm4, [eax + 38H]             
-    paddw   mm3, mm3                               
-    paddw   mm3, mm4                               
-    psraw   mm4, 4                                 
-    psubw   mm6, mm5                               
-    psraw   mm3, 4                                 
-    paddw   mm6, [eax + 38H]             
-    paddw   mm5, mm5                               
-    paddw   mm5, mm6                               
-    psraw   mm6, 4                                 
-    movq    [edx + 40H], mm4             
-    psraw   mm5, 4                                 
-    movq    [edx + 30H], mm3             
-    psubw   mm7, mm0                               
-    paddw   mm7, [eax + 38H]             
-    paddw   mm0, mm0                               
-    paddw   mm0, mm7                               
-    psraw   mm7, 4                                 
-    movq    [edx + 60H], mm6             
-    psraw   mm0, 4                                 
-    movq    [edx + 50H], mm5             
-    movq    [edx + 70H], mm7             
-    movq    [edx], mm0                   
-    movq    mm2, [edx + 38H]             
-    movq    mm6, [eax + 10H]             
-    movq    mm4, mm2                               
-    movq    mm7, [edx + 58H]             
-    pmulhw  mm4, mm6                               
-    movq    mm1, [eax + 20H]             
-    pmulhw  mm6, mm7                               
-    movq    mm5, mm1                               
-    pmulhw  mm1, mm2                               
-    movq    mm3, [edx + 18H]             
-    pmulhw  mm5, mm7                               
-    movq    mm0, [eax]                   
-    paddw   mm4, mm2                               
-    paddw   mm6, mm7                               
-    paddw   mm2, mm1                               
-    movq    mm1, [edx + 78H]             
-    paddw   mm7, mm5                               
-    movq    mm5, mm0                               
-    pmulhw  mm0, mm3                               
-    paddw   mm4, mm7                               
-    pmulhw  mm5, mm1                               
-    movq    mm7, [eax + 30H]             
-    psubw   mm6, mm2                               
-    paddw   mm0, mm3                               
-    pmulhw  mm3, mm7                               
-    movq    mm2, [edx + 28H]             
-    pmulhw  mm7, mm1                               
-    paddw   mm5, mm1                               
-    movq    mm1, mm2                               
-    pmulhw  mm2, [eax + 08H]             
-    psubw   mm3, mm5                               
-    movq    mm5, [edx + 68H]             
-    paddw   mm0, mm7                               
-    movq    mm7, mm5                               
-    psubw   mm0, mm4                               
-    pmulhw  mm5, [eax + 08H]             
-    paddw   mm2, mm1                               
-    pmulhw  mm1, [eax + 28H]             
-    paddw   mm4, mm4                               
-    paddw   mm4, mm0                               
-    psubw   mm3, mm6                               
-    paddw   mm5, mm7                               
-    paddw   mm6, mm6                               
-    pmulhw  mm7, [eax + 28H]             
-    paddw   mm6, mm3                               
-    movq    [edx + 18H], mm4             
-    psubw   mm1, mm5                               
-    movq    mm4, [eax + 18H]             
-    movq    mm5, mm3                               
-    pmulhw  mm3, mm4                               
-    paddw   mm7, mm2                               
-    movq    [edx + 28H], mm6             
-    movq    mm2, mm0                               
-    movq    mm6, [edx + 08H]             
-    pmulhw  mm0, mm4                               
-    paddw   mm5, mm3                               
-    movq    mm3, [edx + 48H]             
-    psubw   mm5, mm1                               
-    paddw   mm2, mm0                               
-    psubw   mm6, mm3                               
-    movq    mm0, mm6                               
-    pmulhw  mm6, mm4                               
-    paddw   mm3, mm3                               
-    paddw   mm1, mm1                               
-    paddw   mm3, mm0                               
-    paddw   mm1, mm5                               
-    pmulhw  mm4, mm3                               
-    paddw   mm6, mm0                               
-    psubw   mm6, mm2                               
-    paddw   mm2, mm2                               
-    movq    mm0, [edx + 18H]             
-    paddw   mm2, mm6                               
-    paddw   mm4, mm3                               
-    psubw   mm2, mm1                               
-    paddw   mm2, [eax + 38H]             
-    paddw   mm1, mm1                               
-    paddw   mm1, mm2                               
-    psraw   mm2, 4                                 
-    psubw   mm4, mm7                               
-    psraw   mm1, 4                                 
-    movq    mm3, [edx + 28H]             
-    paddw   mm7, mm7                               
-    movq    [edx + 28H], mm2             
-    paddw   mm7, mm4                               
-    movq    [edx + 18H], mm1             
-    psubw   mm4, mm3                               
-    paddw   mm4, [eax + 38H]             
-    paddw   mm3, mm3                               
-    paddw   mm3, mm4                               
-    psraw   mm4, 4                                 
-    psubw   mm6, mm5                               
-    psraw   mm3, 4                                 
-    paddw   mm6, [eax + 38H]             
-    paddw   mm5, mm5                               
-    paddw   mm5, mm6                               
-    psraw   mm6, 4                                 
-    movq    [edx + 48H], mm4             
-    psraw   mm5, 4                                 
-    movq    [edx + 38H], mm3             
-    psubw   mm7, mm0                               
-    paddw   mm7, [eax + 38H]             
-    paddw   mm0, mm0                               
-    paddw   mm0, mm7                               
-    psraw   mm7, 4                                 
-    movq    [edx + 68H], mm6             
-    psraw   mm0, 4                                 
-    movq    [edx + 58H], mm5             
-    movq    [edx + 78H], mm7             
-    movq    [edx + 08H], mm0             
+    mov     eax, offset OC_IDCT_CONSTS
+    movq    mm2, [edx + 30H]
+    movq    mm6, [eax + 10H]
+    movq    mm4, mm2
+    movq    mm7, [edx + 18H]
+    pmulhw  mm4, mm6
+    movq    mm1, [eax + 20H]
+    pmulhw  mm6, mm7
+    movq    mm5, mm1
+    pmulhw  mm1, mm2
+    movq    mm3, [edx + 10H]
+    pmulhw  mm5, mm7
+    movq    mm0, [eax]
+    paddw   mm4, mm2
+    paddw   mm6, mm7
+    paddw   mm2, mm1
+    movq    mm1, [edx + 38H]
+    paddw   mm7, mm5
+    movq    mm5, mm0
+    pmulhw  mm0, mm3
+    paddw   mm4, mm7
+    pmulhw  mm5, mm1
+    movq    mm7, [eax + 30H]
+    psubw   mm6, mm2
+    paddw   mm0, mm3
+    pmulhw  mm3, mm7
+    movq    mm2, [edx + 20H]
+    pmulhw  mm7, mm1
+    paddw   mm5, mm1
+    movq    mm1, mm2
+    pmulhw  mm2, [eax + 08H]
+    psubw   mm3, mm5
+    movq    mm5, [edx + 28H]
+    paddw   mm0, mm7
+    movq    mm7, mm5
+    psubw   mm0, mm4
+    pmulhw  mm5, [eax + 08H]
+    paddw   mm2, mm1
+    pmulhw  mm1, [eax + 28H]
+    paddw   mm4, mm4
+    paddw   mm4, mm0
+    psubw   mm3, mm6
+    paddw   mm5, mm7
+    paddw   mm6, mm6
+    pmulhw  mm7, [eax + 28H]
+    paddw   mm6, mm3
+    movq    [edx + 10H], mm4
+    psubw   mm1, mm5
+    movq    mm4, [eax + 18H]
+    movq    mm5, mm3
+    pmulhw  mm3, mm4
+    paddw   mm7, mm2
+    movq    [edx + 20H], mm6
+    movq    mm2, mm0
+    movq    mm6, [edx]
+    pmulhw  mm0, mm4
+    paddw   mm5, mm3
+    movq    mm3, [edx + 08H]
+    psubw   mm5, mm1
+    paddw   mm2, mm0
+    psubw   mm6, mm3
+    movq    mm0, mm6
+    pmulhw  mm6, mm4
+    paddw   mm3, mm3
+    paddw   mm1, mm1
+    paddw   mm3, mm0
+    paddw   mm1, mm5
+    pmulhw  mm4, mm3
+    paddw   mm6, mm0
+    psubw   mm6, mm2
+    paddw   mm2, mm2
+    movq    mm0, [edx + 10H]
+    paddw   mm2, mm6
+    paddw   mm4, mm3
+    psubw   mm2, mm1
+    movq    mm3, [edx + 20H]
+    psubw   mm4, mm7
+    paddw   mm1, mm1
+    paddw   mm7, mm7
+    paddw   mm1, mm2
+    paddw   mm7, mm4
+    psubw   mm4, mm3
+    paddw   mm3, mm3
+    psubw   mm6, mm5
+    paddw   mm5, mm5
+    paddw   mm3, mm4
+    paddw   mm5, mm6
+    psubw   mm7, mm0
+    paddw   mm0, mm0
+    movq    [edx + 10H], mm1
+    paddw   mm0, mm7
+    movq    mm1, mm4
+    punpcklwd mm4, mm5
+    movq    [edx], mm0
+    punpckhwd mm1, mm5
+    movq    mm0, mm6
+    punpcklwd mm6, mm7
+    movq    mm5, mm4
+    punpckldq mm4, mm6
+    punpckhdq mm5, mm6
+    movq    mm6, mm1
+    movq    [edx + 08H], mm4
+    punpckhwd mm0, mm7
+    movq    [edx + 18H], mm5
+    punpckhdq mm6, mm0
+    movq    mm4, [edx]
+    punpckldq mm1, mm0
+    movq    mm5, [edx + 10H]
+    movq    mm0, mm4
+    movq    [edx + 38H], mm6
+    punpcklwd mm0, mm5
+    movq    [edx + 28H], mm1
+    punpckhwd mm4, mm5
+    movq    mm5, mm2
+    punpcklwd mm2, mm3
+    movq    mm1, mm0
+    punpckldq mm0, mm2
+    punpckhdq mm1, mm2
+    movq    mm2, mm4
+    movq    [edx], mm0
+    punpckhwd mm5, mm3
+    movq    [edx + 10H], mm1
+    punpckhdq mm4, mm5
+    punpckldq mm2, mm5
+    movq    [edx + 30H], mm4
+    movq    [edx + 20H], mm2
+    movq    mm2, [edx + 70H]
+    movq    mm6, [eax + 10H]
+    movq    mm4, mm2
+    movq    mm7, [edx + 58H]
+    pmulhw  mm4, mm6
+    movq    mm1, [eax + 20H]
+    pmulhw  mm6, mm7
+    movq    mm5, mm1
+    pmulhw  mm1, mm2
+    movq    mm3, [edx + 50H]
+    pmulhw  mm5, mm7
+    movq    mm0, [eax]
+    paddw   mm4, mm2
+    paddw   mm6, mm7
+    paddw   mm2, mm1
+    movq    mm1, [edx + 78H]
+    paddw   mm7, mm5
+    movq    mm5, mm0
+    pmulhw  mm0, mm3
+    paddw   mm4, mm7
+    pmulhw  mm5, mm1
+    movq    mm7, [eax + 30H]
+    psubw   mm6, mm2
+    paddw   mm0, mm3
+    pmulhw  mm3, mm7
+    movq    mm2, [edx + 60H]
+    pmulhw  mm7, mm1
+    paddw   mm5, mm1
+    movq    mm1, mm2
+    pmulhw  mm2, [eax + 08H]
+    psubw   mm3, mm5
+    movq    mm5, [edx + 68H]
+    paddw   mm0, mm7
+    movq    mm7, mm5
+    psubw   mm0, mm4
+    pmulhw  mm5, [eax + 08H]
+    paddw   mm2, mm1
+    pmulhw  mm1, [eax + 28H]
+    paddw   mm4, mm4
+    paddw   mm4, mm0
+    psubw   mm3, mm6
+    paddw   mm5, mm7
+    paddw   mm6, mm6
+    pmulhw  mm7, [eax + 28H]
+    paddw   mm6, mm3
+    movq    [edx + 50H], mm4
+    psubw   mm1, mm5
+    movq    mm4, [eax + 18H]
+    movq    mm5, mm3
+    pmulhw  mm3, mm4
+    paddw   mm7, mm2
+    movq    [edx + 60H], mm6
+    movq    mm2, mm0
+    movq    mm6, [edx + 40H]
+    pmulhw  mm0, mm4
+    paddw   mm5, mm3
+    movq    mm3, [edx + 48H]
+    psubw   mm5, mm1
+    paddw   mm2, mm0
+    psubw   mm6, mm3
+    movq    mm0, mm6
+    pmulhw  mm6, mm4
+    paddw   mm3, mm3
+    paddw   mm1, mm1
+    paddw   mm3, mm0
+    paddw   mm1, mm5
+    pmulhw  mm4, mm3
+    paddw   mm6, mm0
+    psubw   mm6, mm2
+    paddw   mm2, mm2
+    movq    mm0, [edx + 50H]
+    paddw   mm2, mm6
+    paddw   mm4, mm3
+    psubw   mm2, mm1
+    movq    mm3, [edx + 60H]
+    psubw   mm4, mm7
+    paddw   mm1, mm1
+    paddw   mm7, mm7
+    paddw   mm1, mm2
+    paddw   mm7, mm4
+    psubw   mm4, mm3
+    paddw   mm3, mm3
+    psubw   mm6, mm5
+    paddw   mm5, mm5
+    paddw   mm3, mm4
+    paddw   mm5, mm6
+    psubw   mm7, mm0
+    paddw   mm0, mm0
+    movq    [edx + 50H], mm1
+    paddw   mm0, mm7
+    movq    mm1, mm4
+    punpcklwd mm4, mm5
+    movq    [edx + 40H], mm0
+    punpckhwd mm1, mm5
+    movq    mm0, mm6
+    punpcklwd mm6, mm7
+    movq    mm5, mm4
+    punpckldq mm4, mm6
+    punpckhdq mm5, mm6
+    movq    mm6, mm1
+    movq    [edx + 48H], mm4
+    punpckhwd mm0, mm7
+    movq    [edx + 58H], mm5
+    punpckhdq mm6, mm0
+    movq    mm4, [edx + 40H]
+    punpckldq mm1, mm0
+    movq    mm5, [edx + 50H]
+    movq    mm0, mm4
+    movq    [edx + 78H], mm6
+    punpcklwd mm0, mm5
+    movq    [edx + 68H], mm1
+    punpckhwd mm4, mm5
+    movq    mm5, mm2
+    punpcklwd mm2, mm3
+    movq    mm1, mm0
+    punpckldq mm0, mm2
+    punpckhdq mm1, mm2
+    movq    mm2, mm4
+    movq    [edx + 40H], mm0
+    punpckhwd mm5, mm3
+    movq    [edx + 50H], mm1
+    punpckhdq mm4, mm5
+    punpckldq mm2, mm5
+    movq    [edx + 70H], mm4
+    movq    [edx + 60H], mm2
+    movq    mm2, [edx + 30H]
+    movq    mm6, [eax + 10H]
+    movq    mm4, mm2
+    movq    mm7, [edx + 50H]
+    pmulhw  mm4, mm6
+    movq    mm1, [eax + 20H]
+    pmulhw  mm6, mm7
+    movq    mm5, mm1
+    pmulhw  mm1, mm2
+    movq    mm3, [edx + 10H]
+    pmulhw  mm5, mm7
+    movq    mm0, [eax]
+    paddw   mm4, mm2
+    paddw   mm6, mm7
+    paddw   mm2, mm1
+    movq    mm1, [edx + 70H]
+    paddw   mm7, mm5
+    movq    mm5, mm0
+    pmulhw  mm0, mm3
+    paddw   mm4, mm7
+    pmulhw  mm5, mm1
+    movq    mm7, [eax + 30H]
+    psubw   mm6, mm2
+    paddw   mm0, mm3
+    pmulhw  mm3, mm7
+    movq    mm2, [edx + 20H]
+    pmulhw  mm7, mm1
+    paddw   mm5, mm1
+    movq    mm1, mm2
+    pmulhw  mm2, [eax + 08H]
+    psubw   mm3, mm5
+    movq    mm5, [edx + 60H]
+    paddw   mm0, mm7
+    movq    mm7, mm5
+    psubw   mm0, mm4
+    pmulhw  mm5, [eax + 08H]
+    paddw   mm2, mm1
+    pmulhw  mm1, [eax + 28H]
+    paddw   mm4, mm4
+    paddw   mm4, mm0
+    psubw   mm3, mm6
+    paddw   mm5, mm7
+    paddw   mm6, mm6
+    pmulhw  mm7, [eax + 28H]
+    paddw   mm6, mm3
+    movq    [edx + 10H], mm4
+    psubw   mm1, mm5
+    movq    mm4, [eax + 18H]
+    movq    mm5, mm3
+    pmulhw  mm3, mm4
+    paddw   mm7, mm2
+    movq    [edx + 20H], mm6
+    movq    mm2, mm0
+    movq    mm6, [edx]
+    pmulhw  mm0, mm4
+    paddw   mm5, mm3
+    movq    mm3, [edx + 40H]
+    psubw   mm5, mm1
+    paddw   mm2, mm0
+    psubw   mm6, mm3
+    movq    mm0, mm6
+    pmulhw  mm6, mm4
+    paddw   mm3, mm3
+    paddw   mm1, mm1
+    paddw   mm3, mm0
+    paddw   mm1, mm5
+    pmulhw  mm4, mm3
+    paddw   mm6, mm0
+    psubw   mm6, mm2
+    paddw   mm2, mm2
+    movq    mm0, [edx + 10H]
+    paddw   mm2, mm6
+    paddw   mm4, mm3
+    psubw   mm2, mm1
+    paddw   mm2, [eax + 38H]
+    paddw   mm1, mm1
+    paddw   mm1, mm2
+    psraw   mm2, 4
+    psubw   mm4, mm7
+    psraw   mm1, 4
+    movq    mm3, [edx + 20H]
+    paddw   mm7, mm7
+    movq    [edx + 20H], mm2
+    paddw   mm7, mm4
+    movq    [edx + 10H], mm1
+    psubw   mm4, mm3
+    paddw   mm4, [eax + 38H]
+    paddw   mm3, mm3
+    paddw   mm3, mm4
+    psraw   mm4, 4
+    psubw   mm6, mm5
+    psraw   mm3, 4
+    paddw   mm6, [eax + 38H]
+    paddw   mm5, mm5
+    paddw   mm5, mm6
+    psraw   mm6, 4
+    movq    [edx + 40H], mm4
+    psraw   mm5, 4
+    movq    [edx + 30H], mm3
+    psubw   mm7, mm0
+    paddw   mm7, [eax + 38H]
+    paddw   mm0, mm0
+    paddw   mm0, mm7
+    psraw   mm7, 4
+    movq    [edx + 60H], mm6
+    psraw   mm0, 4
+    movq    [edx + 50H], mm5
+    movq    [edx + 70H], mm7
+    movq    [edx], mm0
+    movq    mm2, [edx + 38H]
+    movq    mm6, [eax + 10H]
+    movq    mm4, mm2
+    movq    mm7, [edx + 58H]
+    pmulhw  mm4, mm6
+    movq    mm1, [eax + 20H]
+    pmulhw  mm6, mm7
+    movq    mm5, mm1
+    pmulhw  mm1, mm2
+    movq    mm3, [edx + 18H]
+    pmulhw  mm5, mm7
+    movq    mm0, [eax]
+    paddw   mm4, mm2
+    paddw   mm6, mm7
+    paddw   mm2, mm1
+    movq    mm1, [edx + 78H]
+    paddw   mm7, mm5
+    movq    mm5, mm0
+    pmulhw  mm0, mm3
+    paddw   mm4, mm7
+    pmulhw  mm5, mm1
+    movq    mm7, [eax + 30H]
+    psubw   mm6, mm2
+    paddw   mm0, mm3
+    pmulhw  mm3, mm7
+    movq    mm2, [edx + 28H]
+    pmulhw  mm7, mm1
+    paddw   mm5, mm1
+    movq    mm1, mm2
+    pmulhw  mm2, [eax + 08H]
+    psubw   mm3, mm5
+    movq    mm5, [edx + 68H]
+    paddw   mm0, mm7
+    movq    mm7, mm5
+    psubw   mm0, mm4
+    pmulhw  mm5, [eax + 08H]
+    paddw   mm2, mm1
+    pmulhw  mm1, [eax + 28H]
+    paddw   mm4, mm4
+    paddw   mm4, mm0
+    psubw   mm3, mm6
+    paddw   mm5, mm7
+    paddw   mm6, mm6
+    pmulhw  mm7, [eax + 28H]
+    paddw   mm6, mm3
+    movq    [edx + 18H], mm4
+    psubw   mm1, mm5
+    movq    mm4, [eax + 18H]
+    movq    mm5, mm3
+    pmulhw  mm3, mm4
+    paddw   mm7, mm2
+    movq    [edx + 28H], mm6
+    movq    mm2, mm0
+    movq    mm6, [edx + 08H]
+    pmulhw  mm0, mm4
+    paddw   mm5, mm3
+    movq    mm3, [edx + 48H]
+    psubw   mm5, mm1
+    paddw   mm2, mm0
+    psubw   mm6, mm3
+    movq    mm0, mm6
+    pmulhw  mm6, mm4
+    paddw   mm3, mm3
+    paddw   mm1, mm1
+    paddw   mm3, mm0
+    paddw   mm1, mm5
+    pmulhw  mm4, mm3
+    paddw   mm6, mm0
+    psubw   mm6, mm2
+    paddw   mm2, mm2
+    movq    mm0, [edx + 18H]
+    paddw   mm2, mm6
+    paddw   mm4, mm3
+    psubw   mm2, mm1
+    paddw   mm2, [eax + 38H]
+    paddw   mm1, mm1
+    paddw   mm1, mm2
+    psraw   mm2, 4
+    psubw   mm4, mm7
+    psraw   mm1, 4
+    movq    mm3, [edx + 28H]
+    paddw   mm7, mm7
+    movq    [edx + 28H], mm2
+    paddw   mm7, mm4
+    movq    [edx + 18H], mm1
+    psubw   mm4, mm3
+    paddw   mm4, [eax + 38H]
+    paddw   mm3, mm3
+    paddw   mm3, mm4
+    psraw   mm4, 4
+    psubw   mm6, mm5
+    psraw   mm3, 4
+    paddw   mm6, [eax + 38H]
+    paddw   mm5, mm5
+    paddw   mm5, mm6
+    psraw   mm6, 4
+    movq    [edx + 48H], mm4
+    psraw   mm5, 4
+    movq    [edx + 38H], mm3
+    psubw   mm7, mm0
+    paddw   mm7, [eax + 38H]
+    paddw   mm0, mm0
+    paddw   mm0, mm7
+    psraw   mm7, 4
+    movq    [edx + 68H], mm6
+    psraw   mm0, 4
+    movq    [edx + 58H], mm5
+    movq    [edx + 78H], mm7
+    movq    [edx + 08H], mm0
     /* emms  */
   }
 }
 
-#endif
\ No newline at end of file
+#endif

Modified: trunk/theora/lib/dec/x86_vc/mmxloopfilter.c
===================================================================
--- trunk/theora/lib/dec/x86_vc/mmxloopfilter.c	2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/dec/x86_vc/mmxloopfilter.c	2008-08-04 18:37:55 UTC (rev 15153)
@@ -11,7 +11,7 @@
  ********************************************************************
 
   function:
-    last mod: $Id: 
+    last mod: $Id:
 
  ********************************************************************/
 
@@ -21,7 +21,7 @@
   Originally written by Rudolf Marek, based on code from On2's VP3.
   Converted to Visual Studio inline assembly by Nils Pipenbrinck.
 
-  Note: I can't test these since my example files never get into the 
+  Note: I can't test these since my example files never get into the
   loop filters, but the code has been converted semi-automatic from
   the GCC sources, so it ought to work.
   ---------------------------------------------------------------------*/
@@ -33,7 +33,7 @@
 
 
 
-static void loop_filter_v(unsigned char *_pix,int _ystride, 
+static void loop_filter_v(unsigned char *_pix,int _ystride,
                           const ogg_int16_t *_ll){
   _asm {
     mov       eax,  [_pix]
@@ -41,134 +41,134 @@
     mov       ebx,  [_ll]
 
     /* _pix -= ystride */
-    sub       eax,   edx                    
+    sub       eax,   edx
     /*  mm0=0          */
-    pxor      mm0,   mm0                    
+    pxor      mm0,   mm0
     /* _pix -= ystride */
-    sub       eax,   edx                    
+    sub       eax,   edx
     /*  esi=_ystride*3 */
-    lea       esi, [edx + edx*2]            
+    lea       esi, [edx + edx*2]
 
-    /*  mm7=_pix[0...8]*/       
-    movq      mm7, [eax]          
-    /*  mm4=_pix[0...8+_ystride*3]*/          
-    movq      mm4, [eax + esi]    
-    /*  mm6=_pix[0...8]*/           
-    movq      mm6, mm7                      
-    /*  Expand unsigned _pix[0...3] to 16 bits.*/                      
-    punpcklbw mm6, mm0                    
-    movq      mm5, mm4                      
+    /*  mm7=_pix[0...8]*/
+    movq      mm7, [eax]
+    /*  mm4=_pix[0...8+_ystride*3]*/
+    movq      mm4, [eax + esi]
+    /*  mm6=_pix[0...8]*/
+    movq      mm6, mm7
+    /*  Expand unsigned _pix[0...3] to 16 bits.*/
+    punpcklbw mm6, mm0
+    movq      mm5, mm4
     /*  Expand unsigned _pix[4...7] to 16 bits.*/
-    punpckhbw mm7, mm0                    
-    punpcklbw mm4, mm0                    
-    /*  Expand other arrays too.*/        
-    punpckhbw mm5, mm0                    
-    /*mm7:mm6=_p[0...7]-_p[0...7+_ystride*3]:*/          
-    psubw     mm6, mm4                      
-    psubw     mm7, mm5                      
-    /*mm5=mm4=_pix[0...7+_ystride]*/          
-    movq      mm4, [eax + edx]    
-    /*mm1=mm3=mm2=_pix[0..7]+_ystride*2]*/          
-    movq      mm2, [eax + edx*2]  
-    movq      mm5, mm4                                
-    movq      mm3, mm2                                
-    movq      mm1, mm2                      
-    /*Expand these arrays.*/                       
-    punpckhbw mm5, mm0                    
-    punpcklbw mm4, mm0                              
-    punpckhbw mm3, mm0                              
-    punpcklbw mm2, mm0                              
-    pcmpeqw   mm0, mm0                      
-    /*mm0=3 3 3 3   
+    punpckhbw mm7, mm0
+    punpcklbw mm4, mm0
+    /*  Expand other arrays too.*/
+    punpckhbw mm5, mm0
+    /*mm7:mm6=_p[0...7]-_p[0...7+_ystride*3]:*/
+    psubw     mm6, mm4
+    psubw     mm7, mm5
+    /*mm5=mm4=_pix[0...7+_ystride]*/
+    movq      mm4, [eax + edx]
+    /*mm1=mm3=mm2=_pix[0..7]+_ystride*2]*/
+    movq      mm2, [eax + edx*2]
+    movq      mm5, mm4
+    movq      mm3, mm2
+    movq      mm1, mm2
+    /*Expand these arrays.*/
+    punpckhbw mm5, mm0
+    punpcklbw mm4, mm0
+    punpckhbw mm3, mm0
+    punpcklbw mm2, mm0
+    pcmpeqw   mm0, mm0
+    /*mm0=3 3 3 3
     mm3:mm2=_pix[0...8+_ystride*2]-_pix[0...8+_ystride]*/
-    psubw     mm3, mm5                                
-    psrlw     mm0, 14                                 
-    psubw     mm2, mm4  
+    psubw     mm3, mm5
+    psrlw     mm0, 14
+    psubw     mm2, mm4
     /*Scale by 3.*/
-    pmullw    mm3, mm0                                
-    pmullw    mm2, mm0  
+    pmullw    mm3, mm0
+    pmullw    mm2, mm0
     /*mm0=4 4 4 4
     f=mm3:mm2==_pix[0...8]-_pix[0...8+_ystride*3]+
      3*(_pix[0...8+_ystride*2]-_pix[0...8+_ystride])*/
-    psrlw     mm0, 1                                  
-    paddw     mm3, mm7                                
-    psllw     mm0, 2    
+    psrlw     mm0, 1
+    paddw     mm3, mm7
+    psllw     mm0, 2
     paddw     mm2, mm6
     /*Add 4.*/
-    paddw     mm3, mm0                                  
-    paddw     mm2, mm0                                
-    /*"Divide" by 8.*/                                
-    psraw     mm3, 3                                    
-    psraw     mm2, 3                                  
-    /*Now compute lflim of mm3:mm2 cf. Section 7.10 of the sepc.*/                         
+    paddw     mm3, mm0
+    paddw     mm2, mm0
+    /*"Divide" by 8.*/
+    psraw     mm3, 3
+    psraw     mm2, 3
+    /*Now compute lflim of mm3:mm2 cf. Section 7.10 of the sepc.*/
     /*Free up mm5.*/
-    packuswb  mm4, mm5  
+    packuswb  mm4, mm5
     /*mm0=L L L L*/
-    movq      mm0, [ebx]   
+    movq      mm0, [ebx]
     /*if(R_i<-2L||R_i>2L)R_i=0:*/
-    movq      mm5, mm2                                
-    pxor      mm6, mm6                                
-    movq      mm7, mm0                                
-    psubw     mm6, mm0                                
-    psllw     mm7, 1                                  
-    psllw     mm6, 1   
+    movq      mm5, mm2
+    pxor      mm6, mm6
+    movq      mm7, mm0
+    psubw     mm6, mm0
+    psllw     mm7, 1
+    psllw     mm6, 1
     /*mm2==R_3 R_2 R_1 R_0*/
     /*mm5==R_3 R_2 R_1 R_0*/
     /*mm6==-2L -2L -2L -2L*/
     /*mm7==2L 2L 2L 2L*/
-    pcmpgtw   mm7, mm2                                
-    pcmpgtw   mm5, mm6                                
-    pand      mm2, mm7                                
-    movq      mm7, mm0                                
-    pand      mm2, mm5                                
-    psllw     mm7, 1                                  
-    movq      mm5, mm3   
+    pcmpgtw   mm7, mm2
+    pcmpgtw   mm5, mm6
+    pand      mm2, mm7
+    movq      mm7, mm0
+    pand      mm2, mm5
+    psllw     mm7, 1
+    movq      mm5, mm3
     /*mm3==R_7 R_6 R_5 R_4*/
     /*mm5==R_7 R_6 R_5 R_4*/
     /*mm6==-2L -2L -2L -2L*/
     /*mm7==2L 2L 2L 2L*/
-    pcmpgtw   mm7, mm3                                
-    pcmpgtw   mm5, mm6                                
-    pand      mm3, mm7                                
-    movq      mm7, mm0                                
-    pand      mm3, mm5    
+    pcmpgtw   mm7, mm3
+    pcmpgtw   mm5, mm6
+    pand      mm3, mm7
+    movq      mm7, mm0
+    pand      mm3, mm5
    /*if(R_i<-L)R_i'=R_i+2L;
      if(R_i>L)R_i'=R_i-2L;
      if(R_i<-L||R_i>L)R_i=-R_i':*/
-    psraw     mm6, 1                                  
-    movq      mm5, mm2                                
-    psllw     mm7, 1  
+    psraw     mm6, 1
+    movq      mm5, mm2
+    psllw     mm7, 1
     /*mm2==R_3 R_2 R_1 R_0*/
     /*mm5==R_3 R_2 R_1 R_0*/
     /*mm6==-L -L -L -L*/
     /*mm0==L L L L*/
     /*mm5=R_i>L?FF:00*/
-    pcmpgtw   mm5, mm0 
+    pcmpgtw   mm5, mm0
     /*mm6=-L>R_i?FF:00*/
-    pcmpgtw   mm6, mm2   
+    pcmpgtw   mm6, mm2
     /*mm7=R_i>L?2L:0*/
-    pand      mm7, mm5 
+    pand      mm7, mm5
     /*mm2=R_i>L?R_i-2L:R_i*/
-    psubw     mm2, mm7                                
-    movq      mm7, mm0 
+    psubw     mm2, mm7
+    movq      mm7, mm0
     /*mm5=-L>R_i||R_i>L*/
-    por       mm5, mm6                                
-    psllw     mm7, 1    
+    por       mm5, mm6
+    psllw     mm7, 1
     /*mm7=-L>R_i?2L:0*/
-    pand      mm7, mm6                                
-    pxor      mm6, mm6   
+    pand      mm7, mm6
+    pxor      mm6, mm6
     /*mm2=-L>R_i?R_i+2L:R_i*/
-    paddw     mm2, mm7                                
-    psubw     mm6, mm0 
+    paddw     mm2, mm7
+    psubw     mm6, mm0
     /*mm5=-L>R_i||R_i>L?-R_i':0*/
-    pand      mm5, mm2                                
-    movq      mm7, mm0   
+    pand      mm5, mm2
+    movq      mm7, mm0
     /*mm2=-L>R_i||R_i>L?0:R_i*/
-    psubw     mm2, mm5                                
-    psllw     mm7, 1  
+    psubw     mm2, mm5
+    psllw     mm7, 1
     /*mm2=-L>R_i||R_i>L?-R_i':R_i*/
-    psubw     mm2, mm5                                
-    movq      mm5, mm3   
+    psubw     mm2, mm5
+    movq      mm5, mm3
     /*mm3==R_7 R_6 R_5 R_4*/
     /*mm5==R_7 R_6 R_5 R_4*/
     /*mm6==-L -L -L -L*/
@@ -176,44 +176,44 @@
     /*mm6=-L>R_i?FF:00*/
     pcmpgtw   mm6, mm3
     /*mm5=R_i>L?FF:00*/
-    pcmpgtw   mm5, mm0    
+    pcmpgtw   mm5, mm0
     /*mm7=R_i>L?2L:0*/
-    pand      mm7, mm5 
+    pand      mm7, mm5
     /*mm2=R_i>L?R_i-2L:R_i*/
-    psubw     mm3, mm7                                
-    psllw     mm0, 1      
+    psubw     mm3, mm7
+    psllw     mm0, 1
     /*mm5=-L>R_i||R_i>L*/
-    por       mm5, mm6     
+    por       mm5, mm6
     /*mm0=-L>R_i?2L:0*/
-    pand      mm0, mm6   
+    pand      mm0, mm6
     /*mm3=-L>R_i?R_i+2L:R_i*/
-    paddw     mm3, mm0  
+    paddw     mm3, mm0
     /*mm5=-L>R_i||R_i>L?-R_i':0*/
-    pand      mm5, mm3    
+    pand      mm5, mm3
     /*mm2=-L>R_i||R_i>L?0:R_i*/
-    psubw     mm3, mm5  
+    psubw     mm3, mm5
     /*mm3=-L>R_i||R_i>L?-R_i':R_i*/
-    psubw     mm3, mm5   
+    psubw     mm3, mm5
     /*Unfortunately, there's no unsigned byte+signed byte with unsigned
        saturation op code, so we have to promote things back 16 bits.*/
-    pxor      mm0, mm0                                
-    movq      mm5, mm4                                
-    punpcklbw mm4, mm0                              
-    punpckhbw mm5, mm0                              
-    movq      mm6, mm1                                
-    punpcklbw mm1, mm0                              
-    punpckhbw mm6, mm0 
+    pxor      mm0, mm0
+    movq      mm5, mm4
+    punpcklbw mm4, mm0
+    punpckhbw mm5, mm0
+    movq      mm6, mm1
+    punpcklbw mm1, mm0
+    punpckhbw mm6, mm0
     /*_pix[0...8+_ystride]+=R_i*/
-    paddw     mm4, mm2                                
-    paddw     mm5, mm3  
+    paddw     mm4, mm2
+    paddw     mm5, mm3
     /*_pix[0...8+_ystride*2]-=R_i*/
-    psubw     mm1, mm2                                
-    psubw     mm6, mm3                                
-    packuswb  mm4, mm5                               
-    packuswb  mm1, mm6 
+    psubw     mm1, mm2
+    psubw     mm6, mm3
+    packuswb  mm4, mm5
+    packuswb  mm1, mm6
     /*Write it back out.*/
-    movq    [eax + edx], mm4              
-    movq    [eax + edx*2], mm1            
+    movq    [eax + edx], mm4
+    movq    [eax + edx*2], mm1
   }
 }
 
@@ -221,7 +221,7 @@
   Data are striped p0 p1 p2 p3 ... p0 p1 p2 p3 ..., so in order to load all
    four p0's to one register we must transpose the values in four mmx regs.
   When half is done we repeat this for the rest.*/
-static void loop_filter_h4(unsigned char *_pix,long _ystride, 
+static void loop_filter_h4(unsigned char *_pix,long _ystride,
                            const ogg_int16_t *_ll){
   /* todo: merge the comments from the GCC sources */
   _asm {
@@ -229,79 +229,79 @@
     mov   edx, [_ystride]
     mov   eax, [_ll]
     /*esi=_ystride*3*/
-    lea     esi, [edx + edx*2]              
+    lea     esi, [edx + edx*2]
 
-    movd    mm0, dword ptr [ecx]            
-    movd    mm1, dword ptr [ecx + edx]      
-    movd    mm2, dword ptr [ecx + edx*2]    
-    movd    mm3, dword ptr [ecx + esi]      
-    punpcklbw mm0, mm1                      
-    punpcklbw mm2, mm3                      
-    movq    mm1, mm0                        
-    punpckhwd mm0, mm2                      
-    punpcklwd mm1, mm2                      
-    pxor    mm7, mm7                        
-    movq    mm5, mm1                        
-    punpcklbw mm1, mm7                      
-    punpckhbw mm5, mm7                      
-    movq    mm3, mm0                        
-    punpcklbw mm0, mm7                      
-    punpckhbw mm3, mm7                      
-    psubw   mm1, mm3                        
-    movq    mm4, mm0                        
-    pcmpeqw mm2, mm2                        
-    psubw   mm0, mm5                        
-    psrlw   mm2, 14                         
-    pmullw  mm0, mm2                        
-    psrlw   mm2, 1                          
-    paddw   mm0, mm1                        
-    psllw   mm2, 2                          
-    paddw   mm0, mm2                        
-    psraw   mm0, 3                          
-    movq    mm6, qword ptr [eax]            
-    movq    mm1, mm0                        
-    pxor    mm2, mm2                        
-    movq    mm3, mm6                        
-    psubw   mm2, mm6                        
-    psllw   mm3, 1                          
-    psllw   mm2, 1                          
-    pcmpgtw mm3, mm0                        
-    pcmpgtw mm1, mm2                        
-    pand    mm0, mm3                        
-    pand    mm0, mm1                        
-    psraw   mm2, 1                          
-    movq    mm1, mm0                        
-    movq    mm3, mm6                        
-    pcmpgtw mm2, mm0                        
-    pcmpgtw mm1, mm6                        
-    psllw   mm3, 1                          
-    psllw   mm6, 1                          
-    pand    mm3, mm1                        
-    pand    mm6, mm2                        
-    psubw   mm0, mm3                        
-    por     mm1, mm2                        
-    paddw   mm0, mm6                        
-    pand    mm1, mm0                        
-    psubw   mm0, mm1                        
-    psubw   mm0, mm1                        
-    paddw   mm5, mm0                        
-    psubw   mm4, mm0                        
-    packuswb mm5, mm7                       
-    packuswb mm4, mm7                       
-    punpcklbw mm5, mm4                      
-    movd    edi, mm5                        
-    mov     word ptr [ecx + 01H], di        
-    psrlq   mm5, 32                         
-    shr     edi, 16                         
-    mov     word ptr [ecx + edx + 01H], di  
-    movd    edi, mm5                        
+    movd    mm0, dword ptr [ecx]
+    movd    mm1, dword ptr [ecx + edx]
+    movd    mm2, dword ptr [ecx + edx*2]
+    movd    mm3, dword ptr [ecx + esi]
+    punpcklbw mm0, mm1
+    punpcklbw mm2, mm3
+    movq    mm1, mm0
+    punpckhwd mm0, mm2
+    punpcklwd mm1, mm2
+    pxor    mm7, mm7
+    movq    mm5, mm1
+    punpcklbw mm1, mm7
+    punpckhbw mm5, mm7
+    movq    mm3, mm0
+    punpcklbw mm0, mm7
+    punpckhbw mm3, mm7
+    psubw   mm1, mm3
+    movq    mm4, mm0
+    pcmpeqw mm2, mm2
+    psubw   mm0, mm5
+    psrlw   mm2, 14
+    pmullw  mm0, mm2
+    psrlw   mm2, 1
+    paddw   mm0, mm1
+    psllw   mm2, 2
+    paddw   mm0, mm2
+    psraw   mm0, 3
+    movq    mm6, qword ptr [eax]
+    movq    mm1, mm0
+    pxor    mm2, mm2
+    movq    mm3, mm6
+    psubw   mm2, mm6
+    psllw   mm3, 1
+    psllw   mm2, 1
+    pcmpgtw mm3, mm0
+    pcmpgtw mm1, mm2
+    pand    mm0, mm3
+    pand    mm0, mm1
+    psraw   mm2, 1
+    movq    mm1, mm0
+    movq    mm3, mm6
+    pcmpgtw mm2, mm0
+    pcmpgtw mm1, mm6
+    psllw   mm3, 1
+    psllw   mm6, 1
+    pand    mm3, mm1
+    pand    mm6, mm2
+    psubw   mm0, mm3
+    por     mm1, mm2
+    paddw   mm0, mm6
+    pand    mm1, mm0
+    psubw   mm0, mm1
+    psubw   mm0, mm1
+    paddw   mm5, mm0
+    psubw   mm4, mm0
+    packuswb mm5, mm7
+    packuswb mm4, mm7
+    punpcklbw mm5, mm4
+    movd    edi, mm5
+    mov     word ptr [ecx + 01H], di
+    psrlq   mm5, 32
+    shr     edi, 16
+    mov     word ptr [ecx + edx + 01H], di
+    movd    edi, mm5
     mov     word ptr [ecx + edx*2 + 01H], di
-    shr     edi, 16                         
-    mov     word ptr [ecx + esi + 01H], di  
+    shr     edi, 16
+    mov     word ptr [ecx + esi + 01H], di
   }
 }
 
-static void loop_filter_h(unsigned char *_pix,int _ystride, 
+static void loop_filter_h(unsigned char *_pix,int _ystride,
                           const ogg_int16_t *_ll){
   _pix-=2;
   loop_filter_h4(_pix,_ystride,_ll);
@@ -374,4 +374,4 @@
   _mm_empty();
 }
 
-#endif
\ No newline at end of file
+#endif

Modified: trunk/theora/lib/dec/x86_vc/mmxstate.c
===================================================================
--- trunk/theora/lib/dec/x86_vc/mmxstate.c	2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/dec/x86_vc/mmxstate.c	2008-08-04 18:37:55 UTC (rev 15153)
@@ -42,33 +42,33 @@
 
 /* Fill a block with value */
 static __inline void loc_fill_mmx_value (__m64 * _dst, __m64 _value){
-	__m64 t	 = _value;
-	_dst[0]  = t;	_dst[1]  = t;	_dst[2]  = t;	_dst[3]  = t;
-	_dst[4]  = t;	_dst[5]  = t;	_dst[6]  = t;	_dst[7]  = t;
-	_dst[8]  = t;	_dst[9]  = t;	_dst[10] = t;	_dst[11] = t;
-	_dst[12] = t;	_dst[13] = t;	_dst[14] = t;	_dst[15] = t;
+  __m64 t   = _value;
+  _dst[0]  = t;  _dst[1]  = t;  _dst[2]  = t;  _dst[3]  = t;
+  _dst[4]  = t;  _dst[5]  = t;  _dst[6]  = t;  _dst[7]  = t;
+  _dst[8]  = t;  _dst[9]  = t;  _dst[10] = t;  _dst[11] = t;
+  _dst[12] = t;  _dst[13] = t;  _dst[14] = t;  _dst[15] = t;
 }
 
 /* copy a block of 8 byte elements using different strides */
-static __inline void loc_blockcopy_mmx (unsigned char * _dst, int _dst_ystride, 
-																			  unsigned char * _src, int _src_ystride){
-	__m64 a,b,c,d,e,f,g,h;
-	a = *(__m64*)(_src + 0 * _src_ystride);
-	b = *(__m64*)(_src + 1 * _src_ystride);
-	c = *(__m64*)(_src + 2 * _src_ystride);
-	d = *(__m64*)(_src + 3 * _src_ystride);
-	e = *(__m64*)(_src + 4 * _src_ystride);
-	f = *(__m64*)(_src + 5 * _src_ystride);
-	g = *(__m64*)(_src + 6 * _src_ystride);
-	h = *(__m64*)(_src + 7 * _src_ystride);
-	*(__m64*)(_dst + 0 * _dst_ystride) = a;
-	*(__m64*)(_dst + 1 * _dst_ystride) = b;
-	*(__m64*)(_dst + 2 * _dst_ystride) = c;
-	*(__m64*)(_dst + 3 * _dst_ystride) = d;
-	*(__m64*)(_dst + 4 * _dst_ystride) = e;
-	*(__m64*)(_dst + 5 * _dst_ystride) = f;
-	*(__m64*)(_dst + 6 * _dst_ystride) = g;
-	*(__m64*)(_dst + 7 * _dst_ystride) = h;
+static __inline void loc_blockcopy_mmx (unsigned char * _dst, int _dst_ystride,
+                                        unsigned char * _src, int _src_ystride){
+  __m64 a,b,c,d,e,f,g,h;
+  a = *(__m64*)(_src + 0 * _src_ystride);
+  b = *(__m64*)(_src + 1 * _src_ystride);
+  c = *(__m64*)(_src + 2 * _src_ystride);
+  d = *(__m64*)(_src + 3 * _src_ystride);
+  e = *(__m64*)(_src + 4 * _src_ystride);
+  f = *(__m64*)(_src + 5 * _src_ystride);
+  g = *(__m64*)(_src + 6 * _src_ystride);
+  h = *(__m64*)(_src + 7 * _src_ystride);
+  *(__m64*)(_dst + 0 * _dst_ystride) = a;
+  *(__m64*)(_dst + 1 * _dst_ystride) = b;
+  *(__m64*)(_dst + 2 * _dst_ystride) = c;
+  *(__m64*)(_dst + 3 * _dst_ystride) = d;
+  *(__m64*)(_dst + 4 * _dst_ystride) = e;
+  *(__m64*)(_dst + 5 * _dst_ystride) = f;
+  *(__m64*)(_dst + 6 * _dst_ystride) = g;
+  *(__m64*)(_dst + 7 * _dst_ystride) = h;
 }
 
 void oc_state_frag_recon_mmx(oc_theora_state *_state,const oc_fragment *_frag,
@@ -117,7 +117,7 @@
        the iDCT.*/
     /*First zero the buffer.*/
     /*On K7, etc., this could be replaced with movntq and sfence.*/
-		loc_fill_mmx_value ((__m64 *)res_buf, _mm_setzero_si64());
+    loc_fill_mmx_value ((__m64 *)res_buf, _mm_setzero_si64());
 
     res_buf[0]=(ogg_int16_t)((ogg_int32_t)_frag->dc*_dc_iquant);
     /*This is planned to be rewritten in MMX.*/
@@ -156,12 +156,12 @@
        _frag->buffer[ref_framei]+mvoffsets[1],ref_ystride,res_buf);
     }
     else{
-			oc_frag_recon_inter_mmx(_frag->buffer[dst_framei],dst_ystride,
+      oc_frag_recon_inter_mmx(_frag->buffer[dst_framei],dst_ystride,
        _frag->buffer[ref_framei]+mvoffsets[0],ref_ystride,res_buf);
     }
   }
 
-	_mm_empty();
+  _mm_empty();
 }
 
 
@@ -180,8 +180,8 @@
   fragi_end=_fragis+_nfragis;
   for(fragi=_fragis;fragi<fragi_end;fragi++){
     oc_fragment *frag = _state->frags+*fragi;
-		loc_blockcopy_mmx (frag->buffer[dst_framei], dst_ystride, 
-											 frag->buffer[src_framei], src_ystride);
+    loc_blockcopy_mmx (frag->buffer[dst_framei], dst_ystride,
+                       frag->buffer[src_framei], src_ystride);
   }
   _m_empty();
 }

Modified: trunk/theora/lib/dec/x86_vc/x86state.c
===================================================================
--- trunk/theora/lib/dec/x86_vc/x86state.c	2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/dec/x86_vc/x86state.c	2008-08-04 18:37:55 UTC (rev 15153)
@@ -23,19 +23,19 @@
 void oc_state_vtable_init_x86(oc_theora_state *_state){
   _state->cpu_flags=oc_cpu_flags_get();
 
-	/* fill with defaults */
-	oc_state_vtable_init_c(_state);
+  /* fill with defaults */
+  oc_state_vtable_init_c(_state);
 
-	/* patch MMX functions */
-	if(_state->cpu_flags&OC_CPU_X86_MMX){
+  /* patch MMX functions */
+  if(_state->cpu_flags&OC_CPU_X86_MMX){
     _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_mmx;
     _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_mmx;
     _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_mmx;
     _state->opt_vtable.restore_fpu=oc_restore_fpu_mmx;
-    _state->opt_vtable.state_frag_copy=oc_state_frag_copy_mmx; 
-    _state->opt_vtable.state_frag_recon=oc_state_frag_recon_mmx; 
+    _state->opt_vtable.state_frag_copy=oc_state_frag_copy_mmx;
+    _state->opt_vtable.state_frag_recon=oc_state_frag_recon_mmx;
     _state->opt_vtable.state_loop_filter_frag_rows=oc_state_loop_filter_frag_rows_mmx;
-	}
+  }
 }
 
 #endif

Modified: trunk/theora/lib/enc/codec_internal.h
===================================================================
--- trunk/theora/lib/enc/codec_internal.h	2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/enc/codec_internal.h	2008-08-04 18:37:55 UTC (rev 15153)
@@ -287,10 +287,10 @@
 struct PB_INSTANCE {
   oggpack_buffer *opb;
   theora_info     info;
-  
+
   /* flag to indicate if the headers already have been written */
   int            HeadersWritten;
-  
+
   /* how far do we shift the granulepos to seperate out P frame counts? */
   int             keyframe_granule_shift;
 
@@ -489,14 +489,14 @@
   ogg_int32_t    fp_quant_Inter_Y_coeffs[64];
   ogg_int32_t    fp_quant_Inter_U_coeffs[64];
   ogg_int32_t    fp_quant_Inter_V_coeffs[64];
-  
+
   ogg_int32_t    fp_quant_Y_round[64];
   ogg_int32_t    fp_quant_U_round[64];
   ogg_int32_t    fp_quant_V_round[64];
   ogg_int32_t    fp_quant_Inter_Y_round[64];
   ogg_int32_t    fp_quant_Inter_U_round[64];
   ogg_int32_t    fp_quant_Inter_V_round[64];
-  
+
   ogg_int32_t    fp_ZeroBinSize_Y[64];
   ogg_int32_t    fp_ZeroBinSize_U[64];
   ogg_int32_t    fp_ZeroBinSize_V[64];
@@ -518,15 +518,6 @@
 
   DspFunctions   dsp;  /* Selected functions for this platform */
 
-#ifdef _TH_DEBUG_
-  Q_LIST_ENTRY (*QFragQUAN)[64];           /* Fragment Coefficients
-                                               Array Pointers */
-  Q_LIST_ENTRY (*QFragFREQ)[64];            /* Fragment Coefficients
-                                               Array Pointers */
-  Q_LIST_ENTRY (*QFragTIME)[64];            /* Fragment Coefficients
-                                               Array Pointers */
-#endif
-
 };
 
 /* Encoder (Compressor) instance -- installed in a theora_state */

Modified: trunk/theora/lib/enc/dct_decode.c
===================================================================
--- trunk/theora/lib/enc/dct_decode.c	2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/enc/dct_decode.c	2008-08-04 18:37:55 UTC (rev 15153)
@@ -65,15 +65,6 @@
   /* Set up pointer into the quantisation buffer. */
   pbi->quantized_list = &pbi->QFragData[FragmentNumber][0];
 
-#ifdef _TH_DEBUG_
- {
-   int i;
-   for(i=0;i<64;i++)
-     pbi->QFragFREQ[FragmentNumber][dezigzag_index[i]]= 
-       pbi->quantized_list[i] * pbi->dequant_coeffs[i];
- }
-#endif
-
   /* Invert quantisation and DCT to get pixel data. */
   switch(pbi->FragCoefEOB[FragmentNumber]){
   case 0:case 1:
@@ -89,14 +80,6 @@
     dsp_IDctSlow(pbi->dsp, pbi->quantized_list, pbi->dequant_coeffs, pbi->ReconDataBuffer );
   }
 
-#ifdef _TH_DEBUG_
- {
-   int i;
-   for(i=0;i<64;i++)
-     pbi->QFragTIME[FragmentNumber][i]= pbi->ReconDataBuffer[i];
- }
-#endif
-
   /* Convert fragment number to a pixel offset in a reconstruction buffer. */
   ReconPixelIndex = pbi->recon_pixel_index_table[FragmentNumber];
 
@@ -148,13 +131,13 @@
 
     /* Select appropriate dequantiser matrix. */
     if ( pbi->CodingMode == CODE_INTRA )
-      if ( FragmentNumber < 
+      if ( FragmentNumber <
                 (ogg_int32_t)(pbi->YPlaneFragments + pbi->UVPlaneFragments) )
         pbi->dequant_coeffs = pbi->dequant_U_coeffs;
       else
         pbi->dequant_coeffs = pbi->dequant_V_coeffs;
     else
-      if ( FragmentNumber < 
+      if ( FragmentNumber <
                 (ogg_int32_t)(pbi->YPlaneFragments + pbi->UVPlaneFragments) )
         pbi->dequant_coeffs = pbi->dequant_InterU_coeffs;
       else
@@ -164,15 +147,6 @@
   /* Set up pointer into the quantisation buffer. */
   pbi->quantized_list = &pbi->QFragData[FragmentNumber][0];
 
-#ifdef _TH_DEBUG_
- {
-   int i;
-   for(i=0;i<64;i++)
-     pbi->QFragFREQ[FragmentNumber][dezigzag_index[i]]= 
-       pbi->quantized_list[i] * pbi->dequant_coeffs[i];
- }
-#endif
-
   /* Invert quantisation and DCT to get pixel data. */
   switch(pbi->FragCoefEOB[FragmentNumber]){
   case 0:case 1:
@@ -188,14 +162,6 @@
     dsp_IDctSlow(pbi->dsp, pbi->quantized_list, pbi->dequant_coeffs, pbi->ReconDataBuffer );
   }
 
-#ifdef _TH_DEBUG_
- {
-   int i;
-   for(i=0;i<64;i++)
-     pbi->QFragTIME[FragmentNumber][i]= pbi->ReconDataBuffer[i];
- }
-#endif
-
   /* Convert fragment number to a pixel offset in a reconstruction buffer. */
   ReconPixelIndex = pbi->recon_pixel_index_table[FragmentNumber];
 
@@ -646,8 +612,8 @@
 }
 
 static void loop_filter_h(unsigned char * PixelPtr,
-			  ogg_int32_t LineLength,
-			  ogg_int16_t *BoundingValuePtr){
+                          ogg_int32_t LineLength,
+                          ogg_int16_t *BoundingValuePtr){
   ogg_int32_t j;
   ogg_int32_t FiltVal;
   PixelPtr-=2;
@@ -669,8 +635,8 @@
 }
 
 static void loop_filter_v(unsigned char * PixelPtr,
-			  ogg_int32_t LineLength,
-			  ogg_int16_t *BoundingValuePtr){
+                          ogg_int32_t LineLength,
+                          ogg_int16_t *BoundingValuePtr){
   ogg_int32_t j;
   ogg_int32_t FiltVal;
   PixelPtr -= 2*LineLength;
@@ -702,7 +668,7 @@
   SetupBoundingValueArray_Generic(BoundingValues, FLimit);
 
   for ( j = 0; j < 3 ; j++){
-    ogg_uint32_t *bp_begin = bp; 
+    ogg_uint32_t *bp_begin = bp;
     ogg_uint32_t *bp_end;
     int stride;
     int h;
@@ -719,23 +685,23 @@
       stride = pbi->UVStride;
       break;
     }
-    
+
     while(bp<bp_end){
       ogg_uint32_t *bp_left = bp;
       ogg_uint32_t *bp_right = bp + h;
       while(bp<bp_right){
-	if(cp[0]){
-	  if(bp>bp_left)
-	    loop_filter_h(&pbi->LastFrameRecon[bp[0]],stride,bvp);
-	  if(bp_left>bp_begin)
-	    loop_filter_v(&pbi->LastFrameRecon[bp[0]],stride,bvp);
-	  if(bp+1<bp_right && !cp[1])
-	    loop_filter_h(&pbi->LastFrameRecon[bp[0]]+8,stride,bvp);
-	  if(bp+h<bp_end && !cp[h])
-	    loop_filter_v(&pbi->LastFrameRecon[bp[h]],stride,bvp);
-	}
-	bp++;
-	cp++;
+        if(cp[0]){
+          if(bp>bp_left)
+            loop_filter_h(&pbi->LastFrameRecon[bp[0]],stride,bvp);
+          if(bp_left>bp_begin)
+            loop_filter_v(&pbi->LastFrameRecon[bp[0]],stride,bvp);
+          if(bp+1<bp_right && !cp[1])
+            loop_filter_h(&pbi->LastFrameRecon[bp[0]]+8,stride,bvp);
+          if(bp+h<bp_end && !cp[h])
+            loop_filter_v(&pbi->LastFrameRecon[bp[h]],stride,bvp);
+        }
+        bp++;
+        cp++;
       }
     }
   }
@@ -842,7 +808,7 @@
       FragsDown = pbi->VFragments >> 1;
       break;
     /*case 2:  v */
-    default:    
+    default:
       FromFragment = pbi->YPlaneFragments + pbi->UVPlaneFragments;
       ToFragment = pbi->YPlaneFragments + (2 * pbi->UVPlaneFragments) ;
       FragsAcross = pbi->HFragments >> 1;
@@ -949,94 +915,6 @@
   /* Apply a loop filter to edge pixels of updated blocks */
   dsp_LoopFilter(pbi->dsp, pbi, pbi->quant_info.loop_filter_limits[pbi->FrameQIndex]);
 
-#ifdef _TH_DEBUG_
-    {
-      int x,y,i,j,k,xn,yn,stride;
-      int plane;
-      int buf;
-
-      /* dump fragment DCT components */
-      for(plane=0;plane<3;plane++){
-	char *plstr;
-	int offset;
-	switch(plane){
-	case 0:
-	  plstr="Y";
-	  xn = pbi->HFragments;
-	  yn = pbi->VFragments;
-	  offset = 0; 
-	  stride = pbi->YStride;
-	  break;
-	case 1:
-	  plstr="U";
-	  xn = pbi->HFragments>>1;
-	  yn = pbi->VFragments>>1;
-	  offset = pbi->VFragments * pbi->HFragments;	
-	  stride = pbi->UVStride;
-	  break;
-	case 2:
-	  plstr="V";
-	  xn = pbi->HFragments>>1;
-	  yn = pbi->VFragments>>1;
-	  offset = pbi->VFragments * pbi->HFragments + 
-	    ((pbi->VFragments * pbi->HFragments) >> 2);
-	  stride = pbi->UVStride;
-	  break;
-	}
-	for(y=0;y<yn;y++){
-	  for(x=0;x<xn;x++,i++){
-	    
-	    for(buf=0;buf<3;buf++){
-	      Q_LIST_ENTRY (*ptr)[64];
-	      char *bufn;
-	      
-	      switch(buf){
-	      case 0:
-		bufn = "coded";
-		ptr = pbi->QFragQUAN;
-		break;
-	      case 1:
-		bufn = "coeff";
-		ptr = pbi->QFragFREQ;
-		break;
-	      case 2:
-		bufn = "idct";
-		ptr = pbi->QFragTIME;
-		break;
-	      }
-	      
-	      i = offset + y*xn + x;
-	      
-	      TH_DEBUG("%s %s [%d][%d] = {",bufn,plstr,x,y);
-	      if ( !pbi->display_fragments[i] ) 
-		TH_DEBUG(" not coded }\n");
-	      else{
-		int l=0;
-		for(j=0;j<8;j++){
-		  TH_DEBUG("\n   ");
-		  for(k=0;k<8;k++,l++){
-		    TH_DEBUG("%d ",ptr[i][l]);
-		  }
-		}
-		TH_DEBUG(" }\n");
-	      }
-	    }
-	    
-	    /* and the loop filter output, which is a flat struct */
-	    TH_DEBUG("recon %s [%d][%d] = {",plstr,x,y);
-	    for(j=0;j<8;j++){
-	      int l = pbi->recon_pixel_index_table[i] + j*stride;
-	      TH_DEBUG("\n   ");
-	      for(k=0;k<8;k++,l++)
-		TH_DEBUG("%d ", pbi->LastFrameRecon[l]);
-	    }
-	    TH_DEBUG(" }\n\n");
-	  }
-	}
-      }
-    }
-#endif
-
   /* We may need to update the UMV border */
   UpdateUMVBorder(pbi, pbi->LastFrameRecon);
 
@@ -1054,7 +932,7 @@
   funcs->LoopFilter = LoopFilter__c;
 #if defined(USE_ASM)
   // Todo: Port the dct for MSC one day.
-#if !defined (_MSC_VER)  
+#if !defined (_MSC_VER)
   if (cpu_flags & OC_CPU_X86_MMX) {
     dsp_mmx_dct_decode_init(funcs);
   }

Modified: trunk/theora/lib/enc/dct_encode.c
===================================================================
--- trunk/theora/lib/enc/dct_encode.c	2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/enc/dct_encode.c	2008-08-04 18:37:55 UTC (rev 15153)
@@ -295,7 +295,7 @@
                                    half pixel MC */
   unsigned char  *ReconPtr1;    /* DCT reconstructed image pointers */
   unsigned char  *ReconPtr2;    /* Pointer used in half pixel MC */
-  
+
   switch(MvDevisor) {
   case 2:
     MvShift = 1;
@@ -413,7 +413,7 @@
       select_quantiser(&cpi->pb, BLOCK_INTER_Y);
   } else {
     LeftEdge = !((FragIndex-cpi->pb.YPlaneFragments)%(cpi->pb.HFragments>>1));
-	
+
     if(FragIndex < (ogg_int32_t)cpi->pb.YPlaneFragments + (ogg_int32_t)cpi->pb.UVPlaneFragments) {
       /* U plane */
       if ( cpi->pb.CodingMode == CODE_INTRA )

Modified: trunk/theora/lib/enc/dsp.c
===================================================================
--- trunk/theora/lib/enc/dsp.c	2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/enc/dsp.c	2008-08-04 18:37:55 UTC (rev 15153)
@@ -72,7 +72,7 @@
 static void sub8x8avg2__c (unsigned char *FiltPtr, unsigned char *ReconPtr1,
                      unsigned char *ReconPtr2, ogg_int16_t *DctInputPtr,
                      ogg_uint32_t PixelsPerLine,
-                     ogg_uint32_t ReconPixelsPerLine) 
+                     ogg_uint32_t ReconPixelsPerLine)
 {
   int i;
 
@@ -100,12 +100,12 @@
   ogg_uint32_t SadValue;
   ogg_uint32_t SadValue1;
 
-  SadValue    = DSP_OP_ABS_DIFF (Src1[0], Src2[0]) + 
+  SadValue    = DSP_OP_ABS_DIFF (Src1[0], Src2[0]) +
                 DSP_OP_ABS_DIFF (Src1[1], Src2[1]) +
                 DSP_OP_ABS_DIFF (Src1[2], Src2[2]) +
                 DSP_OP_ABS_DIFF (Src1[3], Src2[3]);
 
-  SadValue1   = DSP_OP_ABS_DIFF (Src1[4], Src2[4]) + 
+  SadValue1   = DSP_OP_ABS_DIFF (Src1[4], Src2[4]) +
                 DSP_OP_ABS_DIFF (Src1[5], Src2[5]) +
                 DSP_OP_ABS_DIFF (Src1[6], Src2[6]) +
                 DSP_OP_ABS_DIFF (Src1[7], Src2[7]);
@@ -132,7 +132,7 @@
     SadValue[5] += abs(Src1[5] - Src2[5]);
     SadValue[6] += abs(Src1[6] - Src2[6]);
     SadValue[7] += abs(Src1[7] - Src2[7]);
-    
+
     Src1 += stride;
     Src2 += stride;
   }
@@ -146,18 +146,18 @@
     SadValue2[5] += abs(Src1[5] - Src2[5]);
     SadValue2[6] += abs(Src1[6] - Src2[6]);
     SadValue2[7] += abs(Src1[7] - Src2[7]);
-    
+
     Src1 += stride;
     Src2 += stride;
   }
-    
+
   for ( i = 0; i < 8; i++ ){
     if ( SadValue[i] > MaxSad )
       MaxSad = SadValue[i];
     if ( SadValue2[i] > MaxSad )
       MaxSad = SadValue2[i];
   }
-    
+
   return MaxSad;
 }
 
@@ -186,7 +186,7 @@
 }
 
 static ogg_uint32_t sad8x8_thres__c (unsigned char *ptr1, ogg_uint32_t stride1,
-                 unsigned char *ptr2, ogg_uint32_t stride2, 
+                 unsigned char *ptr2, ogg_uint32_t stride2,
              ogg_uint32_t thres)
 {
   ogg_uint32_t  i;
@@ -300,23 +300,23 @@
     DiffVal = DSP_OP_DIFF (SrcData[3], RefDataPtr[3]);
     XSum += DiffVal;
     XXSum += DiffVal*DiffVal;
-        
+
     DiffVal = DSP_OP_DIFF (SrcData[4], RefDataPtr[4]);
     XSum += DiffVal;
     XXSum += DiffVal*DiffVal;
-        
+
     DiffVal = DSP_OP_DIFF (SrcData[5], RefDataPtr[5]);
     XSum += DiffVal;
     XXSum += DiffVal*DiffVal;
-        
+
     DiffVal = DSP_OP_DIFF (SrcData[6], RefDataPtr[6]);
     XSum += DiffVal;
     XXSum += DiffVal*DiffVal;
-        
+
     DiffVal = DSP_OP_DIFF (SrcData[7], RefDataPtr[7]);
     XSum += DiffVal;
     XXSum += DiffVal*DiffVal;
-        
+
     /* Step to next row of block. */
     SrcData += SrcStride;
     RefDataPtr += RefStride;
@@ -382,7 +382,6 @@
 
 void dsp_init(DspFunctions *funcs)
 {
-  /* TH_DEBUG("setting dsp functions to C defaults.\n"); */
   funcs->save_fpu = nop;
   funcs->restore_fpu = nop;
   funcs->sub8x8 = sub8x8__c;

Modified: trunk/theora/lib/enc/dsp.h
===================================================================
--- trunk/theora/lib/enc/dsp.h	2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/enc/dsp.h	2008-08-04 18:37:55 UTC (rev 15153)
@@ -36,19 +36,19 @@
   void   (*sub8x8avg2)     (unsigned char *FiltPtr, unsigned char *ReconPtr1,
                      unsigned char *ReconPtr2, ogg_int16_t *DctInputPtr,
                ogg_uint32_t PixelsPerLine,
-               ogg_uint32_t ReconPixelsPerLine); 
+               ogg_uint32_t ReconPixelsPerLine);
 
-  void   (*copy8x8)      (unsigned char *src, unsigned char *dest, 
+  void   (*copy8x8)      (unsigned char *src, unsigned char *dest,
                      ogg_uint32_t stride);
 
-  void   (*recon_intra8x8)    (unsigned char *ReconPtr, ogg_int16_t *ChangePtr, 
+  void   (*recon_intra8x8)    (unsigned char *ReconPtr, ogg_int16_t *ChangePtr,
                      ogg_uint32_t LineStep);
 
-  void   (*recon_inter8x8)    (unsigned char *ReconPtr, unsigned char *RefPtr, 
+  void   (*recon_inter8x8)    (unsigned char *ReconPtr, unsigned char *RefPtr,
                      ogg_int16_t *ChangePtr, ogg_uint32_t LineStep);
 
-  void   (*recon_inter8x8_half)  (unsigned char *ReconPtr, unsigned char *RefPtr1, 
-           unsigned char *RefPtr2, ogg_int16_t *ChangePtr, 
+  void   (*recon_inter8x8_half)  (unsigned char *ReconPtr, unsigned char *RefPtr1,
+           unsigned char *RefPtr2, ogg_int16_t *ChangePtr,
          ogg_uint32_t LineStep);
 
   void   (*fdct_short)          (ogg_int16_t *InputData, ogg_int16_t *OutputData);
@@ -62,7 +62,7 @@
                unsigned char *ptr2, ogg_uint32_t stride2);
 
   ogg_uint32_t (*sad8x8_thres)  (unsigned char *ptr1, ogg_uint32_t stride1,
-                unsigned char *ptr2, ogg_uint32_t stride2, 
+                unsigned char *ptr2, ogg_uint32_t stride2,
          ogg_uint32_t thres);
 
   ogg_uint32_t (*sad8x8_xy2_thres)(unsigned char *SrcData, ogg_uint32_t SrcStride,
@@ -78,19 +78,19 @@
   ogg_uint32_t (*inter8x8_err_xy2)(unsigned char *SrcData, ogg_uint32_t SrcStride,
                      unsigned char *RefDataPtr1,
                unsigned char *RefDataPtr2, ogg_uint32_t RefStride);
-               
+
   void (*LoopFilter) (PB_INSTANCE *pbi, int FLimit);
 
   void (*FilterVert) (unsigned char * PixelPtr,
                  ogg_int32_t LineLength, ogg_int16_t *BoundingValuePtr);
 
-   void (*IDctSlow) (ogg_int16_t *InputData, 
+   void (*IDctSlow) (ogg_int16_t *InputData,
                   ogg_int16_t *QuantMatrix, ogg_int16_t *OutputData);
 
-    void (*IDct3) (ogg_int16_t *InputData, 
+    void (*IDct3) (ogg_int16_t *InputData,
                    ogg_int16_t *QuantMatrix, ogg_int16_t *OutputData);
-                   
-    void (*IDct10) (ogg_int16_t *InputData, 
+
+    void (*IDct10) (ogg_int16_t *InputData,
                   ogg_int16_t *QuantMatrix, ogg_int16_t *OutputData);
 } DspFunctions;
 

Modified: trunk/theora/lib/enc/encode.c
===================================================================
--- trunk/theora/lib/enc/encode.c	2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/enc/encode.c	2008-08-04 18:37:55 UTC (rev 15153)
@@ -28,10 +28,10 @@
 #define HIGHBITDUPPED(X) (((ogg_int16_t) X)  >> 15)
 
 static ogg_uint32_t QuadCodeComponent ( CP_INSTANCE *cpi,
-					ogg_uint32_t FirstSB,
-					ogg_uint32_t SBRows,
-					ogg_uint32_t SBCols,
-					ogg_uint32_t PixelsPerLine){
+                                        ogg_uint32_t FirstSB,
+                                        ogg_uint32_t SBRows,
+                                        ogg_uint32_t SBCols,
+                                        ogg_uint32_t PixelsPerLine){
 
   ogg_int32_t   FragIndex;      /* Fragment number */
   ogg_uint32_t  MB, B;          /* Macro-Block, Block indices */
@@ -49,7 +49,7 @@
     for ( SBcol=0; SBcol<SBCols; SBcol++ ) {
       /* Check its four Macro-Blocks  */
       /* 'Macro-Block' is a misnomer in the chroma planes; this is
-	 really just a Hilbert curve iterator */
+         really just a Hilbert curve iterator */
       for ( MB=0; MB<4; MB++ ) {
 
         if ( QuadMapToMBTopLeft(cpi->pb.BlockMap,SB,MB) >= 0 ) {
@@ -359,31 +359,15 @@
       /* Add the appropriate mode entropy token. */
       ModeIndex = SchemeList[cpi->ModeList[i]];
       oggpackB_write( opb, ModeBitPatterns[ModeIndex],
-		      (ogg_uint32_t)ModeBitLengths[ModeIndex] );
+                      (ogg_uint32_t)ModeBitLengths[ModeIndex] );
     }
   }else{
     /* Fall back to MODE_BITS per entry */
     for ( i = 0; i < cpi->ModeListCount; i++)
       /* Add the appropriate mode entropy token. */
-      oggpackB_write( opb, cpi->ModeList[i], MODE_BITS  );  
+      oggpackB_write( opb, cpi->ModeList[i], MODE_BITS  );
   }
-  
-#ifdef _TH_DEBUG_
-  TH_DEBUG("mode encode scheme = %d\n",(int)BestScheme);
-  if ( BestScheme == 0 ) {
-    TH_DEBUG("mode scheme list = { ");
-    for ( j = 0; j < MAX_MODES; j++ )
-      TH_DEBUG("%d ",(int)BestModeSchemes[j]);
-    TH_DEBUG("}\n");
-  }
-  TH_DEBUG("mode list = { ");
-  for ( i = 0; i < cpi->ModeListCount; i++) {
-    if((i&0x1f)==0)
-      TH_DEBUG("\n   ");
-    TH_DEBUG("%d ",cpi->ModeList[i]);
-  }
-  TH_DEBUG("\n}\n");
-#endif
+
 }
 
 static void PackMotionVectors (CP_INSTANCE *cpi) {
@@ -422,15 +406,6 @@
                      (ogg_uint32_t)MvBitsPtr[cpi->MVList[i].y] );
   }
 
-#ifdef _TH_DEBUG_
-  TH_DEBUG("motion vectors = {");
-  for ( i = 0; i < (ogg_int32_t)cpi->MvListCount; i++ ) {
-    if((i&0x7)==0)
-      TH_DEBUG("\n   ");
-    TH_DEBUG("%+03d,%+03d ",cpi->MVList[i].x,cpi->MVList[i].y);
-  }
-  TH_DEBUG("\n}\n");
-#endif
 }
 
 static void PackEOBRun( CP_INSTANCE *cpi) {
@@ -905,17 +880,6 @@
     }
   }
 
-#ifdef _TH_DEBUG_
- {
-   int j;
-   for ( i = 0; i < cpi->pb.CodedBlockIndex; i++ ) {
-     FragIndex = cpi->pb.CodedBlockList[i];
-     for(j=0;j<64;j++)
-       cpi->pb.QFragQUAN[FragIndex][j] = cpi->pb.QFragData[FragIndex][j];
-   }
- }
-#endif
-
   /* Pack DC tokens and adjust the ones we couldn't predict 2d */
   for ( i = 0; i < cpi->pb.CodedBlockIndex; i++ ) {
     /* Get the linear index for the current coded fragment. */
@@ -1013,7 +977,7 @@
           cpi->pb.FragCodingMethod[cpi->pb.YPlaneFragments +
                                   cpi->pb.UVPlaneFragments + UVFragOffset] =
             cpi->MBCodingMode;
-	}
+        }
       }
 
       /* Next Super-Block */
@@ -1391,7 +1355,7 @@
 
           cpi->MBCodingMode = CODE_INTER_PLUS_MV;
           SetMBMotionVectorsAndMode(cpi,YFragIndex,UFragIndex,
-				    VFragIndex,&InterMVect);
+                                    VFragIndex,&InterMVect);
 
           /* Update Prior last mv with last mv */
           PriorLastInterMVect.x = LastInterMVect.x;
@@ -1407,7 +1371,7 @@
 
           cpi->MBCodingMode = CODE_GOLDEN_MV;
           SetMBMotionVectorsAndMode(cpi,YFragIndex,UFragIndex,
-				    VFragIndex,&GFMVect);
+                                    VFragIndex,&GFMVect);
 
           /* Note last inter GF MV for future use */
           LastGFMVect.x = GFMVect.x;
@@ -1463,7 +1427,7 @@
 
           cpi->MBCodingMode = CODE_INTRA;
           SetMBMotionVectorsAndMode(cpi,YFragIndex,UFragIndex,
-				    VFragIndex,&ZeroVect);
+                                    VFragIndex,&ZeroVect);
         }
 
 
@@ -1487,17 +1451,11 @@
 void WriteFrameHeader( CP_INSTANCE *cpi) {
   ogg_uint32_t i;
   oggpack_buffer *opb=cpi->oggbuffer;
-
-  TH_DEBUG("\n>>>> beginning frame %ld\n\n",dframe);
-
   /* Output the frame type (base/key frame or inter frame) */
   oggpackB_write( opb, cpi->pb.FrameType, 1 );
-  TH_DEBUG("frame type = video, %s\n",cpi->pb.FrameType?"predicted":"key");
-  
   /* Write out details of the current value of Q... variable resolution. */
   for ( i = 0; i < Q_TABLE_SIZE; i++ ) {
     if ( cpi->pb.ThisFrameQualityValue == cpi->pb.QThreshTable[i] ) {
-      TH_DEBUG("frame quality = { %d }\n",i);
       oggpackB_write( opb, i, 6 );
       break;
     }

Modified: trunk/theora/lib/enc/encoder_quant.c
===================================================================
--- trunk/theora/lib/enc/encoder_quant.c	2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/enc/encoder_quant.c	2008-08-04 18:37:55 UTC (rev 15153)
@@ -20,12 +20,6 @@
 #include "codec_internal.h"
 #include "quant_lookup.h"
 
-#ifdef _TH_DEBUG_
-#include <stdio.h>
-extern FILE *debugout;
-extern long dframe;
-#endif
-
 #define OC_QUANT_MAX        (1024<<2)
 static const unsigned DC_QUANT_MIN[2]={4<<2,8<<2};
 static const unsigned AC_QUANT_MIN[2]={2<<2,4<<2};
@@ -41,9 +35,9 @@
 
 
 void WriteQTables(PB_INSTANCE *pbi,oggpack_buffer* _opb) {
-  
-  th_quant_info *_qinfo = &pbi->quant_info; 
-  
+
+  th_quant_info *_qinfo = &pbi->quant_info;
+
   const th_quant_ranges *qranges;
   const th_quant_base   *base_mats[2*3*64];
   int                    indices[2][3][64];
@@ -58,7 +52,7 @@
   int                    plj;
   int                    bmi;
   int                    i;
-  
+
   /*Unlike the scale tables, we can't assume the maximum value will be in
      index 0, so search for it here.*/
   i=_qinfo->loop_filter_limits[0];
@@ -149,133 +143,57 @@
   th_quant_info *qinfo = &pbi->quant_info;
 
   pbi->QThreshTable = pbi->quant_info.ac_scale;
-  
+
   for(qti=0;qti<2;qti++){
     for(pli=0;pli<3;pli++){
       int qi;  /* quality index */
       int qri; /* range iterator */
-      
+
       for(qi=0,qri=0; qri<=qinfo->qi_ranges[qti][pli].nranges; qri++){
-	th_quant_base base;
-	
-	ogg_uint32_t      q;
-	int               qi_start;
-	int               qi_end;
-	int               ci;
-	memcpy(base,qinfo->qi_ranges[qti][pli].base_matrices[qri],
-	       sizeof(base));
-	
-	qi_start=qi;
-	if(qri==qinfo->qi_ranges[qti][pli].nranges)
-	  qi_end=qi+1;
-	else 
-	  qi_end=qi+qinfo->qi_ranges[qti][pli].sizes[qri];
-	
-	/* Iterate over quality indicies in this range */
-	for(;;){
-	  
-	  /*Scale DC the coefficient from the proper table.*/
-	  q=((ogg_uint32_t)qinfo->dc_scale[qi]*base[0]/100)<<2;
-	  q=OC_CLAMPI(DC_QUANT_MIN[qti],q,OC_QUANT_MAX);
-	  pbi->quant_tables[qti][pli][qi][0]=(ogg_uint16_t)q;
-	  
-	  /*Now scale AC coefficients from the proper table.*/
-	  for(ci=1;ci<64;ci++){
-	    q=((ogg_uint32_t)qinfo->ac_scale[qi]*base[ci]/100)<<2;
-	    q=OC_CLAMPI(AC_QUANT_MIN[qti],q,OC_QUANT_MAX);
-	    pbi->quant_tables[qti][pli][qi][ci]=(ogg_uint16_t)q;
-	  }
-	  
-	  if(++qi>=qi_end)break;
-	  
-	  /*Interpolate the next base matrix.*/
-	  for(ci=0;ci<64;ci++){
-	    base[ci]=(unsigned char)
-	      ((2*((qi_end-qi)*qinfo->qi_ranges[qti][pli].base_matrices[qri][ci]+
-		   (qi-qi_start)*qinfo->qi_ranges[qti][pli].base_matrices[qri+1][ci])
-		+qinfo->qi_ranges[qti][pli].sizes[qri])/
-	       (2*qinfo->qi_ranges[qti][pli].sizes[qri]));
-	  }
-	}
-      }
-    }
-  }
+        th_quant_base base;
 
-#ifdef _TH_DEBUG_
-  int i, j, k, l;
+        ogg_uint32_t      q;
+        int               qi_start;
+        int               qi_end;
+        int               ci;
+        memcpy(base,qinfo->qi_ranges[qti][pli].base_matrices[qri],
+               sizeof(base));
 
-  /* dump the static tables */
-  {
-    int i, j, k, l, m;
-    TH_DEBUG("loop filter limits = {");
-    for(i=0;i<64;){
-      TH_DEBUG("\n        ");
-      for(j=0;j<16;i++,j++)
-	TH_DEBUG("%3d ",qinfo->loop_filter_limits[i]);
-    }
-    TH_DEBUG("\n}\n\n");
+        qi_start=qi;
+        if(qri==qinfo->qi_ranges[qti][pli].nranges)
+          qi_end=qi+1;
+        else
+          qi_end=qi+qinfo->qi_ranges[qti][pli].sizes[qri];
 
-    TH_DEBUG("ac scale = {");
-    for(i=0;i<64;){
-      TH_DEBUG("\n        ");
-      for(j=0;j<16;i++,j++)
-	TH_DEBUG("%3d ",qinfo->ac_scale[i]);
-    }
-    TH_DEBUG("\n}\n\n");
+        /* Iterate over quality indicies in this range */
+        for(;;){
 
-    TH_DEBUG("dc scale = {");
-    for(i=0;i<64;){
-      TH_DEBUG("\n        ");
-      for(j=0;j<16;i++,j++)
-	TH_DEBUG("%3d ",qinfo->dc_scale[i]);
-    }
-    TH_DEBUG("\n}\n\n");
+          /*Scale DC the coefficient from the proper table.*/
+          q=((ogg_uint32_t)qinfo->dc_scale[qi]*base[0]/100)<<2;
+          q=OC_CLAMPI(DC_QUANT_MIN[qti],q,OC_QUANT_MAX);
+          pbi->quant_tables[qti][pli][qi][0]=(ogg_uint16_t)q;
 
-    for(k=0;k<2;k++)
-      for(l=0;l<3;l++){
-	char *name[2][3]={
-	  {"intra Y bases","intra U bases", "intra V bases"},
-	  {"inter Y bases","inter U bases", "inter V bases"}
-	};
+          /*Now scale AC coefficients from the proper table.*/
+          for(ci=1;ci<64;ci++){
+            q=((ogg_uint32_t)qinfo->ac_scale[qi]*base[ci]/100)<<2;
+            q=OC_CLAMPI(AC_QUANT_MIN[qti],q,OC_QUANT_MAX);
+            pbi->quant_tables[qti][pli][qi][ci]=(ogg_uint16_t)q;
+          }
 
-	th_quant_ranges *r = &qinfo->qi_ranges[k][l];
-	TH_DEBUG("%s = {\n",name[k][l]);
-	TH_DEBUG("        ranges = %d\n",r->nranges);
-	TH_DEBUG("        intervals = { ");
-	for(i=0;i<r->nranges;i++)
-	  TH_DEBUG("%3d ",r->sizes[i]);
-	TH_DEBUG("}\n");
-	TH_DEBUG("\n        matricies = { ");
-	for(m=0;m<r->nranges+1;m++){
-	  TH_DEBUG("\n          { ");
-	  for(i=0;i<64;){
-	    TH_DEBUG("\n            ");
-	    for(j=0;j<8;i++,j++)
-	      TH_DEBUG("%3d ",r->base_matrices[m][i]);
-	  }
-	  TH_DEBUG("\n          }");
-	}
-	TH_DEBUG("\n        }\n");
-      }
-  }
+          if(++qi>=qi_end)break;
 
-  /* dump the calculated quantizer tables */
-  for(i=0;i<2;i++){
-    for(j=0;j<3;j++){
-      for(k=0;k<64;k++){
-	TH_DEBUG("quantizer table [%s][%s][Q%d] = {",
-		 (i==0?"intra":"inter"),(j==0?"Y":(j==1?"U":"V")),k);
-	for(l=0;l<64;l++){
-	  if((l&7)==0)
-	    TH_DEBUG("\n   ");
-	  TH_DEBUG("%4d ",pbi->quant_tables[i][j][k][l]);
-	}
-	TH_DEBUG("}\n");
+          /*Interpolate the next base matrix.*/
+          for(ci=0;ci<64;ci++){
+            base[ci]=(unsigned char)
+              ((2*((qi_end-qi)*qinfo->qi_ranges[qti][pli].base_matrices[qri][ci]+
+                   (qi-qi_start)*qinfo->qi_ranges[qti][pli].base_matrices[qri+1][ci])
+                +qinfo->qi_ranges[qti][pli].sizes[qri])/
+               (2*qinfo->qi_ranges[qti][pli].sizes[qri]));
+          }
+        }
       }
     }
   }
-#endif
-
 }
 
 static void BuildZigZagIndex(PB_INSTANCE *pbi){
@@ -289,17 +207,17 @@
 }
 
 static void init_quantizer ( CP_INSTANCE *cpi,
-			     unsigned char QIndex ){
+                             unsigned char QIndex ){
   int i;
   double ZBinFactor;
   double RoundingFactor;
-  
+
   double temp_fp_quant_coeffs;
   double temp_fp_quant_round;
   double temp_fp_ZeroBinSize;
   PB_INSTANCE *pbi = &cpi->pb;
-  
-  
+
+
   const ogg_uint16_t * temp_Y_coeffs;
   const ogg_uint16_t * temp_U_coeffs;
   const ogg_uint16_t * temp_V_coeffs;
@@ -307,22 +225,22 @@
   const ogg_uint16_t * temp_Inter_U_coeffs;
   const ogg_uint16_t * temp_Inter_V_coeffs;
   ogg_uint16_t scale_factor = cpi->pb.quant_info.ac_scale[QIndex];
-  
+
   /* Notes on setup of quantisers.  The initial multiplication by
      the scale factor is done in the ogg_int32_t domain to insure that the
      precision in the quantiser is the same as in the inverse
      quantiser where all calculations are integer.  The "<< 2" is a
      normalisation factor for the forward DCT transform. */
-  
+
   temp_Y_coeffs = pbi->quant_tables[0][0][QIndex];
   temp_U_coeffs = pbi->quant_tables[0][1][QIndex];
   temp_V_coeffs = pbi->quant_tables[0][2][QIndex];
   temp_Inter_Y_coeffs = pbi->quant_tables[1][0][QIndex];
   temp_Inter_U_coeffs = pbi->quant_tables[1][1][QIndex];
   temp_Inter_V_coeffs = pbi->quant_tables[1][2][QIndex];
-  
+
   ZBinFactor = 0.9;
-  
+
   switch(cpi->pb.info.sharpness){
   case 0:
     ZBinFactor = 0.65;
@@ -393,7 +311,7 @@
     pbi->fp_ZeroBinSize_Inter_U[0]= (0.5 + temp_fp_ZeroBinSize);
     temp_fp_quant_coeffs= 1.0 / temp_fp_quant_coeffs;
     pbi->fp_quant_Inter_U_coeffs[0]= (0.5 + SHIFT16 * temp_fp_quant_coeffs);
-    
+
     /* Inter V */
     temp_fp_quant_coeffs = temp_Inter_V_coeffs[0];
     temp_fp_quant_round = temp_fp_quant_coeffs * RoundingFactor;
@@ -402,8 +320,8 @@
     pbi->fp_ZeroBinSize_Inter_V[0]= (0.5 + temp_fp_ZeroBinSize);
     temp_fp_quant_coeffs= 1.0 / temp_fp_quant_coeffs;
     pbi->fp_quant_Inter_V_coeffs[0]= (0.5 + SHIFT16 * temp_fp_quant_coeffs);
-    
 
+
     for ( i = 1; i < 64; i++ ){
       /* Intra Y */
       temp_fp_quant_coeffs = temp_Y_coeffs[i];
@@ -469,7 +387,7 @@
 void select_quantiser(PB_INSTANCE *pbi, int type) {
   /* select a quantiser according to what plane has to be coded in what
    * mode. Could be extended to a more sophisticated scheme. */
-  
+
   switch(type) {
     case BLOCK_Y:
       pbi->fquant_coeffs = pbi->fp_quant_Y_coeffs;
@@ -494,12 +412,12 @@
     case BLOCK_INTER_U:
       pbi->fquant_coeffs = pbi->fp_quant_Inter_U_coeffs;
       pbi->fquant_round = pbi->fp_quant_Inter_U_round;
-      pbi->fquant_ZbSize = pbi->fp_ZeroBinSize_Inter_U;		
+      pbi->fquant_ZbSize = pbi->fp_ZeroBinSize_Inter_U;
       break;
     case BLOCK_INTER_V:
       pbi->fquant_coeffs = pbi->fp_quant_Inter_V_coeffs;
       pbi->fquant_round = pbi->fp_quant_Inter_V_round;
-      pbi->fquant_ZbSize = pbi->fp_ZeroBinSize_Inter_V;		
+      pbi->fquant_ZbSize = pbi->fp_ZeroBinSize_Inter_V;
       break;
   }
 }
@@ -523,7 +441,7 @@
 
   /* Note that we add half divisor to effect rounding on positive number */
   for( i = 0; i < VFRAGPIXELS; i++) {
-  
+
     int col;
     /* Iterate through columns */
     for( col = 0; col < 8; col++) {
@@ -538,7 +456,7 @@
         quantized_list[ZigZagPtr[col]] = ( val < -511 ) ? -511 : val;
       }
     }
- 
+
     FquantRoundPtr += 8;
     FquantCoeffsPtr += 8;
     FquantZBinSizePtr += 8;
@@ -548,9 +466,9 @@
 }
 
 static void init_dequantizer ( PB_INSTANCE *pbi,
-			       unsigned char  QIndex ){
+                               unsigned char  QIndex ){
   int i, j;
-  
+
   ogg_uint16_t * InterY_coeffs;
   ogg_uint16_t * InterU_coeffs;
   ogg_uint16_t * InterV_coeffs;
@@ -564,7 +482,7 @@
   InterY_coeffs = pbi->quant_tables[1][0][QIndex];
   InterU_coeffs = pbi->quant_tables[1][1][QIndex];
   InterV_coeffs = pbi->quant_tables[1][2][QIndex];
-  
+
   /* invert the dequant index into the quant index
      the dxer has a different order than the cxer. */
   BuildZigZagIndex(pbi);
@@ -606,7 +524,7 @@
   else if (NewQIndex < 0) NewQIndex = 0;
 
   pbi->FrameQIndex = NewQIndex;
-  
+
   qscale = pbi->quant_info.ac_scale[NewQIndex];
   pbi->ThisFrameQualityValue = qscale;
 
@@ -624,7 +542,7 @@
     qscale = pbi->quant_info.ac_scale[Q_TABLE_SIZE-1];
   else if ( qscale > pbi->quant_info.ac_scale[0] )
     qscale = pbi->quant_info.ac_scale[0];
-  
+
   /* Set the inter/intra descision control variables. */
   pbi->FrameQIndex = Q_TABLE_SIZE - 1;
   while ((ogg_int32_t) pbi->FrameQIndex >= 0 ) {

Modified: trunk/theora/lib/enc/encoder_toplevel.c
===================================================================
--- trunk/theora/lib/enc/encoder_toplevel.c	2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/enc/encoder_toplevel.c	2008-08-04 18:37:55 UTC (rev 15153)
@@ -26,18 +26,13 @@
 #include "dsp.h"
 #include "codec_internal.h"
 
-#ifdef _TH_DEBUG_
-FILE *debugout=NULL;
-long dframe=0;
-#endif
-
 #define A_TABLE_SIZE        29
 #define DF_CANDIDATE_WINDOW 5
 
 /*
- * th_quant_info for VP3 
+ * th_quant_info for VP3
  */
- 
+
 /*The default quantization parameters used by VP3.1.*/
 static const int OC_VP31_RANGE_SIZES[1]={63};
 static const th_quant_base OC_VP31_BASES_INTRA_Y[2]={
@@ -897,10 +892,6 @@
 
   CP_INSTANCE *cpi;
 
-#ifdef _TH_DEBUG_
-  debugout=fopen("theoraenc-debugout.txt","w");
-#endif
-
   memset(th, 0, sizeof(*th));
   /*Currently only the 4:2:0 format is supported.*/
   if(c->pixelformat!=OC_PF_420)return OC_IMPL;
@@ -1043,7 +1034,7 @@
      current clip. */
   cpi->ThisIsFirstFrame = 1;
   cpi->readyflag = 1;
-  
+
   cpi->pb.HeadersWritten = 0;
   /*We overload this flag to track header output.*/
   cpi->doneflag=-3;
@@ -1111,7 +1102,7 @@
     if(cpi->LastKeyFrame >= (ogg_uint32_t)
        cpi->pb.info.keyframe_frequency_force)
       cpi->ThisIsKeyFrame = 1;
-    
+
     if ( cpi->ThisIsKeyFrame ) {
       CompressKeyFrame(cpi);
       cpi->ThisIsKeyFrame = 0;
@@ -1131,10 +1122,6 @@
     ((cpi->CurrentFrame - cpi->LastKeyFrame)<<cpi->pb.keyframe_granule_shift)+
     cpi->LastKeyFrame - 1;
 
-#ifdef _TH_DEBUG_
-  dframe++;
-#endif  
-
   return 0;
 }
 
@@ -1170,7 +1157,7 @@
 
 static void _tp_writelsbint(oggpack_buffer *opb, long value)
 {
-  oggpackB_write(opb, value&0xFF, 8); 
+  oggpackB_write(opb, value&0xFF, 8);
   oggpackB_write(opb, value>>8&0xFF, 8);
   oggpackB_write(opb, value>>16&0xFF, 8);
   oggpackB_write(opb, value>>24&0xFF, 8);
@@ -1197,7 +1184,7 @@
   /* Applications use offset_y to mean offset from the top of the image; the
    * meaning in the bitstream is the opposite (from the bottom). Transform.
    */
-  offset_y = cpi->pb.info.height - cpi->pb.info.frame_height - 
+  offset_y = cpi->pb.info.height - cpi->pb.info.frame_height -
     cpi->pb.info.offset_y;
   oggpackB_write(cpi->oggbuffer,offset_y,8);
 
@@ -1321,11 +1308,6 @@
     _ogg_free(cpi);
   }
 
-#ifdef _TH_DEBUG_
-  fclose(debugout);
-  debugout=NULL;
-#endif
-
   memset(th,0,sizeof(*th));
 }
 
@@ -1377,59 +1359,59 @@
   CP_INSTANCE *cpi;
   PB_INSTANCE *pbi;
   int value;
-  
+
   if(th == NULL)
     return TH_EFAULT;
 
   cpi = th->internal_encode;
   pbi = &cpi->pb;
-  
+
   switch(req) {
     case TH_ENCCTL_SET_QUANT_PARAMS:
       if( ( buf==NULL&&buf_sz!=0 )
-  	   || ( buf!=NULL&&buf_sz!=sizeof(th_quant_info) )
-  	   || cpi->pb.HeadersWritten ){
+           || ( buf!=NULL&&buf_sz!=sizeof(th_quant_info) )
+           || cpi->pb.HeadersWritten ){
         return TH_EINVAL;
       }
-      
+
       memcpy(&pbi->quant_info, buf, sizeof(th_quant_info));
       InitQTables(pbi);
-      
+
       return 0;
     case TH_ENCCTL_SET_VP3_COMPATIBLE:
       if(cpi->pb.HeadersWritten)
         return TH_EINVAL;
-      
+
       memcpy(&pbi->quant_info, &TH_VP31_QUANT_INFO, sizeof(th_quant_info));
       InitQTables(pbi);
-      
+
       return 0;
     case TH_ENCCTL_SET_SPLEVEL:
       if(buf == NULL || buf_sz != sizeof(int))
         return TH_EINVAL;
-      
+
       memcpy(&value, buf, sizeof(int));
-            
+
       switch(value) {
         case 0:
           cpi->MotionCompensation = 1;
           pbi->info.quick_p = 0;
         break;
-        
+
         case 1:
           cpi->MotionCompensation = 1;
           pbi->info.quick_p = 1;
         break;
-        
+
         case 2:
           cpi->MotionCompensation = 0;
           pbi->info.quick_p = 1;
         break;
-        
+
         default:
-          return TH_EINVAL;    
+          return TH_EINVAL;
       }
-      
+
       return 0;
     case TH_ENCCTL_GET_SPLEVEL_MAX:
       value = 2;

Modified: trunk/theora/lib/enc/frarray.c
===================================================================
--- trunk/theora/lib/enc/frarray.c	2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/enc/frarray.c	2008-08-04 18:37:55 UTC (rev 15153)
@@ -123,11 +123,6 @@
   memset( cpi->PartiallyCodedFlags, 0, cpi->pb.SuperBlocks );
   memset( cpi->BlockCodedFlags, 0, cpi->pb.UnitFragments);
 
-#ifdef _TH_DEBUG_
-  unsigned char blockraster[cpi->pb.UnitFragments];
-  memset(blockraster,0,sizeof(blockraster));
-#endif
-
   for( SB = 0; SB < cpi->pb.SuperBlocks; SB++ ) {
     /* Check for coded blocks and macro-blocks */
     for ( MB=0; MB<4; MB++ ) {
@@ -144,10 +139,6 @@
               cpi->pb.SBCodedFlags[SB] = 1; /* SB at least partly coded */
               cpi->BlockCodedFlags[BListIndex] = 1; /* Block is coded */
 
-#ifdef _TH_DEBUG_
-	      blockraster[DfBlockIndex]=1;
-#endif	      
-
             }else{
               cpi->pb.SBFullyFlags[SB] = 0; /* SB not fully coded */
               cpi->BlockCodedFlags[BListIndex] = 0; /* Block is not coded */
@@ -170,77 +161,16 @@
     }
   }
 
-#ifdef _TH_DEBUG_
-  // assuming 4:2:0 right now
-  TH_DEBUG("predicted (partially coded frame)\n");
-  TH_DEBUG("superblock coded flags = {");
-  int x,y;
-  i=0;
-
-  for(y=0;y< (cpi->ScanConfig.VideoFrameHeight+31)/32;y++){
-    TH_DEBUG("\n   ");
-    for(x=0;x< (cpi->ScanConfig.VideoFrameWidth+31)/32;x++,i++)
-      TH_DEBUG("%x", ((cpi->pb.SBFullyFlags[i]!=0)|
-		      (cpi->PartiallyCodedFlags[i]!=0)));
-  }
-  TH_DEBUG("\n   ");
-  for(y=0;y< (cpi->ScanConfig.VideoFrameHeight+63)/64;y++){
-    TH_DEBUG("\n   ");
-    for(x=0;x< (cpi->ScanConfig.VideoFrameWidth+63)/64;x++,i++)
-      TH_DEBUG("%x", ((cpi->pb.SBFullyFlags[i]!=0)|
-		      (cpi->PartiallyCodedFlags[i]!=0)));
-  }
-  TH_DEBUG("\n   ");
-  for(y=0;y< (cpi->ScanConfig.VideoFrameHeight+63)/64;y++){
-    TH_DEBUG("\n   ");
-    for(x=0;x< (cpi->ScanConfig.VideoFrameWidth+63)/64;x++,i++)
-      TH_DEBUG("%x", ((cpi->pb.SBFullyFlags[i]!=0)|
-		      (cpi->PartiallyCodedFlags[i]!=0)));
-  }
-  TH_DEBUG("\n}\n");
-
-  if(i!=cpi->pb.SuperBlocks)
-    TH_DEBUG("WARNING!  superblock count, raster %d != flat %d\n",
-	     i,cpi->pb.SuperBlocks);
-
-  TH_DEBUG("block coded flags = {");
-
-  i=0;
-
-  for(y=0;y< (cpi->ScanConfig.VideoFrameHeight+7)/8;y++){
-    TH_DEBUG("\n   ");
-    for(x=0;x< (cpi->ScanConfig.VideoFrameWidth+7)/8;x++,i++)
-      TH_DEBUG("%x", blockraster[i]);
-  }
-  TH_DEBUG("\n   ");
-  for(y=0;y< (cpi->ScanConfig.VideoFrameHeight+15)/16;y++){
-    TH_DEBUG("\n   ");
-    for(x=0;x< (cpi->ScanConfig.VideoFrameWidth+15)/16;x++,i++)
-      TH_DEBUG("%x", blockraster[i]);
-  }
-  TH_DEBUG("\n   ");
-  for(y=0;y< (cpi->ScanConfig.VideoFrameHeight+15)/16;y++){
-    TH_DEBUG("\n   ");
-    for(x=0;x< (cpi->ScanConfig.VideoFrameWidth+15)/16;x++,i++)
-      TH_DEBUG("%x", blockraster[i]);
-  }
-  TH_DEBUG("\n}\n");
-
-  if(i!=cpi->pb.UnitFragments)
-    TH_DEBUG("WARNING!  block count, raster %d != flat %d\n",
-	     i,cpi->pb.UnitFragments);
-#endif	      
-
   /* Code list of partially coded Super-Block.  */
   val = cpi->PartiallyCodedFlags[0];
   oggpackB_write( cpi->oggbuffer, (ogg_uint32_t)val, 1);
 
-  i = 0; 
+  i = 0;
   while ( i < cpi->pb.SuperBlocks ) {
     run_count = 0;
-    while ( (i<cpi->pb.SuperBlocks) && 
-	    (cpi->PartiallyCodedFlags[i]==val) &&
-	    run_count<4129 ) {
+    while ( (i<cpi->pb.SuperBlocks) &&
+            (cpi->PartiallyCodedFlags[i]==val) &&
+            run_count<4129 ) {
       i++;
       run_count++;
     }
@@ -251,13 +181,13 @@
     if(run_count >= 4129 && i < cpi->pb.SuperBlocks ){
       val = cpi->PartiallyCodedFlags[i];
       oggpackB_write( cpi->oggbuffer, (ogg_uint32_t)val, 1);
-      
+
     }else
       val = ( val == 0 ) ? 1 : 0;
   }
 
   /* RLC Super-Block fully/not coded. */
-  i = 0; 
+  i = 0;
 
   /* Skip partially coded blocks */
   while( (i < cpi->pb.SuperBlocks) && cpi->PartiallyCodedFlags[i] )
@@ -269,9 +199,9 @@
 
     while ( i < cpi->pb.SuperBlocks ) {
       run_count = 0;
-      while ( (i < cpi->pb.SuperBlocks) && 
-	      (cpi->pb.SBFullyFlags[i] == val) &&
-	      run_count < 4129) {
+      while ( (i < cpi->pb.SuperBlocks) &&
+              (cpi->pb.SBFullyFlags[i] == val) &&
+              run_count < 4129) {
         i++;
         /* Skip partially coded blocks */
         while( (i < cpi->pb.SuperBlocks) && cpi->PartiallyCodedFlags[i] )
@@ -311,6 +241,3 @@
     }
   }
 }
-
-
-

Modified: trunk/theora/lib/enc/frinit.c
===================================================================
--- trunk/theora/lib/enc/frinit.c	2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/enc/frinit.c	2008-08-04 18:37:55 UTC (rev 15153)
@@ -141,14 +141,6 @@
   if(pbi->FragCoefEOB) _ogg_free(pbi->FragCoefEOB);
   if(pbi->skipped_display_fragments) _ogg_free(pbi->skipped_display_fragments);
   if(pbi->QFragData) _ogg_free(pbi->QFragData);
-#ifdef _TH_DEBUG_
-  if(pbi->QFragTIME) _ogg_free(pbi->QFragTIME);
-  if(pbi->QFragFREQ) _ogg_free(pbi->QFragFREQ);
-  if(pbi->QFragQUAN) _ogg_free(pbi->QFragQUAN);
-  pbi->QFragTIME = 0;
-  pbi->QFragFREQ = 0;
-  pbi->QFragQUAN = 0;
-#endif
   if(pbi->TokenList) _ogg_free(pbi->TokenList);
   if(pbi->FragCodingMethod) _ogg_free(pbi->FragCodingMethod);
   if(pbi->FragCoordinates) _ogg_free(pbi->FragCoordinates);
@@ -243,19 +235,6 @@
   pbi->QFragData =
     _ogg_malloc(pbi->UnitFragments * sizeof(*pbi->QFragData));
 
-#ifdef _TH_DEBUG_
-
-  pbi->QFragTIME =
-    _ogg_malloc(pbi->UnitFragments * sizeof(*pbi->QFragTIME));
-
-  pbi->QFragFREQ =
-    _ogg_malloc(pbi->UnitFragments * sizeof(*pbi->QFragFREQ));
-
-  pbi->QFragQUAN =
-    _ogg_malloc(pbi->UnitFragments * sizeof(*pbi->QFragQUAN));
-
-#endif
-
   pbi->TokenList =
     _ogg_malloc(pbi->UnitFragments * sizeof(*pbi->TokenList));
 

Modified: trunk/theora/lib/enc/mcomp.c
===================================================================
--- trunk/theora/lib/enc/mcomp.c	2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/enc/mcomp.c	2008-08-04 18:37:55 UTC (rev 15153)
@@ -111,7 +111,7 @@
               RefDataPtr1, RefPixelsPerLine);
   }else{
     DiffVal = dsp_inter8x8_err_xy2 (cpi->dsp, NewDataPtr, PixelsPerLine,
-              RefDataPtr1, 
+              RefDataPtr1,
               RefDataPtr2, RefPixelsPerLine);
   }
 
@@ -133,11 +133,11 @@
 
   if ( RefOffset == 0 ) {
     /* Simple case as for non 0.5 pixel */
-    DiffVal += dsp_sad8x8 (cpi->dsp, SrcData, PixelsPerLine, 
+    DiffVal += dsp_sad8x8 (cpi->dsp, SrcData, PixelsPerLine,
                    RefDataPtr1, RefPixelsPerLine);
   } else  {
-    DiffVal += dsp_sad8x8_xy2_thres (cpi->dsp, SrcData, PixelsPerLine, 
-                   RefDataPtr1, 
+    DiffVal += dsp_sad8x8_xy2_thres (cpi->dsp, SrcData, PixelsPerLine,
+                   RefDataPtr1,
                    RefDataPtr2, RefPixelsPerLine, BestSoFar);
   }
 
@@ -729,7 +729,7 @@
 
   dsp_save_fpu (cpi->dsp);
 
-  /* For the moment the 4MV mode is only deemed to be valid 
+  /* For the moment the 4MV mode is only deemed to be valid
      if all four Y blocks are to be updated */
   /* This may be adapted later. */
   if ( cpi->pb.display_fragments[FragIndex] &&

Modified: trunk/theora/lib/enc/x86_32/dct_decode_mmx.c
===================================================================
--- trunk/theora/lib/enc/x86_32/dct_decode_mmx.c	2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/enc/x86_32/dct_decode_mmx.c	2008-08-04 18:37:55 UTC (rev 15153)
@@ -27,7 +27,7 @@
  0x0004000400040004LL;
 
 static void loop_filter_v(unsigned char *_pix,int _ystride,
-			  const ogg_int16_t *_ll){
+                          const ogg_int16_t *_ll){
   long esi;
   _pix-=_ystride*2;
   __asm__ __volatile__(
@@ -210,7 +210,7 @@
    four p0's to one register we must transpose the values in four mmx regs.
   When half is done we repeat this for the rest.*/
 static void loop_filter_h4(unsigned char *_pix,long _ystride,
-			   const ogg_int16_t *_ll){
+                           const ogg_int16_t *_ll){
   long esi;
   long edi;
   __asm__ __volatile__(
@@ -343,12 +343,12 @@
 }
 
 static void loop_filter_h(unsigned char *_pix,int _ystride,
-			  const ogg_int16_t *_ll){
+                          const ogg_int16_t *_ll){
   _pix-=2;
   loop_filter_h4(_pix,_ystride,_ll);
   loop_filter_h4(_pix+(_ystride<<2),_ystride,_ll);
 }
- 
+
 static void loop_filter_mmx(PB_INSTANCE *pbi, int FLimit){
   int j;
   ogg_int16_t __attribute__((aligned(8)))  ll[4];
@@ -359,7 +359,7 @@
   ll[0]=ll[1]=ll[2]=ll[3]=FLimit;
 
   for ( j = 0; j < 3 ; j++){
-    ogg_uint32_t *bp_begin = bp; 
+    ogg_uint32_t *bp_begin = bp;
     ogg_uint32_t *bp_end;
     int stride;
     int h;
@@ -376,23 +376,23 @@
       stride = pbi->UVStride;
       break;
     }
-    
+
     while(bp<bp_end){
       ogg_uint32_t *bp_left = bp;
       ogg_uint32_t *bp_right = bp + h;
       while(bp<bp_right){
-	if(cp[0]){
-	  if(bp>bp_left)
-	    loop_filter_h(&pbi->LastFrameRecon[bp[0]],stride,ll);
-	  if(bp_left>bp_begin)
-	    loop_filter_v(&pbi->LastFrameRecon[bp[0]],stride,ll);
-	  if(bp+1<bp_right && !cp[1])
-	    loop_filter_h(&pbi->LastFrameRecon[bp[0]]+8,stride,ll);
-	  if(bp+h<bp_end && !cp[h])
-	    loop_filter_v(&pbi->LastFrameRecon[bp[h]],stride,ll);
-	}
-	bp++;
-	cp++;
+        if(cp[0]){
+          if(bp>bp_left)
+            loop_filter_h(&pbi->LastFrameRecon[bp[0]],stride,ll);
+          if(bp_left>bp_begin)
+            loop_filter_v(&pbi->LastFrameRecon[bp[0]],stride,ll);
+          if(bp+1<bp_right && !cp[1])
+            loop_filter_h(&pbi->LastFrameRecon[bp[0]]+8,stride,ll);
+          if(bp+h<bp_end && !cp[h])
+            loop_filter_v(&pbi->LastFrameRecon[bp[h]],stride,ll);
+        }
+        bp++;
+        cp++;
       }
     }
   }

Modified: trunk/theora/lib/enc/x86_32/dsp_mmx.c
===================================================================
--- trunk/theora/lib/enc/x86_32/dsp_mmx.c	2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/enc/x86_32/dsp_mmx.c	2008-08-04 18:37:55 UTC (rev 15153)
@@ -50,12 +50,12 @@
 
 static void sub8x8__mmx (unsigned char *FiltPtr, unsigned char *ReconPtr,
                   ogg_int16_t *DctInputPtr, ogg_uint32_t PixelsPerLine,
-                  ogg_uint32_t ReconPixelsPerLine) 
+                  ogg_uint32_t ReconPixelsPerLine)
 {
   __asm__ __volatile__ (
     "  .p2align 4                   \n\t"
 
-    "  pxor        %%mm7, %%mm7     \n\t" 
+    "  pxor        %%mm7, %%mm7     \n\t"
     SUB_LOOP
     SUB_LOOP
     SUB_LOOP
@@ -68,7 +68,7 @@
        "+r" (ReconPtr),
        "+r" (DctInputPtr)
      : "m" (PixelsPerLine),
-       "m" (ReconPixelsPerLine) 
+       "m" (ReconPixelsPerLine)
      : "memory"
   );
 }
@@ -86,16 +86,16 @@
   "  movq        %%mm2, 8(%1)     \n\t" /* write answer out */                \
   /* Increment pointers */                                                    \
   "  add         $16, %1           \n\t"                                      \
-  "  add         %2, %0           \n\t"  
+  "  add         %2, %0           \n\t"
 
 
 static void sub8x8_128__mmx (unsigned char *FiltPtr, ogg_int16_t *DctInputPtr,
-                      ogg_uint32_t PixelsPerLine) 
+                      ogg_uint32_t PixelsPerLine)
 {
   __asm__ __volatile__ (
     "  .p2align 4                   \n\t"
 
-    "  pxor        %%mm7, %%mm7     \n\t" 
+    "  pxor        %%mm7, %%mm7     \n\t"
     "  movq        %[V128], %%mm1   \n\t"
     SUB_128_LOOP
     SUB_128_LOOP
@@ -140,18 +140,18 @@
   "  add         $16, %3           \n\t"                                      \
   "  add         %4, %0           \n\t"                                       \
   "  add         %5, %1           \n\t"                                       \
-  "  add         %5, %2           \n\t"  
+  "  add         %5, %2           \n\t"
 
 
 static void sub8x8avg2__mmx (unsigned char *FiltPtr, unsigned char *ReconPtr1,
                      unsigned char *ReconPtr2, ogg_int16_t *DctInputPtr,
                      ogg_uint32_t PixelsPerLine,
-                     ogg_uint32_t ReconPixelsPerLine) 
+                     ogg_uint32_t ReconPixelsPerLine)
 {
   __asm__ __volatile__ (
     "  .p2align 4                   \n\t"
 
-    "  pxor        %%mm7, %%mm7     \n\t" 
+    "  pxor        %%mm7, %%mm7     \n\t"
     SUB_AVG2_LOOP
     SUB_AVG2_LOOP
     SUB_AVG2_LOOP
@@ -165,7 +165,7 @@
        "+r" (ReconPtr2),
        "+r" (DctInputPtr)
      : "m" (PixelsPerLine),
-       "m" (ReconPixelsPerLine) 
+       "m" (ReconPixelsPerLine)
      : "memory"
   );
 }
@@ -177,15 +177,15 @@
   __asm__ __volatile__ (
     "  .p2align 4                   \n\t"
 
-    "  pxor        %%mm6, %%mm6     \n\t"	/* zero out mm6 for unpack */
-    "  pxor        %%mm7, %%mm7     \n\t" 	/* zero out mm7 for unpack */
-    "  movq        (%1), %%mm0      \n\t"	/* take 8 bytes */
+    "  pxor        %%mm6, %%mm6     \n\t"       /* zero out mm6 for unpack */
+    "  pxor        %%mm7, %%mm7     \n\t"       /* zero out mm7 for unpack */
+    "  movq        (%1), %%mm0      \n\t"       /* take 8 bytes */
     "  movq        (%2), %%mm1      \n\t"
 
     "  movq        %%mm0, %%mm2     \n\t"
-    "  psubusb     %%mm1, %%mm0     \n\t" 	/* A - B */
-    "  psubusb     %%mm2, %%mm1     \n\t"	/* B - A */
-    "  por         %%mm1, %%mm0     \n\t"      	/* and or gives abs difference */
+    "  psubusb     %%mm1, %%mm0     \n\t"       /* A - B */
+    "  psubusb     %%mm2, %%mm1     \n\t"       /* B - A */
+    "  por         %%mm1, %%mm0     \n\t"       /* and or gives abs difference */
 
     "  movq        %%mm0, %%mm1     \n\t"
 
@@ -194,7 +194,7 @@
 
     "  movq        %%mm0, %%mm2     \n\t"
     "  movq        %%mm1, %%mm3     \n\t"
-    "  psrlq       $32, %%mm2       \n\t"	/* fold and add */
+    "  psrlq       $32, %%mm2       \n\t"       /* fold and add */
     "  psrlq       $32, %%mm3       \n\t"
     "  paddw       %%mm2, %%mm0     \n\t"
     "  paddw       %%mm3, %%mm1     \n\t"
@@ -206,13 +206,13 @@
     "  paddw       %%mm3, %%mm1     \n\t"
 
     "  psubusw     %%mm0, %%mm1     \n\t"
-    "  paddw       %%mm0, %%mm1     \n\t" 	/* mm1 = max(mm1, mm0) */
+    "  paddw       %%mm0, %%mm1     \n\t"       /* mm1 = max(mm1, mm0) */
     "  movd        %%mm1, %0        \n\t"
     "  andl        $0xffff, %0      \n\t"
 
      : "=m" (MaxSad),
-       "+r" (Src1), 
-       "+r" (Src2) 
+       "+r" (Src1),
+       "+r" (Src2)
      :
      : "memory"
   );
@@ -220,80 +220,80 @@
 }
 
 static ogg_uint32_t col_sad8x8__mmx (unsigned char *Src1, unsigned char *Src2,
-		                    ogg_uint32_t stride)
+                                    ogg_uint32_t stride)
 {
   ogg_uint32_t MaxSad;
 
   __asm__ __volatile__ (
     "  .p2align 4                   \n\t"
 
-    "  pxor        %%mm3, %%mm3     \n\t"	/* zero out mm3 for unpack */
-    "  pxor        %%mm4, %%mm4     \n\t"	/* mm4 low sum */
-    "  pxor        %%mm5, %%mm5     \n\t" 	/* mm5 high sum */
-    "  pxor        %%mm6, %%mm6     \n\t"	/* mm6 low sum */
-    "  pxor        %%mm7, %%mm7     \n\t" 	/* mm7 high sum */
-    "  mov         $4, %%edi        \n\t"	/* 4 rows */
+    "  pxor        %%mm3, %%mm3     \n\t"       /* zero out mm3 for unpack */
+    "  pxor        %%mm4, %%mm4     \n\t"       /* mm4 low sum */
+    "  pxor        %%mm5, %%mm5     \n\t"       /* mm5 high sum */
+    "  pxor        %%mm6, %%mm6     \n\t"       /* mm6 low sum */
+    "  pxor        %%mm7, %%mm7     \n\t"       /* mm7 high sum */
+    "  mov         $4, %%edi        \n\t"       /* 4 rows */
     "1:                             \n\t"
-    "  movq        (%1), %%mm0      \n\t"	/* take 8 bytes */
-    "  movq        (%2), %%mm1      \n\t"	/* take 8 bytes */
+    "  movq        (%1), %%mm0      \n\t"       /* take 8 bytes */
+    "  movq        (%2), %%mm1      \n\t"       /* take 8 bytes */
 
     "  movq        %%mm0, %%mm2     \n\t"
-    "  psubusb     %%mm1, %%mm0     \n\t" 	/* A - B */
-    "  psubusb     %%mm2, %%mm1     \n\t"	/* B - A */
-    "  por         %%mm1, %%mm0     \n\t"      	/* and or gives abs difference */
+    "  psubusb     %%mm1, %%mm0     \n\t"       /* A - B */
+    "  psubusb     %%mm2, %%mm1     \n\t"       /* B - A */
+    "  por         %%mm1, %%mm0     \n\t"       /* and or gives abs difference */
     "  movq        %%mm0, %%mm1     \n\t"
 
-    "  punpcklbw   %%mm3, %%mm0     \n\t"	/* unpack to higher precision for accumulation */
-    "  paddw       %%mm0, %%mm4     \n\t"	/* accumulate difference... */
-    "  punpckhbw   %%mm3, %%mm1     \n\t"	/* unpack high four bytes to higher precision */
-    "  paddw       %%mm1, %%mm5     \n\t"	/* accumulate difference... */
-    "  add         %3, %1           \n\t"	/* Inc pointer into the new data */
-    "  add         %3, %2           \n\t"	/* Inc pointer into the new data */
+    "  punpcklbw   %%mm3, %%mm0     \n\t"       /* unpack to higher precision for accumulation */
+    "  paddw       %%mm0, %%mm4     \n\t"       /* accumulate difference... */
+    "  punpckhbw   %%mm3, %%mm1     \n\t"       /* unpack high four bytes to higher precision */
+    "  paddw       %%mm1, %%mm5     \n\t"       /* accumulate difference... */
+    "  add         %3, %1           \n\t"       /* Inc pointer into the new data */
+    "  add         %3, %2           \n\t"       /* Inc pointer into the new data */
 
     "  dec         %%edi            \n\t"
     "  jnz 1b                       \n\t"
 
-    "  mov         $4, %%edi        \n\t"	/* 4 rows */
+    "  mov         $4, %%edi        \n\t"       /* 4 rows */
     "2:                             \n\t"
-    "  movq        (%1), %%mm0      \n\t"	/* take 8 bytes */
-    "  movq        (%2), %%mm1      \n\t"	/* take 8 bytes */
+    "  movq        (%1), %%mm0      \n\t"       /* take 8 bytes */
+    "  movq        (%2), %%mm1      \n\t"       /* take 8 bytes */
 
     "  movq        %%mm0, %%mm2     \n\t"
-    "  psubusb     %%mm1, %%mm0     \n\t" 	/* A - B */
-    "  psubusb     %%mm2, %%mm1     \n\t"	/* B - A */
-    "  por         %%mm1, %%mm0     \n\t"      	/* and or gives abs difference */
+    "  psubusb     %%mm1, %%mm0     \n\t"       /* A - B */
+    "  psubusb     %%mm2, %%mm1     \n\t"       /* B - A */
+    "  por         %%mm1, %%mm0     \n\t"       /* and or gives abs difference */
     "  movq        %%mm0, %%mm1     \n\t"
 
-    "  punpcklbw   %%mm3, %%mm0     \n\t"	/* unpack to higher precision for accumulation */
-    "  paddw       %%mm0, %%mm6     \n\t"	/* accumulate difference... */
-    "  punpckhbw   %%mm3, %%mm1     \n\t"	/* unpack high four bytes to higher precision */
-    "  paddw       %%mm1, %%mm7     \n\t"	/* accumulate difference... */
-    "  add         %3, %1           \n\t"	/* Inc pointer into the new data */
-    "  add         %3, %2           \n\t"	/* Inc pointer into the new data */
+    "  punpcklbw   %%mm3, %%mm0     \n\t"       /* unpack to higher precision for accumulation */
+    "  paddw       %%mm0, %%mm6     \n\t"       /* accumulate difference... */
+    "  punpckhbw   %%mm3, %%mm1     \n\t"       /* unpack high four bytes to higher precision */
+    "  paddw       %%mm1, %%mm7     \n\t"       /* accumulate difference... */
+    "  add         %3, %1           \n\t"       /* Inc pointer into the new data */
+    "  add         %3, %2           \n\t"       /* Inc pointer into the new data */
 
     "  dec         %%edi            \n\t"
     "  jnz 2b                       \n\t"
 
     "  psubusw     %%mm6, %%mm7     \n\t"
-    "  paddw       %%mm6, %%mm7     \n\t" 	/* mm7 = max(mm7, mm6) */
-    "  psubusw     %%mm4, %%mm5     \n\t" 	
-    "  paddw       %%mm4, %%mm5     \n\t" 	/* mm5 = max(mm5, mm4) */
-    "  psubusw     %%mm5, %%mm7     \n\t" 	
-    "  paddw       %%mm5, %%mm7     \n\t" 	/* mm7 = max(mm5, mm7) */
+    "  paddw       %%mm6, %%mm7     \n\t"       /* mm7 = max(mm7, mm6) */
+    "  psubusw     %%mm4, %%mm5     \n\t"
+    "  paddw       %%mm4, %%mm5     \n\t"       /* mm5 = max(mm5, mm4) */
+    "  psubusw     %%mm5, %%mm7     \n\t"
+    "  paddw       %%mm5, %%mm7     \n\t"       /* mm7 = max(mm5, mm7) */
     "  movq        %%mm7, %%mm6     \n\t"
     "  psrlq       $32, %%mm6       \n\t"
-    "  psubusw     %%mm6, %%mm7     \n\t" 	
-    "  paddw       %%mm6, %%mm7     \n\t" 	/* mm7 = max(mm5, mm7) */
+    "  psubusw     %%mm6, %%mm7     \n\t"
+    "  paddw       %%mm6, %%mm7     \n\t"       /* mm7 = max(mm5, mm7) */
     "  movq        %%mm7, %%mm6     \n\t"
     "  psrlq       $16, %%mm6       \n\t"
-    "  psubusw     %%mm6, %%mm7     \n\t" 	
-    "  paddw       %%mm6, %%mm7     \n\t" 	/* mm7 = max(mm5, mm7) */
+    "  psubusw     %%mm6, %%mm7     \n\t"
+    "  paddw       %%mm6, %%mm7     \n\t"       /* mm7 = max(mm5, mm7) */
     "  movd        %%mm7, %0        \n\t"
     "  andl        $0xffff, %0      \n\t"
 
      : "=r" (MaxSad),
-       "+r" (Src1), 
-       "+r" (Src2) 
+       "+r" (Src1),
+       "+r" (Src2)
      : "r" (stride)
      : "memory", "edi"
   );
@@ -302,29 +302,29 @@
 }
 
 #define SAD_LOOP                                                              \
-  "  movq        (%1), %%mm0      \n\t"	/* take 8 bytes */                    \
+  "  movq        (%1), %%mm0      \n\t" /* take 8 bytes */                    \
   "  movq        (%2), %%mm1      \n\t"                                       \
   "  movq        %%mm0, %%mm2     \n\t"                                       \
-  "  psubusb     %%mm1, %%mm0     \n\t" 	/* A - B */                         \
-  "  psubusb     %%mm2, %%mm1     \n\t"	/* B - A */                           \
+  "  psubusb     %%mm1, %%mm0     \n\t"         /* A - B */                         \
+  "  psubusb     %%mm2, %%mm1     \n\t" /* B - A */                           \
   "  por         %%mm1, %%mm0     \n\t" /* and or gives abs difference */     \
   "  movq        %%mm0, %%mm1     \n\t"                                       \
-  "  punpcklbw   %%mm6, %%mm0     \n\t"	/* unpack to higher precision for accumulation */ \
-  "  paddw       %%mm0, %%mm7     \n\t"	/* accumulate difference... */        \
-  "  punpckhbw   %%mm6, %%mm1     \n\t"	/* unpack high four bytes to higher precision */ \
-  "  add         %3, %1           \n\t"	/* Inc pointer into the new data */   \
-  "  paddw       %%mm1, %%mm7     \n\t"	/* accumulate difference... */        \
-  "  add         %4, %2           \n\t"	/* Inc pointer into ref data */  
+  "  punpcklbw   %%mm6, %%mm0     \n\t" /* unpack to higher precision for accumulation */ \
+  "  paddw       %%mm0, %%mm7     \n\t" /* accumulate difference... */        \
+  "  punpckhbw   %%mm6, %%mm1     \n\t" /* unpack high four bytes to higher precision */ \
+  "  add         %3, %1           \n\t" /* Inc pointer into the new data */   \
+  "  paddw       %%mm1, %%mm7     \n\t" /* accumulate difference... */        \
+  "  add         %4, %2           \n\t" /* Inc pointer into ref data */
 
 static ogg_uint32_t sad8x8__mmx (unsigned char *ptr1, ogg_uint32_t stride1,
-		       	    unsigned char *ptr2, ogg_uint32_t stride2)
+                            unsigned char *ptr2, ogg_uint32_t stride2)
 {
   ogg_uint32_t  DiffVal;
 
   __asm__ __volatile__ (
     "  .p2align 4                   \n\t"
-    "  pxor        %%mm6, %%mm6     \n\t"	/* zero out mm6 for unpack */
-    "  pxor        %%mm7, %%mm7     \n\t" 	/* mm7 contains the result */
+    "  pxor        %%mm6, %%mm6     \n\t"       /* zero out mm6 for unpack */
+    "  pxor        %%mm7, %%mm7     \n\t"       /* mm7 contains the result */
     SAD_LOOP
     SAD_LOOP
     SAD_LOOP
@@ -343,8 +343,8 @@
     "  andl        $0xffff, %0      \n\t"
 
      : "=m" (DiffVal),
-       "+r" (ptr1), 
-       "+r" (ptr2) 
+       "+r" (ptr1),
+       "+r" (ptr2)
      : "r" (stride1),
        "r" (stride2)
      : "memory"
@@ -354,33 +354,33 @@
 }
 
 static ogg_uint32_t sad8x8_thres__mmx (unsigned char *ptr1, ogg_uint32_t stride1,
-		       		  unsigned char *ptr2, ogg_uint32_t stride2, 
-			   	  ogg_uint32_t thres)
+                                  unsigned char *ptr2, ogg_uint32_t stride2,
+                                  ogg_uint32_t thres)
 {
   return sad8x8__mmx (ptr1, stride1, ptr2, stride2);
 }
 
 static ogg_uint32_t sad8x8_xy2_thres__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride,
-		                      unsigned char *RefDataPtr1,
-			              unsigned char *RefDataPtr2, ogg_uint32_t RefStride,
-			              ogg_uint32_t thres)
+                                      unsigned char *RefDataPtr1,
+                                      unsigned char *RefDataPtr2, ogg_uint32_t RefStride,
+                                      ogg_uint32_t thres)
 {
   ogg_uint32_t  DiffVal;
 
   __asm__ __volatile__ (
     "  .p2align 4                   \n\t"
 
-    "  pcmpeqd     %%mm5, %%mm5     \n\t"	/* fefefefefefefefe in mm5 */
+    "  pcmpeqd     %%mm5, %%mm5     \n\t"       /* fefefefefefefefe in mm5 */
     "  paddb       %%mm5, %%mm5     \n\t"
-   
-    "  pxor        %%mm6, %%mm6     \n\t"	/* zero out mm6 for unpack */
-    "  pxor        %%mm7, %%mm7     \n\t" 	/* mm7 contains the result */
-    "  mov         $8, %%edi        \n\t"	/* 8 rows */
+
+    "  pxor        %%mm6, %%mm6     \n\t"       /* zero out mm6 for unpack */
+    "  pxor        %%mm7, %%mm7     \n\t"       /* mm7 contains the result */
+    "  mov         $8, %%edi        \n\t"       /* 8 rows */
     "1:                             \n\t"
-    "  movq        (%1), %%mm0      \n\t"	/* take 8 bytes */
+    "  movq        (%1), %%mm0      \n\t"       /* take 8 bytes */
 
     "  movq        (%2), %%mm2      \n\t"
-    "  movq        (%3), %%mm3      \n\t"	/* take average of mm2 and mm3 */
+    "  movq        (%3), %%mm3      \n\t"       /* take average of mm2 and mm3 */
     "  movq        %%mm2, %%mm1     \n\t"
     "  pand        %%mm3, %%mm1     \n\t"
     "  pxor        %%mm2, %%mm3     \n\t"
@@ -390,18 +390,18 @@
 
     "  movq        %%mm0, %%mm2     \n\t"
 
-    "  psubusb     %%mm1, %%mm0     \n\t" 	/* A - B */
-    "  psubusb     %%mm2, %%mm1     \n\t"	/* B - A */
-    "  por         %%mm1, %%mm0     \n\t"    	/* and or gives abs difference */
+    "  psubusb     %%mm1, %%mm0     \n\t"       /* A - B */
+    "  psubusb     %%mm2, %%mm1     \n\t"       /* B - A */
+    "  por         %%mm1, %%mm0     \n\t"       /* and or gives abs difference */
     "  movq        %%mm0, %%mm1     \n\t"
 
-    "  punpcklbw   %%mm6, %%mm0     \n\t"	/* unpack to higher precision for accumulation */
-    "  paddw       %%mm0, %%mm7     \n\t"	/* accumulate difference... */
-    "  punpckhbw   %%mm6, %%mm1     \n\t"	/* unpack high four bytes to higher precision */
-    "  add         %4, %1           \n\t"	/* Inc pointer into the new data */
-    "  paddw       %%mm1, %%mm7     \n\t"	/* accumulate difference... */
-    "  add         %5, %2           \n\t"	/* Inc pointer into ref data */
-    "  add         %5, %3           \n\t"	/* Inc pointer into ref data */
+    "  punpcklbw   %%mm6, %%mm0     \n\t"       /* unpack to higher precision for accumulation */
+    "  paddw       %%mm0, %%mm7     \n\t"       /* accumulate difference... */
+    "  punpckhbw   %%mm6, %%mm1     \n\t"       /* unpack high four bytes to higher precision */
+    "  add         %4, %1           \n\t"       /* Inc pointer into the new data */
+    "  paddw       %%mm1, %%mm7     \n\t"       /* accumulate difference... */
+    "  add         %5, %2           \n\t"       /* Inc pointer into ref data */
+    "  add         %5, %3           \n\t"       /* Inc pointer into ref data */
 
     "  dec         %%edi            \n\t"
     "  jnz 1b                       \n\t"
@@ -416,9 +416,9 @@
     "  andl        $0xffff, %0      \n\t"
 
      : "=m" (DiffVal),
-       "+r" (SrcData), 
-       "+r" (RefDataPtr1), 
-       "+r" (RefDataPtr2) 
+       "+r" (SrcData),
+       "+r" (RefDataPtr1),
+       "+r" (RefDataPtr2)
      : "m" (SrcStride),
        "m" (RefStride)
      : "edi", "memory"
@@ -440,7 +440,7 @@
     "  pxor        %%mm7, %%mm7     \n\t"
     "  mov         $8, %%edi        \n\t"
     "1:                             \n\t"
-    "  movq        (%2), %%mm0      \n\t"	/* take 8 bytes */
+    "  movq        (%2), %%mm0      \n\t"       /* take 8 bytes */
     "  movq        %%mm0, %%mm2     \n\t"
 
     "  punpcklbw   %%mm6, %%mm0     \n\t"
@@ -451,11 +451,11 @@
 
     "  pmaddwd     %%mm0, %%mm0     \n\t"
     "  pmaddwd     %%mm2, %%mm2     \n\t"
-    
+
     "  paddd       %%mm0, %%mm7     \n\t"
     "  paddd       %%mm2, %%mm7     \n\t"
 
-    "  add         %3, %2           \n\t"	/* Inc pointer into src data */
+    "  add         %3, %2           \n\t"       /* Inc pointer into src data */
 
     "  dec         %%edi            \n\t"
     "  jnz 1b                       \n\t"
@@ -477,7 +477,7 @@
 
      : "=r" (XSum),
        "=r" (XXSum),
-       "+r" (DataPtr) 
+       "+r" (DataPtr)
      : "r" (Stride)
      : "edi", "memory"
   );
@@ -487,7 +487,7 @@
 }
 
 static ogg_uint32_t inter8x8_err__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride,
-		                 unsigned char *RefDataPtr, ogg_uint32_t RefStride)
+                                 unsigned char *RefDataPtr, ogg_uint32_t RefStride)
 {
   ogg_uint32_t  XSum;
   ogg_uint32_t  XXSum;
@@ -500,7 +500,7 @@
     "  pxor        %%mm7, %%mm7     \n\t"
     "  mov         $8, %%edi        \n\t"
     "1:                             \n\t"
-    "  movq        (%2), %%mm0      \n\t"	/* take 8 bytes */
+    "  movq        (%2), %%mm0      \n\t"       /* take 8 bytes */
     "  movq        (%3), %%mm1      \n\t"
     "  movq        %%mm0, %%mm2     \n\t"
     "  movq        %%mm1, %%mm3     \n\t"
@@ -518,12 +518,12 @@
 
     "  pmaddwd     %%mm0, %%mm0     \n\t"
     "  pmaddwd     %%mm2, %%mm2     \n\t"
-    
+
     "  paddd       %%mm0, %%mm7     \n\t"
     "  paddd       %%mm2, %%mm7     \n\t"
 
-    "  add         %4, %2           \n\t"	/* Inc pointer into src data */
-    "  add         %5, %3           \n\t"	/* Inc pointer into ref data */
+    "  add         %4, %2           \n\t"       /* Inc pointer into src data */
+    "  add         %5, %3           \n\t"       /* Inc pointer into ref data */
 
     "  dec         %%edi            \n\t"
     "  jnz 1b                       \n\t"
@@ -545,8 +545,8 @@
 
      : "=m" (XSum),
        "=m" (XXSum),
-       "+r" (SrcData), 
-       "+r" (RefDataPtr) 
+       "+r" (SrcData),
+       "+r" (RefDataPtr)
      : "m" (SrcStride),
        "m" (RefStride)
      : "edi", "memory"
@@ -557,8 +557,8 @@
 }
 
 static ogg_uint32_t inter8x8_err_xy2__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride,
-		                     unsigned char *RefDataPtr1,
-				     unsigned char *RefDataPtr2, ogg_uint32_t RefStride)
+                                     unsigned char *RefDataPtr1,
+                                     unsigned char *RefDataPtr2, ogg_uint32_t RefStride)
 {
   ogg_uint32_t XSum;
   ogg_uint32_t XXSum;
@@ -566,17 +566,17 @@
   __asm__ __volatile__ (
     "  .p2align 4                   \n\t"
 
-    "  pcmpeqd     %%mm4, %%mm4     \n\t"	/* fefefefefefefefe in mm4 */
+    "  pcmpeqd     %%mm4, %%mm4     \n\t"       /* fefefefefefefefe in mm4 */
     "  paddb       %%mm4, %%mm4     \n\t"
     "  pxor        %%mm5, %%mm5     \n\t"
     "  pxor        %%mm6, %%mm6     \n\t"
     "  pxor        %%mm7, %%mm7     \n\t"
     "  mov         $8, %%edi        \n\t"
     "1:                             \n\t"
-    "  movq        (%2), %%mm0      \n\t"	/* take 8 bytes */
+    "  movq        (%2), %%mm0      \n\t"       /* take 8 bytes */
 
     "  movq        (%3), %%mm2      \n\t"
-    "  movq        (%4), %%mm3      \n\t"	/* take average of mm2 and mm3 */
+    "  movq        (%4), %%mm3      \n\t"       /* take average of mm2 and mm3 */
     "  movq        %%mm2, %%mm1     \n\t"
     "  pand        %%mm3, %%mm1     \n\t"
     "  pxor        %%mm2, %%mm3     \n\t"
@@ -600,13 +600,13 @@
 
     "  pmaddwd     %%mm0, %%mm0     \n\t"
     "  pmaddwd     %%mm2, %%mm2     \n\t"
-    
+
     "  paddd       %%mm0, %%mm7     \n\t"
     "  paddd       %%mm2, %%mm7     \n\t"
 
-    "  add         %5, %2           \n\t"	/* Inc pointer into src data */
-    "  add         %6, %3           \n\t"	/* Inc pointer into ref data */
-    "  add         %6, %4           \n\t"	/* Inc pointer into ref data */
+    "  add         %5, %2           \n\t"       /* Inc pointer into src data */
+    "  add         %6, %3           \n\t"       /* Inc pointer into ref data */
+    "  add         %6, %4           \n\t"       /* Inc pointer into ref data */
 
     "  dec         %%edi            \n\t"
     "  jnz 1b                       \n\t"
@@ -628,9 +628,9 @@
 
      : "=m" (XSum),
        "=m" (XXSum),
-       "+r" (SrcData), 
+       "+r" (SrcData),
        "+r" (RefDataPtr1),
-       "+r" (RefDataPtr2) 
+       "+r" (RefDataPtr2)
      : "m" (SrcStride),
        "m" (RefStride)
      : "edi", "memory"
@@ -649,7 +649,6 @@
 
 void dsp_mmx_init(DspFunctions *funcs)
 {
-  TH_DEBUG("enabling accelerated x86_32 mmx dsp functions.\n");
   funcs->restore_fpu = restore_fpu;
   funcs->sub8x8 = sub8x8__mmx;
   funcs->sub8x8_128 = sub8x8_128__mmx;

Modified: trunk/theora/lib/enc/x86_32/dsp_mmxext.c
===================================================================
--- trunk/theora/lib/enc/x86_32/dsp_mmxext.c	2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/enc/x86_32/dsp_mmxext.c	2008-08-04 18:37:55 UTC (rev 15153)
@@ -23,23 +23,23 @@
 #if defined(USE_ASM)
 
 #define SAD_MMXEXT_LOOP \
- "  movq (%1), %%mm0             \n\t"	/* take 8 bytes */ \
+ "  movq (%1), %%mm0             \n\t"  /* take 8 bytes */ \
  "  movq (%2), %%mm1             \n\t" \
  "  psadbw %%mm1, %%mm0          \n\t" \
- "  add %3, %1                   \n\t"	/* Inc pointer into the new data */ \
- "  paddw %%mm0, %%mm7           \n\t"	/* accumulate difference... */ \
- "  add %4, %2                   \n\t"	/* Inc pointer into ref data */ 
+ "  add %3, %1                   \n\t"  /* Inc pointer into the new data */ \
+ "  paddw %%mm0, %%mm7           \n\t"  /* accumulate difference... */ \
+ "  add %4, %2                   \n\t"  /* Inc pointer into ref data */
 
 
 static ogg_uint32_t sad8x8__mmxext (unsigned char *ptr1, ogg_uint32_t stride1,
-		       	    unsigned char *ptr2, ogg_uint32_t stride2)
+                            unsigned char *ptr2, ogg_uint32_t stride2)
 {
   ogg_uint32_t  DiffVal;
 
   __asm__ __volatile__ (
     "  .p2align 4                   \n\t"
-    "  pxor %%mm7, %%mm7            \n\t" 	/* mm7 contains the result */
-    
+    "  pxor %%mm7, %%mm7            \n\t"       /* mm7 contains the result */
+
     SAD_MMXEXT_LOOP
     SAD_MMXEXT_LOOP
     SAD_MMXEXT_LOOP
@@ -48,15 +48,15 @@
     SAD_MMXEXT_LOOP
     SAD_MMXEXT_LOOP
 
-    "  movq (%1), %%mm0             \n\t"	/* take 8 bytes */
+    "  movq (%1), %%mm0             \n\t"       /* take 8 bytes */
     "  movq (%2), %%mm1             \n\t"
     "  psadbw %%mm1, %%mm0          \n\t"
-    "  paddw %%mm0, %%mm7           \n\t"	/* accumulate difference... */
+    "  paddw %%mm0, %%mm7           \n\t"       /* accumulate difference... */
     "  movd %%mm7, %0               \n\t"
 
      : "=r" (DiffVal),
-       "+r" (ptr1), 
-       "+r" (ptr2) 
+       "+r" (ptr1),
+       "+r" (ptr2)
      : "r" (stride1),
        "r" (stride2)
      : "memory"
@@ -66,23 +66,23 @@
 }
 
 #define SAD_TRES_LOOP \
-  "  movq (%1), %%mm0             \n\t"	/* take 8 bytes */ \
+  "  movq (%1), %%mm0             \n\t" /* take 8 bytes */ \
   "  movq (%2), %%mm1             \n\t" \
   "  psadbw %%mm1, %%mm0          \n\t" \
-  "  add %3, %1                   \n\t"	/* Inc pointer into the new data */ \
-  "  paddw %%mm0, %%mm7           \n\t"	/* accumulate difference... */ \
-  "  add %4, %2                   \n\t"	/* Inc pointer into ref data */
+  "  add %3, %1                   \n\t" /* Inc pointer into the new data */ \
+  "  paddw %%mm0, %%mm7           \n\t" /* accumulate difference... */ \
+  "  add %4, %2                   \n\t" /* Inc pointer into ref data */
 
 
 static ogg_uint32_t sad8x8_thres__mmxext (unsigned char *ptr1, ogg_uint32_t stride1,
-		       		  unsigned char *ptr2, ogg_uint32_t stride2, 
-			   	  ogg_uint32_t thres)
+                                  unsigned char *ptr2, ogg_uint32_t stride2,
+                                  ogg_uint32_t thres)
 {
   ogg_uint32_t  DiffVal;
 
   __asm__ __volatile__ (
     "  .p2align 4                   \n\t"
-    "  pxor %%mm7, %%mm7            \n\t" 	/* mm7 contains the result */
+    "  pxor %%mm7, %%mm7            \n\t"       /* mm7 contains the result */
 
     SAD_TRES_LOOP
     SAD_TRES_LOOP
@@ -96,8 +96,8 @@
     "  movd %%mm7, %0               \n\t"
 
      : "=r" (DiffVal),
-       "+r" (ptr1), 
-       "+r" (ptr2) 
+       "+r" (ptr1),
+       "+r" (ptr2)
      : "r" (stride1),
        "r" (stride2)
      : "memory"
@@ -107,28 +107,28 @@
 }
 
 #define SAD_XY2_TRES \
-  "  movq (%1), %%mm0             \n\t"	/* take 8 bytes */ \
+  "  movq (%1), %%mm0             \n\t" /* take 8 bytes */ \
   "  movq (%2), %%mm1             \n\t" \
   "  movq (%3), %%mm2             \n\t" \
   "  pavgb %%mm2, %%mm1           \n\t" \
   "  psadbw %%mm1, %%mm0          \n\t" \
  \
-  "  add %4, %1                   \n\t"	/* Inc pointer into the new data */ \
-  "  paddw %%mm0, %%mm7           \n\t"	/* accumulate difference... */ \
-  "  add %5, %2                   \n\t"	/* Inc pointer into ref data */ \
-  "  add %5, %3                   \n\t"	/* Inc pointer into ref data */
+  "  add %4, %1                   \n\t" /* Inc pointer into the new data */ \
+  "  paddw %%mm0, %%mm7           \n\t" /* accumulate difference... */ \
+  "  add %5, %2                   \n\t" /* Inc pointer into ref data */ \
+  "  add %5, %3                   \n\t" /* Inc pointer into ref data */
 
 
 static ogg_uint32_t sad8x8_xy2_thres__mmxext (unsigned char *SrcData, ogg_uint32_t SrcStride,
-		                      unsigned char *RefDataPtr1,
-			              unsigned char *RefDataPtr2, ogg_uint32_t RefStride,
-			              ogg_uint32_t thres)
+                                      unsigned char *RefDataPtr1,
+                                      unsigned char *RefDataPtr2, ogg_uint32_t RefStride,
+                                      ogg_uint32_t thres)
 {
   ogg_uint32_t  DiffVal;
 
   __asm__ __volatile__ (
     "  .p2align 4                   \n\t"
-    "  pxor %%mm7, %%mm7            \n\t" 	/* mm7 contains the result */
+    "  pxor %%mm7, %%mm7            \n\t"       /* mm7 contains the result */
     SAD_XY2_TRES
     SAD_XY2_TRES
     SAD_XY2_TRES
@@ -140,9 +140,9 @@
 
     "  movd %%mm7, %0               \n\t"
      : "=m" (DiffVal),
-       "+r" (SrcData), 
-       "+r" (RefDataPtr1), 
-       "+r" (RefDataPtr2) 
+       "+r" (SrcData),
+       "+r" (RefDataPtr1),
+       "+r" (RefDataPtr2)
      : "m" (SrcStride),
        "m" (RefStride)
      : "memory"
@@ -150,7 +150,7 @@
 
   return DiffVal;
 }
-		
+
 static ogg_uint32_t row_sad8__mmxext (unsigned char *Src1, unsigned char *Src2)
 {
   ogg_uint32_t MaxSad;
@@ -170,8 +170,8 @@
     "  andl        $0xffff, %0      \n\t"
 
      : "=m" (MaxSad),
-       "+r" (Src1), 
-       "+r" (Src2) 
+       "+r" (Src1),
+       "+r" (Src2)
      :
      : "memory"
   );
@@ -180,56 +180,56 @@
 }
 
 static ogg_uint32_t col_sad8x8__mmxext (unsigned char *Src1, unsigned char *Src2,
-		                    ogg_uint32_t stride)
+                                    ogg_uint32_t stride)
 {
   ogg_uint32_t MaxSad;
 
   __asm__ __volatile__ (
     "  .p2align 4                   \n\t"
 
-    "  pxor        %%mm3, %%mm3     \n\t"	/* zero out mm3 for unpack */
-    "  pxor        %%mm4, %%mm4     \n\t"	/* mm4 low sum */
-    "  pxor        %%mm5, %%mm5     \n\t" 	/* mm5 high sum */
-    "  pxor        %%mm6, %%mm6     \n\t"	/* mm6 low sum */
-    "  pxor        %%mm7, %%mm7     \n\t" 	/* mm7 high sum */
-    "  mov         $4, %%edi        \n\t"	/* 4 rows */
+    "  pxor        %%mm3, %%mm3     \n\t"       /* zero out mm3 for unpack */
+    "  pxor        %%mm4, %%mm4     \n\t"       /* mm4 low sum */
+    "  pxor        %%mm5, %%mm5     \n\t"       /* mm5 high sum */
+    "  pxor        %%mm6, %%mm6     \n\t"       /* mm6 low sum */
+    "  pxor        %%mm7, %%mm7     \n\t"       /* mm7 high sum */
+    "  mov         $4, %%edi        \n\t"       /* 4 rows */
     "1:                             \n\t"
-    "  movq        (%1), %%mm0      \n\t"	/* take 8 bytes */
-    "  movq        (%2), %%mm1      \n\t"	/* take 8 bytes */
+    "  movq        (%1), %%mm0      \n\t"       /* take 8 bytes */
+    "  movq        (%2), %%mm1      \n\t"       /* take 8 bytes */
 
     "  movq        %%mm0, %%mm2     \n\t"
-    "  psubusb     %%mm1, %%mm0     \n\t" 	/* A - B */
-    "  psubusb     %%mm2, %%mm1     \n\t"	/* B - A */
-    "  por         %%mm1, %%mm0     \n\t"      	/* and or gives abs difference */
+    "  psubusb     %%mm1, %%mm0     \n\t"       /* A - B */
+    "  psubusb     %%mm2, %%mm1     \n\t"       /* B - A */
+    "  por         %%mm1, %%mm0     \n\t"       /* and or gives abs difference */
     "  movq        %%mm0, %%mm1     \n\t"
 
-    "  punpcklbw   %%mm3, %%mm0     \n\t"	/* unpack to higher precision for accumulation */
-    "  paddw       %%mm0, %%mm4     \n\t"	/* accumulate difference... */
-    "  punpckhbw   %%mm3, %%mm1     \n\t"	/* unpack high four bytes to higher precision */
-    "  paddw       %%mm1, %%mm5     \n\t"	/* accumulate difference... */
-    "  add         %3, %1           \n\t"	/* Inc pointer into the new data */
-    "  add         %3, %2           \n\t"	/* Inc pointer into the new data */
+    "  punpcklbw   %%mm3, %%mm0     \n\t"       /* unpack to higher precision for accumulation */
+    "  paddw       %%mm0, %%mm4     \n\t"       /* accumulate difference... */
+    "  punpckhbw   %%mm3, %%mm1     \n\t"       /* unpack high four bytes to higher precision */
+    "  paddw       %%mm1, %%mm5     \n\t"       /* accumulate difference... */
+    "  add         %3, %1           \n\t"       /* Inc pointer into the new data */
+    "  add         %3, %2           \n\t"       /* Inc pointer into the new data */
 
     "  dec         %%edi            \n\t"
     "  jnz 1b                       \n\t"
 
-    "  mov         $4, %%edi        \n\t"	/* 4 rows */
+    "  mov         $4, %%edi        \n\t"       /* 4 rows */
     "2:                             \n\t"
-    "  movq        (%1), %%mm0      \n\t"	/* take 8 bytes */
-    "  movq        (%2), %%mm1      \n\t"	/* take 8 bytes */
+    "  movq        (%1), %%mm0      \n\t"       /* take 8 bytes */
+    "  movq        (%2), %%mm1      \n\t"       /* take 8 bytes */
 
     "  movq        %%mm0, %%mm2     \n\t"
-    "  psubusb     %%mm1, %%mm0     \n\t" 	/* A - B */
-    "  psubusb     %%mm2, %%mm1     \n\t"	/* B - A */
-    "  por         %%mm1, %%mm0     \n\t"      	/* and or gives abs difference */
+    "  psubusb     %%mm1, %%mm0     \n\t"       /* A - B */
+    "  psubusb     %%mm2, %%mm1     \n\t"       /* B - A */
+    "  por         %%mm1, %%mm0     \n\t"       /* and or gives abs difference */
     "  movq        %%mm0, %%mm1     \n\t"
 
-    "  punpcklbw   %%mm3, %%mm0     \n\t"	/* unpack to higher precision for accumulation */
-    "  paddw       %%mm0, %%mm6     \n\t"	/* accumulate difference... */
-    "  punpckhbw   %%mm3, %%mm1     \n\t"	/* unpack high four bytes to higher precision */
-    "  paddw       %%mm1, %%mm7     \n\t"	/* accumulate difference... */
-    "  add         %3, %1           \n\t"	/* Inc pointer into the new data */
-    "  add         %3, %2           \n\t"	/* Inc pointer into the new data */
+    "  punpcklbw   %%mm3, %%mm0     \n\t"       /* unpack to higher precision for accumulation */
+    "  paddw       %%mm0, %%mm6     \n\t"       /* accumulate difference... */
+    "  punpckhbw   %%mm3, %%mm1     \n\t"       /* unpack high four bytes to higher precision */
+    "  paddw       %%mm1, %%mm7     \n\t"       /* accumulate difference... */
+    "  add         %3, %1           \n\t"       /* Inc pointer into the new data */
+    "  add         %3, %2           \n\t"       /* Inc pointer into the new data */
 
     "  dec         %%edi            \n\t"
     "  jnz 2b                       \n\t"
@@ -247,8 +247,8 @@
     "  andl        $0xffff, %0      \n\t"
 
      : "=r" (MaxSad),
-       "+r" (Src1), 
-       "+r" (Src2) 
+       "+r" (Src1),
+       "+r" (Src2)
      : "r" (stride)
      : "memory", "edi"
   );
@@ -257,8 +257,8 @@
 }
 
 static ogg_uint32_t inter8x8_err_xy2__mmxext (unsigned char *SrcData, ogg_uint32_t SrcStride,
-		                     unsigned char *RefDataPtr1,
-				     unsigned char *RefDataPtr2, ogg_uint32_t RefStride)
+                                     unsigned char *RefDataPtr1,
+                                     unsigned char *RefDataPtr2, ogg_uint32_t RefStride)
 {
   ogg_uint32_t XSum;
   ogg_uint32_t XXSum;
@@ -272,10 +272,10 @@
     "  pxor        %%mm7, %%mm7     \n\t"
     "  mov         $8, %%edi        \n\t"
     "1:                             \n\t"
-    "  movq        (%2), %%mm0      \n\t"	/* take 8 bytes */
+    "  movq        (%2), %%mm0      \n\t"       /* take 8 bytes */
 
     "  movq        (%3), %%mm2      \n\t"
-    "  movq        (%4), %%mm1      \n\t"	/* take average of mm2 and mm1 */
+    "  movq        (%4), %%mm1      \n\t"       /* take average of mm2 and mm1 */
     "  pavgb       %%mm2, %%mm1     \n\t"
 
     "  movq        %%mm0, %%mm2     \n\t"
@@ -294,13 +294,13 @@
 
     "  pmaddwd     %%mm0, %%mm0     \n\t"
     "  pmaddwd     %%mm2, %%mm2     \n\t"
-    
+
     "  paddd       %%mm0, %%mm7     \n\t"
     "  paddd       %%mm2, %%mm7     \n\t"
 
-    "  add         %5, %2           \n\t"	/* Inc pointer into src data */
-    "  add         %6, %3           \n\t"	/* Inc pointer into ref data */
-    "  add         %6, %4           \n\t"	/* Inc pointer into ref data */
+    "  add         %5, %2           \n\t"       /* Inc pointer into src data */
+    "  add         %6, %3           \n\t"       /* Inc pointer into ref data */
+    "  add         %6, %4           \n\t"       /* Inc pointer into ref data */
 
     "  dec         %%edi            \n\t"
     "  jnz 1b                       \n\t"
@@ -322,9 +322,9 @@
 
      : "=m" (XSum),
        "=m" (XXSum),
-       "+r" (SrcData), 
+       "+r" (SrcData),
        "+r" (RefDataPtr1),
-       "+r" (RefDataPtr2) 
+       "+r" (RefDataPtr2)
      : "m" (SrcStride),
        "m" (RefStride)
      : "edi", "memory"
@@ -336,7 +336,6 @@
 
 void dsp_mmxext_init(DspFunctions *funcs)
 {
-  TH_DEBUG("enabling accelerated x86_32 mmxext dsp functions.\n");
   funcs->row_sad8 = row_sad8__mmxext;
   funcs->col_sad8x8 = col_sad8x8__mmxext;
   funcs->sad8x8 = sad8x8__mmxext;

Modified: trunk/theora/lib/enc/x86_32/fdct_mmx.c
===================================================================
--- trunk/theora/lib/enc/x86_32/fdct_mmx.c	2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/enc/x86_32/fdct_mmx.c	2008-08-04 18:37:55 UTC (rev 15153)
@@ -59,7 +59,7 @@
   "  psubsw      %%mm3, %%mm1       \n\t" /* mm1 = is12 - is56 */             \
   "  movq        %%mm0," #temp "    \n\t" /* Save is07 - is34 to free mm0; */ \
   "  paddsw      %%mm3, %%mm3       \n\t"                                     \
-  "  paddsw      %%mm1, %%mm3       \n\t" /* mm3 = is12 + 1s56	= is1256 */   \
+  "  paddsw      %%mm1, %%mm3       \n\t" /* mm3 = is12 + 1s56  = is1256 */   \
                                                                               \
   "  psubsw    " #ip6 ", %%mm7      \n\t" /* mm7 = ip5 - ip6 = id56 */        \
   /* ------------------------------------------------------------------- */   \
@@ -85,7 +85,7 @@
   "  pmulhw      %[xC4S4], %%mm3    \n\t" /* mm3 = xC4S4 * ( is0734 +is1256 ) - ( is0734 +is1256 ) */ \
                                                                               \
   "  psrlw       $15, %%mm2         \n\t"                                     \
-  "  paddw       %%mm0, %%mm3       \n\t" /* mm3 = xC4S4 * ( is0734 +is1256 )	 */ \
+  "  paddw       %%mm0, %%mm3       \n\t" /* mm3 = xC4S4 * ( is0734 +is1256 )    */ \
   "  paddw       %%mm2, %%mm3       \n\t" /* Truncate mm3, now it is op[0] */ \
                                                                               \
   "  movq        %%mm3," #ip0 "     \n\t"                                     \
@@ -136,16 +136,16 @@
   "  movq        %%mm1, %%mm3       \n\t"                                     \
                                                                               \
   "  pmulhw      %%mm0, %%mm1       \n\t" /* mm0 = xC4S4 * ( is12 - is56 ) - ( is12 - is56 ) */ \
-  "  psrlw       $15, %%mm2         \n\t"				      \
+  "  psrlw       $15, %%mm2         \n\t"                                     \
                                                                               \
   "  paddw       %%mm3, %%mm1       \n\t" /* mm0 = xC4S4 * ( is12 - is56 ) */ \
   "  paddw       %%mm2, %%mm1       \n\t" /* Truncate mm1, now it is icommon_product1 */ \
                                                                               \
   "  movq        %%mm7, %%mm2       \n\t"                                     \
-  "  movq        %%mm7, %%mm3       \n\t"			              \
+  "  movq        %%mm7, %%mm3       \n\t"                                     \
                                                                               \
   "  pmulhw      %%mm0, %%mm7       \n\t" /* mm7 = xC4S4 * ( id12 + id56 ) - ( id12 + id56 ) */ \
-  "  psrlw       $15, %%mm2         \n\t"			              \
+  "  psrlw       $15, %%mm2         \n\t"                                     \
                                                                               \
   "  paddw       %%mm3, %%mm7       \n\t" /* mm7 = xC4S4 * ( id12 + id56 ) */ \
   "  paddw       %%mm2, %%mm7       \n\t" /* Truncate mm7, now it is icommon_product2 */ \
@@ -234,10 +234,10 @@
   "  paddw       %%mm6, %%mm7       \n\t" /* mm7 = xC3S5 * irot_input_y */    \
                                                                               \
   "  paddw       %%mm7, %%mm3       \n\t" /* ip5 */                           \
-  "  movq        %%mm3," #ip5 "     \n\t" 
+  "  movq        %%mm3," #ip5 "     \n\t"
 
 #define Transpose_mmx(ip0,ip1,ip2,ip3,ip4,ip5,ip6,ip7,                  \
-		      op0,op1,op2,op3,op4,op5,op6,op7)                  \
+                      op0,op1,op2,op3,op4,op5,op6,op7)                  \
   "  movq      " #ip0 ", %%mm0      \n\t" /* mm0 = a0 a1 a2 a3 */       \
   "  movq      " #ip4 ", %%mm4      \n\t" /* mm4 = e4 e5 e6 e7 */       \
   "  movq      " #ip1 ", %%mm1      \n\t" /* mm1 = b0 b1 b2 b3 */       \
@@ -251,9 +251,9 @@
   "  movq        %%mm4, %%mm1       \n\t" /* mm1 = e3 e2 e1 e0 */       \
   "  punpcklwd   %%mm5, %%mm4       \n\t" /* mm4 = f1 e1 f0 e0 */       \
   "  movq        %%mm0," #op0 "     \n\t" /* save a3 a2 a1 a0  */       \
-  "  punpckhwd	 %%mm5, %%mm1       \n\t" /* mm1 = f3 e3 f2 e2 */       \
+  "  punpckhwd   %%mm5, %%mm1       \n\t" /* mm1 = f3 e3 f2 e2 */       \
   "  movq        %%mm6, %%mm0       \n\t" /* mm0 = g3 g2 g1 g0 */       \
-  "  punpcklwd	 %%mm7, %%mm6       \n\t" /* mm6 = h1 g1 h0 g0 */       \
+  "  punpcklwd   %%mm7, %%mm6       \n\t" /* mm6 = h1 g1 h0 g0 */       \
   "  movq        %%mm4, %%mm5       \n\t" /* mm5 = f1 e1 f0 e0 */       \
   "  punpckldq   %%mm6, %%mm4       \n\t" /* mm4 = h0 g0 f0 e0 = MM4 */ \
   "  punpckhdq   %%mm6, %%mm5       \n\t" /* mm5 = h1 g1 f1 e1 = MM5 */ \
@@ -285,7 +285,7 @@
   "  movq        %%mm2," #op2 "     \n\t"
 
 
-/* This performs a 2D Forward DCT on an 8x8 block with short 
+/* This performs a 2D Forward DCT on an 8x8 block with short
    coefficients. We try to do the truncation to match the C
    version. */
 static void fdct_short__mmx ( ogg_int16_t *InputData, ogg_int16_t *OutputData)
@@ -299,23 +299,23 @@
      * we will transpose the block of data to two 4x8 blocks???
      */
     Transpose_mmx (  (%0), 16(%0), 32(%0), 48(%0),  8(%0), 24(%0), 40(%0), 56(%0),
-		     (%1), 16(%1), 32(%1), 48(%1),  8(%1), 24(%1), 40(%1), 56(%1))
+                     (%1), 16(%1), 32(%1), 48(%1),  8(%1), 24(%1), 40(%1), 56(%1))
     Fdct_mmx      (  (%1), 16(%1), 32(%1), 48(%1),  8(%1), 24(%1), 40(%1), 56(%1), (%2))
 
     Transpose_mmx (64(%0), 80(%0), 96(%0),112(%0), 72(%0), 88(%0),104(%0),120(%0),
-		   64(%1), 80(%1), 96(%1),112(%1), 72(%1), 88(%1),104(%1),120(%1))
+                   64(%1), 80(%1), 96(%1),112(%1), 72(%1), 88(%1),104(%1),120(%1))
     Fdct_mmx      (64(%1), 80(%1), 96(%1),112(%1), 72(%1), 88(%1),104(%1),120(%1), (%2))
 
     Transpose_mmx ( 0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1),
-		    0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1))
+                    0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1))
     Fdct_mmx      ( 0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1), (%2))
 
     Transpose_mmx ( 8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1),
-		    8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1))
+                    8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1))
     Fdct_mmx      ( 8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1), (%2))
 
     "  emms                         \n\t"
-    
+
     : "+r" (InputData),
       "+r" (OutputData)
     : "r" (temp),
@@ -333,7 +333,6 @@
 /* install our implementation in the function table */
 void dsp_mmx_fdct_init(DspFunctions *funcs)
 {
-  TH_DEBUG("enabling accelerated x86_32 mmx fdct function.\n");
   funcs->fdct_short = fdct_short__mmx;
 }
 

Modified: trunk/theora/lib/enc/x86_32/idct_mmx.c
===================================================================
--- trunk/theora/lib/enc/x86_32/idct_mmx.c	2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/enc/x86_32/idct_mmx.c	2008-08-04 18:37:55 UTC (rev 15153)
@@ -401,7 +401,7 @@
     "   paddsw  "r0","r0"\n"                            \
     "   movq    "r1","I(1)"\n"  /* save R1 */           \
     "   paddsw  "r7","r0"\n"    /* r0 = R0 = G. + C. */ \
-    "#end RowIDCT"										\
+    "#end RowIDCT"                                                                              \
 );
 // end RowIDCT macro (8 + 38 = 46 cycles)
 
@@ -465,7 +465,7 @@
     "   movq    "r5","J(5)"\n"  /* store NR5 at J5 */       \
     "   movq    "r7","J(7)"\n"  /* store NR7 at J7 */       \
     "   movq    "r0","I(0)"\n"  /* store NR0 at I0 */       \
-    "#end ColumnIDCT\n"										\
+    "#end ColumnIDCT\n"                                                                         \
 );
 // end ColumnIDCT macro (38 + 19 = 57 cycles)
 
@@ -559,7 +559,7 @@
     "   movq        "r4","I(3)"\n"          \
                                             \
     "   movq        "r2","I(2)"\n"          \
-    "#end Transpose\n"						\
+    "#end Transpose\n"                                          \
 );
 // end Transpose macro (19 cycles).
 
@@ -1013,7 +1013,7 @@
     "   paddsw  "r0","r0"\n"                                \
     "   movq    "r1","I(1)"\n"  /* save R1 */               \
     "   paddsw  "r7","r0"\n"        /* r0 = R0 = G. + C. */ \
-    "#end RowIDCT_10\n"										\
+    "#end RowIDCT_10\n"                                                                         \
 );
 // end RowIDCT macro (8 + 38 = 46 cycles)
 
@@ -1060,7 +1060,7 @@
     "   movq    "r7","J(7)"\n"  /* store NR7 at J7 */   \
                                                         \
     "   movq    "r0","I(0)"\n"  /* store NR0 at I0 */   \
-    "#end ColumnIDCT_10\n"								\
+    "#end ColumnIDCT_10\n"                                                              \
 );
 // end ColumnIDCT macro (38 + 19 = 57 cycles)
 /* --------------------------------------------------------------- */
@@ -1389,7 +1389,7 @@
     );
 
     ASM(
-    "movq   (%eax), "r0"\n"     
+    "movq   (%eax), "r0"\n"
     "pmullw (%esi), "r0"\n"     /* r0 = 03 02 01 00 */
     "movq   "M(0)", "r2"\n"     /* r2 = __ __ __ FF */
     "movq   "r0", "r3"\n"       /* r3 = 03 02 01 00 */
@@ -1444,7 +1444,6 @@
 /* install our implementation in the function table */
 void dsp_mmx_idct_init(DspFunctions *funcs)
 {
-  TH_DEBUG("enabling accelerated x86_32 mmx idct functions.\n");
   funcs->IDctSlow = IDctSlow__mmx;
   funcs->IDct10 = IDct10__mmx;
   funcs->IDct3 = IDct3__mmx;

Modified: trunk/theora/lib/enc/x86_32/recon_mmx.c
===================================================================
--- trunk/theora/lib/enc/x86_32/recon_mmx.c	2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/enc/x86_32/recon_mmx.c	2008-08-04 18:37:55 UTC (rev 15153)
@@ -22,8 +22,8 @@
 static const __attribute__ ((aligned(8),used)) ogg_int64_t V128 = 0x8080808080808080LL;
 
 static void copy8x8__mmx (unsigned char *src,
-	                unsigned char *dest,
-	                unsigned int stride)
+                        unsigned char *dest,
+                        unsigned int stride)
 {
   __asm__ __volatile__ (
     "  .p2align 4                      \n\t"
@@ -35,14 +35,14 @@
     "  movq        (%1, %2, 2), %%mm2  \n\t"
     "  movq        (%1, %%edi), %%mm3  \n\t"
 
-    "  lea         (%1, %2, 4), %1     \n\t" 
+    "  lea         (%1, %2, 4), %1     \n\t"
 
     "  movq        %%mm0, (%0)         \n\t"
     "  movq        %%mm1, (%0, %2)     \n\t"
     "  movq        %%mm2, (%0, %2, 2)  \n\t"
     "  movq        %%mm3, (%0, %%edi)  \n\t"
 
-    "  lea         (%0, %2, 4), %0     \n\t" 
+    "  lea         (%0, %2, 4), %0     \n\t"
 
     "  movq        (%1), %%mm0         \n\t"
     "  movq        (%1, %2), %%mm1     \n\t"
@@ -61,7 +61,7 @@
 }
 
 static void recon_intra8x8__mmx (unsigned char *ReconPtr, ogg_int16_t *ChangePtr,
-		      ogg_uint32_t LineStep)
+                      ogg_uint32_t LineStep)
 {
   __asm__ __volatile__ (
     "  .p2align 4                      \n\t"
@@ -69,11 +69,11 @@
     "  movq        %[V128], %%mm0      \n\t" /* Set mm0 to 0x8080808080808080 */
 
     "  lea         128(%1), %%edi      \n\t" /* Endpoint in input buffer */
-    "1:                                \n\t" 
+    "1:                                \n\t"
     "  movq         (%1), %%mm2        \n\t" /* First four input values */
 
     "  packsswb    8(%1), %%mm2        \n\t" /* pack with next(high) four values */
-    "  por         %%mm0, %%mm0        \n\t" 
+    "  por         %%mm0, %%mm0        \n\t"
     "  pxor        %%mm0, %%mm2        \n\t" /* Convert result to unsigned (same as add 128) */
     "  lea         16(%1), %1          \n\t" /* Step source buffer */
     "  cmp         %%edi, %1           \n\t" /* are we done */
@@ -91,7 +91,7 @@
 }
 
 static void recon_inter8x8__mmx (unsigned char *ReconPtr, unsigned char *RefPtr,
-		      ogg_int16_t *ChangePtr, ogg_uint32_t LineStep)
+                      ogg_int16_t *ChangePtr, ogg_uint32_t LineStep)
 {
   __asm__ __volatile__ (
     "  .p2align 4                      \n\t"
@@ -127,8 +127,8 @@
 }
 
 static void recon_inter8x8_half__mmx (unsigned char *ReconPtr, unsigned char *RefPtr1,
-		           unsigned char *RefPtr2, ogg_int16_t *ChangePtr,
-			   ogg_uint32_t LineStep)
+                           unsigned char *RefPtr2, ogg_int16_t *ChangePtr,
+                           ogg_uint32_t LineStep)
 {
   __asm__ __volatile__ (
     "  .p2align 4                      \n\t"
@@ -173,7 +173,6 @@
 
 void dsp_mmx_recon_init(DspFunctions *funcs)
 {
-  TH_DEBUG("enabling accelerated x86_32 mmx recon functions.\n");
   funcs->copy8x8 = copy8x8__mmx;
   funcs->recon_intra8x8 = recon_intra8x8__mmx;
   funcs->recon_inter8x8 = recon_inter8x8__mmx;

Modified: trunk/theora/lib/enc/x86_32_vs/dsp_mmx.c
===================================================================
--- trunk/theora/lib/enc/x86_32_vs/dsp_mmx.c	2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/enc/x86_32_vs/dsp_mmx.c	2008-08-04 18:37:55 UTC (rev 15153)
@@ -32,7 +32,7 @@
 
 static void sub8x8__mmx (unsigned char *FiltPtr, unsigned char *ReconPtr,
                   ogg_int16_t *DctInputPtr, ogg_uint32_t PixelsPerLine,
-                  ogg_uint32_t ReconPixelsPerLine) 
+                  ogg_uint32_t ReconPixelsPerLine)
 {
 
     //Make non-zero to use the C-version
@@ -59,192 +59,192 @@
     __asm {
         align 16
 
-        pxor		mm7, mm7	
+        pxor    mm7, mm7
 
         mov     eax, FiltPtr
         mov     ebx, ReconPtr
         mov     edx, DctInputPtr
 
-     /* You can't use rept in inline masm and macro parsing seems screwed with inline asm*/		
-     
+     /* You can't use rept in inline masm and macro parsing seems screwed with inline asm*/
+
      /* ITERATION 1 */
-        movq		mm0, [eax]		/* mm0 = FiltPtr */
-        movq		mm1, [ebx]		/* mm1 = ReconPtr */
-        movq		mm2, mm0		/* dup to prepare for up conversion */
-        movq		mm3, mm1		/* dup to prepare for up conversion */
+        movq    mm0, [eax]    /* mm0 = FiltPtr */
+        movq    mm1, [ebx]    /* mm1 = ReconPtr */
+        movq    mm2, mm0    /* dup to prepare for up conversion */
+        movq    mm3, mm1    /* dup to prepare for up conversion */
         /* convert from UINT8 to INT16 */
-        punpcklbw		mm0, mm7		/* mm0 = INT16(FiltPtr) */
-        punpcklbw		mm1, mm7		/* mm1 = INT16(ReconPtr) */
-        punpckhbw		mm2, mm7		/* mm2 = INT16(FiltPtr) */
-        punpckhbw		mm3, mm7		/* mm3 = INT16(ReconPtr) */
+        punpcklbw   mm0, mm7    /* mm0 = INT16(FiltPtr) */
+        punpcklbw   mm1, mm7    /* mm1 = INT16(ReconPtr) */
+        punpckhbw   mm2, mm7    /* mm2 = INT16(FiltPtr) */
+        punpckhbw   mm3, mm7    /* mm3 = INT16(ReconPtr) */
         /* start calculation */
-        psubw		mm0, mm1		/* mm0 = FiltPtr - ReconPtr */
-        psubw		mm2, mm3		/* mm2 = FiltPtr - ReconPtr */
-        movq		[edx], mm0		/* write answer out */
-        movq		[8 + edx], mm2		/* write answer out */
+        psubw   mm0, mm1    /* mm0 = FiltPtr - ReconPtr */
+        psubw   mm2, mm3    /* mm2 = FiltPtr - ReconPtr */
+        movq    [edx], mm0    /* write answer out */
+        movq    [8 + edx], mm2    /* write answer out */
         /* Increment pointers */
-        add		edx, 16		
-        add		eax, PixelsPerLine		
-        add		ebx, ReconPixelsPerLine
+        add   edx, 16
+        add   eax, PixelsPerLine
+        add   ebx, ReconPixelsPerLine
 
 
      /* ITERATION 2 */
-        movq		mm0, [eax]		/* mm0 = FiltPtr */
-        movq		mm1, [ebx]		/* mm1 = ReconPtr */
-        movq		mm2, mm0		/* dup to prepare for up conversion */
-        movq		mm3, mm1		/* dup to prepare for up conversion */
+        movq    mm0, [eax]    /* mm0 = FiltPtr */
+        movq    mm1, [ebx]    /* mm1 = ReconPtr */
+        movq    mm2, mm0    /* dup to prepare for up conversion */
+        movq    mm3, mm1    /* dup to prepare for up conversion */
         /* convert from UINT8 to INT16 */
-        punpcklbw		mm0, mm7		/* mm0 = INT16(FiltPtr) */
-        punpcklbw		mm1, mm7		/* mm1 = INT16(ReconPtr) */
-        punpckhbw		mm2, mm7		/* mm2 = INT16(FiltPtr) */
-        punpckhbw		mm3, mm7		/* mm3 = INT16(ReconPtr) */
+        punpcklbw   mm0, mm7    /* mm0 = INT16(FiltPtr) */
+        punpcklbw   mm1, mm7    /* mm1 = INT16(ReconPtr) */
+        punpckhbw   mm2, mm7    /* mm2 = INT16(FiltPtr) */
+        punpckhbw   mm3, mm7    /* mm3 = INT16(ReconPtr) */
         /* start calculation */
-        psubw		mm0, mm1		/* mm0 = FiltPtr - ReconPtr */
-        psubw		mm2, mm3		/* mm2 = FiltPtr - ReconPtr */
-        movq		[edx], mm0		/* write answer out */
-        movq		[8 + edx], mm2		/* write answer out */
+        psubw   mm0, mm1    /* mm0 = FiltPtr - ReconPtr */
+        psubw   mm2, mm3    /* mm2 = FiltPtr - ReconPtr */
+        movq    [edx], mm0    /* write answer out */
+        movq    [8 + edx], mm2    /* write answer out */
         /* Increment pointers */
-        add		edx, 16		
-        add		eax, PixelsPerLine		
-        add		ebx, ReconPixelsPerLine
+        add   edx, 16
+        add   eax, PixelsPerLine
+        add   ebx, ReconPixelsPerLine
 
 
      /* ITERATION 3 */
-        movq		mm0, [eax]		/* mm0 = FiltPtr */
-        movq		mm1, [ebx]		/* mm1 = ReconPtr */
-        movq		mm2, mm0		/* dup to prepare for up conversion */
-        movq		mm3, mm1		/* dup to prepare for up conversion */
+        movq    mm0, [eax]    /* mm0 = FiltPtr */
+        movq    mm1, [ebx]    /* mm1 = ReconPtr */
+        movq    mm2, mm0    /* dup to prepare for up conversion */
+        movq    mm3, mm1    /* dup to prepare for up conversion */
         /* convert from UINT8 to INT16 */
-        punpcklbw		mm0, mm7		/* mm0 = INT16(FiltPtr) */
-        punpcklbw		mm1, mm7		/* mm1 = INT16(ReconPtr) */
-        punpckhbw		mm2, mm7		/* mm2 = INT16(FiltPtr) */
-        punpckhbw		mm3, mm7		/* mm3 = INT16(ReconPtr) */
+        punpcklbw   mm0, mm7    /* mm0 = INT16(FiltPtr) */
+        punpcklbw   mm1, mm7    /* mm1 = INT16(ReconPtr) */
+        punpckhbw   mm2, mm7    /* mm2 = INT16(FiltPtr) */
+        punpckhbw   mm3, mm7    /* mm3 = INT16(ReconPtr) */
         /* start calculation */
-        psubw		mm0, mm1		/* mm0 = FiltPtr - ReconPtr */
-        psubw		mm2, mm3		/* mm2 = FiltPtr - ReconPtr */
-        movq		[edx], mm0		/* write answer out */
-        movq		[8 + edx], mm2		/* write answer out */
+        psubw   mm0, mm1    /* mm0 = FiltPtr - ReconPtr */
+        psubw   mm2, mm3    /* mm2 = FiltPtr - ReconPtr */
+        movq    [edx], mm0    /* write answer out */
+        movq    [8 + edx], mm2    /* write answer out */
         /* Increment pointers */
-        add		edx, 16		
-        add		eax, PixelsPerLine		
-        add		ebx, ReconPixelsPerLine
+        add   edx, 16
+        add   eax, PixelsPerLine
+        add   ebx, ReconPixelsPerLine
 
 
      /* ITERATION 4 */
-        movq		mm0, [eax]		/* mm0 = FiltPtr */
-        movq		mm1, [ebx]		/* mm1 = ReconPtr */
-        movq		mm2, mm0		/* dup to prepare for up conversion */
-        movq		mm3, mm1		/* dup to prepare for up conversion */
+        movq    mm0, [eax]    /* mm0 = FiltPtr */
+        movq    mm1, [ebx]    /* mm1 = ReconPtr */
+        movq    mm2, mm0    /* dup to prepare for up conversion */
+        movq    mm3, mm1    /* dup to prepare for up conversion */
         /* convert from UINT8 to INT16 */
-        punpcklbw		mm0, mm7		/* mm0 = INT16(FiltPtr) */
-        punpcklbw		mm1, mm7		/* mm1 = INT16(ReconPtr) */
-        punpckhbw		mm2, mm7		/* mm2 = INT16(FiltPtr) */
-        punpckhbw		mm3, mm7		/* mm3 = INT16(ReconPtr) */
+        punpcklbw   mm0, mm7    /* mm0 = INT16(FiltPtr) */
+        punpcklbw   mm1, mm7    /* mm1 = INT16(ReconPtr) */
+        punpckhbw   mm2, mm7    /* mm2 = INT16(FiltPtr) */
+        punpckhbw   mm3, mm7    /* mm3 = INT16(ReconPtr) */
         /* start calculation */
-        psubw		mm0, mm1		/* mm0 = FiltPtr - ReconPtr */
-        psubw		mm2, mm3		/* mm2 = FiltPtr - ReconPtr */
-        movq		[edx], mm0		/* write answer out */
-        movq		[8 + edx], mm2		/* write answer out */
+        psubw   mm0, mm1    /* mm0 = FiltPtr - ReconPtr */
+        psubw   mm2, mm3    /* mm2 = FiltPtr - ReconPtr */
+        movq    [edx], mm0    /* write answer out */
+        movq    [8 + edx], mm2    /* write answer out */
         /* Increment pointers */
-        add		edx, 16		
-        add		eax, PixelsPerLine		
-        add		ebx, ReconPixelsPerLine
+        add   edx, 16
+        add   eax, PixelsPerLine
+        add   ebx, ReconPixelsPerLine
 
 
      /* ITERATION 5 */
-        movq		mm0, [eax]		/* mm0 = FiltPtr */
-        movq		mm1, [ebx]		/* mm1 = ReconPtr */
-        movq		mm2, mm0		/* dup to prepare for up conversion */
-        movq		mm3, mm1		/* dup to prepare for up conversion */
+        movq    mm0, [eax]    /* mm0 = FiltPtr */
+        movq    mm1, [ebx]    /* mm1 = ReconPtr */
+        movq    mm2, mm0    /* dup to prepare for up conversion */
+        movq    mm3, mm1    /* dup to prepare for up conversion */
         /* convert from UINT8 to INT16 */
-        punpcklbw		mm0, mm7		/* mm0 = INT16(FiltPtr) */
-        punpcklbw		mm1, mm7		/* mm1 = INT16(ReconPtr) */
-        punpckhbw		mm2, mm7		/* mm2 = INT16(FiltPtr) */
-        punpckhbw		mm3, mm7		/* mm3 = INT16(ReconPtr) */
+        punpcklbw   mm0, mm7    /* mm0 = INT16(FiltPtr) */
+        punpcklbw   mm1, mm7    /* mm1 = INT16(ReconPtr) */
+        punpckhbw   mm2, mm7    /* mm2 = INT16(FiltPtr) */
+        punpckhbw   mm3, mm7    /* mm3 = INT16(ReconPtr) */
         /* start calculation */
-        psubw		mm0, mm1		/* mm0 = FiltPtr - ReconPtr */
-        psubw		mm2, mm3		/* mm2 = FiltPtr - ReconPtr */
-        movq		[edx], mm0		/* write answer out */
-        movq		[8 + edx], mm2		/* write answer out */
+        psubw   mm0, mm1    /* mm0 = FiltPtr - ReconPtr */
+        psubw   mm2, mm3    /* mm2 = FiltPtr - ReconPtr */
+        movq    [edx], mm0    /* write answer out */
+        movq    [8 + edx], mm2    /* write answer out */
         /* Increment pointers */
-        add		edx, 16		
-        add		eax, PixelsPerLine		
-        add		ebx, ReconPixelsPerLine
+        add   edx, 16
+        add   eax, PixelsPerLine
+        add   ebx, ReconPixelsPerLine
 
 
      /* ITERATION 6 */
-        movq		mm0, [eax]		/* mm0 = FiltPtr */
-        movq		mm1, [ebx]		/* mm1 = ReconPtr */
-        movq		mm2, mm0		/* dup to prepare for up conversion */
-        movq		mm3, mm1		/* dup to prepare for up conversion */
+        movq    mm0, [eax]    /* mm0 = FiltPtr */
+        movq    mm1, [ebx]    /* mm1 = ReconPtr */
+        movq    mm2, mm0    /* dup to prepare for up conversion */
+        movq    mm3, mm1    /* dup to prepare for up conversion */
         /* convert from UINT8 to INT16 */
-        punpcklbw		mm0, mm7		/* mm0 = INT16(FiltPtr) */
-        punpcklbw		mm1, mm7		/* mm1 = INT16(ReconPtr) */
-        punpckhbw		mm2, mm7		/* mm2 = INT16(FiltPtr) */
-        punpckhbw		mm3, mm7		/* mm3 = INT16(ReconPtr) */
+        punpcklbw   mm0, mm7    /* mm0 = INT16(FiltPtr) */
+        punpcklbw   mm1, mm7    /* mm1 = INT16(ReconPtr) */
+        punpckhbw   mm2, mm7    /* mm2 = INT16(FiltPtr) */
+        punpckhbw   mm3, mm7    /* mm3 = INT16(ReconPtr) */
         /* start calculation */
-        psubw		mm0, mm1		/* mm0 = FiltPtr - ReconPtr */
-        psubw		mm2, mm3		/* mm2 = FiltPtr - ReconPtr */
-        movq		[edx], mm0		/* write answer out */
-        movq		[8 + edx], mm2		/* write answer out */
+        psubw   mm0, mm1    /* mm0 = FiltPtr - ReconPtr */
+        psubw   mm2, mm3    /* mm2 = FiltPtr - ReconPtr */
+        movq    [edx], mm0    /* write answer out */
+        movq    [8 + edx], mm2    /* write answer out */
         /* Increment pointers */
-        add		edx, 16		
-        add		eax, PixelsPerLine		
-        add		ebx, ReconPixelsPerLine
+        add   edx, 16
+        add   eax, PixelsPerLine
+        add   ebx, ReconPixelsPerLine
 
 
      /* ITERATION 7 */
-        movq		mm0, [eax]		/* mm0 = FiltPtr */
-        movq		mm1, [ebx]		/* mm1 = ReconPtr */
-        movq		mm2, mm0		/* dup to prepare for up conversion */
-        movq		mm3, mm1		/* dup to prepare for up conversion */
+        movq    mm0, [eax]    /* mm0 = FiltPtr */
+        movq    mm1, [ebx]    /* mm1 = ReconPtr */
+        movq    mm2, mm0    /* dup to prepare for up conversion */
+        movq    mm3, mm1    /* dup to prepare for up conversion */
         /* convert from UINT8 to INT16 */
-        punpcklbw		mm0, mm7		/* mm0 = INT16(FiltPtr) */
-        punpcklbw		mm1, mm7		/* mm1 = INT16(ReconPtr) */
-        punpckhbw		mm2, mm7		/* mm2 = INT16(FiltPtr) */
-        punpckhbw		mm3, mm7		/* mm3 = INT16(ReconPtr) */
+        punpcklbw   mm0, mm7    /* mm0 = INT16(FiltPtr) */
+        punpcklbw   mm1, mm7    /* mm1 = INT16(ReconPtr) */
+        punpckhbw   mm2, mm7    /* mm2 = INT16(FiltPtr) */
+        punpckhbw   mm3, mm7    /* mm3 = INT16(ReconPtr) */
         /* start calculation */
-        psubw		mm0, mm1		/* mm0 = FiltPtr - ReconPtr */
-        psubw		mm2, mm3		/* mm2 = FiltPtr - ReconPtr */
-        movq		[edx], mm0		/* write answer out */
-        movq		[8 + edx], mm2		/* write answer out */
+        psubw   mm0, mm1    /* mm0 = FiltPtr - ReconPtr */
+        psubw   mm2, mm3    /* mm2 = FiltPtr - ReconPtr */
+        movq    [edx], mm0    /* write answer out */
+        movq    [8 + edx], mm2    /* write answer out */
         /* Increment pointers */
-        add		edx, 16		
-        add		eax, PixelsPerLine		
-        add		ebx, ReconPixelsPerLine
+        add   edx, 16
+        add   eax, PixelsPerLine
+        add   ebx, ReconPixelsPerLine
 
 
      /* ITERATION 8 */
-        movq		mm0, [eax]		/* mm0 = FiltPtr */
-        movq		mm1, [ebx]		/* mm1 = ReconPtr */
-        movq		mm2, mm0		/* dup to prepare for up conversion */
-        movq		mm3, mm1		/* dup to prepare for up conversion */
+        movq    mm0, [eax]    /* mm0 = FiltPtr */
+        movq    mm1, [ebx]    /* mm1 = ReconPtr */
+        movq    mm2, mm0    /* dup to prepare for up conversion */
+        movq    mm3, mm1    /* dup to prepare for up conversion */
         /* convert from UINT8 to INT16 */
-        punpcklbw		mm0, mm7		/* mm0 = INT16(FiltPtr) */
-        punpcklbw		mm1, mm7		/* mm1 = INT16(ReconPtr) */
-        punpckhbw		mm2, mm7		/* mm2 = INT16(FiltPtr) */
-        punpckhbw		mm3, mm7		/* mm3 = INT16(ReconPtr) */
+        punpcklbw   mm0, mm7    /* mm0 = INT16(FiltPtr) */
+        punpcklbw   mm1, mm7    /* mm1 = INT16(ReconPtr) */
+        punpckhbw   mm2, mm7    /* mm2 = INT16(FiltPtr) */
+        punpckhbw   mm3, mm7    /* mm3 = INT16(ReconPtr) */
         /* start calculation */
-        psubw		mm0, mm1		/* mm0 = FiltPtr - ReconPtr */
-        psubw		mm2, mm3		/* mm2 = FiltPtr - ReconPtr */
-        movq		[edx], mm0		/* write answer out */
-        movq		[8 + edx], mm2		/* write answer out */
+        psubw   mm0, mm1    /* mm0 = FiltPtr - ReconPtr */
+        psubw   mm2, mm3    /* mm2 = FiltPtr - ReconPtr */
+        movq    [edx], mm0    /* write answer out */
+        movq    [8 + edx], mm2    /* write answer out */
         /* Increment pointers */
-        add		edx, 16		
-        add		eax, PixelsPerLine		
-        add		ebx, ReconPixelsPerLine
+        add   edx, 16
+        add   eax, PixelsPerLine
+        add   ebx, ReconPixelsPerLine
 
 
 
-     
 
+
     };
- 
+
 #endif
 }
 
 static void sub8x8_128__mmx (unsigned char *FiltPtr, ogg_int16_t *DctInputPtr,
-                      ogg_uint32_t PixelsPerLine) 
+                      ogg_uint32_t PixelsPerLine)
 {
 
 #if 0
@@ -273,142 +273,142 @@
     __asm {
         align 16
 
-        pxor		mm7, mm7		
+        pxor    mm7, mm7
 
         mov         eax, FiltPtr
         mov         ebx, DctInputPtr
 
-        movq		mm1, V128
+        movq    mm1, V128
 
-        /*  ITERATION 1 */		
-        movq		mm0, [eax]		/* mm0 = FiltPtr */
-        movq		mm2, mm0		/* dup to prepare for up conversion */
+        /*  ITERATION 1 */
+        movq    mm0, [eax]    /* mm0 = FiltPtr */
+        movq    mm2, mm0    /* dup to prepare for up conversion */
         /* convert from UINT8 to INT16 */
-        punpcklbw		mm0, mm7		/* mm0 = INT16(FiltPtr) */
-        punpckhbw		mm2, mm7		/* mm2 = INT16(FiltPtr) */
+        punpcklbw   mm0, mm7    /* mm0 = INT16(FiltPtr) */
+        punpckhbw   mm2, mm7    /* mm2 = INT16(FiltPtr) */
         /* start calculation */
-        psubw		mm0, mm1		/* mm0 = FiltPtr - 128 */
-        psubw		mm2, mm1		/* mm2 = FiltPtr - 128 */
-        movq		[ebx], mm0		/* write answer out */
-        movq		[8 + ebx], mm2		/* write answer out */
+        psubw   mm0, mm1    /* mm0 = FiltPtr - 128 */
+        psubw   mm2, mm1    /* mm2 = FiltPtr - 128 */
+        movq    [ebx], mm0    /* write answer out */
+        movq    [8 + ebx], mm2    /* write answer out */
         /* Increment pointers */
-        add		ebx, 16		
-        add		eax, PixelsPerLine	
+        add   ebx, 16
+        add   eax, PixelsPerLine
 
 
-        /*  ITERATION 2 */		
-        movq		mm0, [eax]		/* mm0 = FiltPtr */
-        movq		mm2, mm0		/* dup to prepare for up conversion */
+        /*  ITERATION 2 */
+        movq    mm0, [eax]    /* mm0 = FiltPtr */
+        movq    mm2, mm0    /* dup to prepare for up conversion */
         /* convert from UINT8 to INT16 */
-        punpcklbw		mm0, mm7		/* mm0 = INT16(FiltPtr) */
-        punpckhbw		mm2, mm7		/* mm2 = INT16(FiltPtr) */
+        punpcklbw   mm0, mm7    /* mm0 = INT16(FiltPtr) */
+        punpckhbw   mm2, mm7    /* mm2 = INT16(FiltPtr) */
         /* start calculation */
-        psubw		mm0, mm1		/* mm0 = FiltPtr - 128 */
-        psubw		mm2, mm1		/* mm2 = FiltPtr - 128 */
-        movq		[ebx], mm0		/* write answer out */
-        movq		[8 + ebx], mm2		/* write answer out */
+        psubw   mm0, mm1    /* mm0 = FiltPtr - 128 */
+        psubw   mm2, mm1    /* mm2 = FiltPtr - 128 */
+        movq    [ebx], mm0    /* write answer out */
+        movq    [8 + ebx], mm2    /* write answer out */
         /* Increment pointers */
-        add		ebx, 16		
-        add		eax, PixelsPerLine	
+        add   ebx, 16
+        add   eax, PixelsPerLine
 
 
-        /*  ITERATION 3 */		
-        movq		mm0, [eax]		/* mm0 = FiltPtr */
-        movq		mm2, mm0		/* dup to prepare for up conversion */
+        /*  ITERATION 3 */
+        movq    mm0, [eax]    /* mm0 = FiltPtr */
+        movq    mm2, mm0    /* dup to prepare for up conversion */
         /* convert from UINT8 to INT16 */
-        punpcklbw		mm0, mm7		/* mm0 = INT16(FiltPtr) */
-        punpckhbw		mm2, mm7		/* mm2 = INT16(FiltPtr) */
+        punpcklbw   mm0, mm7    /* mm0 = INT16(FiltPtr) */
+        punpckhbw   mm2, mm7    /* mm2 = INT16(FiltPtr) */
         /* start calculation */
-        psubw		mm0, mm1		/* mm0 = FiltPtr - 128 */
-        psubw		mm2, mm1		/* mm2 = FiltPtr - 128 */
-        movq		[ebx], mm0		/* write answer out */
-        movq		[8 + ebx], mm2		/* write answer out */
+        psubw   mm0, mm1    /* mm0 = FiltPtr - 128 */
+        psubw   mm2, mm1    /* mm2 = FiltPtr - 128 */
+        movq    [ebx], mm0    /* write answer out */
+        movq    [8 + ebx], mm2    /* write answer out */
         /* Increment pointers */
-        add		ebx, 16		
-        add		eax, PixelsPerLine	
+        add   ebx, 16
+        add   eax, PixelsPerLine
 
 
-        /*  ITERATION 4 */		
-        movq		mm0, [eax]		/* mm0 = FiltPtr */
-        movq		mm2, mm0		/* dup to prepare for up conversion */
+        /*  ITERATION 4 */
+        movq    mm0, [eax]    /* mm0 = FiltPtr */
+        movq    mm2, mm0    /* dup to prepare for up conversion */
         /* convert from UINT8 to INT16 */
-        punpcklbw		mm0, mm7		/* mm0 = INT16(FiltPtr) */
-        punpckhbw		mm2, mm7		/* mm2 = INT16(FiltPtr) */
+        punpcklbw   mm0, mm7    /* mm0 = INT16(FiltPtr) */
+        punpckhbw   mm2, mm7    /* mm2 = INT16(FiltPtr) */
         /* start calculation */
-        psubw		mm0, mm1		/* mm0 = FiltPtr - 128 */
-        psubw		mm2, mm1		/* mm2 = FiltPtr - 128 */
-        movq		[ebx], mm0		/* write answer out */
-        movq		[8 + ebx], mm2		/* write answer out */
+        psubw   mm0, mm1    /* mm0 = FiltPtr - 128 */
+        psubw   mm2, mm1    /* mm2 = FiltPtr - 128 */
+        movq    [ebx], mm0    /* write answer out */
+        movq    [8 + ebx], mm2    /* write answer out */
         /* Increment pointers */
-        add		ebx, 16		
-        add		eax, PixelsPerLine	
+        add   ebx, 16
+        add   eax, PixelsPerLine
 
 
-        /*  ITERATION 5 */		
-        movq		mm0, [eax]		/* mm0 = FiltPtr */
-        movq		mm2, mm0		/* dup to prepare for up conversion */
+        /*  ITERATION 5 */
+        movq    mm0, [eax]    /* mm0 = FiltPtr */
+        movq    mm2, mm0    /* dup to prepare for up conversion */
         /* convert from UINT8 to INT16 */
-        punpcklbw		mm0, mm7		/* mm0 = INT16(FiltPtr) */
-        punpckhbw		mm2, mm7		/* mm2 = INT16(FiltPtr) */
+        punpcklbw   mm0, mm7    /* mm0 = INT16(FiltPtr) */
+        punpckhbw   mm2, mm7    /* mm2 = INT16(FiltPtr) */
         /* start calculation */
-        psubw		mm0, mm1		/* mm0 = FiltPtr - 128 */
-        psubw		mm2, mm1		/* mm2 = FiltPtr - 128 */
-        movq		[ebx], mm0		/* write answer out */
-        movq		[8 + ebx], mm2		/* write answer out */
+        psubw   mm0, mm1    /* mm0 = FiltPtr - 128 */
+        psubw   mm2, mm1    /* mm2 = FiltPtr - 128 */
+        movq    [ebx], mm0    /* write answer out */
+        movq    [8 + ebx], mm2    /* write answer out */
         /* Increment pointers */
-        add		ebx, 16		
-        add		eax, PixelsPerLine	
+        add   ebx, 16
+        add   eax, PixelsPerLine
 
 
-        /*  ITERATION 6 */		
-        movq		mm0, [eax]		/* mm0 = FiltPtr */
-        movq		mm2, mm0		/* dup to prepare for up conversion */
+        /*  ITERATION 6 */
+        movq    mm0, [eax]    /* mm0 = FiltPtr */
+        movq    mm2, mm0    /* dup to prepare for up conversion */
         /* convert from UINT8 to INT16 */
-        punpcklbw		mm0, mm7		/* mm0 = INT16(FiltPtr) */
-        punpckhbw		mm2, mm7		/* mm2 = INT16(FiltPtr) */
+        punpcklbw   mm0, mm7    /* mm0 = INT16(FiltPtr) */
+        punpckhbw   mm2, mm7    /* mm2 = INT16(FiltPtr) */
         /* start calculation */
-        psubw		mm0, mm1		/* mm0 = FiltPtr - 128 */
-        psubw		mm2, mm1		/* mm2 = FiltPtr - 128 */
-        movq		[ebx], mm0		/* write answer out */
-        movq		[8 + ebx], mm2		/* write answer out */
+        psubw   mm0, mm1    /* mm0 = FiltPtr - 128 */
+        psubw   mm2, mm1    /* mm2 = FiltPtr - 128 */
+        movq    [ebx], mm0    /* write answer out */
+        movq    [8 + ebx], mm2    /* write answer out */
         /* Increment pointers */
-        add		ebx, 16		
-        add		eax, PixelsPerLine	
+        add   ebx, 16
+        add   eax, PixelsPerLine
 
 
-        /*  ITERATION 7 */		
-        movq		mm0, [eax]		/* mm0 = FiltPtr */
-        movq		mm2, mm0		/* dup to prepare for up conversion */
+        /*  ITERATION 7 */
+        movq    mm0, [eax]    /* mm0 = FiltPtr */
+        movq    mm2, mm0    /* dup to prepare for up conversion */
         /* convert from UINT8 to INT16 */
-        punpcklbw		mm0, mm7		/* mm0 = INT16(FiltPtr) */
-        punpckhbw		mm2, mm7		/* mm2 = INT16(FiltPtr) */
+        punpcklbw   mm0, mm7    /* mm0 = INT16(FiltPtr) */
+        punpckhbw   mm2, mm7    /* mm2 = INT16(FiltPtr) */
         /* start calculation */
-        psubw		mm0, mm1		/* mm0 = FiltPtr - 128 */
-        psubw		mm2, mm1		/* mm2 = FiltPtr - 128 */
-        movq		[ebx], mm0		/* write answer out */
-        movq		[8 + ebx], mm2		/* write answer out */
+        psubw   mm0, mm1    /* mm0 = FiltPtr - 128 */
+        psubw   mm2, mm1    /* mm2 = FiltPtr - 128 */
+        movq    [ebx], mm0    /* write answer out */
+        movq    [8 + ebx], mm2    /* write answer out */
         /* Increment pointers */
-        add		ebx, 16		
-        add		eax, PixelsPerLine	
+        add   ebx, 16
+        add   eax, PixelsPerLine
 
 
-        /*  ITERATION 8 */		
-        movq		mm0, [eax]		/* mm0 = FiltPtr */
-        movq		mm2, mm0		/* dup to prepare for up conversion */
+        /*  ITERATION 8 */
+        movq    mm0, [eax]    /* mm0 = FiltPtr */
+        movq    mm2, mm0    /* dup to prepare for up conversion */
         /* convert from UINT8 to INT16 */
-        punpcklbw		mm0, mm7		/* mm0 = INT16(FiltPtr) */
-        punpckhbw		mm2, mm7		/* mm2 = INT16(FiltPtr) */
+        punpcklbw   mm0, mm7    /* mm0 = INT16(FiltPtr) */
+        punpckhbw   mm2, mm7    /* mm2 = INT16(FiltPtr) */
         /* start calculation */
-        psubw		mm0, mm1		/* mm0 = FiltPtr - 128 */
-        psubw		mm2, mm1		/* mm2 = FiltPtr - 128 */
-        movq		[ebx], mm0		/* write answer out */
-        movq		[8 + ebx], mm2		/* write answer out */
+        psubw   mm0, mm1    /* mm0 = FiltPtr - 128 */
+        psubw   mm2, mm1    /* mm2 = FiltPtr - 128 */
+        movq    [ebx], mm0    /* write answer out */
+        movq    [8 + ebx], mm2    /* write answer out */
         /* Increment pointers */
-        add		ebx, 16		
-        add		eax, PixelsPerLine	
+        add   ebx, 16
+        add   eax, PixelsPerLine
 
     };
- 
+
 #endif
 }
 
@@ -418,7 +418,7 @@
 static void sub8x8avg2__mmx (unsigned char *FiltPtr, unsigned char *ReconPtr1,
                      unsigned char *ReconPtr2, ogg_int16_t *DctInputPtr,
                      ogg_uint32_t PixelsPerLine,
-                     ogg_uint32_t ReconPixelsPerLine) 
+                     ogg_uint32_t ReconPixelsPerLine)
 {
 
 #if 0
@@ -453,251 +453,251 @@
         mov         ecx, ReconPtr2
         mov         edx, DctInputPtr
 
-        /*  ITERATION 1 */	
-        movq		mm0, [eax]		;	/* mm0 = FiltPtr */
-        movq		mm1, [ebx]		;	/* mm1 = ReconPtr1 */
-        movq		mm4, [ecx]		;	/* mm1 = ReconPtr2 */
-        movq		mm2, mm0		;	/* dup to prepare for up conversion */
-        movq		mm3, mm1		;	/* dup to prepare for up conversion */
-        movq		mm5, mm4		;	/* dup to prepare for up conversion */
-	        ;	/* convert from UINT8 to INT16 */
-        punpcklbw		mm0, mm7		;	/* mm0 = INT16(FiltPtr) */
-        punpcklbw		mm1, mm7		;	/* mm1 = INT16(ReconPtr1) */
-        punpcklbw		mm4, mm7		;	/* mm1 = INT16(ReconPtr2) */
-        punpckhbw		mm2, mm7		;	/* mm2 = INT16(FiltPtr) */
-        punpckhbw		mm3, mm7		;	/* mm3 = INT16(ReconPtr1) */
-        punpckhbw		mm5, mm7		;	/* mm3 = INT16(ReconPtr2) */
-	        ;	/* average ReconPtr1 and ReconPtr2 */
-        paddw		mm1, mm4		;	/* mm1 = ReconPtr1 + ReconPtr2 */
-        paddw		mm3, mm5		;	/* mm3 = ReconPtr1 + ReconPtr2 */
-        psrlw		mm1, 1		;	/* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
-        psrlw		mm3, 1		;	/* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
-        psubw		mm0, mm1		;	/* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
-        psubw		mm2, mm3		;	/* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
-        movq		[edx], mm0		;	/* write answer out */
-        movq		[8 + edx], mm2		;	/* write answer out */
-	        ;	/* Increment pointers */
-        add		edx, 16		;	
-        add		eax, PixelsPerLine		;	
-        add		ebx, ReconPixelsPerLine		;	
-        add		ecx, ReconPixelsPerLine		;	
-	
+        /*  ITERATION 1 */
+        movq    mm0, [eax]    ; /* mm0 = FiltPtr */
+        movq    mm1, [ebx]    ; /* mm1 = ReconPtr1 */
+        movq    mm4, [ecx]    ; /* mm1 = ReconPtr2 */
+        movq    mm2, mm0    ; /* dup to prepare for up conversion */
+        movq    mm3, mm1    ; /* dup to prepare for up conversion */
+        movq    mm5, mm4    ; /* dup to prepare for up conversion */
+          ; /* convert from UINT8 to INT16 */
+        punpcklbw   mm0, mm7    ; /* mm0 = INT16(FiltPtr) */
+        punpcklbw   mm1, mm7    ; /* mm1 = INT16(ReconPtr1) */
+        punpcklbw   mm4, mm7    ; /* mm1 = INT16(ReconPtr2) */
+        punpckhbw   mm2, mm7    ; /* mm2 = INT16(FiltPtr) */
+        punpckhbw   mm3, mm7    ; /* mm3 = INT16(ReconPtr1) */
+        punpckhbw   mm5, mm7    ; /* mm3 = INT16(ReconPtr2) */
+          ; /* average ReconPtr1 and ReconPtr2 */
+        paddw   mm1, mm4    ; /* mm1 = ReconPtr1 + ReconPtr2 */
+        paddw   mm3, mm5    ; /* mm3 = ReconPtr1 + ReconPtr2 */
+        psrlw   mm1, 1    ; /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
+        psrlw   mm3, 1    ; /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
+        psubw   mm0, mm1    ; /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+        psubw   mm2, mm3    ; /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+        movq    [edx], mm0    ; /* write answer out */
+        movq    [8 + edx], mm2    ; /* write answer out */
+          ; /* Increment pointers */
+        add   edx, 16   ;
+        add   eax, PixelsPerLine    ;
+        add   ebx, ReconPixelsPerLine   ;
+        add   ecx, ReconPixelsPerLine   ;
 
-        /*  ITERATION 2 */	
-        movq		mm0, [eax]		;	/* mm0 = FiltPtr */
-        movq		mm1, [ebx]		;	/* mm1 = ReconPtr1 */
-        movq		mm4, [ecx]		;	/* mm1 = ReconPtr2 */
-        movq		mm2, mm0		;	/* dup to prepare for up conversion */
-        movq		mm3, mm1		;	/* dup to prepare for up conversion */
-        movq		mm5, mm4		;	/* dup to prepare for up conversion */
-	        ;	/* convert from UINT8 to INT16 */
-        punpcklbw		mm0, mm7		;	/* mm0 = INT16(FiltPtr) */
-        punpcklbw		mm1, mm7		;	/* mm1 = INT16(ReconPtr1) */
-        punpcklbw		mm4, mm7		;	/* mm1 = INT16(ReconPtr2) */
-        punpckhbw		mm2, mm7		;	/* mm2 = INT16(FiltPtr) */
-        punpckhbw		mm3, mm7		;	/* mm3 = INT16(ReconPtr1) */
-        punpckhbw		mm5, mm7		;	/* mm3 = INT16(ReconPtr2) */
-	        ;	/* average ReconPtr1 and ReconPtr2 */
-        paddw		mm1, mm4		;	/* mm1 = ReconPtr1 + ReconPtr2 */
-        paddw		mm3, mm5		;	/* mm3 = ReconPtr1 + ReconPtr2 */
-        psrlw		mm1, 1		;	/* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
-        psrlw		mm3, 1		;	/* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
-        psubw		mm0, mm1		;	/* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
-        psubw		mm2, mm3		;	/* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
-        movq		[edx], mm0		;	/* write answer out */
-        movq		[8 + edx], mm2		;	/* write answer out */
-	        ;	/* Increment pointers */
-        add		edx, 16		;	
-        add		eax, PixelsPerLine		;	
-        add		ebx, ReconPixelsPerLine		;	
-        add		ecx, ReconPixelsPerLine		;	
 
+        /*  ITERATION 2 */
+        movq    mm0, [eax]    ; /* mm0 = FiltPtr */
+        movq    mm1, [ebx]    ; /* mm1 = ReconPtr1 */
+        movq    mm4, [ecx]    ; /* mm1 = ReconPtr2 */
+        movq    mm2, mm0    ; /* dup to prepare for up conversion */
+        movq    mm3, mm1    ; /* dup to prepare for up conversion */
+        movq    mm5, mm4    ; /* dup to prepare for up conversion */
+          ; /* convert from UINT8 to INT16 */
+        punpcklbw   mm0, mm7    ; /* mm0 = INT16(FiltPtr) */
+        punpcklbw   mm1, mm7    ; /* mm1 = INT16(ReconPtr1) */
+        punpcklbw   mm4, mm7    ; /* mm1 = INT16(ReconPtr2) */
+        punpckhbw   mm2, mm7    ; /* mm2 = INT16(FiltPtr) */
+        punpckhbw   mm3, mm7    ; /* mm3 = INT16(ReconPtr1) */
+        punpckhbw   mm5, mm7    ; /* mm3 = INT16(ReconPtr2) */
+          ; /* average ReconPtr1 and ReconPtr2 */
+        paddw   mm1, mm4    ; /* mm1 = ReconPtr1 + ReconPtr2 */
+        paddw   mm3, mm5    ; /* mm3 = ReconPtr1 + ReconPtr2 */
+        psrlw   mm1, 1    ; /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
+        psrlw   mm3, 1    ; /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
+        psubw   mm0, mm1    ; /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+        psubw   mm2, mm3    ; /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+        movq    [edx], mm0    ; /* write answer out */
+        movq    [8 + edx], mm2    ; /* write answer out */
+          ; /* Increment pointers */
+        add   edx, 16   ;
+        add   eax, PixelsPerLine    ;
+        add   ebx, ReconPixelsPerLine   ;
+        add   ecx, ReconPixelsPerLine   ;
 
-        /*  ITERATION 3 */	
-        movq		mm0, [eax]		;	/* mm0 = FiltPtr */
-        movq		mm1, [ebx]		;	/* mm1 = ReconPtr1 */
-        movq		mm4, [ecx]		;	/* mm1 = ReconPtr2 */
-        movq		mm2, mm0		;	/* dup to prepare for up conversion */
-        movq		mm3, mm1		;	/* dup to prepare for up conversion */
-        movq		mm5, mm4		;	/* dup to prepare for up conversion */
-	        ;	/* convert from UINT8 to INT16 */
-        punpcklbw		mm0, mm7		;	/* mm0 = INT16(FiltPtr) */
-        punpcklbw		mm1, mm7		;	/* mm1 = INT16(ReconPtr1) */
-        punpcklbw		mm4, mm7		;	/* mm1 = INT16(ReconPtr2) */
-        punpckhbw		mm2, mm7		;	/* mm2 = INT16(FiltPtr) */
-        punpckhbw		mm3, mm7		;	/* mm3 = INT16(ReconPtr1) */
-        punpckhbw		mm5, mm7		;	/* mm3 = INT16(ReconPtr2) */
-	        ;	/* average ReconPtr1 and ReconPtr2 */
-        paddw		mm1, mm4		;	/* mm1 = ReconPtr1 + ReconPtr2 */
-        paddw		mm3, mm5		;	/* mm3 = ReconPtr1 + ReconPtr2 */
-        psrlw		mm1, 1		;	/* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
-        psrlw		mm3, 1		;	/* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
-        psubw		mm0, mm1		;	/* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
-        psubw		mm2, mm3		;	/* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
-        movq		[edx], mm0		;	/* write answer out */
-        movq		[8 + edx], mm2		;	/* write answer out */
-	        ;	/* Increment pointers */
-        add		edx, 16		;	
-        add		eax, PixelsPerLine		;	
-        add		ebx, ReconPixelsPerLine		;	
-        add		ecx, ReconPixelsPerLine		;	
 
+        /*  ITERATION 3 */
+        movq    mm0, [eax]    ; /* mm0 = FiltPtr */
+        movq    mm1, [ebx]    ; /* mm1 = ReconPtr1 */
+        movq    mm4, [ecx]    ; /* mm1 = ReconPtr2 */
+        movq    mm2, mm0    ; /* dup to prepare for up conversion */
+        movq    mm3, mm1    ; /* dup to prepare for up conversion */
+        movq    mm5, mm4    ; /* dup to prepare for up conversion */
+          ; /* convert from UINT8 to INT16 */
+        punpcklbw   mm0, mm7    ; /* mm0 = INT16(FiltPtr) */
+        punpcklbw   mm1, mm7    ; /* mm1 = INT16(ReconPtr1) */
+        punpcklbw   mm4, mm7    ; /* mm1 = INT16(ReconPtr2) */
+        punpckhbw   mm2, mm7    ; /* mm2 = INT16(FiltPtr) */
+        punpckhbw   mm3, mm7    ; /* mm3 = INT16(ReconPtr1) */
+        punpckhbw   mm5, mm7    ; /* mm3 = INT16(ReconPtr2) */
+          ; /* average ReconPtr1 and ReconPtr2 */
+        paddw   mm1, mm4    ; /* mm1 = ReconPtr1 + ReconPtr2 */
+        paddw   mm3, mm5    ; /* mm3 = ReconPtr1 + ReconPtr2 */
+        psrlw   mm1, 1    ; /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
+        psrlw   mm3, 1    ; /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
+        psubw   mm0, mm1    ; /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+        psubw   mm2, mm3    ; /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+        movq    [edx], mm0    ; /* write answer out */
+        movq    [8 + edx], mm2    ; /* write answer out */
+          ; /* Increment pointers */
+        add   edx, 16   ;
+        add   eax, PixelsPerLine    ;
+        add   ebx, ReconPixelsPerLine   ;
+        add   ecx, ReconPixelsPerLine   ;
 
-        /*  ITERATION 4 */	
-        movq		mm0, [eax]		;	/* mm0 = FiltPtr */
-        movq		mm1, [ebx]		;	/* mm1 = ReconPtr1 */
-        movq		mm4, [ecx]		;	/* mm1 = ReconPtr2 */
-        movq		mm2, mm0		;	/* dup to prepare for up conversion */
-        movq		mm3, mm1		;	/* dup to prepare for up conversion */
-        movq		mm5, mm4		;	/* dup to prepare for up conversion */
-	        ;	/* convert from UINT8 to INT16 */
-        punpcklbw		mm0, mm7		;	/* mm0 = INT16(FiltPtr) */
-        punpcklbw		mm1, mm7		;	/* mm1 = INT16(ReconPtr1) */
-        punpcklbw		mm4, mm7		;	/* mm1 = INT16(ReconPtr2) */
-        punpckhbw		mm2, mm7		;	/* mm2 = INT16(FiltPtr) */
-        punpckhbw		mm3, mm7		;	/* mm3 = INT16(ReconPtr1) */
-        punpckhbw		mm5, mm7		;	/* mm3 = INT16(ReconPtr2) */
-	        ;	/* average ReconPtr1 and ReconPtr2 */
-        paddw		mm1, mm4		;	/* mm1 = ReconPtr1 + ReconPtr2 */
-        paddw		mm3, mm5		;	/* mm3 = ReconPtr1 + ReconPtr2 */
-        psrlw		mm1, 1		;	/* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
-        psrlw		mm3, 1		;	/* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
-        psubw		mm0, mm1		;	/* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
-        psubw		mm2, mm3		;	/* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
-        movq		[edx], mm0		;	/* write answer out */
-        movq		[8 + edx], mm2		;	/* write answer out */
-	        ;	/* Increment pointers */
-        add		edx, 16		;	
-        add		eax, PixelsPerLine		;	
-        add		ebx, ReconPixelsPerLine		;	
-        add		ecx, ReconPixelsPerLine		;	
 
+        /*  ITERATION 4 */
+        movq    mm0, [eax]    ; /* mm0 = FiltPtr */
+        movq    mm1, [ebx]    ; /* mm1 = ReconPtr1 */
+        movq    mm4, [ecx]    ; /* mm1 = ReconPtr2 */
+        movq    mm2, mm0    ; /* dup to prepare for up conversion */
+        movq    mm3, mm1    ; /* dup to prepare for up conversion */
+        movq    mm5, mm4    ; /* dup to prepare for up conversion */
+          ; /* convert from UINT8 to INT16 */
+        punpcklbw   mm0, mm7    ; /* mm0 = INT16(FiltPtr) */
+        punpcklbw   mm1, mm7    ; /* mm1 = INT16(ReconPtr1) */
+        punpcklbw   mm4, mm7    ; /* mm1 = INT16(ReconPtr2) */
+        punpckhbw   mm2, mm7    ; /* mm2 = INT16(FiltPtr) */
+        punpckhbw   mm3, mm7    ; /* mm3 = INT16(ReconPtr1) */
+        punpckhbw   mm5, mm7    ; /* mm3 = INT16(ReconPtr2) */
+          ; /* average ReconPtr1 and ReconPtr2 */
+        paddw   mm1, mm4    ; /* mm1 = ReconPtr1 + ReconPtr2 */
+        paddw   mm3, mm5    ; /* mm3 = ReconPtr1 + ReconPtr2 */
+        psrlw   mm1, 1    ; /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
+        psrlw   mm3, 1    ; /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
+        psubw   mm0, mm1    ; /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+        psubw   mm2, mm3    ; /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+        movq    [edx], mm0    ; /* write answer out */
+        movq    [8 + edx], mm2    ; /* write answer out */
+          ; /* Increment pointers */
+        add   edx, 16   ;
+        add   eax, PixelsPerLine    ;
+        add   ebx, ReconPixelsPerLine   ;
+        add   ecx, ReconPixelsPerLine   ;
 
-        /*  ITERATION 5 */	
-        movq		mm0, [eax]		;	/* mm0 = FiltPtr */
-        movq		mm1, [ebx]		;	/* mm1 = ReconPtr1 */
-        movq		mm4, [ecx]		;	/* mm1 = ReconPtr2 */
-        movq		mm2, mm0		;	/* dup to prepare for up conversion */
-        movq		mm3, mm1		;	/* dup to prepare for up conversion */
-        movq		mm5, mm4		;	/* dup to prepare for up conversion */
-	        ;	/* convert from UINT8 to INT16 */
-        punpcklbw		mm0, mm7		;	/* mm0 = INT16(FiltPtr) */
-        punpcklbw		mm1, mm7		;	/* mm1 = INT16(ReconPtr1) */
-        punpcklbw		mm4, mm7		;	/* mm1 = INT16(ReconPtr2) */
-        punpckhbw		mm2, mm7		;	/* mm2 = INT16(FiltPtr) */
-        punpckhbw		mm3, mm7		;	/* mm3 = INT16(ReconPtr1) */
-        punpckhbw		mm5, mm7		;	/* mm3 = INT16(ReconPtr2) */
-	        ;	/* average ReconPtr1 and ReconPtr2 */
-        paddw		mm1, mm4		;	/* mm1 = ReconPtr1 + ReconPtr2 */
-        paddw		mm3, mm5		;	/* mm3 = ReconPtr1 + ReconPtr2 */
-        psrlw		mm1, 1		;	/* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
-        psrlw		mm3, 1		;	/* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
-        psubw		mm0, mm1		;	/* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
-        psubw		mm2, mm3		;	/* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
-        movq		[edx], mm0		;	/* write answer out */
-        movq		[8 + edx], mm2		;	/* write answer out */
-	        ;	/* Increment pointers */
-        add		edx, 16		;	
-        add		eax, PixelsPerLine		;	
-        add		ebx, ReconPixelsPerLine		;	
-        add		ecx, ReconPixelsPerLine		;	
 
+        /*  ITERATION 5 */
+        movq    mm0, [eax]    ; /* mm0 = FiltPtr */
+        movq    mm1, [ebx]    ; /* mm1 = ReconPtr1 */
+        movq    mm4, [ecx]    ; /* mm1 = ReconPtr2 */
+        movq    mm2, mm0    ; /* dup to prepare for up conversion */
+        movq    mm3, mm1    ; /* dup to prepare for up conversion */
+        movq    mm5, mm4    ; /* dup to prepare for up conversion */
+          ; /* convert from UINT8 to INT16 */
+        punpcklbw   mm0, mm7    ; /* mm0 = INT16(FiltPtr) */
+        punpcklbw   mm1, mm7    ; /* mm1 = INT16(ReconPtr1) */
+        punpcklbw   mm4, mm7    ; /* mm1 = INT16(ReconPtr2) */
+        punpckhbw   mm2, mm7    ; /* mm2 = INT16(FiltPtr) */
+        punpckhbw   mm3, mm7    ; /* mm3 = INT16(ReconPtr1) */
+        punpckhbw   mm5, mm7    ; /* mm3 = INT16(ReconPtr2) */
+          ; /* average ReconPtr1 and ReconPtr2 */
+        paddw   mm1, mm4    ; /* mm1 = ReconPtr1 + ReconPtr2 */
+        paddw   mm3, mm5    ; /* mm3 = ReconPtr1 + ReconPtr2 */
+        psrlw   mm1, 1    ; /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
+        psrlw   mm3, 1    ; /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
+        psubw   mm0, mm1    ; /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+        psubw   mm2, mm3    ; /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+        movq    [edx], mm0    ; /* write answer out */
+        movq    [8 + edx], mm2    ; /* write answer out */
+          ; /* Increment pointers */
+        add   edx, 16   ;
+        add   eax, PixelsPerLine    ;
+        add   ebx, ReconPixelsPerLine   ;
+        add   ecx, ReconPixelsPerLine   ;
 
-        /*  ITERATION 6 */	
-        movq		mm0, [eax]		;	/* mm0 = FiltPtr */
-        movq		mm1, [ebx]		;	/* mm1 = ReconPtr1 */
-        movq		mm4, [ecx]		;	/* mm1 = ReconPtr2 */
-        movq		mm2, mm0		;	/* dup to prepare for up conversion */
-        movq		mm3, mm1		;	/* dup to prepare for up conversion */
-        movq		mm5, mm4		;	/* dup to prepare for up conversion */
-	        ;	/* convert from UINT8 to INT16 */
-        punpcklbw		mm0, mm7		;	/* mm0 = INT16(FiltPtr) */
-        punpcklbw		mm1, mm7		;	/* mm1 = INT16(ReconPtr1) */
-        punpcklbw		mm4, mm7		;	/* mm1 = INT16(ReconPtr2) */
-        punpckhbw		mm2, mm7		;	/* mm2 = INT16(FiltPtr) */
-        punpckhbw		mm3, mm7		;	/* mm3 = INT16(ReconPtr1) */
-        punpckhbw		mm5, mm7		;	/* mm3 = INT16(ReconPtr2) */
-	        ;	/* average ReconPtr1 and ReconPtr2 */
-        paddw		mm1, mm4		;	/* mm1 = ReconPtr1 + ReconPtr2 */
-        paddw		mm3, mm5		;	/* mm3 = ReconPtr1 + ReconPtr2 */
-        psrlw		mm1, 1		;	/* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
-        psrlw		mm3, 1		;	/* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
-        psubw		mm0, mm1		;	/* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
-        psubw		mm2, mm3		;	/* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
-        movq		[edx], mm0		;	/* write answer out */
-        movq		[8 + edx], mm2		;	/* write answer out */
-	        ;	/* Increment pointers */
-        add		edx, 16		;	
-        add		eax, PixelsPerLine		;	
-        add		ebx, ReconPixelsPerLine		;	
-        add		ecx, ReconPixelsPerLine		;	
 
+        /*  ITERATION 6 */
+        movq    mm0, [eax]    ; /* mm0 = FiltPtr */
+        movq    mm1, [ebx]    ; /* mm1 = ReconPtr1 */
+        movq    mm4, [ecx]    ; /* mm1 = ReconPtr2 */
+        movq    mm2, mm0    ; /* dup to prepare for up conversion */
+        movq    mm3, mm1    ; /* dup to prepare for up conversion */
+        movq    mm5, mm4    ; /* dup to prepare for up conversion */
+          ; /* convert from UINT8 to INT16 */
+        punpcklbw   mm0, mm7    ; /* mm0 = INT16(FiltPtr) */
+        punpcklbw   mm1, mm7    ; /* mm1 = INT16(ReconPtr1) */
+        punpcklbw   mm4, mm7    ; /* mm1 = INT16(ReconPtr2) */
+        punpckhbw   mm2, mm7    ; /* mm2 = INT16(FiltPtr) */
+        punpckhbw   mm3, mm7    ; /* mm3 = INT16(ReconPtr1) */
+        punpckhbw   mm5, mm7    ; /* mm3 = INT16(ReconPtr2) */
+          ; /* average ReconPtr1 and ReconPtr2 */
+        paddw   mm1, mm4    ; /* mm1 = ReconPtr1 + ReconPtr2 */
+        paddw   mm3, mm5    ; /* mm3 = ReconPtr1 + ReconPtr2 */
+        psrlw   mm1, 1    ; /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
+        psrlw   mm3, 1    ; /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
+        psubw   mm0, mm1    ; /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+        psubw   mm2, mm3    ; /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+        movq    [edx], mm0    ; /* write answer out */
+        movq    [8 + edx], mm2    ; /* write answer out */
+          ; /* Increment pointers */
+        add   edx, 16   ;
+        add   eax, PixelsPerLine    ;
+        add   ebx, ReconPixelsPerLine   ;
+        add   ecx, ReconPixelsPerLine   ;
 
-        /*  ITERATION 7 */	
-        movq		mm0, [eax]		;	/* mm0 = FiltPtr */
-        movq		mm1, [ebx]		;	/* mm1 = ReconPtr1 */
-        movq		mm4, [ecx]		;	/* mm1 = ReconPtr2 */
-        movq		mm2, mm0		;	/* dup to prepare for up conversion */
-        movq		mm3, mm1		;	/* dup to prepare for up conversion */
-        movq		mm5, mm4		;	/* dup to prepare for up conversion */
-	        ;	/* convert from UINT8 to INT16 */
-        punpcklbw		mm0, mm7		;	/* mm0 = INT16(FiltPtr) */
-        punpcklbw		mm1, mm7		;	/* mm1 = INT16(ReconPtr1) */
-        punpcklbw		mm4, mm7		;	/* mm1 = INT16(ReconPtr2) */
-        punpckhbw		mm2, mm7		;	/* mm2 = INT16(FiltPtr) */
-        punpckhbw		mm3, mm7		;	/* mm3 = INT16(ReconPtr1) */
-        punpckhbw		mm5, mm7		;	/* mm3 = INT16(ReconPtr2) */
-	        ;	/* average ReconPtr1 and ReconPtr2 */
-        paddw		mm1, mm4		;	/* mm1 = ReconPtr1 + ReconPtr2 */
-        paddw		mm3, mm5		;	/* mm3 = ReconPtr1 + ReconPtr2 */
-        psrlw		mm1, 1		;	/* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
-        psrlw		mm3, 1		;	/* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
-        psubw		mm0, mm1		;	/* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
-        psubw		mm2, mm3		;	/* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
-        movq		[edx], mm0		;	/* write answer out */
-        movq		[8 + edx], mm2		;	/* write answer out */
-	        ;	/* Increment pointers */
-        add		edx, 16		;	
-        add		eax, PixelsPerLine		;	
-        add		ebx, ReconPixelsPerLine		;	
-        add		ecx, ReconPixelsPerLine		;	
 
+        /*  ITERATION 7 */
+        movq    mm0, [eax]    ; /* mm0 = FiltPtr */
+        movq    mm1, [ebx]    ; /* mm1 = ReconPtr1 */
+        movq    mm4, [ecx]    ; /* mm1 = ReconPtr2 */
+        movq    mm2, mm0    ; /* dup to prepare for up conversion */
+        movq    mm3, mm1    ; /* dup to prepare for up conversion */
+        movq    mm5, mm4    ; /* dup to prepare for up conversion */
+          ; /* convert from UINT8 to INT16 */
+        punpcklbw   mm0, mm7    ; /* mm0 = INT16(FiltPtr) */
+        punpcklbw   mm1, mm7    ; /* mm1 = INT16(ReconPtr1) */
+        punpcklbw   mm4, mm7    ; /* mm1 = INT16(ReconPtr2) */
+        punpckhbw   mm2, mm7    ; /* mm2 = INT16(FiltPtr) */
+        punpckhbw   mm3, mm7    ; /* mm3 = INT16(ReconPtr1) */
+        punpckhbw   mm5, mm7    ; /* mm3 = INT16(ReconPtr2) */
+          ; /* average ReconPtr1 and ReconPtr2 */
+        paddw   mm1, mm4    ; /* mm1 = ReconPtr1 + ReconPtr2 */
+        paddw   mm3, mm5    ; /* mm3 = ReconPtr1 + ReconPtr2 */
+        psrlw   mm1, 1    ; /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
+        psrlw   mm3, 1    ; /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
+        psubw   mm0, mm1    ; /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+        psubw   mm2, mm3    ; /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+        movq    [edx], mm0    ; /* write answer out */
+        movq    [8 + edx], mm2    ; /* write answer out */
+          ; /* Increment pointers */
+        add   edx, 16   ;
+        add   eax, PixelsPerLine    ;
+        add   ebx, ReconPixelsPerLine   ;
+        add   ecx, ReconPixelsPerLine   ;
 
-        /*  ITERATION 8 */	
-        movq		mm0, [eax]		;	/* mm0 = FiltPtr */
-        movq		mm1, [ebx]		;	/* mm1 = ReconPtr1 */
-        movq		mm4, [ecx]		;	/* mm1 = ReconPtr2 */
-        movq		mm2, mm0		;	/* dup to prepare for up conversion */
-        movq		mm3, mm1		;	/* dup to prepare for up conversion */
-        movq		mm5, mm4		;	/* dup to prepare for up conversion */
-	        ;	/* convert from UINT8 to INT16 */
-        punpcklbw		mm0, mm7		;	/* mm0 = INT16(FiltPtr) */
-        punpcklbw		mm1, mm7		;	/* mm1 = INT16(ReconPtr1) */
-        punpcklbw		mm4, mm7		;	/* mm1 = INT16(ReconPtr2) */
-        punpckhbw		mm2, mm7		;	/* mm2 = INT16(FiltPtr) */
-        punpckhbw		mm3, mm7		;	/* mm3 = INT16(ReconPtr1) */
-        punpckhbw		mm5, mm7		;	/* mm3 = INT16(ReconPtr2) */
-	        ;	/* average ReconPtr1 and ReconPtr2 */
-        paddw		mm1, mm4		;	/* mm1 = ReconPtr1 + ReconPtr2 */
-        paddw		mm3, mm5		;	/* mm3 = ReconPtr1 + ReconPtr2 */
-        psrlw		mm1, 1		;	/* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
-        psrlw		mm3, 1		;	/* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
-        psubw		mm0, mm1		;	/* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
-        psubw		mm2, mm3		;	/* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
-        movq		[edx], mm0		;	/* write answer out */
-        movq		[8 + edx], mm2		;	/* write answer out */
-	        ;	/* Increment pointers */
-        add		edx, 16		;	
-        add		eax, PixelsPerLine		;	
-        add		ebx, ReconPixelsPerLine		;	
-        add		ecx, ReconPixelsPerLine		;	
 
+        /*  ITERATION 8 */
+        movq    mm0, [eax]    ; /* mm0 = FiltPtr */
+        movq    mm1, [ebx]    ; /* mm1 = ReconPtr1 */
+        movq    mm4, [ecx]    ; /* mm1 = ReconPtr2 */
+        movq    mm2, mm0    ; /* dup to prepare for up conversion */
+        movq    mm3, mm1    ; /* dup to prepare for up conversion */
+        movq    mm5, mm4    ; /* dup to prepare for up conversion */
+          ; /* convert from UINT8 to INT16 */
+        punpcklbw   mm0, mm7    ; /* mm0 = INT16(FiltPtr) */
+        punpcklbw   mm1, mm7    ; /* mm1 = INT16(ReconPtr1) */
+        punpcklbw   mm4, mm7    ; /* mm1 = INT16(ReconPtr2) */
+        punpckhbw   mm2, mm7    ; /* mm2 = INT16(FiltPtr) */
+        punpckhbw   mm3, mm7    ; /* mm3 = INT16(ReconPtr1) */
+        punpckhbw   mm5, mm7    ; /* mm3 = INT16(ReconPtr2) */
+          ; /* average ReconPtr1 and ReconPtr2 */
+        paddw   mm1, mm4    ; /* mm1 = ReconPtr1 + ReconPtr2 */
+        paddw   mm3, mm5    ; /* mm3 = ReconPtr1 + ReconPtr2 */
+        psrlw   mm1, 1    ; /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
+        psrlw   mm3, 1    ; /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
+        psubw   mm0, mm1    ; /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+        psubw   mm2, mm3    ; /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+        movq    [edx], mm0    ; /* write answer out */
+        movq    [8 + edx], mm2    ; /* write answer out */
+          ; /* Increment pointers */
+        add   edx, 16   ;
+        add   eax, PixelsPerLine    ;
+        add   ebx, ReconPixelsPerLine   ;
+        add   ecx, ReconPixelsPerLine   ;
+
     };
 
 
 
 
- 
+
 #endif
 }
 
@@ -708,15 +708,15 @@
   ogg_uint32_t SadValue;
   ogg_uint32_t SadValue1;
 
-  SadValue    = DSP_OP_ABS_DIFF (Src1[0], Src2[0]) + 
-	        DSP_OP_ABS_DIFF (Src1[1], Src2[1]) +
-	        DSP_OP_ABS_DIFF (Src1[2], Src2[2]) +
-	        DSP_OP_ABS_DIFF (Src1[3], Src2[3]);
+  SadValue    = DSP_OP_ABS_DIFF (Src1[0], Src2[0]) +
+          DSP_OP_ABS_DIFF (Src1[1], Src2[1]) +
+          DSP_OP_ABS_DIFF (Src1[2], Src2[2]) +
+          DSP_OP_ABS_DIFF (Src1[3], Src2[3]);
 
-  SadValue1   = DSP_OP_ABS_DIFF (Src1[4], Src2[4]) + 
-	        DSP_OP_ABS_DIFF (Src1[5], Src2[5]) +
-	        DSP_OP_ABS_DIFF (Src1[6], Src2[6]) +
-	        DSP_OP_ABS_DIFF (Src1[7], Src2[7]);
+  SadValue1   = DSP_OP_ABS_DIFF (Src1[4], Src2[4]) +
+          DSP_OP_ABS_DIFF (Src1[5], Src2[5]) +
+          DSP_OP_ABS_DIFF (Src1[6], Src2[6]) +
+          DSP_OP_ABS_DIFF (Src1[7], Src2[7]);
 
   SadValue = ( SadValue > SadValue1 ) ? SadValue : SadValue1;
 
@@ -725,54 +725,54 @@
 #else
   ogg_uint32_t MaxSad;
 
-  
+
   __asm {
     align       16
     mov         ebx, Src1
     mov         ecx, Src2
 
 
-    pxor		mm6, mm6		;	/* zero out mm6 for unpack */
-    pxor		mm7, mm7		;	/* zero out mm7 for unpack */
-    movq		mm0, [ebx]		;	/* take 8 bytes */
-    movq		mm1, [ecx]		;	
+    pxor    mm6, mm6    ; /* zero out mm6 for unpack */
+    pxor    mm7, mm7    ; /* zero out mm7 for unpack */
+    movq    mm0, [ebx]    ; /* take 8 bytes */
+    movq    mm1, [ecx]    ;
 
-    movq		mm2, mm0		;	
-    psubusb		mm0, mm1		;	/* A - B */
-    psubusb		mm1, mm2		;	/* B - A */
-    por		mm0, mm1		;	/* and or gives abs difference */
+    movq    mm2, mm0    ;
+    psubusb   mm0, mm1    ; /* A - B */
+    psubusb   mm1, mm2    ; /* B - A */
+    por   mm0, mm1    ; /* and or gives abs difference */
 
-    movq		mm1, mm0		;	
+    movq    mm1, mm0    ;
 
-    punpcklbw		mm0, mm6		;	/* ; unpack low four bytes to higher precision */
-    punpckhbw		mm1, mm7		;	/* ; unpack high four bytes to higher precision */
+    punpcklbw   mm0, mm6    ; /* ; unpack low four bytes to higher precision */
+    punpckhbw   mm1, mm7    ; /* ; unpack high four bytes to higher precision */
 
-    movq		mm2, mm0		;	
-    movq		mm3, mm1		;	
-    psrlq		mm2, 32		;	/* fold and add */
-    psrlq		mm3, 32		;	
-    paddw		mm0, mm2		;	
-    paddw		mm1, mm3		;	
-    movq		mm2, mm0		;	
-    movq		mm3, mm1		;	
-    psrlq		mm2, 16		;	
-    psrlq		mm3, 16		;	
-    paddw		mm0, mm2		;	
-    paddw		mm1, mm3		;	
+    movq    mm2, mm0    ;
+    movq    mm3, mm1    ;
+    psrlq   mm2, 32   ; /* fold and add */
+    psrlq   mm3, 32   ;
+    paddw   mm0, mm2    ;
+    paddw   mm1, mm3    ;
+    movq    mm2, mm0    ;
+    movq    mm3, mm1    ;
+    psrlq   mm2, 16   ;
+    psrlq   mm3, 16   ;
+    paddw   mm0, mm2    ;
+    paddw   mm1, mm3    ;
 
-    psubusw		mm1, mm0		;	
-    paddw		mm1, mm0		;	/* mm1 = max(mm1, mm0) */
-    movd		eax, mm1		;
+    psubusw   mm1, mm0    ;
+    paddw   mm1, mm0    ; /* mm1 = max(mm1, mm0) */
+    movd    eax, mm1    ;
 
     and         eax, 0xffff
     mov         MaxSad, eax
   };
    return MaxSad;
-  
-  
-  
- 
 
+
+
+
+
 #endif
 }
 
@@ -780,7 +780,7 @@
 
 
 static ogg_uint32_t col_sad8x8__mmx (unsigned char *Src1, unsigned char *Src2,
-		                    ogg_uint32_t stride)
+                        ogg_uint32_t stride)
 {
 
 #if 0
@@ -798,7 +798,7 @@
     SadValue[5] += abs(Src1[5] - Src2[5]);
     SadValue[6] += abs(Src1[6] - Src2[6]);
     SadValue[7] += abs(Src1[7] - Src2[7]);
-    
+
     Src1 += stride;
     Src2 += stride;
   }
@@ -812,18 +812,18 @@
     SadValue2[5] += abs(Src1[5] - Src2[5]);
     SadValue2[6] += abs(Src1[6] - Src2[6]);
     SadValue2[7] += abs(Src1[7] - Src2[7]);
-    
+
     Src1 += stride;
     Src2 += stride;
   }
-    
+
   for ( i = 0; i < 8; i++ ){
     if ( SadValue[i] > MaxSad )
       MaxSad = SadValue[i];
     if ( SadValue2[i] > MaxSad )
       MaxSad = SadValue2[i];
   }
-    
+
   return MaxSad;
 #else
   ogg_uint32_t MaxSad;
@@ -834,69 +834,69 @@
         mov         ebx, Src1
         mov         ecx, Src2
 
-        pxor		mm3, mm3		;	/* zero out mm3 for unpack */
-        pxor		mm4, mm4		;	/* mm4 low sum */
-        pxor		mm5, mm5		;	/* mm5 high sum */
-        pxor		mm6, mm6		;	/* mm6 low sum */
-        pxor		mm7, mm7		;	/* mm7 high sum */
-        mov		edi, 4		;	/* 4 rows */
-        label_1:				;	
-        movq		mm0, [ebx]		;	/* take 8 bytes */
-        movq		mm1, [ecx]		;	/* take 8 bytes */
+        pxor    mm3, mm3    ; /* zero out mm3 for unpack */
+        pxor    mm4, mm4    ; /* mm4 low sum */
+        pxor    mm5, mm5    ; /* mm5 high sum */
+        pxor    mm6, mm6    ; /* mm6 low sum */
+        pxor    mm7, mm7    ; /* mm7 high sum */
+        mov   edi, 4    ; /* 4 rows */
+        label_1:        ;
+        movq    mm0, [ebx]    ; /* take 8 bytes */
+        movq    mm1, [ecx]    ; /* take 8 bytes */
 
-        movq		mm2, mm0		;	
-        psubusb		mm0, mm1		;	/* A - B */
-        psubusb		mm1, mm2		;	/* B - A */
-        por		mm0, mm1		;	/* and or gives abs difference */
-        movq		mm1, mm0		;	
+        movq    mm2, mm0    ;
+        psubusb   mm0, mm1    ; /* A - B */
+        psubusb   mm1, mm2    ; /* B - A */
+        por   mm0, mm1    ; /* and or gives abs difference */
+        movq    mm1, mm0    ;
 
-        punpcklbw		mm0, mm3		;	/* unpack to higher precision for accumulation */
-        paddw		mm4, mm0		;	/* accumulate difference... */
-        punpckhbw		mm1, mm3		;	/* unpack high four bytes to higher precision */
-        paddw		mm5, mm1		;	/* accumulate difference... */
-        add		ebx, stride		;	/* Inc pointer into the new data */
-        add		ecx, stride		;	/* Inc pointer into the new data */
+        punpcklbw   mm0, mm3    ; /* unpack to higher precision for accumulation */
+        paddw   mm4, mm0    ; /* accumulate difference... */
+        punpckhbw   mm1, mm3    ; /* unpack high four bytes to higher precision */
+        paddw   mm5, mm1    ; /* accumulate difference... */
+        add   ebx, stride   ; /* Inc pointer into the new data */
+        add   ecx, stride   ; /* Inc pointer into the new data */
 
-        dec		edi		;	
-        jnz		label_1		;	
+        dec   edi   ;
+        jnz   label_1   ;
 
-        mov		edi, 4		;	/* 4 rows */
-        label_2:				;	
-        movq		mm0, [ebx]		;	/* take 8 bytes */
-        movq		mm1, [ecx]		;	/* take 8 bytes */
+        mov   edi, 4    ; /* 4 rows */
+        label_2:        ;
+        movq    mm0, [ebx]    ; /* take 8 bytes */
+        movq    mm1, [ecx]    ; /* take 8 bytes */
 
-        movq		mm2, mm0		;	
-        psubusb		mm0, mm1		;	/* A - B */
-        psubusb		mm1, mm2		;	/* B - A */
-        por		mm0, mm1		;	/* and or gives abs difference */
-        movq		mm1, mm0		;	
+        movq    mm2, mm0    ;
+        psubusb   mm0, mm1    ; /* A - B */
+        psubusb   mm1, mm2    ; /* B - A */
+        por   mm0, mm1    ; /* and or gives abs difference */
+        movq    mm1, mm0    ;
 
-        punpcklbw		mm0, mm3		;	/* unpack to higher precision for accumulation */
-        paddw		mm6, mm0		;	/* accumulate difference... */
-        punpckhbw		mm1, mm3		;	/* unpack high four bytes to higher precision */
-        paddw		mm7, mm1		;	/* accumulate difference... */
-        add		ebx, stride		;	/* Inc pointer into the new data */
-        add		ecx, stride		;	/* Inc pointer into the new data */
+        punpcklbw   mm0, mm3    ; /* unpack to higher precision for accumulation */
+        paddw   mm6, mm0    ; /* accumulate difference... */
+        punpckhbw   mm1, mm3    ; /* unpack high four bytes to higher precision */
+        paddw   mm7, mm1    ; /* accumulate difference... */
+        add   ebx, stride   ; /* Inc pointer into the new data */
+        add   ecx, stride   ; /* Inc pointer into the new data */
 
-        dec		edi		;	
-        jnz		label_2		;	
+        dec   edi   ;
+        jnz   label_2   ;
 
-        psubusw		mm7, mm6		;	
-        paddw		mm7, mm6		;	/* mm7 = max(mm7, mm6) */
-        psubusw		mm5, mm4		;	
-        paddw		mm5, mm4		;	/* mm5 = max(mm5, mm4) */
-        psubusw		mm7, mm5		;	
-        paddw		mm7, mm5		;	/* mm7 = max(mm5, mm7) */
-        movq		mm6, mm7		;	
-        psrlq		mm6, 32		;	
-        psubusw		mm7, mm6		;	
-        paddw		mm7, mm6		;	/* mm7 = max(mm5, mm7) */
-        movq		mm6, mm7		;	
-        psrlq		mm6, 16		;	
-        psubusw		mm7, mm6		;	
-        paddw		mm7, mm6		;	/* mm7 = max(mm5, mm7) */
-        movd		eax, mm7		;	
-        and		    eax, 0xffff		;
+        psubusw   mm7, mm6    ;
+        paddw   mm7, mm6    ; /* mm7 = max(mm7, mm6) */
+        psubusw   mm5, mm4    ;
+        paddw   mm5, mm4    ; /* mm5 = max(mm5, mm4) */
+        psubusw   mm7, mm5    ;
+        paddw   mm7, mm5    ; /* mm7 = max(mm5, mm7) */
+        movq    mm6, mm7    ;
+        psrlq   mm6, 32   ;
+        psubusw   mm7, mm6    ;
+        paddw   mm7, mm6    ; /* mm7 = max(mm5, mm7) */
+        movq    mm6, mm7    ;
+        psrlq   mm6, 16   ;
+        psubusw   mm7, mm6    ;
+        paddw   mm7, mm6    ; /* mm7 = max(mm5, mm7) */
+        movd    eax, mm7    ;
+        and       eax, 0xffff   ;
 
         mov         MaxSad, eax
     };
@@ -908,7 +908,7 @@
 }
 
 static ogg_uint32_t sad8x8__mmx (unsigned char *ptr1, ogg_uint32_t stride1,
-		       	    unsigned char *ptr2, ogg_uint32_t stride2)
+                unsigned char *ptr2, ogg_uint32_t stride2)
 {
 
 #if 0
@@ -940,177 +940,177 @@
     mov         ebx, ptr1
     mov         edx, ptr2
 
-    pxor		mm6, mm6		;	/* zero out mm6 for unpack */
-    pxor		mm7, mm7		;	/* mm7 contains the result */
-    
+    pxor    mm6, mm6    ; /* zero out mm6 for unpack */
+    pxor    mm7, mm7    ; /* mm7 contains the result */
+
     ; /* ITERATION 1 */
-    movq		mm0, [ebx]		;	/* take 8 bytes */
-    movq		mm1, [edx]		;	
-    movq		mm2, mm0		;	
+    movq    mm0, [ebx]    ; /* take 8 bytes */
+    movq    mm1, [edx]    ;
+    movq    mm2, mm0    ;
 
-    psubusb		mm0, mm1		;	/* A - B */
-    psubusb		mm1, mm2		;	/* B - A */
-    por		mm0, mm1		;	/* and or gives abs difference */
-    movq		mm1, mm0		;	
+    psubusb   mm0, mm1    ; /* A - B */
+    psubusb   mm1, mm2    ; /* B - A */
+    por   mm0, mm1    ; /* and or gives abs difference */
+    movq    mm1, mm0    ;
 
-    punpcklbw		mm0, mm6		;	/* unpack to higher precision for accumulation */
-    paddw		mm7, mm0		;	/* accumulate difference... */
-    punpckhbw		mm1, mm6		;	/* unpack high four bytes to higher precision */
-    add		ebx, stride1		;	/* Inc pointer into the new data */
-    paddw		mm7, mm1		;	/* accumulate difference... */
-    add		edx, stride2		;	/* Inc pointer into ref data */
+    punpcklbw   mm0, mm6    ; /* unpack to higher precision for accumulation */
+    paddw   mm7, mm0    ; /* accumulate difference... */
+    punpckhbw   mm1, mm6    ; /* unpack high four bytes to higher precision */
+    add   ebx, stride1    ; /* Inc pointer into the new data */
+    paddw   mm7, mm1    ; /* accumulate difference... */
+    add   edx, stride2    ; /* Inc pointer into ref data */
 
     ; /* ITERATION 2 */
-    movq		mm0, [ebx]		;	/* take 8 bytes */
-    movq		mm1, [edx]		;	
-    movq		mm2, mm0		;	
+    movq    mm0, [ebx]    ; /* take 8 bytes */
+    movq    mm1, [edx]    ;
+    movq    mm2, mm0    ;
 
-    psubusb		mm0, mm1		;	/* A - B */
-    psubusb		mm1, mm2		;	/* B - A */
-    por		mm0, mm1		;	/* and or gives abs difference */
-    movq		mm1, mm0		;	
+    psubusb   mm0, mm1    ; /* A - B */
+    psubusb   mm1, mm2    ; /* B - A */
+    por   mm0, mm1    ; /* and or gives abs difference */
+    movq    mm1, mm0    ;
 
-    punpcklbw		mm0, mm6		;	/* unpack to higher precision for accumulation */
-    paddw		mm7, mm0		;	/* accumulate difference... */
-    punpckhbw		mm1, mm6		;	/* unpack high four bytes to higher precision */
-    add		ebx, stride1		;	/* Inc pointer into the new data */
-    paddw		mm7, mm1		;	/* accumulate difference... */
-    add		edx, stride2		;	/* Inc pointer into ref data */
+    punpcklbw   mm0, mm6    ; /* unpack to higher precision for accumulation */
+    paddw   mm7, mm0    ; /* accumulate difference... */
+    punpckhbw   mm1, mm6    ; /* unpack high four bytes to higher precision */
+    add   ebx, stride1    ; /* Inc pointer into the new data */
+    paddw   mm7, mm1    ; /* accumulate difference... */
+    add   edx, stride2    ; /* Inc pointer into ref data */
 
 
     ; /* ITERATION 3 */
-    movq		mm0, [ebx]		;	/* take 8 bytes */
-    movq		mm1, [edx]		;	
-    movq		mm2, mm0		;	
+    movq    mm0, [ebx]    ; /* take 8 bytes */
+    movq    mm1, [edx]    ;
+    movq    mm2, mm0    ;
 
-    psubusb		mm0, mm1		;	/* A - B */
-    psubusb		mm1, mm2		;	/* B - A */
-    por		mm0, mm1		;	/* and or gives abs difference */
-    movq		mm1, mm0		;	
+    psubusb   mm0, mm1    ; /* A - B */
+    psubusb   mm1, mm2    ; /* B - A */
+    por   mm0, mm1    ; /* and or gives abs difference */
+    movq    mm1, mm0    ;
 
-    punpcklbw		mm0, mm6		;	/* unpack to higher precision for accumulation */
-    paddw		mm7, mm0		;	/* accumulate difference... */
-    punpckhbw		mm1, mm6		;	/* unpack high four bytes to higher precision */
-    add		ebx, stride1		;	/* Inc pointer into the new data */
-    paddw		mm7, mm1		;	/* accumulate difference... */
-    add		edx, stride2		;	/* Inc pointer into ref data */
+    punpcklbw   mm0, mm6    ; /* unpack to higher precision for accumulation */
+    paddw   mm7, mm0    ; /* accumulate difference... */
+    punpckhbw   mm1, mm6    ; /* unpack high four bytes to higher precision */
+    add   ebx, stride1    ; /* Inc pointer into the new data */
+    paddw   mm7, mm1    ; /* accumulate difference... */
+    add   edx, stride2    ; /* Inc pointer into ref data */
 
     ; /* ITERATION 4 */
-    movq		mm0, [ebx]		;	/* take 8 bytes */
-    movq		mm1, [edx]		;	
-    movq		mm2, mm0		;	
+    movq    mm0, [ebx]    ; /* take 8 bytes */
+    movq    mm1, [edx]    ;
+    movq    mm2, mm0    ;
 
-    psubusb		mm0, mm1		;	/* A - B */
-    psubusb		mm1, mm2		;	/* B - A */
-    por		mm0, mm1		;	/* and or gives abs difference */
-    movq		mm1, mm0		;	
+    psubusb   mm0, mm1    ; /* A - B */
+    psubusb   mm1, mm2    ; /* B - A */
+    por   mm0, mm1    ; /* and or gives abs difference */
+    movq    mm1, mm0    ;
 
-    punpcklbw		mm0, mm6		;	/* unpack to higher precision for accumulation */
-    paddw		mm7, mm0		;	/* accumulate difference... */
-    punpckhbw		mm1, mm6		;	/* unpack high four bytes to higher precision */
-    add		ebx, stride1		;	/* Inc pointer into the new data */
-    paddw		mm7, mm1		;	/* accumulate difference... */
-    add		edx, stride2		;	/* Inc pointer into ref data */
+    punpcklbw   mm0, mm6    ; /* unpack to higher precision for accumulation */
+    paddw   mm7, mm0    ; /* accumulate difference... */
+    punpckhbw   mm1, mm6    ; /* unpack high four bytes to higher precision */
+    add   ebx, stride1    ; /* Inc pointer into the new data */
+    paddw   mm7, mm1    ; /* accumulate difference... */
+    add   edx, stride2    ; /* Inc pointer into ref data */
 
 
     ; /* ITERATION 5 */
-    movq		mm0, [ebx]		;	/* take 8 bytes */
-    movq		mm1, [edx]		;	
-    movq		mm2, mm0		;	
+    movq    mm0, [ebx]    ; /* take 8 bytes */
+    movq    mm1, [edx]    ;
+    movq    mm2, mm0    ;
 
-    psubusb		mm0, mm1		;	/* A - B */
-    psubusb		mm1, mm2		;	/* B - A */
-    por		mm0, mm1		;	/* and or gives abs difference */
-    movq		mm1, mm0		;	
+    psubusb   mm0, mm1    ; /* A - B */
+    psubusb   mm1, mm2    ; /* B - A */
+    por   mm0, mm1    ; /* and or gives abs difference */
+    movq    mm1, mm0    ;
 
-    punpcklbw		mm0, mm6		;	/* unpack to higher precision for accumulation */
-    paddw		mm7, mm0		;	/* accumulate difference... */
-    punpckhbw		mm1, mm6		;	/* unpack high four bytes to higher precision */
-    add		ebx, stride1		;	/* Inc pointer into the new data */
-    paddw		mm7, mm1		;	/* accumulate difference... */
-    add		edx, stride2		;	/* Inc pointer into ref data */
+    punpcklbw   mm0, mm6    ; /* unpack to higher precision for accumulation */
+    paddw   mm7, mm0    ; /* accumulate difference... */
+    punpckhbw   mm1, mm6    ; /* unpack high four bytes to higher precision */
+    add   ebx, stride1    ; /* Inc pointer into the new data */
+    paddw   mm7, mm1    ; /* accumulate difference... */
+    add   edx, stride2    ; /* Inc pointer into ref data */
 
 
     ; /* ITERATION 6 */
-    movq		mm0, [ebx]		;	/* take 8 bytes */
-    movq		mm1, [edx]		;	
-    movq		mm2, mm0		;	
+    movq    mm0, [ebx]    ; /* take 8 bytes */
+    movq    mm1, [edx]    ;
+    movq    mm2, mm0    ;
 
-    psubusb		mm0, mm1		;	/* A - B */
-    psubusb		mm1, mm2		;	/* B - A */
-    por		mm0, mm1		;	/* and or gives abs difference */
-    movq		mm1, mm0		;	
+    psubusb   mm0, mm1    ; /* A - B */
+    psubusb   mm1, mm2    ; /* B - A */
+    por   mm0, mm1    ; /* and or gives abs difference */
+    movq    mm1, mm0    ;
 
-    punpcklbw		mm0, mm6		;	/* unpack to higher precision for accumulation */
-    paddw		mm7, mm0		;	/* accumulate difference... */
-    punpckhbw		mm1, mm6		;	/* unpack high four bytes to higher precision */
-    add		ebx, stride1		;	/* Inc pointer into the new data */
-    paddw		mm7, mm1		;	/* accumulate difference... */
-    add		edx, stride2		;	/* Inc pointer into ref data */
+    punpcklbw   mm0, mm6    ; /* unpack to higher precision for accumulation */
+    paddw   mm7, mm0    ; /* accumulate difference... */
+    punpckhbw   mm1, mm6    ; /* unpack high four bytes to higher precision */
+    add   ebx, stride1    ; /* Inc pointer into the new data */
+    paddw   mm7, mm1    ; /* accumulate difference... */
+    add   edx, stride2    ; /* Inc pointer into ref data */
 
 
     ; /* ITERATION 7 */
-    movq		mm0, [ebx]		;	/* take 8 bytes */
-    movq		mm1, [edx]		;	
-    movq		mm2, mm0		;	
+    movq    mm0, [ebx]    ; /* take 8 bytes */
+    movq    mm1, [edx]    ;
+    movq    mm2, mm0    ;
 
-    psubusb		mm0, mm1		;	/* A - B */
-    psubusb		mm1, mm2		;	/* B - A */
-    por		mm0, mm1		;	/* and or gives abs difference */
-    movq		mm1, mm0		;	
+    psubusb   mm0, mm1    ; /* A - B */
+    psubusb   mm1, mm2    ; /* B - A */
+    por   mm0, mm1    ; /* and or gives abs difference */
+    movq    mm1, mm0    ;
 
-    punpcklbw		mm0, mm6		;	/* unpack to higher precision for accumulation */
-    paddw		mm7, mm0		;	/* accumulate difference... */
-    punpckhbw		mm1, mm6		;	/* unpack high four bytes to higher precision */
-    add		ebx, stride1		;	/* Inc pointer into the new data */
-    paddw		mm7, mm1		;	/* accumulate difference... */
-    add		edx, stride2		;	/* Inc pointer into ref data */
+    punpcklbw   mm0, mm6    ; /* unpack to higher precision for accumulation */
+    paddw   mm7, mm0    ; /* accumulate difference... */
+    punpckhbw   mm1, mm6    ; /* unpack high four bytes to higher precision */
+    add   ebx, stride1    ; /* Inc pointer into the new data */
+    paddw   mm7, mm1    ; /* accumulate difference... */
+    add   edx, stride2    ; /* Inc pointer into ref data */
 
 
 
     ; /* ITERATION 8 */
-    movq		mm0, [ebx]		;	/* take 8 bytes */
-    movq		mm1, [edx]		;	
-    movq		mm2, mm0		;	
+    movq    mm0, [ebx]    ; /* take 8 bytes */
+    movq    mm1, [edx]    ;
+    movq    mm2, mm0    ;
 
-    psubusb		mm0, mm1		;	/* A - B */
-    psubusb		mm1, mm2		;	/* B - A */
-    por		mm0, mm1		;	/* and or gives abs difference */
-    movq		mm1, mm0		;	
+    psubusb   mm0, mm1    ; /* A - B */
+    psubusb   mm1, mm2    ; /* B - A */
+    por   mm0, mm1    ; /* and or gives abs difference */
+    movq    mm1, mm0    ;
 
-    punpcklbw		mm0, mm6		;	/* unpack to higher precision for accumulation */
-    paddw		mm7, mm0		;	/* accumulate difference... */
-    punpckhbw		mm1, mm6		;	/* unpack high four bytes to higher precision */
-    add		ebx, stride1		;	/* Inc pointer into the new data */
-    paddw		mm7, mm1		;	/* accumulate difference... */
-    add		edx, stride2		;	/* Inc pointer into ref data */
+    punpcklbw   mm0, mm6    ; /* unpack to higher precision for accumulation */
+    paddw   mm7, mm0    ; /* accumulate difference... */
+    punpckhbw   mm1, mm6    ; /* unpack high four bytes to higher precision */
+    add   ebx, stride1    ; /* Inc pointer into the new data */
+    paddw   mm7, mm1    ; /* accumulate difference... */
+    add   edx, stride2    ; /* Inc pointer into ref data */
 
 
 
     ; /* ------ */
 
-    movq		mm0, mm7		;	
-    psrlq		mm7, 32		;	
-    paddw		mm7, mm0		;	
-    movq		mm0, mm7		;	
-    psrlq		mm7, 16		;	
-    paddw		mm7, mm0		;	
-    movd		eax, mm7		;	
-    and		    eax, 0xffff		;	
+    movq    mm0, mm7    ;
+    psrlq   mm7, 32   ;
+    paddw   mm7, mm0    ;
+    movq    mm0, mm7    ;
+    psrlq   mm7, 16   ;
+    paddw   mm7, mm0    ;
+    movd    eax, mm7    ;
+    and       eax, 0xffff   ;
 
     mov         DiffVal, eax
   };
 
   return DiffVal;
 
- 
 
+
 #endif
 }
 
 static ogg_uint32_t sad8x8_thres__mmx (unsigned char *ptr1, ogg_uint32_t stride1,
-		       		  unsigned char *ptr2, ogg_uint32_t stride2, 
-			   	  ogg_uint32_t thres)
+                unsigned char *ptr2, ogg_uint32_t stride2,
+            ogg_uint32_t thres)
 {
 #if 0
   ogg_uint32_t  i;
@@ -1142,9 +1142,9 @@
 
 
 static ogg_uint32_t sad8x8_xy2_thres__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride,
-		                      unsigned char *RefDataPtr1,
-			              unsigned char *RefDataPtr2, ogg_uint32_t RefStride,
-			              ogg_uint32_t thres)
+                          unsigned char *RefDataPtr1,
+                    unsigned char *RefDataPtr2, ogg_uint32_t RefStride,
+                    ogg_uint32_t thres)
 {
 #if 0
   ogg_uint32_t  i;
@@ -1181,58 +1181,58 @@
         mov     edx, RefDataPtr2
 
 
-    pcmpeqd		mm5, mm5		;	/* fefefefefefefefe in mm5 */
-    paddb		mm5, mm5		;	
-				    ;	
-    pxor		mm6, mm6		;	/* zero out mm6 for unpack */
-    pxor		mm7, mm7		;	/* mm7 contains the result */
-    mov		edi, 8		;	/* 8 rows */
-    loop_start:				;	
-    movq		mm0, [ebx]		;	/* take 8 bytes */
+    pcmpeqd   mm5, mm5    ; /* fefefefefefefefe in mm5 */
+    paddb   mm5, mm5    ;
+            ;
+    pxor    mm6, mm6    ; /* zero out mm6 for unpack */
+    pxor    mm7, mm7    ; /* mm7 contains the result */
+    mov   edi, 8    ; /* 8 rows */
+    loop_start:       ;
+    movq    mm0, [ebx]    ; /* take 8 bytes */
 
-    movq		mm2, [ecx]		;	
-    movq		mm3, [edx]		;	/* take average of mm2 and mm3 */
-    movq		mm1, mm2		;	
-    pand		mm1, mm3		;	
-    pxor		mm3, mm2		;	
-    pand		mm3, mm5		;	
-    psrlq		mm3, 1		;	
-    paddb		mm1, mm3		;	
+    movq    mm2, [ecx]    ;
+    movq    mm3, [edx]    ; /* take average of mm2 and mm3 */
+    movq    mm1, mm2    ;
+    pand    mm1, mm3    ;
+    pxor    mm3, mm2    ;
+    pand    mm3, mm5    ;
+    psrlq   mm3, 1    ;
+    paddb   mm1, mm3    ;
 
-    movq		mm2, mm0		;	
+    movq    mm2, mm0    ;
 
-    psubusb		mm0, mm1		;	/* A - B */
-    psubusb		mm1, mm2		;	/* B - A */
-    por		mm0, mm1		;	/* and or gives abs difference */
-    movq		mm1, mm0		;	
+    psubusb   mm0, mm1    ; /* A - B */
+    psubusb   mm1, mm2    ; /* B - A */
+    por   mm0, mm1    ; /* and or gives abs difference */
+    movq    mm1, mm0    ;
 
-    punpcklbw		mm0, mm6		;	/* unpack to higher precision for accumulation */
-    paddw		mm7, mm0		;	/* accumulate difference... */
-    punpckhbw		mm1, mm6		;	/* unpack high four bytes to higher precision */
-    add		ebx, SrcStride		;	/* Inc pointer into the new data */
-    paddw		mm7, mm1		;	/* accumulate difference... */
-    add		ecx, RefStride		;	/* Inc pointer into ref data */
-    add		edx, RefStride		;	/* Inc pointer into ref data */
+    punpcklbw   mm0, mm6    ; /* unpack to higher precision for accumulation */
+    paddw   mm7, mm0    ; /* accumulate difference... */
+    punpckhbw   mm1, mm6    ; /* unpack high four bytes to higher precision */
+    add   ebx, SrcStride    ; /* Inc pointer into the new data */
+    paddw   mm7, mm1    ; /* accumulate difference... */
+    add   ecx, RefStride    ; /* Inc pointer into ref data */
+    add   edx, RefStride    ; /* Inc pointer into ref data */
 
-    dec		edi		;	
-    jnz		loop_start		;	
+    dec   edi   ;
+    jnz   loop_start    ;
 
-    movq		mm0, mm7		;	
-    psrlq		mm7, 32		;	
-    paddw		mm7, mm0		;	
-    movq		mm0, mm7		;	
-    psrlq		mm7, 16		;	
-    paddw		mm7, mm0		;	
-    movd		eax, mm7		;	
-    and		eax, 0xffff		;	
+    movq    mm0, mm7    ;
+    psrlq   mm7, 32   ;
+    paddw   mm7, mm0    ;
+    movq    mm0, mm7    ;
+    psrlq   mm7, 16   ;
+    paddw   mm7, mm0    ;
+    movd    eax, mm7    ;
+    and   eax, 0xffff   ;
 
     mov DiffVal, eax
   };
 
   return DiffVal;
 
- 
 
+
 #endif
 }
 
@@ -1277,45 +1277,45 @@
 
         mov     ecx, DataPtr
 
-    pxor		mm5, mm5		;	
-    pxor		mm6, mm6		;	
-    pxor		mm7, mm7		;	
-    mov		edi, 8		;	
-    loop_start:		
-    movq		mm0, [ecx]		;	/* take 8 bytes */
-    movq		mm2, mm0		;	
+    pxor    mm5, mm5    ;
+    pxor    mm6, mm6    ;
+    pxor    mm7, mm7    ;
+    mov   edi, 8    ;
+    loop_start:
+    movq    mm0, [ecx]    ; /* take 8 bytes */
+    movq    mm2, mm0    ;
 
-    punpcklbw		mm0, mm6		;	
-    punpckhbw		mm2, mm6		;	
+    punpcklbw   mm0, mm6    ;
+    punpckhbw   mm2, mm6    ;
 
-    paddw		mm5, mm0		;	
-    paddw		mm5, mm2		;	
+    paddw   mm5, mm0    ;
+    paddw   mm5, mm2    ;
 
-    pmaddwd		mm0, mm0		;	
-    pmaddwd		mm2, mm2		;	
-				    ;	
-    paddd		mm7, mm0		;	
-    paddd		mm7, mm2		;	
+    pmaddwd   mm0, mm0    ;
+    pmaddwd   mm2, mm2    ;
+            ;
+    paddd   mm7, mm0    ;
+    paddd   mm7, mm2    ;
 
-    add		ecx, Stride		;	/* Inc pointer into src data */
+    add   ecx, Stride   ; /* Inc pointer into src data */
 
-    dec		edi		;	
-    jnz		loop_start		;	
+    dec   edi   ;
+    jnz   loop_start    ;
 
-    movq		mm0, mm5		;	
-    psrlq		mm5, 32		;	
-    paddw		mm5, mm0		;	
-    movq		mm0, mm5		;	
-    psrlq		mm5, 16		;	
-    paddw		mm5, mm0		;	
-    movd		edi, mm5		;	
-    movsx		edi, di		;	
-    mov		eax, edi		;	
+    movq    mm0, mm5    ;
+    psrlq   mm5, 32   ;
+    paddw   mm5, mm0    ;
+    movq    mm0, mm5    ;
+    psrlq   mm5, 16   ;
+    paddw   mm5, mm0    ;
+    movd    edi, mm5    ;
+    movsx   edi, di   ;
+    mov   eax, edi    ;
 
-    movq		mm0, mm7		;	
-    psrlq		mm7, 32		;	
-    paddd		mm7, mm0		;	
-    movd		ebx, mm7		;	
+    movq    mm0, mm7    ;
+    psrlq   mm7, 32   ;
+    paddd   mm7, mm0    ;
+    movd    ebx, mm7    ;
 
         mov         XSum, eax
         mov         XXSum, ebx;
@@ -1324,13 +1324,13 @@
     /* Compute population variance as mis-match metric. */
     return (( (XXSum<<6) - XSum*XSum ) );
 
- 
 
+
 #endif
 }
 
 static ogg_uint32_t inter8x8_err__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride,
-		                 unsigned char *RefDataPtr, ogg_uint32_t RefStride)
+                     unsigned char *RefDataPtr, ogg_uint32_t RefStride)
 {
 
 #if 0
@@ -1355,23 +1355,23 @@
     DiffVal = DSP_OP_DIFF (SrcData[3], RefDataPtr[3]);
     XSum += DiffVal;
     XXSum += DiffVal*DiffVal;
-        
+
     DiffVal = DSP_OP_DIFF (SrcData[4], RefDataPtr[4]);
     XSum += DiffVal;
     XXSum += DiffVal*DiffVal;
-        
+
     DiffVal = DSP_OP_DIFF (SrcData[5], RefDataPtr[5]);
     XSum += DiffVal;
     XXSum += DiffVal*DiffVal;
-        
+
     DiffVal = DSP_OP_DIFF (SrcData[6], RefDataPtr[6]);
     XSum += DiffVal;
     XXSum += DiffVal*DiffVal;
-        
+
     DiffVal = DSP_OP_DIFF (SrcData[7], RefDataPtr[7]);
     XSum += DiffVal;
     XXSum += DiffVal*DiffVal;
-        
+
     /* Step to next row of block. */
     SrcData += SrcStride;
     RefDataPtr += RefStride;
@@ -1390,53 +1390,53 @@
         mov     ecx, SrcData
         mov     edx, RefDataPtr
 
-    pxor		mm5, mm5		;	
-    pxor		mm6, mm6		;	
-    pxor		mm7, mm7		;	
-    mov		edi, 8		;	
-    loop_start:				;	
-    movq		mm0, [ecx]		;	/* take 8 bytes */
-    movq		mm1, [edx]		;	
-    movq		mm2, mm0		;	
-    movq		mm3, mm1		;	
+    pxor    mm5, mm5    ;
+    pxor    mm6, mm6    ;
+    pxor    mm7, mm7    ;
+    mov   edi, 8    ;
+    loop_start:       ;
+    movq    mm0, [ecx]    ; /* take 8 bytes */
+    movq    mm1, [edx]    ;
+    movq    mm2, mm0    ;
+    movq    mm3, mm1    ;
 
-    punpcklbw		mm0, mm6		;	
-    punpcklbw		mm1, mm6		;	
-    punpckhbw		mm2, mm6		;	
-    punpckhbw		mm3, mm6		;	
+    punpcklbw   mm0, mm6    ;
+    punpcklbw   mm1, mm6    ;
+    punpckhbw   mm2, mm6    ;
+    punpckhbw   mm3, mm6    ;
 
-    psubsw		mm0, mm1		;	
-    psubsw		mm2, mm3		;	
+    psubsw    mm0, mm1    ;
+    psubsw    mm2, mm3    ;
 
-    paddw		mm5, mm0		;	
-    paddw		mm5, mm2		;	
+    paddw   mm5, mm0    ;
+    paddw   mm5, mm2    ;
 
-    pmaddwd		mm0, mm0		;	
-    pmaddwd		mm2, mm2		;	
-				    ;	
-    paddd		mm7, mm0		;	
-    paddd		mm7, mm2		;	
+    pmaddwd   mm0, mm0    ;
+    pmaddwd   mm2, mm2    ;
+            ;
+    paddd   mm7, mm0    ;
+    paddd   mm7, mm2    ;
 
-    add		ecx, SrcStride		;	/* Inc pointer into src data */
-    add		edx, RefStride		;	/* Inc pointer into ref data */
+    add   ecx, SrcStride    ; /* Inc pointer into src data */
+    add   edx, RefStride    ; /* Inc pointer into ref data */
 
-    dec		edi		;	
-    jnz		loop_start		;	
+    dec   edi   ;
+    jnz   loop_start    ;
 
-    movq		mm0, mm5		;	
-    psrlq		mm5, 32		;	
-    paddw		mm5, mm0		;	
-    movq		mm0, mm5		;	
-    psrlq		mm5, 16		;	
-    paddw		mm5, mm0		;	
-    movd		edi, mm5		;	
-    movsx		edi, di		;	
-    mov		eax, edi		;	
+    movq    mm0, mm5    ;
+    psrlq   mm5, 32   ;
+    paddw   mm5, mm0    ;
+    movq    mm0, mm5    ;
+    psrlq   mm5, 16   ;
+    paddw   mm5, mm0    ;
+    movd    edi, mm5    ;
+    movsx   edi, di   ;
+    mov   eax, edi    ;
 
-    movq		mm0, mm7		;	
-    psrlq		mm7, 32		;	
-    paddd		mm7, mm0		;	
-    movd		ebx, mm7		;	
+    movq    mm0, mm7    ;
+    psrlq   mm7, 32   ;
+    paddd   mm7, mm0    ;
+    movd    ebx, mm7    ;
 
         mov     XSum, eax
         mov     XXSum, ebx
@@ -1446,13 +1446,13 @@
   /* Compute and return population variance as mis-match metric. */
   return (( (XXSum<<6) - XSum*XSum ));
 
- 
+
 #endif
 }
 
 static ogg_uint32_t inter8x8_err_xy2__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride,
-		                     unsigned char *RefDataPtr1,
-				     unsigned char *RefDataPtr2, ogg_uint32_t RefStride)
+                         unsigned char *RefDataPtr1,
+             unsigned char *RefDataPtr2, ogg_uint32_t RefStride)
 {
 #if 0
   ogg_uint32_t  i;
@@ -1512,65 +1512,65 @@
         mov ecx, RefDataPtr1
         mov edx, RefDataPtr2
 
-    pcmpeqd		mm4, mm4		;	/* fefefefefefefefe in mm4 */
-    paddb		mm4, mm4		;	
-    pxor		mm5, mm5		;	
-    pxor		mm6, mm6		;	
-    pxor		mm7, mm7		;	
-    mov		edi, 8		;	
-    loop_start:				;	
-    movq		mm0, [ebx]		;	/* take 8 bytes */
+    pcmpeqd   mm4, mm4    ; /* fefefefefefefefe in mm4 */
+    paddb   mm4, mm4    ;
+    pxor    mm5, mm5    ;
+    pxor    mm6, mm6    ;
+    pxor    mm7, mm7    ;
+    mov   edi, 8    ;
+    loop_start:       ;
+    movq    mm0, [ebx]    ; /* take 8 bytes */
 
-    movq		mm2, [ecx]		;	
-    movq		mm3, [edx]		;	/* take average of mm2 and mm3 */
-    movq		mm1, mm2		;	
-    pand		mm1, mm3		;	
-    pxor		mm3, mm2		;	
-    pand		mm3, mm4		;	
-    psrlq		mm3, 1		;	
-    paddb		mm1, mm3		;	
+    movq    mm2, [ecx]    ;
+    movq    mm3, [edx]    ; /* take average of mm2 and mm3 */
+    movq    mm1, mm2    ;
+    pand    mm1, mm3    ;
+    pxor    mm3, mm2    ;
+    pand    mm3, mm4    ;
+    psrlq   mm3, 1    ;
+    paddb   mm1, mm3    ;
 
-    movq		mm2, mm0		;	
-    movq		mm3, mm1		;	
+    movq    mm2, mm0    ;
+    movq    mm3, mm1    ;
 
-    punpcklbw		mm0, mm6		;	
-    punpcklbw		mm1, mm6		;	
-    punpckhbw		mm2, mm6		;	
-    punpckhbw		mm3, mm6		;	
+    punpcklbw   mm0, mm6    ;
+    punpcklbw   mm1, mm6    ;
+    punpckhbw   mm2, mm6    ;
+    punpckhbw   mm3, mm6    ;
 
-    psubsw		mm0, mm1		;	
-    psubsw		mm2, mm3		;	
+    psubsw    mm0, mm1    ;
+    psubsw    mm2, mm3    ;
 
-    paddw		mm5, mm0		;	
-    paddw		mm5, mm2		;	
+    paddw   mm5, mm0    ;
+    paddw   mm5, mm2    ;
 
-    pmaddwd		mm0, mm0		;	
-    pmaddwd		mm2, mm2		;	
-				    ;	
-    paddd		mm7, mm0		;	
-    paddd		mm7, mm2		;	
+    pmaddwd   mm0, mm0    ;
+    pmaddwd   mm2, mm2    ;
+            ;
+    paddd   mm7, mm0    ;
+    paddd   mm7, mm2    ;
 
-    add		ebx, SrcStride		;	/* Inc pointer into src data */
-    add		ecx, RefStride		;	/* Inc pointer into ref data */
-    add		edx, RefStride		;	/* Inc pointer into ref data */
+    add   ebx, SrcStride    ; /* Inc pointer into src data */
+    add   ecx, RefStride    ; /* Inc pointer into ref data */
+    add   edx, RefStride    ; /* Inc pointer into ref data */
 
-    dec		edi		;	
-    jnz		loop_start		;	
+    dec   edi   ;
+    jnz   loop_start    ;
 
-    movq		mm0, mm5		;	
-    psrlq		mm5, 32		;	
-    paddw		mm5, mm0		;	
-    movq		mm0, mm5		;	
-    psrlq		mm5, 16		;	
-    paddw		mm5, mm0		;	
-    movd		edi, mm5		;	
-    movsx		edi, di		;	
-    mov         XSum, edi   ; /* movl		eax, edi		;	Modified for vc to resuse eax*/
+    movq    mm0, mm5    ;
+    psrlq   mm5, 32   ;
+    paddw   mm5, mm0    ;
+    movq    mm0, mm5    ;
+    psrlq   mm5, 16   ;
+    paddw   mm5, mm0    ;
+    movd    edi, mm5    ;
+    movsx   edi, di   ;
+    mov         XSum, edi   ; /* movl   eax, edi    ; Modified for vc to resuse eax*/
 
-    movq		mm0, mm7		;	
-    psrlq		mm7, 32		;	
-    paddd		mm7, mm0		;	
-    movd        XXSum, mm7 ; /*movd		eax, mm7		; Modified for vc to reuse eax */
+    movq    mm0, mm7    ;
+    psrlq   mm7, 32   ;
+    paddd   mm7, mm0    ;
+    movd        XXSum, mm7 ; /*movd   eax, mm7    ; Modified for vc to reuse eax */
   };
 
     return (( (XXSum<<6) - XSum*XSum ));
@@ -1589,7 +1589,6 @@
 
 void dsp_mmx_init(DspFunctions *funcs)
 {
-  TH_DEBUG("enabling accelerated x86_32 mmx dsp functions.\n");
   funcs->restore_fpu = restore_fpu;
   funcs->sub8x8 = sub8x8__mmx;
   funcs->sub8x8_128 = sub8x8_128__mmx;

Modified: trunk/theora/lib/enc/x86_32_vs/fdct_mmx.c
===================================================================
--- trunk/theora/lib/enc/x86_32_vs/fdct_mmx.c	2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/enc/x86_32_vs/fdct_mmx.c	2008-08-04 18:37:55 UTC (rev 15153)
@@ -35,51 +35,51 @@
             mov     edx, OutputData2
 
 
-        movq		mm0, [eax]		;	/* mm0 = a0 a1 a2 a3 */
-        movq		mm4, [ebx]		;	/* mm4 = e4 e5 e6 e7 */
-        movq		mm1, [16 + eax]		;	/* mm1 = b0 b1 b2 b3 */
-        movq		mm5, [16 + ebx]		;	/* mm5 = f4 f5 f6 f7 */
-        movq		mm2, [32 + eax]		;	/* mm2 = c0 c1 c2 c3 */
-        movq		mm6, [32 + ebx]		;	/* mm6 = g4 g5 g6 g7 */
-        movq		mm3, [48 + eax]		;	/* mm3 = d0 d1 d2 d3 */
-        movq		[16 + ecx], mm1		;	/* save  b0 b1 b2 b3 */
-        movq		mm7, [48 + ebx]		;	/* mm7 = h0 h1 h2 h3 */
-	        ;	/* Transpose 2x8 block */
-        movq		mm1, mm4		;	/* mm1 = e3 e2 e1 e0 */
-        punpcklwd		mm4, mm5		;	/* mm4 = f1 e1 f0 e0 */
-        movq		[ecx], mm0		;	/* save a3 a2 a1 a0  */
-        punpckhwd		mm1, mm5		;	/* mm1 = f3 e3 f2 e2 */
-        movq		mm0, mm6		;	/* mm0 = g3 g2 g1 g0 */
-        punpcklwd		mm6, mm7		;	/* mm6 = h1 g1 h0 g0 */
-        movq		mm5, mm4		;	/* mm5 = f1 e1 f0 e0 */
-        punpckldq		mm4, mm6		;	/* mm4 = h0 g0 f0 e0 = MM4 */
-        punpckhdq		mm5, mm6		;	/* mm5 = h1 g1 f1 e1 = MM5 */
-        movq		mm6, mm1		;	/* mm6 = f3 e3 f2 e2 */
-        movq		[edx], mm4		;	
-        punpckhwd		mm0, mm7		;	/* mm0 = h3 g3 h2 g2 */
-        movq		[16 + edx], mm5		;	
-        punpckhdq		mm6, mm0		;	/* mm6 = h3 g3 f3 e3 = MM7 */
-        movq		mm4, [ecx]		;	/* mm4 = a3 a2 a1 a0 */
-        punpckldq		mm1, mm0		;	/* mm1 = h2 g2 f2 e2 = MM6 */
-        movq		mm5, [16 + ecx]		;	/* mm5 = b3 b2 b1 b0 */
-        movq		mm0, mm4		;	/* mm0 = a3 a2 a1 a0 */
-        movq		[48 + edx], mm6		;	
-        punpcklwd		mm0, mm5		;	/* mm0 = b1 a1 b0 a0 */
-        movq		[32 + edx], mm1		;	
-        punpckhwd		mm4, mm5		;	/* mm4 = b3 a3 b2 a2 */
-        movq		mm5, mm2		;	/* mm5 = c3 c2 c1 c0 */
-        punpcklwd		mm2, mm3		;	/* mm2 = d1 c1 d0 c0 */
-        movq		mm1, mm0		;	/* mm1 = b1 a1 b0 a0 */
-        punpckldq		mm0, mm2		;	/* mm0 = d0 c0 b0 a0 = MM0 */
-        punpckhdq		mm1, mm2		;	/* mm1 = d1 c1 b1 a1 = MM1 */
-        movq		mm2, mm4		;	/* mm2 = b3 a3 b2 a2 */
-        movq		[ecx], mm0		;	
-        punpckhwd		mm5, mm3		;	/* mm5 = d3 c3 d2 c2 */
-        movq		[16 + ecx], mm1		;	
-        punpckhdq		mm4, mm5		;	/* mm4 = d3 c3 b3 a3 = MM3 */
-        punpckldq		mm2, mm5		;	/* mm2 = d2 c2 b2 a2 = MM2 */
-        movq		[48 + ecx], mm4		;	
-        movq		[32 + ecx], mm2		;	
+        movq    mm0, [eax]    ; /* mm0 = a0 a1 a2 a3 */
+        movq    mm4, [ebx]    ; /* mm4 = e4 e5 e6 e7 */
+        movq    mm1, [16 + eax]   ; /* mm1 = b0 b1 b2 b3 */
+        movq    mm5, [16 + ebx]   ; /* mm5 = f4 f5 f6 f7 */
+        movq    mm2, [32 + eax]   ; /* mm2 = c0 c1 c2 c3 */
+        movq    mm6, [32 + ebx]   ; /* mm6 = g4 g5 g6 g7 */
+        movq    mm3, [48 + eax]   ; /* mm3 = d0 d1 d2 d3 */
+        movq    [16 + ecx], mm1   ; /* save  b0 b1 b2 b3 */
+        movq    mm7, [48 + ebx]   ; /* mm7 = h0 h1 h2 h3 */
+          ; /* Transpose 2x8 block */
+        movq    mm1, mm4    ; /* mm1 = e3 e2 e1 e0 */
+        punpcklwd   mm4, mm5    ; /* mm4 = f1 e1 f0 e0 */
+        movq    [ecx], mm0    ; /* save a3 a2 a1 a0  */
+        punpckhwd   mm1, mm5    ; /* mm1 = f3 e3 f2 e2 */
+        movq    mm0, mm6    ; /* mm0 = g3 g2 g1 g0 */
+        punpcklwd   mm6, mm7    ; /* mm6 = h1 g1 h0 g0 */
+        movq    mm5, mm4    ; /* mm5 = f1 e1 f0 e0 */
+        punpckldq   mm4, mm6    ; /* mm4 = h0 g0 f0 e0 = MM4 */
+        punpckhdq   mm5, mm6    ; /* mm5 = h1 g1 f1 e1 = MM5 */
+        movq    mm6, mm1    ; /* mm6 = f3 e3 f2 e2 */
+        movq    [edx], mm4    ;
+        punpckhwd   mm0, mm7    ; /* mm0 = h3 g3 h2 g2 */
+        movq    [16 + edx], mm5   ;
+        punpckhdq   mm6, mm0    ; /* mm6 = h3 g3 f3 e3 = MM7 */
+        movq    mm4, [ecx]    ; /* mm4 = a3 a2 a1 a0 */
+        punpckldq   mm1, mm0    ; /* mm1 = h2 g2 f2 e2 = MM6 */
+        movq    mm5, [16 + ecx]   ; /* mm5 = b3 b2 b1 b0 */
+        movq    mm0, mm4    ; /* mm0 = a3 a2 a1 a0 */
+        movq    [48 + edx], mm6   ;
+        punpcklwd   mm0, mm5    ; /* mm0 = b1 a1 b0 a0 */
+        movq    [32 + edx], mm1   ;
+        punpckhwd   mm4, mm5    ; /* mm4 = b3 a3 b2 a2 */
+        movq    mm5, mm2    ; /* mm5 = c3 c2 c1 c0 */
+        punpcklwd   mm2, mm3    ; /* mm2 = d1 c1 d0 c0 */
+        movq    mm1, mm0    ; /* mm1 = b1 a1 b0 a0 */
+        punpckldq   mm0, mm2    ; /* mm0 = d0 c0 b0 a0 = MM0 */
+        punpckhdq   mm1, mm2    ; /* mm1 = d1 c1 b1 a1 = MM1 */
+        movq    mm2, mm4    ; /* mm2 = b3 a3 b2 a2 */
+        movq    [ecx], mm0    ;
+        punpckhwd   mm5, mm3    ; /* mm5 = d3 c3 d2 c2 */
+        movq    [16 + ecx], mm1   ;
+        punpckhdq   mm4, mm5    ; /* mm4 = d3 c3 b3 a3 = MM3 */
+        punpckldq   mm2, mm5    ; /* mm2 = d2 c2 b2 a2 = MM2 */
+        movq    [48 + ecx], mm4   ;
+        movq    [32 + ecx], mm2   ;
 
     };
 
@@ -96,208 +96,208 @@
                 mov     eax, InputData1
                 mov     ebx, InputData2
                 mov     ecx, temp
-        movq		mm0, [eax]		;	
-        movq		mm1, [16 + eax]		;	
-        movq		mm2, [48 + eax]		;	
-        movq		mm3, [16 + ebx]		;	
-        movq		mm4, mm0		;	
-        movq		mm5, mm1		;	
-        movq		mm6, mm2		;	
-        movq		mm7, mm3		;	
-				        ;	
-        paddsw		mm0, [48 + ebx]		;	/* mm0 = ip0 + ip7 = is07 */
-        paddsw		mm1, [32 + eax]		;	/* mm1 = ip1 + ip2 = is12 */
-        paddsw		mm2, [ebx]		;	/* mm2 = ip3 + ip4 = is34 */
-        paddsw		mm3, [32 + ebx]		;	/* mm3 = ip5 + ip6 = is56 */
-        psubsw		mm4, [48 + ebx]		;	/* mm4 = ip0 - ip7 = id07 */
-        psubsw		mm5, [32 + eax]		;	/* mm5 = ip1 - ip2 = id12 */
-				        ;	
-        psubsw		mm0, mm2		;	/* mm0 = is07 - is34 */
-				        ;	
-        paddsw		mm2, mm2		;	
-				        ;	
-        psubsw		mm6, [ebx]		;	/* mm6 = ip3 - ip4 = id34 */
-				        ;	
-        paddsw		mm2, mm0		;	/* mm2 = is07 + is34 = is0734 */
-        psubsw		mm1, mm3		;	/* mm1 = is12 - is56 */
-        movq		[ecx], mm0		;	/* Save is07 - is34 to free mm0; */
-        paddsw		mm3, mm3		;	
-        paddsw		mm3, mm1		;	/* mm3 = is12 + 1s56	= is1256 */
-				        ;	
-        psubsw		mm7, [32 + ebx]		;	/* mm7 = ip5 - ip6 = id56 */
-	        ;	/* ------------------------------------------------------------------- */
-        psubsw		mm5, mm7		;	/* mm5 = id12 - id56 */
-        paddsw		mm7, mm7		;	
-        paddsw		mm7, mm5		;	/* mm7 = id12 + id56 */
-	        ;	/* ------------------------------------------------------------------- */
-        psubsw		mm2, mm3		;	/* mm2 = is0734 - is1256 */
-        paddsw		mm3, mm3		;	
-				        ;	
-        movq		mm0, mm2		;	/* make a copy */
-        paddsw		mm3, mm2		;	/* mm3 = is0734 + is1256 */
-				        ;	
-        pmulhw		mm0, xC4S4		;	/* mm0 = xC4S4 * ( is0734 - is1256 ) - ( is0734 - is1256 ) */
-        paddw		mm0, mm2		;	/* mm0 = xC4S4 * ( is0734 - is1256 ) */
-        psrlw		mm2, 15		;	
-        paddw		mm0, mm2		;	/* Truncate mm0, now it is op[4] */
-				        ;	
-        movq		mm2, mm3		;	
-        movq		[ebx], mm0		;	/* save ip4, now mm0,mm2 are free */
-				        ;	
-        movq		mm0, mm3		;	
-        pmulhw		mm3, xC4S4		;	/* mm3 = xC4S4 * ( is0734 +is1256 ) - ( is0734 +is1256 ) */
-				        ;	
-        psrlw		mm2, 15		;	
-        paddw		mm3, mm0		;	/* mm3 = xC4S4 * ( is0734 +is1256 )	 */
-        paddw		mm3, mm2		;	/* Truncate mm3, now it is op[0] */
-				        ;	
-        movq		[eax], mm3		;	
-	        ;	/* ------------------------------------------------------------------- */
-        movq		mm3, [ecx]		;	/* mm3 = irot_input_y */
-        pmulhw		mm3, xC2S6	;	/* mm3 = xC2S6 * irot_input_y - irot_input_y */
-				        ;	
-        movq		mm2, [ecx]		;	
-        movq		mm0, mm2		;	
-				        ;	
-        psrlw		mm2, 15		;	/* mm3 = xC2S6 * irot_input_y */
-        paddw		mm3, mm0		;	
-				        ;	
-        paddw		mm3, mm2		;	/* Truncated */
-        movq		mm0, mm5		;	
-				        ;	
-        movq		mm2, mm5		;	
-        pmulhw		mm0, xC6S2		;	/* mm0 = xC6S2 * irot_input_x */
-				        ;	
-        psrlw		mm2, 15		;	
-        paddw		mm0, mm2		;	/* Truncated */
-				        ;	
-        paddsw		mm3, mm0		;	/* ip[2] */
-        movq		[32 + eax], mm3		;	/* Save ip2 */
-				        ;	
-        movq		mm0, mm5		;	
-        movq		mm2, mm5		;	
-				        ;	
-        pmulhw		mm5, xC2S6		;	/* mm5 = xC2S6 * irot_input_x - irot_input_x */
-        psrlw		mm2, 15		;	
-				        ;	
-        movq		mm3, [ecx]		;	
-        paddw		mm5, mm0		;	/* mm5 = xC2S6 * irot_input_x */
-				        ;	
-        paddw		mm5, mm2		;	/* Truncated */
-        movq		mm2, mm3		;	
-				        ;	
-        pmulhw		mm3, xC6S2		;	/* mm3 = xC6S2 * irot_input_y */
-        psrlw		mm2, 15		;	
-				        ;	
-        paddw		mm3, mm2		;	/* Truncated */
-        psubsw		mm3, mm5		;	
-				        ;	
-        movq		[32 + ebx], mm3		;	
-	        ;	/* ------------------------------------------------------------------- */
-        movq		mm0, xC4S4		;	
-        movq		mm2, mm1		;	
-        movq		mm3, mm1		;	
-				        ;	
-        pmulhw		mm1, mm0		;	/* mm0 = xC4S4 * ( is12 - is56 ) - ( is12 - is56 ) */
-        psrlw		mm2, 15		;	
-				        ;	
-        paddw		mm1, mm3		;	/* mm0 = xC4S4 * ( is12 - is56 ) */
-        paddw		mm1, mm2		;	/* Truncate mm1, now it is icommon_product1 */
-				        ;	
-        movq		mm2, mm7		;	
-        movq		mm3, mm7		;	
-				        ;	
-        pmulhw		mm7, mm0		;	/* mm7 = xC4S4 * ( id12 + id56 ) - ( id12 + id56 ) */
-        psrlw		mm2, 15		;	
-				        ;	
-        paddw		mm7, mm3		;	/* mm7 = xC4S4 * ( id12 + id56 ) */
-        paddw		mm7, mm2		;	/* Truncate mm7, now it is icommon_product2 */
-	        ;	/* ------------------------------------------------------------------- */
-        pxor		mm0, mm0		;	/* Clear mm0 */
-        psubsw		mm0, mm6		;	/* mm0 = - id34 */
-				        ;	
-        psubsw		mm0, mm7		;	/* mm0 = - ( id34 + idcommon_product2 ) */
-        paddsw		mm6, mm6		;	
-        paddsw		mm6, mm0		;	/* mm6 = id34 - icommon_product2 */
-				        ;	
-        psubsw		mm4, mm1		;	/* mm4 = id07 - icommon_product1 */
-        paddsw		mm1, mm1		;	
-        paddsw		mm1, mm4		;	/* mm1 = id07 + icommon_product1 */
-	        ;	/* ------------------------------------------------------------------- */
-        movq		mm7, xC1S7		;	
-        movq		mm2, mm1		;	
-				        ;	
-        movq		mm3, mm1		;	
-        pmulhw		mm1, mm7		;	/* mm1 = xC1S7 * irot_input_x - irot_input_x */
-				        ;	
-        movq		mm7, xC7S1		;	
-        psrlw		mm2, 15		;	
-				        ;	
-        paddw		mm1, mm3		;	/* mm1 = xC1S7 * irot_input_x */
-        paddw		mm1, mm2		;	/* Trucated */
-				        ;	
-        pmulhw		mm3, mm7		;	/* mm3 = xC7S1 * irot_input_x */
-        paddw		mm3, mm2		;	/* Truncated */
-				        ;	
-        movq		mm5, mm0		;	
-        movq		mm2, mm0		;	
-				        ;	
-        movq		mm7, xC1S7		;	
-        pmulhw		mm0, mm7		;	/* mm0 = xC1S7 * irot_input_y - irot_input_y */
-				        ;	
-        movq		mm7, xC7S1		;	
-        psrlw		mm2, 15		;	
-				        ;	
-        paddw		mm0, mm5		;	/* mm0 = xC1S7 * irot_input_y */
-        paddw		mm0, mm2		;	/* Truncated */
-				        ;	
-        pmulhw		mm5, mm7		;	/* mm5 = xC7S1 * irot_input_y */
-        paddw		mm5, mm2		;	/* Truncated */
-				        ;	
-        psubsw		mm1, mm5		;	/* mm1 = xC1S7 * irot_input_x - xC7S1 * irot_input_y = ip1 */
-        paddsw		mm3, mm0		;	/* mm3 = xC7S1 * irot_input_x - xC1S7 * irot_input_y = ip7 */
-				        ;	
-        movq		[16 + eax], mm1		;	
-        movq		[48 + ebx], mm3		;	
-	        ;	/* ------------------------------------------------------------------- */
-        movq		mm0, xC3S5		;	
-        movq		mm1, xC5S3		;	
-				        ;	
-        movq		mm5, mm6		;	
-        movq		mm7, mm6		;	
-				        ;	
-        movq		mm2, mm4		;	
-        movq		mm3, mm4		;	
-				        ;	
-        pmulhw		mm4, mm0		;	/* mm4 = xC3S5 * irot_input_x - irot_input_x */
-        pmulhw		mm6, mm1		;	/* mm6 = xC5S3 * irot_input_y - irot_input_y */
-				        ;	
-        psrlw		mm2, 15		;	
-        psrlw		mm5, 15		;	
-				        ;	
-        paddw		mm4, mm3		;	/* mm4 = xC3S5 * irot_input_x */
-        paddw		mm6, mm7		;	/* mm6 = xC5S3 * irot_input_y */
-				        ;	
-        paddw		mm4, mm2		;	/* Truncated */
-        paddw		mm6, mm5		;	/* Truncated */
-				        ;	
-        psubsw		mm4, mm6		;	/* ip3 */
-        movq		[48 + eax], mm4		;	
-				        ;	
-        movq		mm4, mm3		;	
-        movq		mm6, mm7		;	
-				        ;	
-        pmulhw		mm3, mm1		;	/* mm3 = xC5S3 * irot_input_x - irot_input_x */
-        pmulhw		mm7, mm0		;	/* mm7 = xC3S5 * irot_input_y - irot_input_y */
-				        ;	
-        paddw		mm4, mm2		;	
-        paddw		mm6, mm5		;	
-				        ;	
-        paddw		mm3, mm4		;	/* mm3 = xC5S3 * irot_input_x */
-        paddw		mm7, mm6		;	/* mm7 = xC3S5 * irot_input_y */
-				        ;	
-        paddw		mm3, mm7		;	/* ip5 */
-        movq		[16 + ebx], mm3		;	
+        movq    mm0, [eax]    ;
+        movq    mm1, [16 + eax]   ;
+        movq    mm2, [48 + eax]   ;
+        movq    mm3, [16 + ebx]   ;
+        movq    mm4, mm0    ;
+        movq    mm5, mm1    ;
+        movq    mm6, mm2    ;
+        movq    mm7, mm3    ;
+                ;
+        paddsw    mm0, [48 + ebx]   ; /* mm0 = ip0 + ip7 = is07 */
+        paddsw    mm1, [32 + eax]   ; /* mm1 = ip1 + ip2 = is12 */
+        paddsw    mm2, [ebx]    ; /* mm2 = ip3 + ip4 = is34 */
+        paddsw    mm3, [32 + ebx]   ; /* mm3 = ip5 + ip6 = is56 */
+        psubsw    mm4, [48 + ebx]   ; /* mm4 = ip0 - ip7 = id07 */
+        psubsw    mm5, [32 + eax]   ; /* mm5 = ip1 - ip2 = id12 */
+                ;
+        psubsw    mm0, mm2    ; /* mm0 = is07 - is34 */
+                ;
+        paddsw    mm2, mm2    ;
+                ;
+        psubsw    mm6, [ebx]    ; /* mm6 = ip3 - ip4 = id34 */
+                ;
+        paddsw    mm2, mm0    ; /* mm2 = is07 + is34 = is0734 */
+        psubsw    mm1, mm3    ; /* mm1 = is12 - is56 */
+        movq    [ecx], mm0    ; /* Save is07 - is34 to free mm0; */
+        paddsw    mm3, mm3    ;
+        paddsw    mm3, mm1    ; /* mm3 = is12 + 1s56  = is1256 */
+                ;
+        psubsw    mm7, [32 + ebx]   ; /* mm7 = ip5 - ip6 = id56 */
+          ; /* ------------------------------------------------------------------- */
+        psubsw    mm5, mm7    ; /* mm5 = id12 - id56 */
+        paddsw    mm7, mm7    ;
+        paddsw    mm7, mm5    ; /* mm7 = id12 + id56 */
+          ; /* ------------------------------------------------------------------- */
+        psubsw    mm2, mm3    ; /* mm2 = is0734 - is1256 */
+        paddsw    mm3, mm3    ;
+                ;
+        movq    mm0, mm2    ; /* make a copy */
+        paddsw    mm3, mm2    ; /* mm3 = is0734 + is1256 */
+                ;
+        pmulhw    mm0, xC4S4    ; /* mm0 = xC4S4 * ( is0734 - is1256 ) - ( is0734 - is1256 ) */
+        paddw   mm0, mm2    ; /* mm0 = xC4S4 * ( is0734 - is1256 ) */
+        psrlw   mm2, 15   ;
+        paddw   mm0, mm2    ; /* Truncate mm0, now it is op[4] */
+                ;
+        movq    mm2, mm3    ;
+        movq    [ebx], mm0    ; /* save ip4, now mm0,mm2 are free */
+                ;
+        movq    mm0, mm3    ;
+        pmulhw    mm3, xC4S4    ; /* mm3 = xC4S4 * ( is0734 +is1256 ) - ( is0734 +is1256 ) */
+                ;
+        psrlw   mm2, 15   ;
+        paddw   mm3, mm0    ; /* mm3 = xC4S4 * ( is0734 +is1256 )  */
+        paddw   mm3, mm2    ; /* Truncate mm3, now it is op[0] */
+                ;
+        movq    [eax], mm3    ;
+          ; /* ------------------------------------------------------------------- */
+        movq    mm3, [ecx]    ; /* mm3 = irot_input_y */
+        pmulhw    mm3, xC2S6  ; /* mm3 = xC2S6 * irot_input_y - irot_input_y */
+                ;
+        movq    mm2, [ecx]    ;
+        movq    mm0, mm2    ;
+                ;
+        psrlw   mm2, 15   ; /* mm3 = xC2S6 * irot_input_y */
+        paddw   mm3, mm0    ;
+                ;
+        paddw   mm3, mm2    ; /* Truncated */
+        movq    mm0, mm5    ;
+                ;
+        movq    mm2, mm5    ;
+        pmulhw    mm0, xC6S2    ; /* mm0 = xC6S2 * irot_input_x */
+                ;
+        psrlw   mm2, 15   ;
+        paddw   mm0, mm2    ; /* Truncated */
+                ;
+        paddsw    mm3, mm0    ; /* ip[2] */
+        movq    [32 + eax], mm3   ; /* Save ip2 */
+                ;
+        movq    mm0, mm5    ;
+        movq    mm2, mm5    ;
+                ;
+        pmulhw    mm5, xC2S6    ; /* mm5 = xC2S6 * irot_input_x - irot_input_x */
+        psrlw   mm2, 15   ;
+                ;
+        movq    mm3, [ecx]    ;
+        paddw   mm5, mm0    ; /* mm5 = xC2S6 * irot_input_x */
+                ;
+        paddw   mm5, mm2    ; /* Truncated */
+        movq    mm2, mm3    ;
+                ;
+        pmulhw    mm3, xC6S2    ; /* mm3 = xC6S2 * irot_input_y */
+        psrlw   mm2, 15   ;
+                ;
+        paddw   mm3, mm2    ; /* Truncated */
+        psubsw    mm3, mm5    ;
+                ;
+        movq    [32 + ebx], mm3   ;
+          ; /* ------------------------------------------------------------------- */
+        movq    mm0, xC4S4    ;
+        movq    mm2, mm1    ;
+        movq    mm3, mm1    ;
+                ;
+        pmulhw    mm1, mm0    ; /* mm0 = xC4S4 * ( is12 - is56 ) - ( is12 - is56 ) */
+        psrlw   mm2, 15   ;
+                ;
+        paddw   mm1, mm3    ; /* mm0 = xC4S4 * ( is12 - is56 ) */
+        paddw   mm1, mm2    ; /* Truncate mm1, now it is icommon_product1 */
+                ;
+        movq    mm2, mm7    ;
+        movq    mm3, mm7    ;
+                ;
+        pmulhw    mm7, mm0    ; /* mm7 = xC4S4 * ( id12 + id56 ) - ( id12 + id56 ) */
+        psrlw   mm2, 15   ;
+                ;
+        paddw   mm7, mm3    ; /* mm7 = xC4S4 * ( id12 + id56 ) */
+        paddw   mm7, mm2    ; /* Truncate mm7, now it is icommon_product2 */
+          ; /* ------------------------------------------------------------------- */
+        pxor    mm0, mm0    ; /* Clear mm0 */
+        psubsw    mm0, mm6    ; /* mm0 = - id34 */
+                ;
+        psubsw    mm0, mm7    ; /* mm0 = - ( id34 + idcommon_product2 ) */
+        paddsw    mm6, mm6    ;
+        paddsw    mm6, mm0    ; /* mm6 = id34 - icommon_product2 */
+                ;
+        psubsw    mm4, mm1    ; /* mm4 = id07 - icommon_product1 */
+        paddsw    mm1, mm1    ;
+        paddsw    mm1, mm4    ; /* mm1 = id07 + icommon_product1 */
+          ; /* ------------------------------------------------------------------- */
+        movq    mm7, xC1S7    ;
+        movq    mm2, mm1    ;
+                ;
+        movq    mm3, mm1    ;
+        pmulhw    mm1, mm7    ; /* mm1 = xC1S7 * irot_input_x - irot_input_x */
+                ;
+        movq    mm7, xC7S1    ;
+        psrlw   mm2, 15   ;
+                ;
+        paddw   mm1, mm3    ; /* mm1 = xC1S7 * irot_input_x */
+        paddw   mm1, mm2    ; /* Trucated */
+                ;
+        pmulhw    mm3, mm7    ; /* mm3 = xC7S1 * irot_input_x */
+        paddw   mm3, mm2    ; /* Truncated */
+                ;
+        movq    mm5, mm0    ;
+        movq    mm2, mm0    ;
+                ;
+        movq    mm7, xC1S7    ;
+        pmulhw    mm0, mm7    ; /* mm0 = xC1S7 * irot_input_y - irot_input_y */
+                ;
+        movq    mm7, xC7S1    ;
+        psrlw   mm2, 15   ;
+                ;
+        paddw   mm0, mm5    ; /* mm0 = xC1S7 * irot_input_y */
+        paddw   mm0, mm2    ; /* Truncated */
+                ;
+        pmulhw    mm5, mm7    ; /* mm5 = xC7S1 * irot_input_y */
+        paddw   mm5, mm2    ; /* Truncated */
+                ;
+        psubsw    mm1, mm5    ; /* mm1 = xC1S7 * irot_input_x - xC7S1 * irot_input_y = ip1 */
+        paddsw    mm3, mm0    ; /* mm3 = xC7S1 * irot_input_x - xC1S7 * irot_input_y = ip7 */
+                ;
+        movq    [16 + eax], mm1   ;
+        movq    [48 + ebx], mm3   ;
+          ; /* ------------------------------------------------------------------- */
+        movq    mm0, xC3S5    ;
+        movq    mm1, xC5S3    ;
+                ;
+        movq    mm5, mm6    ;
+        movq    mm7, mm6    ;
+                ;
+        movq    mm2, mm4    ;
+        movq    mm3, mm4    ;
+                ;
+        pmulhw    mm4, mm0    ; /* mm4 = xC3S5 * irot_input_x - irot_input_x */
+        pmulhw    mm6, mm1    ; /* mm6 = xC5S3 * irot_input_y - irot_input_y */
+                ;
+        psrlw   mm2, 15   ;
+        psrlw   mm5, 15   ;
+                ;
+        paddw   mm4, mm3    ; /* mm4 = xC3S5 * irot_input_x */
+        paddw   mm6, mm7    ; /* mm6 = xC5S3 * irot_input_y */
+                ;
+        paddw   mm4, mm2    ; /* Truncated */
+        paddw   mm6, mm5    ; /* Truncated */
+                ;
+        psubsw    mm4, mm6    ; /* ip3 */
+        movq    [48 + eax], mm4   ;
+                ;
+        movq    mm4, mm3    ;
+        movq    mm6, mm7    ;
+                ;
+        pmulhw    mm3, mm1    ; /* mm3 = xC5S3 * irot_input_x - irot_input_x */
+        pmulhw    mm7, mm0    ; /* mm7 = xC3S5 * irot_input_y - irot_input_y */
+                ;
+        paddw   mm4, mm2    ;
+        paddw   mm6, mm5    ;
+                ;
+        paddw   mm3, mm4    ; /* mm3 = xC5S3 * irot_input_x */
+        paddw   mm7, mm6    ; /* mm7 = xC3S5 * irot_input_y */
+                ;
+        paddw   mm3, mm7    ; /* ip5 */
+        movq    [16 + ebx], mm3   ;
 
 };
 
@@ -329,6 +329,5 @@
 
 void dsp_mmx_fdct_init(DspFunctions *funcs)
 {
-  TH_DEBUG("enabling accelerated x86_32 mmx fdct function.\n");
   funcs->fdct_short = fdct_short__mmx;
 }

Modified: trunk/theora/lib/enc/x86_32_vs/recon_mmx.c
===================================================================
--- trunk/theora/lib/enc/x86_32_vs/recon_mmx.c	2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/enc/x86_32_vs/recon_mmx.c	2008-08-04 18:37:55 UTC (rev 15153)
@@ -21,44 +21,44 @@
 static const unsigned __int64 V128 = 0x8080808080808080;
 
 static void copy8x8__mmx (unsigned char *src,
-	                unsigned char *dest,
-	                unsigned int stride)
+                    unsigned char *dest,
+                    unsigned int stride)
 {
 
     //Is this even the fastest way to do this?
     __asm {
-        align 16        
+        align 16
 
         mov         eax, src
         mov         ebx, dest
         mov         ecx, stride
 
-        lea		    edi, [ecx + ecx * 2]
-        movq		mm0, [eax]
-        movq		mm1, [eax + ecx]
-        movq		mm2, [eax + ecx * 2]
-        movq		mm3, [eax + edi]
-        lea		    eax, [eax + ecx * 4]
-        movq		[ebx], mm0
-        movq		[ebx + ecx], mm1
-        movq		[ebx + ecx * 2], mm2
-        movq		[ebx + edi], mm3
-        lea		    ebx, [ebx + ecx * 4]
-        movq		mm0, [eax]
-        movq		mm1, [eax + ecx]
-        movq		mm2, [eax + ecx * 2]
-        movq		mm3, [eax + edi]
-        movq		[ebx], mm0
-        movq		[ebx + ecx], mm1
-        movq		[ebx + ecx * 2], mm2
-        movq		[ebx + edi], mm3
+        lea         edi, [ecx + ecx * 2]
+        movq        mm0, [eax]
+        movq        mm1, [eax + ecx]
+        movq        mm2, [eax + ecx * 2]
+        movq        mm3, [eax + edi]
+        lea         eax, [eax + ecx * 4]
+        movq        [ebx], mm0
+        movq        [ebx + ecx], mm1
+        movq        [ebx + ecx * 2], mm2
+        movq        [ebx + edi], mm3
+        lea         ebx, [ebx + ecx * 4]
+        movq        mm0, [eax]
+        movq        mm1, [eax + ecx]
+        movq        mm2, [eax + ecx * 2]
+        movq        mm3, [eax + edi]
+        movq        [ebx], mm0
+        movq        [ebx + ecx], mm1
+        movq        [ebx + ecx * 2], mm2
+        movq        [ebx + edi], mm3
 
     };
 
 }
 
 static void recon_intra8x8__mmx (unsigned char *ReconPtr, ogg_int16_t *ChangePtr,
-		      ogg_uint32_t LineStep)
+              ogg_uint32_t LineStep)
 {
 
     __asm {
@@ -68,28 +68,28 @@
         mov         ebx, ChangePtr
         mov         ecx, LineStep
 
-        movq		mm0, V128
+        movq        mm0, V128
 
-        lea		    edi, [128 + ebx]
-    loop_start:	
-        movq		mm2, [ebx]
+        lea         edi, [128 + ebx]
+    loop_start:
+        movq        mm2, [ebx]
 
-        packsswb	mm2, [8 + ebx]
-        por		    mm0, mm0
-        pxor		mm2, mm0
-        lea		    ebx, [16 + ebx]
-        cmp		    ebx, edi
+        packsswb    mm2, [8 + ebx]
+        por         mm0, mm0
+        pxor        mm2, mm0
+        lea         ebx, [16 + ebx]
+        cmp         ebx, edi
 
-        movq		[eax], mm2
+        movq        [eax], mm2
 
 
 
-        lea		    eax, [eax + ecx]
-        jc		    loop_start
+        lea         eax, [eax + ecx]
+        jc          loop_start
 
 
     };
-    
+
 }
 
 
@@ -97,7 +97,7 @@
 
 
 static void recon_inter8x8__mmx (unsigned char *ReconPtr, unsigned char *RefPtr,
-		      ogg_int16_t *ChangePtr, ogg_uint32_t LineStep)
+              ogg_int16_t *ChangePtr, ogg_uint32_t LineStep)
 {
 
     __asm {
@@ -108,29 +108,29 @@
         mov         ebx, ChangePtr
         mov         ecx, LineStep
         mov         edx, RefPtr
-    
-        pxor		mm0, mm0
-        lea		    edi, [128 + ebx]
 
+        pxor        mm0, mm0
+        lea         edi, [128 + ebx]
+
     loop_start:
-        movq		mm2, [edx]
+        movq        mm2, [edx]
 
-        movq		mm4, [ebx]
-        movq		mm3, mm2
-        movq		mm5, [8 + ebx]
-        punpcklbw	mm2, mm0
-        paddsw		mm2, mm4
-        punpckhbw	mm3, mm0
-        paddsw		mm3, mm5
-        add		    edx, ecx
-        packuswb	mm2, mm3
-        lea		    ebx, [16 + ebx]
-        cmp		    ebx, edi
+        movq        mm4, [ebx]
+        movq        mm3, mm2
+        movq        mm5, [8 + ebx]
+        punpcklbw   mm2, mm0
+        paddsw      mm2, mm4
+        punpckhbw   mm3, mm0
+        paddsw      mm3, mm5
+        add         edx, ecx
+        packuswb    mm2, mm3
+        lea         ebx, [16 + ebx]
+        cmp         ebx, edi
 
-        movq		[eax], mm2
+        movq        [eax], mm2
 
-        lea		    eax, [eax + ecx]
-        jc		    loop_start
+        lea         eax, [eax + ecx]
+        jc          loop_start
 
     };
 }
@@ -139,8 +139,8 @@
 
 
 static void recon_inter8x8_half__mmx (unsigned char *ReconPtr, unsigned char *RefPtr1,
-		           unsigned char *RefPtr2, ogg_int16_t *ChangePtr,
-			   ogg_uint32_t LineStep)
+                   unsigned char *RefPtr2, ogg_int16_t *ChangePtr,
+               ogg_uint32_t LineStep)
 {
     __asm {
         align 16
@@ -149,36 +149,36 @@
         mov     ebx, ChangePtr
         mov     ecx, RefPtr1
         mov     edx, RefPtr2
-                
-        pxor		mm0, mm0
-        lea		edi, [128 + ebx]
 
+        pxor        mm0, mm0
+        lea     edi, [128 + ebx]
+
     loop_start:
-        movq		mm2, [ecx]
-        movq		mm4, [edx]
+        movq        mm2, [ecx]
+        movq        mm4, [edx]
 
-        movq		mm3, mm2
-        punpcklbw		mm2, mm0
-        movq		mm5, mm4
-        movq		mm6, [ebx]
-        punpckhbw		mm3, mm0
-        movq		mm7, [8 + ebx]
-        punpcklbw		mm4, mm0
-        punpckhbw		mm5, mm0
-        paddw		mm2, mm4
-        paddw		mm3, mm5
-        psrlw		mm2, 1
-        psrlw		mm3, 1
-        paddw		mm2, mm6
-        paddw		mm3, mm7
-        lea		ebx, [16 + ebx]
-        packuswb		mm2, mm3
-        add		ecx, LineStep
-        add		edx, LineStep
-        movq		[eax], mm2
-        add		eax, LineStep
-        cmp		ebx, edi
-        jc		loop_start
+        movq        mm3, mm2
+        punpcklbw       mm2, mm0
+        movq        mm5, mm4
+        movq        mm6, [ebx]
+        punpckhbw       mm3, mm0
+        movq        mm7, [8 + ebx]
+        punpcklbw       mm4, mm0
+        punpckhbw       mm5, mm0
+        paddw       mm2, mm4
+        paddw       mm3, mm5
+        psrlw       mm2, 1
+        psrlw       mm3, 1
+        paddw       mm2, mm6
+        paddw       mm3, mm7
+        lea     ebx, [16 + ebx]
+        packuswb        mm2, mm3
+        add     ecx, LineStep
+        add     edx, LineStep
+        movq        [eax], mm2
+        add     eax, LineStep
+        cmp     ebx, edi
+        jc      loop_start
 
     };
 
@@ -189,7 +189,6 @@
 
 void dsp_mmx_recon_init(DspFunctions *funcs)
 {
-  TH_DEBUG("enabling accelerated x86_32 mmx recon functions.\n");
   funcs->copy8x8 = copy8x8__mmx;
   funcs->recon_intra8x8 = recon_intra8x8__mmx;
   funcs->recon_inter8x8 = recon_inter8x8__mmx;

Modified: trunk/theora/lib/enc/x86_64/dct_decode_mmx.c
===================================================================
--- trunk/theora/lib/enc/x86_64/dct_decode_mmx.c	2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/enc/x86_64/dct_decode_mmx.c	2008-08-04 18:37:55 UTC (rev 15153)
@@ -27,7 +27,7 @@
  0x0004000400040004LL;
 
 static void loop_filter_v(unsigned char *_pix,int _ystride,
-			  const ogg_int16_t *_ll){
+                          const ogg_int16_t *_ll){
   long esi;
   _pix-=_ystride*2;
   __asm__ __volatile__(
@@ -210,7 +210,7 @@
    four p0's to one register we must transpose the values in four mmx regs.
   When half is done we repeat this for the rest.*/
 static void loop_filter_h4(unsigned char *_pix,long _ystride,
-			   const ogg_int16_t *_ll){
+                           const ogg_int16_t *_ll){
   long esi;
   long edi;
   __asm__ __volatile__(
@@ -343,12 +343,12 @@
 }
 
 static void loop_filter_h(unsigned char *_pix,int _ystride,
-			  const ogg_int16_t *_ll){
+                          const ogg_int16_t *_ll){
   _pix-=2;
   loop_filter_h4(_pix,_ystride,_ll);
   loop_filter_h4(_pix+(_ystride<<2),_ystride,_ll);
 }
- 
+
 static void loop_filter_mmx(PB_INSTANCE *pbi, int FLimit){
   int j;
   ogg_int16_t __attribute__((aligned(8)))  ll[4];
@@ -359,7 +359,7 @@
   ll[0]=ll[1]=ll[2]=ll[3]=FLimit;
 
   for ( j = 0; j < 3 ; j++){
-    ogg_uint32_t *bp_begin = bp; 
+    ogg_uint32_t *bp_begin = bp;
     ogg_uint32_t *bp_end;
     int stride;
     int h;
@@ -376,23 +376,23 @@
       stride = pbi->UVStride;
       break;
     }
-    
+
     while(bp<bp_end){
       ogg_uint32_t *bp_left = bp;
       ogg_uint32_t *bp_right = bp + h;
       while(bp<bp_right){
-	if(cp[0]){
-	  if(bp>bp_left)
-	    loop_filter_h(&pbi->LastFrameRecon[bp[0]],stride,ll);
-	  if(bp_left>bp_begin)
-	    loop_filter_v(&pbi->LastFrameRecon[bp[0]],stride,ll);
-	  if(bp+1<bp_right && !cp[1])
-	    loop_filter_h(&pbi->LastFrameRecon[bp[0]]+8,stride,ll);
-	  if(bp+h<bp_end && !cp[h])
-	    loop_filter_v(&pbi->LastFrameRecon[bp[h]],stride,ll);
-	}
-	bp++;
-	cp++;
+        if(cp[0]){
+          if(bp>bp_left)
+            loop_filter_h(&pbi->LastFrameRecon[bp[0]],stride,ll);
+          if(bp_left>bp_begin)
+            loop_filter_v(&pbi->LastFrameRecon[bp[0]],stride,ll);
+          if(bp+1<bp_right && !cp[1])
+            loop_filter_h(&pbi->LastFrameRecon[bp[0]]+8,stride,ll);
+          if(bp+h<bp_end && !cp[h])
+            loop_filter_v(&pbi->LastFrameRecon[bp[h]],stride,ll);
+        }
+        bp++;
+        cp++;
       }
     }
   }

Modified: trunk/theora/lib/enc/x86_64/dsp_mmx.c
===================================================================
--- trunk/theora/lib/enc/x86_64/dsp_mmx.c	2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/enc/x86_64/dsp_mmx.c	2008-08-04 18:37:55 UTC (rev 15153)
@@ -32,12 +32,12 @@
 
 static void sub8x8__mmx (unsigned char *FiltPtr, unsigned char *ReconPtr,
                   ogg_int16_t *DctInputPtr, ogg_uint32_t PixelsPerLine,
-                  ogg_uint32_t ReconPixelsPerLine) 
+                  ogg_uint32_t ReconPixelsPerLine)
 {
   __asm__ __volatile__ (
     "  .balign 16                   \n\t"
 
-    "  pxor        %%mm7, %%mm7     \n\t" 
+    "  pxor        %%mm7, %%mm7     \n\t"
 
     ".rept 8                        \n\t"
     "  movq        (%0), %%mm0      \n\t" /* mm0 = FiltPtr */
@@ -64,20 +64,20 @@
        "+r" (ReconPtr),
        "+r" (DctInputPtr)
      : "r" ((ogg_uint64_t)PixelsPerLine),
-       "r" ((ogg_uint64_t)ReconPixelsPerLine) 
+       "r" ((ogg_uint64_t)ReconPixelsPerLine)
      : "memory"
   );
 }
 
 static void sub8x8_128__mmx (unsigned char *FiltPtr, ogg_int16_t *DctInputPtr,
-                      ogg_uint32_t PixelsPerLine) 
+                      ogg_uint32_t PixelsPerLine)
 {
   ogg_uint64_t ppl = PixelsPerLine;
 
   __asm__ __volatile__ (
     "  .balign 16                   \n\t"
 
-    "  pxor        %%mm7, %%mm7     \n\t" 
+    "  pxor        %%mm7, %%mm7     \n\t"
     "  movq        %[V128], %%mm1   \n\t"
 
     ".rept 8                        \n\t"
@@ -107,12 +107,12 @@
 static void sub8x8avg2__mmx (unsigned char *FiltPtr, unsigned char *ReconPtr1,
                      unsigned char *ReconPtr2, ogg_int16_t *DctInputPtr,
                      ogg_uint32_t PixelsPerLine,
-                     ogg_uint32_t ReconPixelsPerLine) 
+                     ogg_uint32_t ReconPixelsPerLine)
 {
   __asm__ __volatile__ (
     "  .balign 16                   \n\t"
 
-    "  pxor        %%mm7, %%mm7     \n\t" 
+    "  pxor        %%mm7, %%mm7     \n\t"
 
     ".rept 8                        \n\t"
     "  movq        (%0), %%mm0      \n\t" /* mm0 = FiltPtr */
@@ -149,7 +149,7 @@
        "+r" (ReconPtr2),
        "+r" (DctInputPtr)
      : "r" ((ogg_uint64_t)PixelsPerLine),
-       "r" ((ogg_uint64_t)ReconPixelsPerLine) 
+       "r" ((ogg_uint64_t)ReconPixelsPerLine)
      : "memory"
   );
 }
@@ -167,7 +167,7 @@
     "  pxor        %%mm7, %%mm7     \n\t"
     "  mov         $8, %%rdi        \n\t"
     "1:                             \n\t"
-    "  movq        (%2), %%mm0      \n\t"	/* take 8 bytes */
+    "  movq        (%2), %%mm0      \n\t"       /* take 8 bytes */
     "  movq        %%mm0, %%mm2     \n\t"
 
     "  punpcklbw   %%mm6, %%mm0     \n\t"
@@ -178,11 +178,11 @@
 
     "  pmaddwd     %%mm0, %%mm0     \n\t"
     "  pmaddwd     %%mm2, %%mm2     \n\t"
-    
+
     "  paddd       %%mm0, %%mm7     \n\t"
     "  paddd       %%mm2, %%mm7     \n\t"
 
-    "  add         %3, %2           \n\t"	/* Inc pointer into src data */
+    "  add         %3, %2           \n\t"       /* Inc pointer into src data */
 
     "  dec         %%rdi            \n\t"
     "  jnz 1b                       \n\t"
@@ -204,7 +204,7 @@
 
      : "=r" (XSum),
        "=r" (XXSum),
-       "+r" (DataPtr) 
+       "+r" (DataPtr)
      : "r" ((ogg_uint64_t)Stride)
      : "rdi", "memory"
   );
@@ -214,7 +214,7 @@
 }
 
 static ogg_uint32_t inter8x8_err__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride,
-		                 unsigned char *RefDataPtr, ogg_uint32_t RefStride)
+                                 unsigned char *RefDataPtr, ogg_uint32_t RefStride)
 {
   ogg_uint64_t  XSum;
   ogg_uint64_t  XXSum;
@@ -227,7 +227,7 @@
     "  pxor        %%mm7, %%mm7     \n\t"
     "  mov         $8, %%rdi        \n\t"
     "1:                             \n\t"
-    "  movq        (%2), %%mm0      \n\t"	/* take 8 bytes */
+    "  movq        (%2), %%mm0      \n\t"       /* take 8 bytes */
     "  movq        (%3), %%mm1      \n\t"
     "  movq        %%mm0, %%mm2     \n\t"
     "  movq        %%mm1, %%mm3     \n\t"
@@ -245,12 +245,12 @@
 
     "  pmaddwd     %%mm0, %%mm0     \n\t"
     "  pmaddwd     %%mm2, %%mm2     \n\t"
-    
+
     "  paddd       %%mm0, %%mm7     \n\t"
     "  paddd       %%mm2, %%mm7     \n\t"
 
-    "  add         %4, %2           \n\t"	/* Inc pointer into src data */
-    "  add         %5, %3           \n\t"	/* Inc pointer into ref data */
+    "  add         %4, %2           \n\t"       /* Inc pointer into src data */
+    "  add         %5, %3           \n\t"       /* Inc pointer into ref data */
 
     "  dec         %%rdi            \n\t"
     "  jnz 1b                       \n\t"
@@ -272,8 +272,8 @@
 
      : "=m" (XSum),
        "=m" (XXSum),
-       "+r" (SrcData), 
-       "+r" (RefDataPtr) 
+       "+r" (SrcData),
+       "+r" (RefDataPtr)
      : "r" ((ogg_uint64_t)SrcStride),
        "r" ((ogg_uint64_t)RefStride)
      : "rdi", "memory"
@@ -292,7 +292,6 @@
 
 void dsp_mmx_init(DspFunctions *funcs)
 {
-  TH_DEBUG("setting accelerated x86_64 mmx dsp functions.\n");
   funcs->restore_fpu = restore_fpu;
   funcs->sub8x8 = sub8x8__mmx;
   funcs->sub8x8_128 = sub8x8_128__mmx;

Modified: trunk/theora/lib/enc/x86_64/dsp_mmxext.c
===================================================================
--- trunk/theora/lib/enc/x86_64/dsp_mmxext.c	2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/enc/x86_64/dsp_mmxext.c	2008-08-04 18:37:55 UTC (rev 15153)
@@ -31,26 +31,26 @@
 
   __asm__ __volatile__ (
     "  .balign 16                   \n\t"
-    "  pxor %%mm7, %%mm7            \n\t" 	/* mm7 contains the result */
+    "  pxor %%mm7, %%mm7            \n\t"       /* mm7 contains the result */
 
     ".rept 7                        \n\t"
-    "  movq (%1), %%mm0             \n\t"	/* take 8 bytes */
+    "  movq (%1), %%mm0             \n\t"       /* take 8 bytes */
     "  movq (%2), %%mm1             \n\t"
     "  psadbw %%mm1, %%mm0          \n\t"
-    "  add %3, %1                   \n\t"	/* Inc pointer into the new data */
-    "  paddw %%mm0, %%mm7           \n\t"	/* accumulate difference... */
-    "  add %4, %2                   \n\t"	/* Inc pointer into ref data */
+    "  add %3, %1                   \n\t"       /* Inc pointer into the new data */
+    "  paddw %%mm0, %%mm7           \n\t"       /* accumulate difference... */
+    "  add %4, %2                   \n\t"       /* Inc pointer into ref data */
     ".endr                          \n\t"
 
-    "  movq (%1), %%mm0             \n\t"	/* take 8 bytes */
+    "  movq (%1), %%mm0             \n\t"       /* take 8 bytes */
     "  movq (%2), %%mm1             \n\t"
     "  psadbw %%mm1, %%mm0          \n\t"
-    "  paddw %%mm0, %%mm7           \n\t"	/* accumulate difference... */
+    "  paddw %%mm0, %%mm7           \n\t"       /* accumulate difference... */
     "  movd %%mm7, %0               \n\t"
 
      : "=r" (DiffVal),
-       "+r" (ptr1), 
-       "+r" (ptr2) 
+       "+r" (ptr1),
+       "+r" (ptr2)
      : "r" ((ogg_uint64_t)stride1),
        "r" ((ogg_uint64_t)stride2)
      : "memory"
@@ -60,29 +60,29 @@
 }
 
 static ogg_uint32_t sad8x8_thres__mmxext (unsigned char *ptr1, ogg_uint32_t stride1,
-                                          unsigned char *ptr2, ogg_uint32_t stride2, 
-			   	  ogg_uint32_t thres)
+                                          unsigned char *ptr2, ogg_uint32_t stride2,
+                                  ogg_uint32_t thres)
 {
   ogg_uint32_t  DiffVal;
 
   __asm__ __volatile__ (
     "  .balign 16                   \n\t"
-    "  pxor %%mm7, %%mm7            \n\t" 	/* mm7 contains the result */
+    "  pxor %%mm7, %%mm7            \n\t"       /* mm7 contains the result */
 
     ".rept 8                        \n\t"
-    "  movq (%1), %%mm0             \n\t"	/* take 8 bytes */
+    "  movq (%1), %%mm0             \n\t"       /* take 8 bytes */
     "  movq (%2), %%mm1             \n\t"
     "  psadbw %%mm1, %%mm0          \n\t"
-    "  add %3, %1                   \n\t"	/* Inc pointer into the new data */
-    "  paddw %%mm0, %%mm7           \n\t"	/* accumulate difference... */
-    "  add %4, %2                   \n\t"	/* Inc pointer into ref data */
+    "  add %3, %1                   \n\t"       /* Inc pointer into the new data */
+    "  paddw %%mm0, %%mm7           \n\t"       /* accumulate difference... */
+    "  add %4, %2                   \n\t"       /* Inc pointer into ref data */
     ".endr                          \n\t"
 
     "  movd %%mm7, %0               \n\t"
 
      : "=r" (DiffVal),
-       "+r" (ptr1), 
-       "+r" (ptr2) 
+       "+r" (ptr1),
+       "+r" (ptr2)
      : "r" ((ogg_uint64_t)stride1),
        "r" ((ogg_uint64_t)stride2)
      : "memory"
@@ -100,25 +100,25 @@
 
   __asm__ __volatile__ (
     "  .balign 16                   \n\t"
-    "  pxor %%mm7, %%mm7            \n\t" 	/* mm7 contains the result */
+    "  pxor %%mm7, %%mm7            \n\t"       /* mm7 contains the result */
     ".rept 8                        \n\t"
-    "  movq (%1), %%mm0             \n\t"	/* take 8 bytes */
+    "  movq (%1), %%mm0             \n\t"       /* take 8 bytes */
     "  movq (%2), %%mm1             \n\t"
     "  movq (%3), %%mm2             \n\t"
     "  pavgb %%mm2, %%mm1           \n\t"
     "  psadbw %%mm1, %%mm0          \n\t"
 
-    "  add %4, %1                   \n\t"	/* Inc pointer into the new data */
-    "  paddw %%mm0, %%mm7           \n\t"	/* accumulate difference... */
-    "  add %5, %2                   \n\t"	/* Inc pointer into ref data */
-    "  add %5, %3                   \n\t"	/* Inc pointer into ref data */
+    "  add %4, %1                   \n\t"       /* Inc pointer into the new data */
+    "  paddw %%mm0, %%mm7           \n\t"       /* accumulate difference... */
+    "  add %5, %2                   \n\t"       /* Inc pointer into ref data */
+    "  add %5, %3                   \n\t"       /* Inc pointer into ref data */
     ".endr                          \n\t"
 
     "  movd %%mm7, %0               \n\t"
      : "=m" (DiffVal),
-       "+r" (SrcData), 
-       "+r" (RefDataPtr1), 
-       "+r" (RefDataPtr2) 
+       "+r" (SrcData),
+       "+r" (RefDataPtr1),
+       "+r" (RefDataPtr2)
      : "r" ((ogg_uint64_t)SrcStride),
        "r" ((ogg_uint64_t)RefStride)
      : "memory"
@@ -126,7 +126,7 @@
 
   return DiffVal;
 }
-		
+
 static ogg_uint32_t row_sad8__mmxext (unsigned char *Src1, unsigned char *Src2)
 {
   ogg_uint32_t MaxSad;
@@ -146,8 +146,8 @@
     "  andl        $0xffff, %0      \n\t"
 
      : "=m" (MaxSad),
-       "+r" (Src1), 
-       "+r" (Src2) 
+       "+r" (Src1),
+       "+r" (Src2)
      :
      : "memory"
   );
@@ -156,56 +156,56 @@
 }
 
 static ogg_uint32_t col_sad8x8__mmxext (unsigned char *Src1, unsigned char *Src2,
-		                    ogg_uint32_t stride)
+                                    ogg_uint32_t stride)
 {
   ogg_uint32_t MaxSad;
 
   __asm__ __volatile__ (
     "  .balign 16                   \n\t"
 
-    "  pxor        %%mm3, %%mm3     \n\t"	/* zero out mm3 for unpack */
-    "  pxor        %%mm4, %%mm4     \n\t"	/* mm4 low sum */
-    "  pxor        %%mm5, %%mm5     \n\t" 	/* mm5 high sum */
-    "  pxor        %%mm6, %%mm6     \n\t"	/* mm6 low sum */
-    "  pxor        %%mm7, %%mm7     \n\t" 	/* mm7 high sum */
-    "  mov         $4, %%rdi        \n\t"	/* 4 rows */
+    "  pxor        %%mm3, %%mm3     \n\t"       /* zero out mm3 for unpack */
+    "  pxor        %%mm4, %%mm4     \n\t"       /* mm4 low sum */
+    "  pxor        %%mm5, %%mm5     \n\t"       /* mm5 high sum */
+    "  pxor        %%mm6, %%mm6     \n\t"       /* mm6 low sum */
+    "  pxor        %%mm7, %%mm7     \n\t"       /* mm7 high sum */
+    "  mov         $4, %%rdi        \n\t"       /* 4 rows */
     "1:                             \n\t"
-    "  movq        (%1), %%mm0      \n\t"	/* take 8 bytes */
-    "  movq        (%2), %%mm1      \n\t"	/* take 8 bytes */
+    "  movq        (%1), %%mm0      \n\t"       /* take 8 bytes */
+    "  movq        (%2), %%mm1      \n\t"       /* take 8 bytes */
 
     "  movq        %%mm0, %%mm2     \n\t"
-    "  psubusb     %%mm1, %%mm0     \n\t" 	/* A - B */
-    "  psubusb     %%mm2, %%mm1     \n\t"	/* B - A */
-    "  por         %%mm1, %%mm0     \n\t"      	/* and or gives abs difference */
+    "  psubusb     %%mm1, %%mm0     \n\t"       /* A - B */
+    "  psubusb     %%mm2, %%mm1     \n\t"       /* B - A */
+    "  por         %%mm1, %%mm0     \n\t"       /* and or gives abs difference */
     "  movq        %%mm0, %%mm1     \n\t"
 
-    "  punpcklbw   %%mm3, %%mm0     \n\t"	/* unpack to higher precision for accumulation */
-    "  paddw       %%mm0, %%mm4     \n\t"	/* accumulate difference... */
-    "  punpckhbw   %%mm3, %%mm1     \n\t"	/* unpack high four bytes to higher precision */
-    "  paddw       %%mm1, %%mm5     \n\t"	/* accumulate difference... */
-    "  add         %3, %1           \n\t"	/* Inc pointer into the new data */
-    "  add         %3, %2           \n\t"	/* Inc pointer into the new data */
+    "  punpcklbw   %%mm3, %%mm0     \n\t"       /* unpack to higher precision for accumulation */
+    "  paddw       %%mm0, %%mm4     \n\t"       /* accumulate difference... */
+    "  punpckhbw   %%mm3, %%mm1     \n\t"       /* unpack high four bytes to higher precision */
+    "  paddw       %%mm1, %%mm5     \n\t"       /* accumulate difference... */
+    "  add         %3, %1           \n\t"       /* Inc pointer into the new data */
+    "  add         %3, %2           \n\t"       /* Inc pointer into the new data */
 
     "  dec         %%rdi            \n\t"
     "  jnz 1b                       \n\t"
 
-    "  mov         $4, %%rdi        \n\t"	/* 4 rows */
+    "  mov         $4, %%rdi        \n\t"       /* 4 rows */
     "2:                             \n\t"
-    "  movq        (%1), %%mm0      \n\t"	/* take 8 bytes */
-    "  movq        (%2), %%mm1      \n\t"	/* take 8 bytes */
+    "  movq        (%1), %%mm0      \n\t"       /* take 8 bytes */
+    "  movq        (%2), %%mm1      \n\t"       /* take 8 bytes */
 
     "  movq        %%mm0, %%mm2     \n\t"
-    "  psubusb     %%mm1, %%mm0     \n\t" 	/* A - B */
-    "  psubusb     %%mm2, %%mm1     \n\t"	/* B - A */
-    "  por         %%mm1, %%mm0     \n\t"      	/* and or gives abs difference */
+    "  psubusb     %%mm1, %%mm0     \n\t"       /* A - B */
+    "  psubusb     %%mm2, %%mm1     \n\t"       /* B - A */
+    "  por         %%mm1, %%mm0     \n\t"       /* and or gives abs difference */
     "  movq        %%mm0, %%mm1     \n\t"
 
-    "  punpcklbw   %%mm3, %%mm0     \n\t"	/* unpack to higher precision for accumulation */
-    "  paddw       %%mm0, %%mm6     \n\t"	/* accumulate difference... */
-    "  punpckhbw   %%mm3, %%mm1     \n\t"	/* unpack high four bytes to higher precision */
-    "  paddw       %%mm1, %%mm7     \n\t"	/* accumulate difference... */
-    "  add         %3, %1           \n\t"	/* Inc pointer into the new data */
-    "  add         %3, %2           \n\t"	/* Inc pointer into the new data */
+    "  punpcklbw   %%mm3, %%mm0     \n\t"       /* unpack to higher precision for accumulation */
+    "  paddw       %%mm0, %%mm6     \n\t"       /* accumulate difference... */
+    "  punpckhbw   %%mm3, %%mm1     \n\t"       /* unpack high four bytes to higher precision */
+    "  paddw       %%mm1, %%mm7     \n\t"       /* accumulate difference... */
+    "  add         %3, %1           \n\t"       /* Inc pointer into the new data */
+    "  add         %3, %2           \n\t"       /* Inc pointer into the new data */
 
     "  dec         %%rdi            \n\t"
     "  jnz 2b                       \n\t"
@@ -223,8 +223,8 @@
     "  andl        $0xffff, %0      \n\t"
 
      : "=r" (MaxSad),
-       "+r" (Src1), 
-       "+r" (Src2) 
+       "+r" (Src1),
+       "+r" (Src2)
      : "r" ((ogg_uint64_t)stride)
      : "memory", "rdi"
   );
@@ -248,10 +248,10 @@
     "  pxor        %%mm7, %%mm7     \n\t"
     "  mov         $8, %%rdi        \n\t"
     "1:                             \n\t"
-    "  movq        (%2), %%mm0      \n\t"	/* take 8 bytes */
+    "  movq        (%2), %%mm0      \n\t"       /* take 8 bytes */
 
     "  movq        (%3), %%mm2      \n\t"
-    "  movq        (%4), %%mm1      \n\t"	/* take average of mm2 and mm1 */
+    "  movq        (%4), %%mm1      \n\t"       /* take average of mm2 and mm1 */
     "  pavgb       %%mm2, %%mm1     \n\t"
 
     "  movq        %%mm0, %%mm2     \n\t"
@@ -270,13 +270,13 @@
 
     "  pmaddwd     %%mm0, %%mm0     \n\t"
     "  pmaddwd     %%mm2, %%mm2     \n\t"
-    
+
     "  paddd       %%mm0, %%mm7     \n\t"
     "  paddd       %%mm2, %%mm7     \n\t"
 
-    "  add         %5, %2           \n\t"	/* Inc pointer into src data */
-    "  add         %6, %3           \n\t"	/* Inc pointer into ref data */
-    "  add         %6, %4           \n\t"	/* Inc pointer into ref data */
+    "  add         %5, %2           \n\t"       /* Inc pointer into src data */
+    "  add         %6, %3           \n\t"       /* Inc pointer into ref data */
+    "  add         %6, %4           \n\t"       /* Inc pointer into ref data */
 
     "  dec         %%rdi            \n\t"
     "  jnz 1b                       \n\t"
@@ -298,9 +298,9 @@
 
      : "=m" (XSum),
        "=m" (XXSum),
-       "+r" (SrcData), 
+       "+r" (SrcData),
        "+r" (RefDataPtr1),
-       "+r" (RefDataPtr2) 
+       "+r" (RefDataPtr2)
      : "r" ((ogg_uint64_t)SrcStride),
        "r" ((ogg_uint64_t)RefStride)
      : "rdi", "memory"
@@ -312,7 +312,6 @@
 
 void dsp_mmxext_init(DspFunctions *funcs)
 {
-  TH_DEBUG("enabling accerated x86_64 mmxext dsp functions.\n");
   funcs->row_sad8 = row_sad8__mmxext;
   funcs->col_sad8x8 = col_sad8x8__mmxext;
   funcs->sad8x8 = sad8x8__mmxext;

Modified: trunk/theora/lib/enc/x86_64/fdct_mmx.c
===================================================================
--- trunk/theora/lib/enc/x86_64/fdct_mmx.c	2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/enc/x86_64/fdct_mmx.c	2008-08-04 18:37:55 UTC (rev 15153)
@@ -62,7 +62,7 @@
   "  psubsw      %%mm3, %%mm1       \n\t" /* mm1 = is12 - is56 */             \
   "  movq        %%mm0," #temp "    \n\t" /* Save is07 - is34 to free mm0; */ \
   "  paddsw      %%mm3, %%mm3       \n\t"                                     \
-  "  paddsw      %%mm1, %%mm3       \n\t" /* mm3 = is12 + 1s56	= is1256 */   \
+  "  paddsw      %%mm1, %%mm3       \n\t" /* mm3 = is12 + 1s56  = is1256 */   \
                                                                               \
   "  psubsw    " #ip6 ", %%mm7      \n\t" /* mm7 = ip5 - ip6 = id56 */        \
   /* ------------------------------------------------------------------- */   \
@@ -88,7 +88,7 @@
   "  pmulhw      %[xC4S4], %%mm3    \n\t" /* mm3 = xC4S4 * ( is0734 +is1256 ) - ( is0734 +is1256 ) */ \
                                                                               \
   "  psrlw       $15, %%mm2         \n\t"                                     \
-  "  paddw       %%mm0, %%mm3       \n\t" /* mm3 = xC4S4 * ( is0734 +is1256 )	 */ \
+  "  paddw       %%mm0, %%mm3       \n\t" /* mm3 = xC4S4 * ( is0734 +is1256 )    */ \
   "  paddw       %%mm2, %%mm3       \n\t" /* Truncate mm3, now it is op[0] */ \
                                                                               \
   "  movq        %%mm3," #ip0 "     \n\t"                                     \
@@ -139,16 +139,16 @@
   "  movq        %%mm1, %%mm3       \n\t"                                     \
                                                                               \
   "  pmulhw      %%mm0, %%mm1       \n\t" /* mm0 = xC4S4 * ( is12 - is56 ) - ( is12 - is56 ) */ \
-  "  psrlw       $15, %%mm2         \n\t"				      \
+  "  psrlw       $15, %%mm2         \n\t"                                     \
                                                                               \
   "  paddw       %%mm3, %%mm1       \n\t" /* mm0 = xC4S4 * ( is12 - is56 ) */ \
   "  paddw       %%mm2, %%mm1       \n\t" /* Truncate mm1, now it is icommon_product1 */ \
                                                                               \
   "  movq        %%mm7, %%mm2       \n\t"                                     \
-  "  movq        %%mm7, %%mm3       \n\t"			              \
+  "  movq        %%mm7, %%mm3       \n\t"                                     \
                                                                               \
   "  pmulhw      %%mm0, %%mm7       \n\t" /* mm7 = xC4S4 * ( id12 + id56 ) - ( id12 + id56 ) */ \
-  "  psrlw       $15, %%mm2         \n\t"			              \
+  "  psrlw       $15, %%mm2         \n\t"                                     \
                                                                               \
   "  paddw       %%mm3, %%mm7       \n\t" /* mm7 = xC4S4 * ( id12 + id56 ) */ \
   "  paddw       %%mm2, %%mm7       \n\t" /* Truncate mm7, now it is icommon_product2 */ \
@@ -237,10 +237,10 @@
   "  paddw       %%mm6, %%mm7       \n\t" /* mm7 = xC3S5 * irot_input_y */    \
                                                                               \
   "  paddw       %%mm7, %%mm3       \n\t" /* ip5 */                           \
-  "  movq        %%mm3," #ip5 "     \n\t" 
+  "  movq        %%mm3," #ip5 "     \n\t"
 
 #define Transpose_mmx(ip0,ip1,ip2,ip3,ip4,ip5,ip6,ip7,                  \
-		      op0,op1,op2,op3,op4,op5,op6,op7)                  \
+                      op0,op1,op2,op3,op4,op5,op6,op7)                  \
   "  movq      " #ip0 ", %%mm0      \n\t" /* mm0 = a0 a1 a2 a3 */       \
   "  movq      " #ip4 ", %%mm4      \n\t" /* mm4 = e4 e5 e6 e7 */       \
   "  movq      " #ip1 ", %%mm1      \n\t" /* mm1 = b0 b1 b2 b3 */       \
@@ -254,9 +254,9 @@
   "  movq        %%mm4, %%mm1       \n\t" /* mm1 = e3 e2 e1 e0 */       \
   "  punpcklwd   %%mm5, %%mm4       \n\t" /* mm4 = f1 e1 f0 e0 */       \
   "  movq        %%mm0," #op0 "     \n\t" /* save a3 a2 a1 a0  */       \
-  "  punpckhwd	 %%mm5, %%mm1       \n\t" /* mm1 = f3 e3 f2 e2 */       \
+  "  punpckhwd   %%mm5, %%mm1       \n\t" /* mm1 = f3 e3 f2 e2 */       \
   "  movq        %%mm6, %%mm0       \n\t" /* mm0 = g3 g2 g1 g0 */       \
-  "  punpcklwd	 %%mm7, %%mm6       \n\t" /* mm6 = h1 g1 h0 g0 */       \
+  "  punpcklwd   %%mm7, %%mm6       \n\t" /* mm6 = h1 g1 h0 g0 */       \
   "  movq        %%mm4, %%mm5       \n\t" /* mm5 = f1 e1 f0 e0 */       \
   "  punpckldq   %%mm6, %%mm4       \n\t" /* mm4 = h0 g0 f0 e0 = MM4 */ \
   "  punpckhdq   %%mm6, %%mm5       \n\t" /* mm5 = h1 g1 f1 e1 = MM5 */ \
@@ -302,23 +302,23 @@
      * we will transpose the block of data to two 4x8 blocks???
      */
     Transpose_mmx (  (%0), 16(%0), 32(%0), 48(%0),  8(%0), 24(%0), 40(%0), 56(%0),
-		     (%1), 16(%1), 32(%1), 48(%1),  8(%1), 24(%1), 40(%1), 56(%1))
+                     (%1), 16(%1), 32(%1), 48(%1),  8(%1), 24(%1), 40(%1), 56(%1))
     Fdct_mmx      (  (%1), 16(%1), 32(%1), 48(%1),  8(%1), 24(%1), 40(%1), 56(%1), (%2))
 
     Transpose_mmx (64(%0), 80(%0), 96(%0),112(%0), 72(%0), 88(%0),104(%0),120(%0),
-		   64(%1), 80(%1), 96(%1),112(%1), 72(%1), 88(%1),104(%1),120(%1))
+                   64(%1), 80(%1), 96(%1),112(%1), 72(%1), 88(%1),104(%1),120(%1))
     Fdct_mmx      (64(%1), 80(%1), 96(%1),112(%1), 72(%1), 88(%1),104(%1),120(%1), (%2))
 
     Transpose_mmx ( 0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1),
-		    0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1))
+                    0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1))
     Fdct_mmx      ( 0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1), (%2))
 
     Transpose_mmx ( 8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1),
-		    8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1))
+                    8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1))
     Fdct_mmx      ( 8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1), (%2))
 
     "  emms                         \n\t"
-    
+
     : "+r" (InputData),
       "+r" (OutputData)
     : "r" (temp),
@@ -336,7 +336,6 @@
 /* install our implementation in the function table */
 void dsp_mmx_fdct_init(DspFunctions *funcs)
 {
-  TH_DEBUG("enabling accelerated x86_64 mmx fdct function.\n");
   funcs->fdct_short = fdct_short__mmx;
 }
 

Modified: trunk/theora/lib/enc/x86_64/recon_mmx.c
===================================================================
--- trunk/theora/lib/enc/x86_64/recon_mmx.c	2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/enc/x86_64/recon_mmx.c	2008-08-04 18:37:55 UTC (rev 15153)
@@ -37,14 +37,14 @@
     "  movq        (%1, %2, 2), %%mm2  \n\t"
     "  movq        (%1, %%rdi), %%mm3  \n\t"
 
-    "  lea         (%1, %2, 4), %1     \n\t" 
+    "  lea         (%1, %2, 4), %1     \n\t"
 
     "  movq        %%mm0, (%0)         \n\t"
     "  movq        %%mm1, (%0, %2)     \n\t"
     "  movq        %%mm2, (%0, %2, 2)  \n\t"
     "  movq        %%mm3, (%0, %%rdi)  \n\t"
 
-    "  lea         (%0, %2, 4), %0     \n\t" 
+    "  lea         (%0, %2, 4), %0     \n\t"
 
     "  movq        (%1), %%mm0         \n\t"
     "  movq        (%1, %2), %%mm1     \n\t"
@@ -71,11 +71,11 @@
     "  movq        %[V128], %%mm0      \n\t" /* Set mm0 to 0x8080808080808080 */
 
     "  lea         128(%1), %%rdi      \n\t" /* Endpoint in input buffer */
-    "1:                                \n\t" 
+    "1:                                \n\t"
     "  movq         (%1), %%mm2        \n\t" /* First four input values */
 
     "  packsswb    8(%1), %%mm2        \n\t" /* pack with next(high) four values */
-    "  por         %%mm0, %%mm0        \n\t" 
+    "  por         %%mm0, %%mm0        \n\t"
     "  pxor        %%mm0, %%mm2        \n\t" /* Convert result to unsigned (same as add 128) */
     "  lea         16(%1), %1          \n\t" /* Step source buffer */
     "  cmp         %%rdi, %1           \n\t" /* are we done */
@@ -175,7 +175,6 @@
 
 void dsp_mmx_recon_init(DspFunctions *funcs)
 {
-  TH_DEBUG("enabling accelerated x86_64 mmx recon functions.\n");
   funcs->copy8x8 = copy8x8__mmx;
   funcs->recon_intra8x8 = recon_intra8x8__mmx;
   funcs->recon_inter8x8 = recon_inter8x8__mmx;

Modified: trunk/theora/lib/internal.h
===================================================================
--- trunk/theora/lib/internal.h	2008-08-04 12:43:26 UTC (rev 15152)
+++ trunk/theora/lib/internal.h	2008-08-04 18:37:55 UTC (rev 15153)
@@ -27,16 +27,6 @@
 # include "dec/huffman.h"
 # include "dec/quant.h"
 
-/* debug macros */
-#ifdef _TH_DEBUG_
-#include <stdio.h>
-extern long dframe;
-extern FILE *debugout;
-#define TH_DEBUG(...) fprintf(debugout, __VA_ARGS__)
-#else
-#define TH_DEBUG(...)
-#endif
-
 /*Thank you Microsoft, I know the order of operations.*/
 # if defined(_MSC_VER)
 #  pragma warning(disable:4554) /* order of operations */
@@ -238,14 +228,6 @@
   oc_border_info *border;
   /*The motion vector used for this fragment.*/
   oc_mv           mv;
-
-#ifdef _TH_DEBUG_
-  int quant[64];
-  int freq[64];
-  int time[64];
-  int recon[64];
-  int loop[64];
-#endif
 }oc_fragment;
 
 
@@ -296,77 +278,77 @@
 /*Common state information between the encoder and decoder.*/
 struct oc_theora_state{
   /*The stream information.*/
-  th_info           info;
+  th_info             info;
   /*Table for shared accelerated functions.*/
-  oc_base_opt_vtable    opt_vtable;
+  oc_base_opt_vtable  opt_vtable;
   /*CPU flags to detect the presence of extended instruction sets.*/
-  ogg_uint32_t          cpu_flags;
+  ogg_uint32_t        cpu_flags;
   /*The fragment plane descriptions.*/
-  oc_fragment_plane     fplanes[3];
+  oc_fragment_plane   fplanes[3];
   /*The total number of fragments in a single frame.*/
-  int                   nfrags;
+  int                 nfrags;
   /*The list of fragments, indexed in image order.*/
-  oc_fragment          *frags;
+  oc_fragment        *frags;
   /*The total number of super blocks in a single frame.*/
-  int                   nsbs;
+  int                 nsbs;
   /*The list of super blocks, indexed in image order.*/
-  oc_sb                *sbs;
+  oc_sb              *sbs;
   /*The number of macro blocks in the X direction.*/
-  int                   nhmbs;
+  int                 nhmbs;
   /*The number of macro blocks in the Y direction.*/
-  int                   nvmbs;
+  int                 nvmbs;
   /*The total number of macro blocks.*/
-  int                   nmbs;
+  int                 nmbs;
   /*The list of macro blocks, indexed in super block order.
     That is, the macro block corresponding to the macro block mbi in (luma
      plane) super block sbi is (sbi<<2|mbi).*/
-  oc_mb                *mbs;
+  oc_mb              *mbs;
   /*The list of coded fragments, in coded order.*/
-  int                  *coded_fragis;
+  int                *coded_fragis;
   /*The number of coded fragments in each plane.*/
-  int                   ncoded_fragis[3];
+  int                 ncoded_fragis[3];
   /*The list of uncoded fragments.
     This just past the end of the list, which is in reverse order, and
      uses the same block of allocated storage as the coded_fragis list.*/
-  int                  *uncoded_fragis;
+  int                *uncoded_fragis;
   /*The number of uncoded fragments in each plane.*/
-  int                   nuncoded_fragis[3];
+  int                 nuncoded_fragis[3];
   /*The list of coded macro blocks in the Y plane, in coded order.*/
-  int                  *coded_mbis;
+  int                *coded_mbis;
   /*The number of coded macro blocks in the Y plane.*/
-  int                   ncoded_mbis;
+  int                 ncoded_mbis;
   /*A copy of the image data used to fill the input pointers in each fragment.
     If the data pointers or strides change, these input pointers must be
      re-populated.*/
-  th_ycbcr_buffer   input;
+  th_ycbcr_buffer     input;
   /*The number of unique border patterns.*/
-  int                   nborders;
+  int                 nborders;
   /*The storage for the border info for all border fragments.
     This data is pointed to from the appropriate fragments.*/
-  oc_border_info        borders[16];
+  oc_border_info      borders[16];
   /*The index of the buffers being used for each OC_FRAME_* reference frame.*/
-  int                   ref_frame_idx[3];
+  int                 ref_frame_idx[3];
   /*The actual buffers used for the previously decoded frames.*/
-  th_ycbcr_buffer   ref_frame_bufs[3];
+  th_ycbcr_buffer     ref_frame_bufs[3];
   /*The storage for the reference frame buffers.*/
-  unsigned char        *ref_frame_data;
+  unsigned char      *ref_frame_data;
   /*The frame number of the last keyframe.*/
-  ogg_int64_t           keyframe_num;
+  ogg_int64_t         keyframe_num;
   /*The frame number of the current frame.*/
-  ogg_int64_t           curframe_num;
+  ogg_int64_t         curframe_num;
   /*The granpos of the current frame.*/
-  ogg_int64_t           granpos;
+  ogg_int64_t         granpos;
   /*The type of the current frame.*/
-  int                   frame_type;
+  int                 frame_type;
   /*The quality indices of the current frame.*/
-  int                   qis[3];
+  int                 qis[3];
   /*The number of quality indices used in the current frame.*/
-  int                   nqis;
+  int                 nqis;
   /*The dequantization tables.*/
-  oc_quant_table       *dequant_tables[2][3];
-  oc_quant_tables       dequant_table_data[2][3];
+  oc_quant_table     *dequant_tables[2][3];
+  oc_quant_tables     dequant_table_data[2][3];
   /*Loop filter strength parameters.*/
-  unsigned char         loop_filter_limits[64];
+  unsigned char       loop_filter_limits[64];
 };