[xiph-commits] r11560 - in branches/theora-playtime: lib lib/x86_32_vs win32/VS2005/encoder_example win32/VS2005/libtheora

Sun Jun 11 10:57:25 PDT 2006

Author: illiminable
Date: 2006-06-11 10:56:24 -0700 (Sun, 11 Jun 2006)
New Revision: 11560

Added:
   branches/theora-playtime/lib/x86_32_vs/quant_sse2.c
Modified:
   branches/theora-playtime/lib/codec_internal.h
   branches/theora-playtime/lib/dct_decode.c
   branches/theora-playtime/lib/dct_encode.c
   branches/theora-playtime/lib/dsp.c
   branches/theora-playtime/lib/dsp.h
   branches/theora-playtime/lib/encoder_toplevel.c
   branches/theora-playtime/lib/idct.c
   branches/theora-playtime/lib/quant.c
   branches/theora-playtime/lib/reconstruct.c
   branches/theora-playtime/lib/scan.c
   branches/theora-playtime/lib/x86_32_vs/perf_helper.h
   branches/theora-playtime/lib/x86_32_vs/scan_sse2.c
   branches/theora-playtime/win32/VS2005/encoder_example/encoder_example.vcproj
   branches/theora-playtime/win32/VS2005/libtheora/libtheora.vcproj
Log:
* Force alignement of arrays in pb_instance, and alignment of all structs containing pb_instances
* Add a macro, so you can have asm enabled, but sse2 forceably disabled, ie just mmx if it's available
* sse2 implementation of quantize (still slightly incorrect, and not that much faster)
* Alignments of dct_codes



Modified: branches/theora-playtime/lib/codec_internal.h
===================================================================

--- branches/theora-playtime/lib/codec_internal.h	2006-06-11 04:26:48 UTC (rev 11559)
+++ branches/theora-playtime/lib/codec_internal.h	2006-06-11 17:56:24 UTC (rev 11560)
@@ -453,9 +453,9 @@
 
   ogg_int16_t   *DequantBuffer;
 
-  ogg_int32_t    fp_quant_InterUV_coeffs[64];
-  ogg_int32_t    fp_quant_InterUV_round[64];
-  ogg_int32_t    fp_ZeroBinSize_InterUV[64];
+  __declspec(align(16)) ogg_int32_t    fp_quant_InterUV_coeffs[64];
+  __declspec(align(16)) ogg_int32_t    fp_quant_InterUV_round[64];
+  __declspec(align(16)) ogg_int32_t    fp_ZeroBinSize_InterUV[64];
 
   ogg_int16_t   *TmpReconBuffer;
   ogg_int16_t   *TmpDataBuffer;
@@ -467,12 +467,12 @@
   /* Dequantiser and rounding tables */
   ogg_uint32_t   QThreshTable[Q_TABLE_SIZE];
   Q_LIST_ENTRY   DcScaleFactorTable[Q_TABLE_SIZE];
-  Q_LIST_ENTRY   Y_coeffs[64];
-  Q_LIST_ENTRY   U_coeffs[64];
-  Q_LIST_ENTRY   V_coeffs[64];
-  Q_LIST_ENTRY   InterY_coeffs[64];
-  Q_LIST_ENTRY   InterU_coeffs[64];
-  Q_LIST_ENTRY   InterV_coeffs[64];
+  __declspec(align(16)) Q_LIST_ENTRY   Y_coeffs[64];
+  __declspec(align(16)) Q_LIST_ENTRY   U_coeffs[64];
+  __declspec(align(16)) Q_LIST_ENTRY   V_coeffs[64];
+  __declspec(align(16)) Q_LIST_ENTRY   InterY_coeffs[64];
+  __declspec(align(16)) Q_LIST_ENTRY   InterU_coeffs[64];
+  __declspec(align(16)) Q_LIST_ENTRY   InterV_coeffs[64];
   Q_LIST_ENTRY  *dequant_Y_coeffs;
   Q_LIST_ENTRY  *dequant_U_coeffs;
   Q_LIST_ENTRY  *dequant_V_coeffs;
@@ -480,10 +480,10 @@
   Q_LIST_ENTRY  *dequant_InterU_coeffs;
   Q_LIST_ENTRY  *dequant_InterV_coeffs;
   Q_LIST_ENTRY  *dequant_coeffs;        /* currently active quantizer */
-  unsigned int   zigzag_index[64];
-  ogg_int32_t    quant_Y_coeffs[64];
-  ogg_int32_t    quant_UV_coeffs[64];
-  ogg_int32_t    fp_quant_Y_coeffs[64]; /* used in reiniting quantizers */
+  __declspec(align(16)) unsigned int   zigzag_index[64];
+  __declspec(align(16)) ogg_int32_t    quant_Y_coeffs[64];
+  __declspec(align(16)) ogg_int32_t    quant_UV_coeffs[64];
+  __declspec(align(16)) ogg_int32_t    fp_quant_Y_coeffs[64]; /* used in reiniting quantizers */
 
   HUFF_ENTRY    *HuffRoot_VP3x[NUM_HUFF_TABLES];
   ogg_uint32_t  *HuffCodeArray_VP3x[NUM_HUFF_TABLES];
@@ -491,14 +491,14 @@
   const unsigned char *ExtraBitLengths_VP3x;
 
   /* Quantiser and rounding tables */
-  ogg_int32_t    fp_quant_UV_coeffs[64];
-  ogg_int32_t    fp_quant_Inter_coeffs[64];
-  ogg_int32_t    fp_quant_Y_round[64];
-  ogg_int32_t    fp_quant_UV_round[64];
-  ogg_int32_t    fp_quant_Inter_round[64];
-  ogg_int32_t    fp_ZeroBinSize_Y[64];
-  ogg_int32_t    fp_ZeroBinSize_UV[64];
-  ogg_int32_t    fp_ZeroBinSize_Inter[64];
+  __declspec(align(16)) ogg_int32_t    fp_quant_UV_coeffs[64];
+  __declspec(align(16)) ogg_int32_t    fp_quant_Inter_coeffs[64];
+  __declspec(align(16)) ogg_int32_t    fp_quant_Y_round[64];
+  __declspec(align(16)) ogg_int32_t    fp_quant_UV_round[64];
+  __declspec(align(16)) ogg_int32_t    fp_quant_Inter_round[64];
+  __declspec(align(16)) ogg_int32_t    fp_ZeroBinSize_Y[64];
+  __declspec(align(16)) ogg_int32_t    fp_ZeroBinSize_UV[64];
+  __declspec(align(16)) ogg_int32_t    fp_ZeroBinSize_Inter[64];
   ogg_int32_t   *fquant_coeffs;
   ogg_int32_t   *fquant_round;
   ogg_int32_t   *fquant_ZbSize;
@@ -688,8 +688,12 @@
 
   /* instances (used for reconstructing buffers and to hold tokens etc.) */
   PP_INSTANCE       pp;   /* preprocessor */
-  PB_INSTANCE       pb;   /* playback */
 
+  /* ZEN::: Since there are things in PB_INSTANCE that are 16 byte aligned, we
+        have to make sure the base address is also 16 byte aligned or the alignment
+        won't work */
+  __declspec(align(16))PB_INSTANCE       pb;   /* playback */
+
   /* ogg bitpacker for use in packet coding, other API state */
   oggpack_buffer   *oggbuffer;
 #ifdef LIBOGG2  /* Remember, this is just until we drop libogg1 */

Modified: branches/theora-playtime/lib/dct_decode.c
===================================================================
--- branches/theora-playtime/lib/dct_decode.c	2006-06-11 04:26:48 UTC (rev 11559)
+++ branches/theora-playtime/lib/dct_decode.c	2006-06-11 17:56:24 UTC (rev 11560)
@@ -1242,9 +1242,10 @@
   //if (cpu_flags & CPU_X86_MMX) {
   //  dsp_mmx_idct_init(funcs);
   //}
-
+#ifndef USE_NO_SSE2
   if (cpu_flags & CPU_X86_SSE2) {
     dsp_sse2_dct_decode_init(funcs);
   }
 #endif
+#endif
 }

Modified: branches/theora-playtime/lib/dct_encode.c
===================================================================
--- branches/theora-playtime/lib/dct_encode.c	2006-06-11 04:26:48 UTC (rev 11559)
+++ branches/theora-playtime/lib/dct_encode.c	2006-06-11 17:56:24 UTC (rev 11560)
@@ -450,7 +450,7 @@
   dsp_fdct_short(cpi->dsp, cpi->DCTDataBuffer, cpi->DCT_codes );
 
   /* Quantize that transform data. */
-  quantize ( &cpi->pb, cpi->DCT_codes, cpi->pb.QFragData[FragIndex] );
+  dsp_quant_quantize (cpi->dsp, &cpi->pb, cpi->DCT_codes, cpi->pb.QFragData[FragIndex] );
 
   if ( (cpi->pb.CodingMode == CODE_INTER_NO_MV) &&
        ( AllZeroDctData(cpi->pb.QFragData[FragIndex]) ) ) {

Modified: branches/theora-playtime/lib/dsp.c
===================================================================
--- branches/theora-playtime/lib/dsp.c	2006-06-11 04:26:48 UTC (rev 11559)
+++ branches/theora-playtime/lib/dsp.c	2006-06-11 17:56:24 UTC (rev 11560)
@@ -412,6 +412,7 @@
   dsp_idct_init (funcs, cpuflags);
   dsp_dct_decode_init(funcs, cpuflags);
   dsp_scan_init(funcs, cpuflags);
+  dsp_quant_init(funcs, cpuflags);
 #if defined(USE_ASM)
   if (cpuflags & CPU_X86_MMX) {
     dsp_mmx_init(funcs);
@@ -423,12 +424,12 @@
     dsp_mmxext_init(funcs);
   }
 # endif
-
+#ifndef USE_NO_SSE2
   if (cpuflags & CPU_X86_SSE2) {
     dsp_sse2_init(funcs);
   }
+#endif
 
-
 #endif
 }
 

Modified: branches/theora-playtime/lib/dsp.h
===================================================================
--- branches/theora-playtime/lib/dsp.h	2006-06-11 04:26:48 UTC (rev 11559)
+++ branches/theora-playtime/lib/dsp.h	2006-06-11 17:56:24 UTC (rev 11560)
@@ -22,7 +22,10 @@
 
 
 struct PP_INSTANCE;
+struct PB_INSTANCE;
 
+//typedef ogg_int16_t Q_LIST_ENTRY;
+
 typedef unsigned long int ogg_uint64_t;
 
 typedef struct
@@ -96,12 +99,12 @@
              ogg_int16_t *QuantMatrix,
              ogg_int16_t * OutputData );
 
-  void (*IDctSlow)(  ogg_int16_t/*Q_LIST_ENTRY*/ * InputData,
+  void (*IDctSlow)(  ogg_int16_t /*Q_LIST_ENTRY*/ * InputData,
                 ogg_int16_t *QuantMatrix,
                 ogg_int16_t * OutputData );
 
   void (*dequant_slow)( ogg_int16_t * dequant_coeffs,
-                   ogg_int16_t * quantized_list,
+                   ogg_int16_t /*Q_LIST_ENTRY*/ * quantized_list,
                    ogg_int32_t * DCT_block);
 
   /* dct_decode */
@@ -114,18 +117,24 @@
                 ogg_int32_t *BoundingValuePtr);
 
 
-  void (*RowDiffScan)( struct PP_INSTANCE *ppi,
-                         unsigned char * YuvPtr1,
-                         unsigned char * YuvPtr2,
-                         ogg_int16_t   * YUVDiffsPtr,
-                         unsigned char * bits_map_ptr,
-                         signed char   * SgcPtr,
-                         signed char   * DispFragPtr,
-                         unsigned char * FDiffPixels,
-                         ogg_int32_t   * RowDiffsPtr,
+  /* Scan */
+  void (*RowDiffScan)( struct PP_INSTANCE *ppi,
+                         unsigned char * YuvPtr1,
+                         unsigned char * YuvPtr2,
+                         ogg_int16_t   * YUVDiffsPtr,
+                         unsigned char * bits_map_ptr,
+                         signed char   * SgcPtr,
+                         signed char   * DispFragPtr,
+                         unsigned char * FDiffPixels,
+                         ogg_int32_t   * RowDiffsPtr,
                          unsigned char * ChLocalsPtr, int EdgeRow );
 
+  /* Quant */
+  void (*quantize)( struct PB_INSTANCE *pbi,
+               ogg_int16_t * DCT_block,
+               ogg_int16_t /*Q_LIST_ENTRY*/ * quantized_list);
 
+
 } DspFunctions;
 
 extern void dsp_dct_init(DspFunctions *funcs, ogg_uint32_t cpu_flags);
@@ -133,6 +142,7 @@
 extern void dsp_idct_init (DspFunctions *funcs, ogg_uint32_t cpu_flags);
 extern void dsp_dct_decode_init (DspFunctions *funcs, ogg_uint32_t cpu_flags);
 extern void dsp_scan_init (DspFunctions *funcs, ogg_uint32_t cpu_flags);
+extern void dsp_quant_init (DspFunctions *funcs, ogg_uint32_t cpu_flags);
 
 
 void dsp_init(DspFunctions *funcs);
@@ -148,6 +158,7 @@
 extern void dsp_sse2_idct_init(DspFunctions *funcs);
 extern void dsp_sse2_dct_decode_init(DspFunctions *funcs);
 extern void dsp_sse2_scan_init(DspFunctions *funcs);
+extern void dsp_sse2_quant_init(DspFunctions *funcs);
 
 #endif
 
@@ -216,5 +227,8 @@
 #define dsp_scan_row_diff_scan(funcs, ptr1, ptr2, ptr3, ptr4, ptr5, ptr6, ptr7, ptr8, ptr9, ptr10, a1)  \
             (funcs.RowDiffScan(ptr1, ptr2, ptr3, ptr4, ptr5, ptr6, ptr7, ptr8, ptr9, ptr10, a1))
 
+#define dsp_quant_quantize(funcs, ptr1, ptr2, ptr3) \
+            (funcs.quantize(ptr1, ptr2, ptr3))
 
+
 #endif /* DSP_H */

Modified: branches/theora-playtime/lib/encoder_toplevel.c
===================================================================
--- branches/theora-playtime/lib/encoder_toplevel.c	2006-06-11 04:26:48 UTC (rev 11559)
+++ branches/theora-playtime/lib/encoder_toplevel.c	2006-06-11 17:56:24 UTC (rev 11560)
@@ -47,7 +47,7 @@
   if(cpi->MVList)
     _ogg_free(cpi->MVList);
   if(cpi->DCT_codes )
-    _ogg_free( cpi->DCT_codes );
+    _theora_16_byte_aligned_free( cpi->DCT_codes );
   if(cpi->DCTDataBuffer )
     _theora_16_byte_aligned_free( cpi->DCTDataBuffer);
   if(cpi->quantized_list)
@@ -137,7 +137,7 @@
     _ogg_malloc(cpi->pb.UnitFragments*
                 sizeof(cpi->MVList));
   cpi->DCT_codes =
-    _ogg_malloc(64*
+    _theora_16_byte_aligned_malloc(64*
                 sizeof(*cpi->DCT_codes));
   cpi->DCTDataBuffer =
     _theora_16_byte_aligned_malloc(64*
@@ -777,7 +777,12 @@
   memset(th, 0, sizeof(*th));
   /*Currently only the 4:2:0 format is supported.*/
   if(c->pixelformat!=OC_PF_420)return OC_IMPL;
-  th->internal_encode=cpi=_ogg_calloc(1,sizeof(*cpi));
+  
+  /* th->internal_encode=cpi=_ogg_calloc(1,sizeof(*cpi)); */
+  /* ZEN::: Need 16 byte alignment and calloc doesn't cut it */
+  th->internal_encode=cpi=_theora_16_byte_aligned_malloc(sizeof(*cpi));
+  memset(cpi, 0, sizeof(*cpi));
+  /* Equivalent of calloc */
 
   dsp_static_init (&cpi->dsp);
   memcpy (&cpi->pb.dsp, &cpi->dsp, sizeof(DspFunctions));
@@ -1216,6 +1221,7 @@
     
     oggpackB_writeclear(cpi->oggbuffer);
     _ogg_free(cpi->oggbuffer);
-    _ogg_free(cpi);
+    /*_ogg_free(cpi); */
+    _theora_16_byte_aligned_free(cpi);
   }
 }

Modified: branches/theora-playtime/lib/idct.c
===================================================================
--- branches/theora-playtime/lib/idct.c	2006-06-11 04:26:48 UTC (rev 11559)
+++ branches/theora-playtime/lib/idct.c	2006-06-11 17:56:24 UTC (rev 11560)
@@ -572,9 +572,10 @@
   //if (cpu_flags & CPU_X86_MMX) {
   //  dsp_mmx_idct_init(funcs);
   //}
-
+#ifndef USE_NO_SSE2
   if (cpu_flags & CPU_X86_SSE2) {
     dsp_sse2_idct_init(funcs);
   }
 #endif
+#endif
 }
\ No newline at end of file

Modified: branches/theora-playtime/lib/quant.c
===================================================================
--- branches/theora-playtime/lib/quant.c	2006-06-11 04:26:48 UTC (rev 11559)
+++ branches/theora-playtime/lib/quant.c	2006-06-11 17:56:24 UTC (rev 11560)
@@ -17,6 +17,8 @@
 
 #include <stdlib.h>
 #include <string.h>
+#include "dsp.h"
+#include "cpu.h"
 #include "codec_internal.h"
 #include "quant_lookup.h"
 
@@ -560,7 +562,7 @@
   pbi->fquant_ZbSize = pbi->fp_ZeroBinSize_InterUV;
 }
 
-void quantize( PB_INSTANCE *pbi,
+void quantize__c( PB_INSTANCE *pbi,
                ogg_int16_t * DCT_block,
                Q_LIST_ENTRY * quantized_list){
   ogg_uint32_t  i;              /* Row index */
@@ -871,3 +873,22 @@
   init_quantizer ( cpi, qscale, (unsigned char) pbi->FrameQIndex );
   init_dequantizer ( pbi, qscale, (unsigned char) pbi->FrameQIndex );
 }
+
+void dsp_quant_init (DspFunctions *funcs, ogg_uint32_t cpu_flags)
+{
+    funcs->quantize = quantize__c;
+  //funcs->copy8x8 = copy8x8__c;
+  //funcs->recon_intra8x8 = recon_intra8x8__c;
+  //funcs->recon_inter8x8 = recon_inter8x8__c;
+  //funcs->recon_inter8x8_half = recon_inter8x8_half__c;
+#if defined(USE_ASM)
+  //if (cpu_flags & CPU_X86_MMX) {
+  //  dsp_mmx_scan_init(funcs);
+  //}
+#ifndef USE_NO_SSE2
+  if (cpu_flags & CPU_X86_SSE2) {
+    dsp_sse2_quant_init(funcs);
+  }
+#endif
+#endif
+}
\ No newline at end of file

Modified: branches/theora-playtime/lib/reconstruct.c
===================================================================
--- branches/theora-playtime/lib/reconstruct.c	2006-06-11 04:26:48 UTC (rev 11559)
+++ branches/theora-playtime/lib/reconstruct.c	2006-06-11 17:56:24 UTC (rev 11560)
@@ -108,9 +108,10 @@
   if (cpu_flags & CPU_X86_MMX) {
     dsp_mmx_recon_init(funcs);
   }
-
+#ifndef USE_NO_SSE2
   if (cpu_flags & CPU_X86_SSE2) {
     dsp_sse2_recon_init(funcs);
   }
 #endif
+#endif
 }

Modified: branches/theora-playtime/lib/scan.c
===================================================================
--- branches/theora-playtime/lib/scan.c	2006-06-11 04:26:48 UTC (rev 11559)
+++ branches/theora-playtime/lib/scan.c	2006-06-11 17:56:24 UTC (rev 11560)
@@ -2313,9 +2313,10 @@
   //if (cpu_flags & CPU_X86_MMX) {
   //  dsp_mmx_scan_init(funcs);
   //}
-
+#ifndef USE_NO_SSE2
   if (cpu_flags & CPU_X86_SSE2) {
     dsp_sse2_scan_init(funcs);
   }
 #endif
+#endif
 }

Modified: branches/theora-playtime/lib/x86_32_vs/perf_helper.h
===================================================================
--- branches/theora-playtime/lib/x86_32_vs/perf_helper.h	2006-06-11 04:26:48 UTC (rev 11559)
+++ branches/theora-playtime/lib/x86_32_vs/perf_helper.h	2006-06-11 17:56:24 UTC (rev 11560)
@@ -16,7 +16,7 @@
 */
 
 extern unsigned __int64 GetCPUTime();
-#define PERF_DATA_ON
+//#define PERF_DATA_ON
 #ifdef PERF_DATA_ON
 
 

Added: branches/theora-playtime/lib/x86_32_vs/quant_sse2.c
===================================================================
--- branches/theora-playtime/lib/x86_32_vs/quant_sse2.c	2006-06-11 04:26:48 UTC (rev 11559)
+++ branches/theora-playtime/lib/x86_32_vs/quant_sse2.c	2006-06-11 17:56:24 UTC (rev 11560)
@@ -0,0 +1,453 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2005                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+  last mod: $Id: quant.c 11442 2006-05-27 17:28:08Z giles $
+
+ ********************************************************************/
+
+#include <stdlib.h>
+#include <string.h>
+#include "codec_internal.h"
+#include "quant_lookup.h"
+
+#include "perf_helper.h"
+
+static unsigned __int64 perf_quant_time;
+static unsigned __int64 perf_quant_min;
+static unsigned __int64 perf_quant_count;
+
+
+void quantize__sse2( PB_INSTANCE *pbi,
+               ogg_int16_t * DCT_block,
+               Q_LIST_ENTRY * quantized_list){
+
+#if 0
+  ogg_uint32_t  i;              /* Row index */
+  Q_LIST_ENTRY  val;            /* Quantised value. */
+
+  ogg_int32_t * FquantRoundPtr = pbi->fquant_round;
+  ogg_int32_t * FquantCoeffsPtr = pbi->fquant_coeffs;
+  ogg_int32_t * FquantZBinSizePtr = pbi->fquant_ZbSize;
+  ogg_int16_t * DCT_blockPtr = DCT_block;
+  ogg_uint32_t * ZigZagPtr = (ogg_uint32_t *)pbi->zigzag_index;
+  ogg_int32_t temp;
+
+  PERF_BLOCK_START();
+  /* Set the quantized_list to default to 0 */
+  memset( quantized_list, 0, 64 * sizeof(Q_LIST_ENTRY) );
+
+  /* Note that we add half divisor to effect rounding on positive number */
+  for( i = 0; i < VFRAGPIXELS; i++) {
+    /* Column 0  */
+    if ( DCT_blockPtr[0] >= FquantZBinSizePtr[0] ) {
+      temp = FquantCoeffsPtr[0] * ( DCT_blockPtr[0] + FquantRoundPtr[0] ) ;
+      val = (Q_LIST_ENTRY) (temp>>16);
+      quantized_list[ZigZagPtr[0]] = ( val > 511 ) ? 511 : val;
+    } else if ( DCT_blockPtr[0] <= -FquantZBinSizePtr[0] ) {
+      temp = FquantCoeffsPtr[0] *
+        ( DCT_blockPtr[0] - FquantRoundPtr[0] ) + MIN16;
+      val = (Q_LIST_ENTRY) (temp>>16);
+      quantized_list[ZigZagPtr[0]] = ( val < -511 ) ? -511 : val;
+    }
+
+    /* Column 1 */
+    if ( DCT_blockPtr[1] >= FquantZBinSizePtr[1] ) {
+      temp = FquantCoeffsPtr[1] *
+        ( DCT_blockPtr[1] + FquantRoundPtr[1] ) ;
+      val = (Q_LIST_ENTRY) (temp>>16);
+      quantized_list[ZigZagPtr[1]] = ( val > 511 ) ? 511 : val;
+    } else if ( DCT_blockPtr[1] <= -FquantZBinSizePtr[1] ) {
+      temp = FquantCoeffsPtr[1] *
+        ( DCT_blockPtr[1] - FquantRoundPtr[1] ) + MIN16;
+      val = (Q_LIST_ENTRY) (temp>>16);
+      quantized_list[ZigZagPtr[1]] = ( val < -511 ) ? -511 : val;
+    }
+
+    /* Column 2 */
+    if ( DCT_blockPtr[2] >= FquantZBinSizePtr[2] ) {
+      temp = FquantCoeffsPtr[2] *
+        ( DCT_blockPtr[2] + FquantRoundPtr[2] ) ;
+      val = (Q_LIST_ENTRY) (temp>>16);
+      quantized_list[ZigZagPtr[2]] = ( val > 511 ) ? 511 : val;
+    } else if ( DCT_blockPtr[2] <= -FquantZBinSizePtr[2] ) {
+      temp = FquantCoeffsPtr[2] *
+        ( DCT_blockPtr[2] - FquantRoundPtr[2] ) + MIN16;
+      val = (Q_LIST_ENTRY) (temp>>16);
+      quantized_list[ZigZagPtr[2]] = ( val < -511 ) ? -511 : val;
+    }
+
+    /* Column 3 */
+    if ( DCT_blockPtr[3] >= FquantZBinSizePtr[3] ) {
+      temp = FquantCoeffsPtr[3] *
+        ( DCT_blockPtr[3] + FquantRoundPtr[3] ) ;
+      val = (Q_LIST_ENTRY) (temp>>16);
+      quantized_list[ZigZagPtr[3]] = ( val > 511 ) ? 511 : val;
+    } else if ( DCT_blockPtr[3] <= -FquantZBinSizePtr[3] ) {
+      temp = FquantCoeffsPtr[3] *
+        ( DCT_blockPtr[3] - FquantRoundPtr[3] ) + MIN16;
+      val = (Q_LIST_ENTRY) (temp>>16);
+      quantized_list[ZigZagPtr[3]] = ( val < -511 ) ? -511 : val;
+    }
+
+    /* Column 4 */
+    if ( DCT_blockPtr[4] >= FquantZBinSizePtr[4] ) {
+      temp = FquantCoeffsPtr[4] *
+        ( DCT_blockPtr[4] + FquantRoundPtr[4] ) ;
+      val = (Q_LIST_ENTRY) (temp>>16);
+      quantized_list[ZigZagPtr[4]] = ( val > 511 ) ? 511 : val;
+    } else if ( DCT_blockPtr[4] <= -FquantZBinSizePtr[4] ) {
+      temp = FquantCoeffsPtr[4] *
+        ( DCT_blockPtr[4] - FquantRoundPtr[4] ) + MIN16;
+      val = (Q_LIST_ENTRY) (temp>>16);
+      quantized_list[ZigZagPtr[4]] = ( val < -511 ) ? -511 : val;
+    }
+
+    /* Column 5 */
+    if ( DCT_blockPtr[5] >= FquantZBinSizePtr[5] ) {
+      temp = FquantCoeffsPtr[5] *
+        ( DCT_blockPtr[5] + FquantRoundPtr[5] ) ;
+      val = (Q_LIST_ENTRY) (temp>>16);
+      quantized_list[ZigZagPtr[5]] = ( val > 511 ) ? 511 : val;
+    } else if ( DCT_blockPtr[5] <= -FquantZBinSizePtr[5] ) {
+      temp = FquantCoeffsPtr[5] *
+        ( DCT_blockPtr[5] - FquantRoundPtr[5] ) + MIN16;
+      val = (Q_LIST_ENTRY) (temp>>16);
+      quantized_list[ZigZagPtr[5]] = ( val < -511 ) ? -511 : val;
+    }
+
+    /* Column 6 */
+    if ( DCT_blockPtr[6] >= FquantZBinSizePtr[6] ) {
+      temp = FquantCoeffsPtr[6] *
+        ( DCT_blockPtr[6] + FquantRoundPtr[6] ) ;
+      val = (Q_LIST_ENTRY) (temp>>16);
+      quantized_list[ZigZagPtr[6]] = ( val > 511 ) ? 511 : val;
+    } else if ( DCT_blockPtr[6] <= -FquantZBinSizePtr[6] ) {
+      temp = FquantCoeffsPtr[6] *
+        ( DCT_blockPtr[6] - FquantRoundPtr[6] ) + MIN16;
+      val = (Q_LIST_ENTRY) (temp>>16);
+      quantized_list[ZigZagPtr[6]] = ( val < -511 ) ? -511 : val;
+    }
+
+    /* Column 7 */
+    if ( DCT_blockPtr[7] >= FquantZBinSizePtr[7] ) {
+      temp = FquantCoeffsPtr[7] *
+        ( DCT_blockPtr[7] + FquantRoundPtr[7] ) ;
+      val = (Q_LIST_ENTRY) (temp>>16);
+      quantized_list[ZigZagPtr[7]] = ( val > 511 ) ? 511 : val;
+    } else if ( DCT_blockPtr[7] <= -FquantZBinSizePtr[7] ) {
+      temp = FquantCoeffsPtr[7] *
+        ( DCT_blockPtr[7] - FquantRoundPtr[7] ) + MIN16;
+      val = (Q_LIST_ENTRY) (temp>>16);
+      quantized_list[ZigZagPtr[7]] = ( val < -511 ) ? -511 : val;
+    }
+
+    FquantRoundPtr += 8;
+    FquantCoeffsPtr += 8;
+    FquantZBinSizePtr += 8;
+    DCT_blockPtr += 8;
+    ZigZagPtr += 8;
+  }
+
+  PERF_BLOCK_END("quantize C", perf_quant_time, perf_quant_count, perf_quant_min, 20000);
+
+#else
+static __declspec(align(16)) unsigned short Some511s[8] = { 0x01FF, 0x01FF, 0x01FF, 0x01FF, 0x01FF, 0x01FF, 0x01FF, 0x01FF };
+static unsigned short* Some511sPtr = Some511s;
+
+static __declspec(align(16)) unsigned char temp[128];
+static unsigned char* temp_ptr = temp;
+
+
+  ogg_int32_t * FquantRoundPtr = pbi->fquant_round;     /* These are not aligned for now */
+  ogg_int32_t * FquantCoeffsPtr = pbi->fquant_coeffs;   /* These are not aligned for now */
+  ogg_int32_t * FquantZBinSizePtr = pbi->fquant_ZbSize; /* These are not aligned for now */
+  ogg_int16_t * DCT_blockPtr = DCT_block;
+  ogg_uint32_t * ZigZagPtr = (ogg_uint32_t *)pbi->zigzag_index;
+  
+
+  PERF_BLOCK_START();
+ 
+
+  __asm {
+        align           16
+        
+        mov     edi, temp_ptr      
+        mov     esi, DCT_blockPtr
+        mov     eax, FquantRoundPtr
+        mov     edx, FquantCoeffsPtr
+        mov     ecx, Some511sPtr
+        
+        movdqa  xmm7, [ecx]
+        pcmpeqw xmm6, xmm6      /* All 1's */
+        mov     ecx, FquantZBinSizePtr
+
+            push    ebx
+            mov     ebx, 8/* Loop counter */
+           
+            
+
+        /* Set to 0, might be better to do twice as many modq's than to unaligned write? */
+
+        /* 128 bytes worth of 0's */
+        //movdqu  [edi], xmm0
+        //movdqu  [edi + 16], xmm0
+        //movdqu  [edi + 32], xmm0
+        //movdqu  [edi + 48], xmm0
+        //movdqu  [edi + 64], xmm0
+        //movdqu  [edi + 80], xmm0
+        //movdqu  [edi + 96], xmm0
+        //movdqu  [edi + 112], xmm0
+
+    read_loop_start:
+        pxor    xmm0, xmm0
+
+        /* Read all 8x16 bitsof the dct block */
+        movdqa  xmm1, [esi]
+
+
+        /* Load 8x32bits of the rounding values */
+        movdqa      xmm3, [eax]
+        movdqa      xmm4, [eax + 16]
+
+
+        /* Shrnk them back to 16 bits */
+        packssdw    xmm3, xmm4
+
+
+        /* Load 8x32bits of the coeff values */
+        movdqa      xmm4, [edx]
+        movdqa      xmm5, [edx + 16]
+
+        /* Shirnk the coeffs back to 16 bits */
+        packssdw    xmm4, xmm5
+
+
+        /* Add the rounding to the dct in one register */
+        movdqa      xmm2, xmm1
+        paddd       xmm1, xmm3
+
+        /* Subtract in another */
+        psubd       xmm2, xmm3
+
+        /* Multiply both suma nd diff by the coeffs, keeping only the high word 
+            (since in the C code we shift right by 16 bits ) */
+        pmulhw      xmm1, xmm4
+        pmulhw      xmm2, xmm4      /* TODO::: In the subtraction, have too do the round by adding 65535 */
+
+
+        /* Now need to do the gt checks and mask in the appropriate result */
+            /* Check the summed results to see if any are over 511 */
+
+                /* Duplicate the summed values */
+                movdqa      xmm5, xmm1
+
+                /* Compare each word to 511, any that are >511 will have their word set to all 1s */
+                pcmpgtw     xmm5, xmm7
+
+                /* CHECK INSERTED CODE, to save reloading xmm7, assumes xmm3 was not holding a value at this point */
+                movdqa      xmm3, xmm7
+
+                /* Use the mask created to make a register with 511 in every place the original sum was >511 and 0 elsewhere */
+                pand        xmm3, xmm5
+
+                /* Flip the bits in the mask */
+                pxor        xmm5, xmm6
+
+                /* Now register has all the values that were less than or eq to 511 intact, but every where else is 0 */
+                pand        xmm1, xmm5
+
+                /* Now combine all the vals lt or eq to 511, from the original sums, with the register that has 511
+                        in all the places where the value was >511. This effectively performs an upper bound clipping.
+                        So every value that was >511 is now set to 511 */
+                por         xmm1, xmm3
+
+            /* Now do similar for the differences, to clip any value less than -511 */
+                /* Duplicate the differenced values */
+                movdqa      xmm5, xmm2
+
+               
+                /* Subtract the 511s from 0 to get -511's */
+
+                psubw       xmm0, xmm7
+
+                /* See if -511 is greater than the value. If it is that word is all 1's. This is effectively
+                    a check to see if the value is less or equal to -511. Since the operation is the same for the
+                    equal case, it doesn't matter that we check for less or equal rather than just less than */
+
+
+                movdqa      xmm3, xmm0          /* mm3 is now a duplicate of the -511's */
+                pcmpgtw     xmm0, xmm5      
+
+                /* Create a mask on the -511s */
+                pand        xmm3, xmm0
+
+                /* Flip the bits in the mask */
+                pxor        xmm0, xmm6
+
+                /* Mask the values in the difference register */
+                pand        xmm2, xmm0
+
+                /* Combine them together */
+                por         xmm2, xmm3
+
+
+
+        /* By here, xmm1 contains the clipped 8 values of the sum, and xmm2 the clipped 8 values of the difference */
+
+        
+        pxor        xmm0, xmm0
+        //mov         eax, FquantZBinSizePtr
+
+        /* Load 8x32bits of the fquantzbin values */
+        movdqa      xmm3, [ecx]
+        movdqa      xmm4, [ecx + 16]
+
+
+        /* Shrnk them back to 16 bits */
+        packssdw    xmm3, xmm4
+
+        /* Load the DCT Block values 8x16 bits again */
+        movdqa      xmm4, [esi]
+
+        /* Find -Fqauntzbin by subtract it from 0 */
+        psubw       xmm0, xmm3
+
+        /* Check if fquantzbin is greater dct_block value. if it's not, then use the summed register value */
+        pcmpgtw     xmm3, xmm4
+
+
+        /* Flip the mask */
+        pxor        xmm3, xmm6
+
+        /* Copy the mask to save for later */
+        movdqa      xmm5, xmm3
+
+        /* And the summed value regsiter */
+        pand        xmm1, xmm3
+
+
+        /* Check if -fquantzbin is greater than dct_block. If it is, use the difference register value */
+        pcmpgtw     xmm0, xmm4
+
+        /* Or the mask with the other mask, so we have the combination, of all those using the sum and all those
+                using the difference, everything else will be set to zero later */
+        por         xmm5, xmm0
+
+        /* And the difference register to mask the appropriate values */
+        pand        xmm2, xmm0
+
+        /* Merge together the selected summed and differenced values */
+        por         xmm1, xmm2
+
+        /* Zero out everything that wasn't selected by the sum mask or the difference mask */
+        pand        xmm1, xmm5
+
+
+
+        /* Write these values out to the temp_space, after all 8 loops, 64 x 16 bit values are written,
+            later we can apply the zigzag write all at once */
+        movdqa      [edi], xmm1
+
+
+        /* Increment the pointer */
+        add         edi, 16     /* Temp output by 16 bytes */
+        add         esi, 16     /* DCT_Block by 16 bytes */
+        add         edx, 32     /* Fquantcoeffs by 32 bytes */
+        add         eax, 32     /* fquant round ptr by 32 bytes */
+        add         ecx, 32     /* fquant zbin by 32 bytes */
+
+    /* Update the loop variable */
+    sub         ebx, 1
+    jnz         read_loop_start
+        
+
+
+        /* Now read through the temp output space and write using the zigpag pointer values */
+        
+
+        mov     edx, quantized_list
+        mov     esi, ZigZagPtr
+        mov     ebx, 8
+        
+        /* Put the temp output back to the start of the block */
+        sub     edi, 128
+
+    write_loop_start:
+        mov     ax, WORD PTR [edi]
+        mov     ecx, [esi]
+        mov     WORD PTR [edx + ecx*2], ax
+
+        mov     ax, WORD PTR [edi+2]
+        mov     ecx, [esi+4]
+        mov     WORD PTR [edx + ecx*2], ax
+
+        mov     ax, WORD PTR [edi+4]
+        mov     ecx, [esi+8]
+        mov     WORD PTR [edx + ecx*2], ax
+
+        mov     ax, WORD PTR [edi+6]
+        mov     ecx, [esi+12]
+        mov     WORD PTR [edx + ecx*2], ax
+
+
+        mov     ax, WORD PTR [edi+8]
+        mov     ecx, [esi+16]
+        mov     WORD PTR [edx + ecx*2], ax
+
+        mov     ax, WORD PTR [edi+10]
+        mov     ecx, [esi+20]
+        mov     WORD PTR [edx + ecx*2], ax
+
+        mov     ax, WORD PTR [edi+12]
+        mov     ecx, [esi+24]
+        mov     WORD PTR [edx + ecx*2], ax
+
+        mov     ax, WORD PTR [edi+14]
+        mov     ecx, [esi+28]
+        mov     WORD PTR [edx + ecx*2], ax
+
+            /* Advance all the pointer */
+            add     esi, 32
+            add     edi, 16
+
+    /* Check the loop counter */
+    sub         ebx, 1
+    jnz         write_loop_start
+
+
+    /* Restore ebx */
+    pop         ebx
+
+  }
+
+  PERF_BLOCK_END("quantize sse2", perf_quant_time, perf_quant_count, perf_quant_min, 20000);
+
+#endif
+}
+
+
+void dsp_sse2_quant_init(DspFunctions *funcs)
+{
+#ifndef USE_NO_SSE2
+  TH_DEBUG("enabling accelerated x86_32 sse2 quant functions.\n");
+  perf_quant_time = 0;
+  perf_quant_min = -1;
+  perf_quant_count = 0;
+  funcs->quantize = quantize__sse2;
+
+#endif
+  //funcs->copy8x8 = copy8x8__sse2;
+  //funcs->recon_intra8x8 = recon_intra8x8__sse2;
+  //funcs->recon_inter8x8 = recon_inter8x8__sse2;
+  //funcs->recon_inter8x8_half = recon_inter8x8_half__sse2;
+}
\ No newline at end of file

Modified: branches/theora-playtime/lib/x86_32_vs/scan_sse2.c
===================================================================
--- branches/theora-playtime/lib/x86_32_vs/scan_sse2.c	2006-06-11 04:26:48 UTC (rev 11559)
+++ branches/theora-playtime/lib/x86_32_vs/scan_sse2.c	2006-06-11 17:56:24 UTC (rev 11560)
@@ -929,8 +929,5 @@
 {
   TH_DEBUG("enabling accelerated x86_32 sse2 scan functions.\n");
   funcs->RowDiffScan = RowDiffScan__sse2;
-  //funcs->copy8x8 = copy8x8__sse2;
-  //funcs->recon_intra8x8 = recon_intra8x8__sse2;
-  //funcs->recon_inter8x8 = recon_inter8x8__sse2;
-  //funcs->recon_inter8x8_half = recon_inter8x8_half__sse2;
+
 }
\ No newline at end of file

Modified: branches/theora-playtime/win32/VS2005/encoder_example/encoder_example.vcproj
===================================================================
--- branches/theora-playtime/win32/VS2005/encoder_example/encoder_example.vcproj	2006-06-11 04:26:48 UTC (rev 11559)
+++ branches/theora-playtime/win32/VS2005/encoder_example/encoder_example.vcproj	2006-06-11 17:56:24 UTC (rev 11560)
@@ -119,13 +119,14 @@
 				InlineFunctionExpansion="2"
 				EnableIntrinsicFunctions="true"
 				FavorSizeOrSpeed="1"
+				WholeProgramOptimization="false"
 				AdditionalIncludeDirectories="..\..\..\include;..\..\..\..\libogg\include;..\..\..\..\libvorbis\include"
 				PreprocessorDefinitions="WIN32;NDEBUG;_CONSOLE"
 				RuntimeLibrary="0"
 				UsePrecompiledHeader="0"
 				WarningLevel="4"
 				Detect64BitPortabilityProblems="true"
-				DebugInformationFormat="0"
+				DebugInformationFormat="3"
 			/>
 			<Tool
 				Name="VCManagedResourceCompilerTool"
@@ -139,7 +140,7 @@
 			<Tool
 				Name="VCLinkerTool"
 				LinkIncremental="1"
-				GenerateDebugInformation="false"
+				GenerateDebugInformation="true"
 				SubSystem="1"
 				OptimizeReferences="2"
 				EnableCOMDATFolding="2"

Modified: branches/theora-playtime/win32/VS2005/libtheora/libtheora.vcproj
===================================================================
--- branches/theora-playtime/win32/VS2005/libtheora/libtheora.vcproj	2006-06-11 04:26:48 UTC (rev 11559)
+++ branches/theora-playtime/win32/VS2005/libtheora/libtheora.vcproj	2006-06-11 17:56:24 UTC (rev 11560)
@@ -48,7 +48,7 @@
 				RuntimeLibrary="1"
 				EnableEnhancedInstructionSet="0"
 				UsePrecompiledHeader="0"
-				WarningLevel="3"
+				WarningLevel="4"
 				Detect64BitPortabilityProblems="true"
 				DebugInformationFormat="3"
 				DisableSpecificWarnings="4996"
@@ -129,6 +129,7 @@
 				EnableIntrinsicFunctions="true"
 				FavorSizeOrSpeed="1"
 				OmitFramePointers="true"
+				WholeProgramOptimization="false"
 				AdditionalIncludeDirectories="..\..\..\include;..\..\..\..\libogg\include;..\..\..\lib;G:\Dev\xiph\zens_sdk\lib\libtheora-playtime\lib\x86_32_vs"
 				PreprocessorDefinitions="WIN32;NDEBUG;_WINDOWS;_USRDLL;LIBTHEORA_EXPORTS; USE_ASM"
 				StringPooling="true"
@@ -157,7 +158,7 @@
 				OutputFile="$(OutDir)/libtheora.dll"
 				LinkIncremental="1"
 				ModuleDefinitionFile="..\..\libtheora.def"
-				GenerateDebugInformation="false"
+				GenerateDebugInformation="true"
 				SubSystem="2"
 				OptimizeReferences="2"
 				EnableCOMDATFolding="2"
@@ -490,6 +491,10 @@
 				>
 			</File>
 			<File
+				RelativePath="..\..\..\lib\x86_32_vs\quant_sse2.c"
+				>
+			</File>
+			<File
 				RelativePath="..\..\..\lib\x86_32_vs\recon_mmx.c"
 				>
 			</File>