[xiph-commits] r12826 - in trunk/theora: . lib lib/x86_32 lib/x86_64

Mon Apr 2 14:08:39 PDT 2007

Author: j
Date: 2007-04-02 14:08:28 -0700 (Mon, 02 Apr 2007)
New Revision: 12826

Added:
   trunk/theora/lib/x86_32/dct_decode_mmx.c
   trunk/theora/lib/x86_32/idct_mmx.c
   trunk/theora/lib/x86_64/dct_decode_mmx.c
   trunk/theora/lib/x86_64/idct_mmx.c
Modified:
   trunk/theora/configure.ac
   trunk/theora/lib/Makefile.am
   trunk/theora/lib/codec_internal.h
   trunk/theora/lib/cpu.h
   trunk/theora/lib/dct.c
   trunk/theora/lib/dct_decode.c
   trunk/theora/lib/dsp.c
   trunk/theora/lib/dsp.h
   trunk/theora/lib/idct.c
   trunk/theora/lib/mcomp.c
   trunk/theora/lib/reconstruct.c
Log:
* MMX loop filter based on Rudolf Marek's patch in
http://lists.xiph.org/pipermail/theora-dev/2005-August/002838.html
* MMX IDCT based on Rudolf Marek's patch in
http://lists.xiph.org/pipermail/theora-dev/2005-July/002816.html
and the code in http://svn.xiph.org/trunk/vp32/CoreLibs/CDXV/Vp31/dx/win32/
* change FiltBoundingValue to ogg_int16_t and reduce it to 256
entries. (It is safe if I read the spec correctly)
* comment out unused idct_short__c, remove unused LoopFilterLimitValuesV2

based on patch by Chih-Chung Chang <jochang <at> gmail.com>



Modified: trunk/theora/configure.ac
===================================================================

--- trunk/theora/configure.ac	2007-04-01 13:39:37 UTC (rev 12825)
+++ trunk/theora/configure.ac	2007-04-02 21:08:28 UTC (rev 12826)
@@ -24,7 +24,7 @@
 AC_SUBST(V_LIB_AGE)
 
 dnl Extra linker options (for version script)
-SHLIB_VERSION_ARG=""
+THEORA_LDFLAGS=""
 
 dnl --------------------------------------------------  
 dnl Check for programs
@@ -115,7 +115,10 @@
 		cpu_x86_32=yes 
 		cpu_optimization="32 bit x86"
 		AC_DEFINE([USE_ASM], [],  [make use of asm optimization])
-    	;;
+		if test "x$target_vendor" = "xapple"; then
+			THEORA_LDFLAGS="$THEORA_LDFLAGS  -Wl,-read_only_relocs,suppress"
+		fi
+		;;
 	x86_64)
 		cpu_x86_64=yes
 		cpu_optimization="64 bit x86"
@@ -125,6 +128,7 @@
 else
   cpu_optimization="disabled"
 fi
+
 AM_CONDITIONAL([CPU_x86_64], [test x$cpu_x86_64 = xyes])
 AM_CONDITIONAL([CPU_x86_32], [test x$cpu_x86_32 = xyes])
 
@@ -132,19 +136,20 @@
 AC_PROG_LD
 AC_PROG_LD_GNU
 if test "x$lt_cv_prog_gnu_ld" = "xyes"; then
-   SHLIB_VERSION_ARG="Wl,--version-script=Version_script"
+	SHLIB_VERSION_ARG="-Wl,--version-script=Version_script"
 
-   dnl Set extra linker options
-   case "$target_os" in
+	dnl Set extra linker options
+	case "$target_os" in
 	linux* | solaris* )
 		SHLIB_VERSION_ARG="-Wl,--version-script=Version_script"
 		;;
 	*)
 		;;
-   esac
+	esac
+	THEORA_LDFLAGS="$THEORA_LDFLAGS SHLIB_VERSION_ARG"
 fi
 
-AC_SUBST(SHLIB_VERSION_ARG)
+AC_SUBST(THEORA_LDFLAGS)
 
 dnl --------------------------------------------------
 dnl Checks for support libraries and headers

Modified: trunk/theora/lib/Makefile.am
===================================================================
--- trunk/theora/lib/Makefile.am	2007-04-01 13:39:37 UTC (rev 12825)
+++ trunk/theora/lib/Makefile.am	2007-04-02 21:08:28 UTC (rev 12826)
@@ -1,14 +1,18 @@
 INCLUDES = -I$(top_srcdir)/include
 
 EXTRA_DIST = Version_script.in \
+	x86_32/dct_decode_mmx.c \
 	x86_32/dsp_mmx.c \
 	x86_32/dsp_mmxext.c \
 	x86_32/recon_mmx.c \
 	x86_32/fdct_mmx.c \
+	x86_32/idct_mmx.c \
+	x86_64/dct_decode_mmx.c \
 	x86_64/dsp_mmx.c \
 	x86_64/dsp_mmxext.c \
 	x86_64/recon_mmx.c \
 	x86_64/fdct_mmx.c \
+	x86_64/idct_mmx.c \
 	x86_32_vs/dsp_mmx.c \
 	x86_32_vs/fdct_mmx.c \
 	x86_32_vs/recon_mmx.c \
@@ -28,18 +32,22 @@
 if CPU_x86_64
 arch_dir = x86_64
 arch_sources= \
+	$(arch_dir)/dct_decode_mmx.c \
 	$(arch_dir)/dsp_mmx.c \
 	$(arch_dir)/dsp_mmxext.c \
 	$(arch_dir)/recon_mmx.c \
-	$(arch_dir)/fdct_mmx.c
+	$(arch_dir)/fdct_mmx.c \
+	$(arch_dir)/idct_mmx.c
 else
 if CPU_x86_32
 arch_dir = x86_32
 arch_sources= \
+	$(arch_dir)/dct_decode_mmx.c \
 	$(arch_dir)/dsp_mmx.c \
 	$(arch_dir)/dsp_mmxext.c \
 	$(arch_dir)/recon_mmx.c \
-	$(arch_dir)/fdct_mmx.c
+	$(arch_dir)/fdct_mmx.c \
+	$(arch_dir)/idct_mmx.c
 endif
 endif
 
@@ -81,7 +89,7 @@
 	dsp.h
 
 libtheora_la_CFLAGS = $(OGG_CFLAGS)
-libtheora_la_LDFLAGS = -version-info @V_LIB_CURRENT@:@V_LIB_REVISION@:@V_LIB_AGE@ @SHLIB_VERSION_ARG@
+libtheora_la_LDFLAGS = -version-info @V_LIB_CURRENT@:@V_LIB_REVISION@:@V_LIB_AGE@ @THEORA_LDFLAGS@
 libtheora_la_LIBADD = $(OGG_LIBS)
 
 debug:

Modified: trunk/theora/lib/codec_internal.h
===================================================================
--- trunk/theora/lib/codec_internal.h	2007-04-01 13:39:37 UTC (rev 12825)
+++ trunk/theora/lib/codec_internal.h	2007-04-02 21:08:28 UTC (rev 12826)
@@ -453,7 +453,7 @@
 
   /* Loop filter bounding values */
   unsigned char  LoopFilterLimits[Q_TABLE_SIZE];
-  ogg_int32_t    FiltBoundingValue[512];
+  ogg_int16_t    FiltBoundingValue[256];
 
   /* Dequantiser and rounding tables */
   ogg_uint32_t   QThreshTable[Q_TABLE_SIZE];
@@ -706,15 +706,6 @@
 extern void InitPBInstance(PB_INSTANCE *pbi);
 extern void ClearPBInstance(PB_INSTANCE *pbi);
 
-
-extern void IDctSlow(  Q_LIST_ENTRY * InputData,
-                       ogg_int16_t *QuantMatrix,
-                       ogg_int16_t * OutputData ) ;
-
-extern void IDct10( Q_LIST_ENTRY * InputData,
-                    ogg_int16_t *QuantMatrix,
-                    ogg_int16_t * OutputData );
-
 extern void IDct1( Q_LIST_ENTRY * InputData,
                    ogg_int16_t *QuantMatrix,
                    ogg_int16_t * OutputData );

Modified: trunk/theora/lib/cpu.h
===================================================================
--- trunk/theora/lib/cpu.h	2007-04-01 13:39:37 UTC (rev 12825)
+++ trunk/theora/lib/cpu.h	2007-04-02 21:08:28 UTC (rev 12826)
@@ -14,6 +14,8 @@
   last mod: $Id$
 
  ********************************************************************/
+#ifndef CPU_H
+#define CPU_H
 
 #include "codec_internal.h"
 
@@ -28,3 +30,4 @@
 
 ogg_uint32_t cpu_init (void);
 
+#endif

Modified: trunk/theora/lib/dct.c
===================================================================
--- trunk/theora/lib/dct.c	2007-04-01 13:39:37 UTC (rev 12825)
+++ trunk/theora/lib/dct.c	2007-04-02 21:08:28 UTC (rev 12826)
@@ -16,8 +16,6 @@
  ********************************************************************/
 
 #include "codec_internal.h"
-#include "dsp.h"
-#include "cpu.h"
 
 static ogg_int32_t xC1S7 = 64277;
 static ogg_int32_t xC2S6 = 60547;
@@ -257,6 +255,8 @@
 void dsp_dct_init (DspFunctions *funcs, ogg_uint32_t cpu_flags)
 {
   funcs->fdct_short = fdct_short__c;
+  dsp_dct_decode_init(funcs, cpu_flags);
+  dsp_idct_init(funcs, cpu_flags);
 #if defined(USE_ASM)
   if (cpu_flags & CPU_X86_MMX) {
     dsp_mmx_fdct_init(funcs);

Modified: trunk/theora/lib/dct_decode.c
===================================================================
--- trunk/theora/lib/dct_decode.c	2007-04-01 13:39:37 UTC (rev 12825)
+++ trunk/theora/lib/dct_decode.c	2007-04-02 21:08:28 UTC (rev 12826)
@@ -18,7 +18,6 @@
 #include <stdlib.h>
 #include <string.h>
 #include "codec_internal.h"
-#include "dsp.h"
 
 
 #define GOLDEN_FRAME_THRESH_Q   50
@@ -28,7 +27,7 @@
 #define PL 1
 #define HIGHBITDUPPED(X) (((signed short) X)  >> 15)
 
-/* in-loop filter tables. one of these is used in dct_decode.c */
+/* in-loop filter tables. */
 
 static const unsigned char LoopFilterLimitValuesV1[Q_TABLE_SIZE] = {
   30, 25, 20, 20, 15, 15, 14, 14,
@@ -41,27 +40,16 @@
   0,  0,  0,  0,  0,  0,  0,  0
 };
 
-static const unsigned char LoopFilterLimitValuesV2[Q_TABLE_SIZE] = {
-  30, 25, 20, 20, 15, 15, 14, 14,
-  13, 13, 12, 12, 11, 11, 10, 10,
-  9,  9,  8,  8,  7,  7,  7,  7,
-  6,  6,  6,  6,  5,  5,  5,  5,
-  4,  4,  4,  4,  3,  3,  3,  3,
-  2,  2,  2,  2,  2,  2,  2,  2,
-  2,  2,  2,  2,  2,  2,  2,  2,
-  1,  1,  1,  1,  1,  1,  1,  1
-};
-
 static const int ModeUsesMC[MAX_MODES] = { 0, 0, 1, 1, 1, 0, 1, 1 };
 
 static void SetupBoundingValueArray_Generic(PB_INSTANCE *pbi,
                                             ogg_int32_t FLimit){
 
-  ogg_int32_t * BoundingValuePtr = pbi->FiltBoundingValue+256;
+  ogg_int16_t * BoundingValuePtr = pbi->FiltBoundingValue+127;
   ogg_int32_t i;
 
   /* Set up the bounding value array. */
-  memset ( pbi->FiltBoundingValue, 0, (512*sizeof(*pbi->FiltBoundingValue)) );
+  memset ( pbi->FiltBoundingValue, 0, (256*sizeof(*pbi->FiltBoundingValue)) );
   for ( i = 0; i < FLimit; i++ ){
     BoundingValuePtr[-i-FLimit] = (-FLimit+i);
     BoundingValuePtr[-i] = -i;
@@ -107,8 +95,6 @@
 void SetupLoopFilter(PB_INSTANCE *pbi){
   ogg_int32_t FLimit;
 
-  /* nb: this was using the V2 values rather than V1
-     we think is was a mistake; the results were not used */
   FLimit = pbi->LoopFilterLimits[pbi->FrameQIndex];
   SetupBoundingValueArray_Generic(pbi, FLimit);
 }
@@ -137,11 +123,14 @@
   case 0:case 1:
     IDct1( pbi->quantized_list, pbi->dequant_coeffs, pbi->ReconDataBuffer );
     break;
-  case 2: case 3:case 4:case 5:case 6:case 7:case 8: case 9:case 10:
-    IDct10( pbi->quantized_list, pbi->dequant_coeffs, pbi->ReconDataBuffer );
+  case 2: case 3:
+    dsp_IDct3(pbi->dsp, pbi->quantized_list, pbi->dequant_coeffs, pbi->ReconDataBuffer );
     break;
+  case 4:case 5:case 6:case 7:case 8: case 9:case 10:
+    dsp_IDct10(pbi->dsp, pbi->quantized_list, pbi->dequant_coeffs, pbi->ReconDataBuffer );
+    break;
   default:
-    IDctSlow( pbi->quantized_list, pbi->dequant_coeffs, pbi->ReconDataBuffer );
+    dsp_IDctSlow(pbi->dsp, pbi->quantized_list, pbi->dequant_coeffs, pbi->ReconDataBuffer );
   }
 
   /* Convert fragment number to a pixel offset in a reconstruction buffer. */
@@ -217,11 +206,14 @@
   case 0:case 1:
     IDct1( pbi->quantized_list, pbi->dequant_coeffs, pbi->ReconDataBuffer );
     break;
-  case 2: case 3:case 4:case 5:case 6:case 7:case 8: case 9:case 10:
-    IDct10( pbi->quantized_list, pbi->dequant_coeffs, pbi->ReconDataBuffer );
+  case 2: case 3:
+    dsp_IDct3(pbi->dsp, pbi->quantized_list, pbi->dequant_coeffs, pbi->ReconDataBuffer );
     break;
+  case 4:case 5:case 6:case 7:case 8: case 9:case 10:
+    dsp_IDct10(pbi->dsp, pbi->quantized_list, pbi->dequant_coeffs, pbi->ReconDataBuffer );
+    break;
   default:
-    IDctSlow( pbi->quantized_list, pbi->dequant_coeffs, pbi->ReconDataBuffer );
+    dsp_IDctSlow(pbi->dsp, pbi->quantized_list, pbi->dequant_coeffs, pbi->ReconDataBuffer );
   }
 
   /* Convert fragment number to a pixel offset in a reconstruction buffer. */
@@ -283,7 +275,7 @@
       /* Reconstruct the pixel dats from the reference frame and change data
          (no half pixel in this case as the two references were the same. */
       dsp_recon_inter8x8 (pbi->dsp,
-		  &pbi->ThisFrameRecon[ReconPixelIndex],
+          &pbi->ThisFrameRecon[ReconPixelIndex],
                   LastFrameRecPtr, pbi->ReconDataBuffer,
                   ReconPixelsPerLine);
     }else{
@@ -673,9 +665,9 @@
   }
 }
 
-static void FilterHoriz(unsigned char * PixelPtr,
+static void FilterHoriz__c(unsigned char * PixelPtr,
                         ogg_int32_t LineLength,
-                        ogg_int32_t *BoundingValuePtr){
+                        ogg_int16_t *BoundingValuePtr){
   ogg_int32_t j;
   ogg_int32_t FiltVal;
 
@@ -695,18 +687,16 @@
   }
 }
 
-static void FilterVert(unsigned char * PixelPtr,
+static void FilterVert__c(unsigned char * PixelPtr,
                 ogg_int32_t LineLength,
-                ogg_int32_t *BoundingValuePtr){
+                ogg_int16_t *BoundingValuePtr){
   ogg_int32_t j;
   ogg_int32_t FiltVal;
-
+  PixelPtr -= 2*LineLength;
   /* the math was correct, but negative array indicies are forbidden
      by ANSI/C99 and will break optimization on several modern
      compilers */
 
-  PixelPtr -= 2*LineLength;
-
   for ( j = 0; j < 8; j++ ) {
     FiltVal = ( (ogg_int32_t)PixelPtr[0] ) -
       ( (ogg_int32_t)PixelPtr[LineLength] * 3 ) +
@@ -725,7 +715,7 @@
 void LoopFilter(PB_INSTANCE *pbi){
   ogg_int32_t i;
 
-  ogg_int32_t * BoundingValuePtr=pbi->FiltBoundingValue+256;
+  ogg_int16_t * BoundingValuePtr=pbi->FiltBoundingValue+127;
   int FragsAcross=pbi->HFragments;
   int FromFragment,ToFragment;
   int FragsDown = pbi->VFragments;
@@ -790,14 +780,14 @@
       /* Filter right hand border only if the block to the right is
          not coded */
       if ( !pbi->display_fragments[ i + 1 ] ){
-        FilterHoriz(pbi->LastFrameRecon+
+        dsp_FilterHoriz(pbi->dsp,pbi->LastFrameRecon+
                     pbi->recon_pixel_index_table[i]+6,
                     LineLength,BoundingValuePtr);
       }
 
       /* Bottom done if next row set */
       if( !pbi->display_fragments[ i + LineFragments] ){
-        FilterVert(pbi->LastFrameRecon+
+        dsp_FilterVert(pbi->dsp,pbi->LastFrameRecon+
                    pbi->recon_pixel_index_table[i+LineFragments],
                    LineLength, BoundingValuePtr);
       }
@@ -809,21 +799,21 @@
     for ( n = 1 ; n < FragsAcross - 1 ; n++, i++) {
       if( pbi->display_fragments[i]){
         /* Filter Left edge always */
-        FilterHoriz(pbi->LastFrameRecon+
+        dsp_FilterHoriz(pbi->dsp,pbi->LastFrameRecon+
                     pbi->recon_pixel_index_table[i]-2,
                     LineLength, BoundingValuePtr);
 
         /* Filter right hand border only if the block to the right is
            not coded */
         if ( !pbi->display_fragments[ i + 1 ] ){
-          FilterHoriz(pbi->LastFrameRecon+
+          dsp_FilterHoriz(pbi->dsp,pbi->LastFrameRecon+
                       pbi->recon_pixel_index_table[i]+6,
                       LineLength, BoundingValuePtr);
         }
 
         /* Bottom done if next row set */
         if( !pbi->display_fragments[ i + LineFragments] ){
-          FilterVert(pbi->LastFrameRecon+
+          dsp_FilterVert(pbi->dsp,pbi->LastFrameRecon+
                      pbi->recon_pixel_index_table[i + LineFragments],
                      LineLength, BoundingValuePtr);
         }
@@ -835,13 +825,13 @@
     /* Last Column */
     if( pbi->display_fragments[i]){
       /* Filter Left edge always */
-      FilterHoriz(pbi->LastFrameRecon+
+      dsp_FilterHoriz(pbi->dsp,pbi->LastFrameRecon+
                   pbi->recon_pixel_index_table[i] - 2 ,
                   LineLength, BoundingValuePtr);
 
       /* Bottom done if next row set */
       if( !pbi->display_fragments[ i + LineFragments] ){
-        FilterVert(pbi->LastFrameRecon+
+        dsp_FilterVert(pbi->dsp,pbi->LastFrameRecon+
                    pbi->recon_pixel_index_table[i + LineFragments],
                    LineLength, BoundingValuePtr);
       }
@@ -859,21 +849,21 @@
          all fragments are intra */
       if( pbi->display_fragments[i]){
         /* TopRow is always done */
-        FilterVert(pbi->LastFrameRecon+
+        dsp_FilterVert(pbi->dsp,pbi->LastFrameRecon+
                    pbi->recon_pixel_index_table[i],
                    LineLength, BoundingValuePtr);
 
         /* Filter right hand border only if the block to the right is
            not coded */
         if ( !pbi->display_fragments[ i + 1 ] ){
-          FilterHoriz(pbi->LastFrameRecon+
+          dsp_FilterHoriz(pbi->dsp,pbi->LastFrameRecon+
                       pbi->recon_pixel_index_table[i] + 6,
                       LineLength, BoundingValuePtr);
         }
 
         /* Bottom done if next row set */
         if( !pbi->display_fragments[ i + LineFragments] ){
-          FilterVert(pbi->LastFrameRecon+
+          dsp_FilterVert(pbi->dsp,pbi->LastFrameRecon+
                      pbi->recon_pixel_index_table[i + LineFragments],
                      LineLength, BoundingValuePtr);
         }
@@ -885,26 +875,26 @@
       for ( n = 1 ; n < FragsAcross - 1 ; n++, i++){
         if( pbi->display_fragments[i]){
           /* Filter Left edge always */
-          FilterHoriz(pbi->LastFrameRecon+
+          dsp_FilterHoriz(pbi->dsp,pbi->LastFrameRecon+
                       pbi->recon_pixel_index_table[i] - 2,
                       LineLength, BoundingValuePtr);
 
           /* TopRow is always done */
-          FilterVert(pbi->LastFrameRecon+
+          dsp_FilterVert(pbi->dsp,pbi->LastFrameRecon+
                      pbi->recon_pixel_index_table[i],
                      LineLength, BoundingValuePtr);
 
           /* Filter right hand border only if the block to the right
              is not coded */
           if ( !pbi->display_fragments[ i + 1 ] ){
-            FilterHoriz(pbi->LastFrameRecon+
+            dsp_FilterHoriz(pbi->dsp,pbi->LastFrameRecon+
                         pbi->recon_pixel_index_table[i] + 6,
                         LineLength, BoundingValuePtr);
           }
 
           /* Bottom done if next row set */
           if( !pbi->display_fragments[ i + LineFragments] ){
-            FilterVert(pbi->LastFrameRecon+
+            dsp_FilterVert(pbi->dsp,pbi->LastFrameRecon+
                        pbi->recon_pixel_index_table[i + LineFragments],
                        LineLength, BoundingValuePtr);
           }
@@ -915,18 +905,18 @@
       /* Last Column */
       if( pbi->display_fragments[i]){
         /* Filter Left edge always*/
-        FilterHoriz(pbi->LastFrameRecon+
+        dsp_FilterHoriz(pbi->dsp,pbi->LastFrameRecon+
                     pbi->recon_pixel_index_table[i] - 2,
                     LineLength, BoundingValuePtr);
 
         /* TopRow is always done */
-        FilterVert(pbi->LastFrameRecon+
+        dsp_FilterVert(pbi->dsp,pbi->LastFrameRecon+
                    pbi->recon_pixel_index_table[i],
                    LineLength, BoundingValuePtr);
 
         /* Bottom done if next row set */
         if( !pbi->display_fragments[ i + LineFragments] ){
-          FilterVert(pbi->LastFrameRecon+
+          dsp_FilterVert(pbi->dsp,pbi->LastFrameRecon+
                      pbi->recon_pixel_index_table[i + LineFragments],
                      LineLength, BoundingValuePtr);
         }
@@ -944,14 +934,14 @@
     if( pbi->display_fragments[i]){
 
       /* TopRow is always done */
-      FilterVert(pbi->LastFrameRecon+
+      dsp_FilterVert(pbi->dsp,pbi->LastFrameRecon+
                  pbi->recon_pixel_index_table[i],
                  LineLength, BoundingValuePtr);
 
       /* Filter right hand border only if the block to the right is
          not coded */
       if ( !pbi->display_fragments[ i + 1 ] ){
-        FilterHoriz(pbi->LastFrameRecon+
+        dsp_FilterHoriz(pbi->dsp,pbi->LastFrameRecon+
                     pbi->recon_pixel_index_table[i] + 6,
                     LineLength, BoundingValuePtr);
       }
@@ -963,19 +953,19 @@
     for ( n = 1 ; n < FragsAcross - 1 ; n++, i++){
       if( pbi->display_fragments[i]){
         /* Filter Left edge always */
-        FilterHoriz(pbi->LastFrameRecon+
+        dsp_FilterHoriz(pbi->dsp,pbi->LastFrameRecon+
                     pbi->recon_pixel_index_table[i] - 2,
                     LineLength, BoundingValuePtr);
 
         /* TopRow is always done */
-        FilterVert(pbi->LastFrameRecon+
+        dsp_FilterVert(pbi->dsp,pbi->LastFrameRecon+
                    pbi->recon_pixel_index_table[i],
                    LineLength, BoundingValuePtr);
 
         /* Filter right hand border only if the block to the right is
            not coded */
         if ( !pbi->display_fragments[ i + 1 ] ){
-          FilterHoriz(pbi->LastFrameRecon+
+          dsp_FilterHoriz(pbi->dsp,pbi->LastFrameRecon+
                       pbi->recon_pixel_index_table[i] + 6,
                       LineLength, BoundingValuePtr);
         }
@@ -986,12 +976,12 @@
     /* Last Column */
     if( pbi->display_fragments[i]){
       /* Filter Left edge always */
-      FilterHoriz(pbi->LastFrameRecon+
+      dsp_FilterHoriz(pbi->dsp,pbi->LastFrameRecon+
                   pbi->recon_pixel_index_table[i] - 2,
                   LineLength, BoundingValuePtr);
 
       /* TopRow is always done */
-      FilterVert(pbi->LastFrameRecon+
+      dsp_FilterVert(pbi->dsp,pbi->LastFrameRecon+
                  pbi->recon_pixel_index_table[i],
                  LineLength, BoundingValuePtr);
 
@@ -1222,3 +1212,14 @@
     UpdateUMVBorder(pbi, pbi->GoldenFrame);
   }
 }
+
+void dsp_dct_decode_init (DspFunctions *funcs, ogg_uint32_t cpu_flags)
+{
+  funcs->FilterVert = FilterVert__c;
+  funcs->FilterHoriz = FilterHoriz__c;
+#if defined(USE_ASM)
+  if (cpu_flags & CPU_X86_MMX) {
+    dsp_mmx_dct_decode_init(funcs);
+  }
+#endif
+}

Modified: trunk/theora/lib/dsp.c
===================================================================
--- trunk/theora/lib/dsp.c	2007-04-01 13:39:37 UTC (rev 12825)
+++ trunk/theora/lib/dsp.c	2007-04-02 21:08:28 UTC (rev 12826)
@@ -16,8 +16,6 @@
  ********************************************************************/
 
 #include <stdlib.h>
-#include "cpu.h"
-#include "dsp.h"
 #include "codec_internal.h"
 
 #define DSP_OP_AVG(a,b) ((((int)(a)) + ((int)(b)))/2)

Modified: trunk/theora/lib/dsp.h
===================================================================
--- trunk/theora/lib/dsp.h	2007-04-01 13:39:37 UTC (rev 12825)
+++ trunk/theora/lib/dsp.h	2007-04-02 21:08:28 UTC (rev 12826)
@@ -18,7 +18,8 @@
 #ifndef DSP_H
 #define DSP_H
 
-#include <theora/theora.h>
+#include "theora/theora.h"
+#include "cpu.h"
 
 typedef struct
 {
@@ -77,10 +78,27 @@
   ogg_uint32_t (*inter8x8_err_xy2)(unsigned char *SrcData, ogg_uint32_t SrcStride,
 		                 unsigned char *RefDataPtr1,
 			         unsigned char *RefDataPtr2, ogg_uint32_t RefStride);
+			         
+  void (*FilterHoriz) (unsigned char * PixelPtr,
+                ogg_int32_t LineLength, ogg_int16_t *BoundingValuePtr);
+
+  void (*FilterVert) (unsigned char * PixelPtr,
+                 ogg_int32_t LineLength, ogg_int16_t *BoundingValuePtr);
+
+   void (*IDctSlow) (ogg_int16_t *InputData, 
+                  ogg_int16_t *QuantMatrix, ogg_int16_t *OutputData);
+
+    void (*IDct3) (ogg_int16_t *InputData, 
+                   ogg_int16_t *QuantMatrix, ogg_int16_t *OutputData);
+                   
+    void (*IDct10) (ogg_int16_t *InputData, 
+                  ogg_int16_t *QuantMatrix, ogg_int16_t *OutputData);
 } DspFunctions;
 
 extern void dsp_dct_init(DspFunctions *funcs, ogg_uint32_t cpu_flags);
 extern void dsp_recon_init (DspFunctions *funcs, ogg_uint32_t cpu_flags);
+extern void dsp_dct_decode_init(DspFunctions *funcs, ogg_uint32_t cpu_flags);
+extern void dsp_idct_init(DspFunctions *funcs, ogg_uint32_t cpu_flags);
 
 void dsp_init(DspFunctions *funcs);
 void dsp_static_init(DspFunctions *funcs);
@@ -89,6 +107,8 @@
 extern void dsp_mmxext_init(DspFunctions *funcs);
 extern void dsp_mmx_fdct_init(DspFunctions *funcs);
 extern void dsp_mmx_recon_init(DspFunctions *funcs);
+extern void dsp_mmx_dct_decode_init(DspFunctions *funcs);
+extern void dsp_mmx_idct_init(DspFunctions *funcs);
 #endif
 
 #define dsp_save_fpu(funcs) (funcs.save_fpu ())
@@ -132,5 +152,19 @@
 #define dsp_inter8x8_err_xy2(funcs,ptr1,str1,ptr2,ptr3,str2) \
 	(funcs.inter8x8_err_xy2 (ptr1,str1,ptr2,ptr3,str2))
 
+#define dsp_FilterHoriz(funcs, ptr1, ptr2, ptr3) \
+  (funcs.FilterHoriz(ptr1, ptr2, ptr3))
 
+#define dsp_FilterVert(funcs, ptr1, ptr2, ptr3) \
+  (funcs.FilterVert(ptr1, ptr2, ptr3))
+
+#define dsp_IDctSlow(funcs, ptr1, ptr2, ptr3) \
+    (funcs.IDctSlow(ptr1, ptr2, ptr3))
+
+#define dsp_IDct3(funcs, ptr1, ptr2, ptr3) \
+    (funcs.IDctSlow(ptr1, ptr2, ptr3))
+
+#define dsp_IDct10(funcs, ptr1, ptr2, ptr3) \
+   (funcs.IDctSlow(ptr1, ptr2, ptr3))
+
 #endif /* DSP_H */

Modified: trunk/theora/lib/idct.c
===================================================================
--- trunk/theora/lib/idct.c	2007-04-01 13:39:37 UTC (rev 12825)
+++ trunk/theora/lib/idct.c	2007-04-02 21:08:28 UTC (rev 12826)
@@ -17,10 +17,10 @@
 
 #include <string.h>
 #include "codec_internal.h"
+
 #include "quant_lookup.h"
 
 #define IdctAdjustBeforeShift 8
-
 /* cos(n*pi/16) or sin(8-n)*pi/16) */
 #define xC1S7 64277
 #define xC2S6 60547
@@ -31,6 +31,7 @@
 #define xC7S1 12785
 
 /* compute the 16 bit signed 1D inverse DCT - spec version */
+/*
 static void idct_short__c ( ogg_int16_t * InputData, ogg_int16_t * OutputData ) {
   ogg_int32_t t[8], r;
   ogg_int16_t *y = InputData;
@@ -108,6 +109,7 @@
   x[7] = r;
 
 }
+*/
 
 static void dequant_slow( ogg_int16_t * dequant_coeffs,
                    ogg_int16_t * quantized_list,
@@ -119,7 +121,7 @@
 
 
 
-void IDctSlow(  Q_LIST_ENTRY * InputData,
+void IDctSlow__c(  Q_LIST_ENTRY * InputData,
                 ogg_int16_t *QuantMatrix,
                 ogg_int16_t * OutputData ) {
   ogg_int32_t IntermediateData[64];
@@ -348,7 +350,7 @@
 
 }
 
-void IDct10( Q_LIST_ENTRY * InputData,
+void IDct10__c( Q_LIST_ENTRY * InputData,
              ogg_int16_t *QuantMatrix,
              ogg_int16_t * OutputData ){
   ogg_int32_t IntermediateData[64];
@@ -553,3 +555,15 @@
     OutputData[loop]=OutD;
 
 }
+
+void dsp_idct_init (DspFunctions *funcs, ogg_uint32_t cpu_flags)
+{
+  funcs->IDctSlow = IDctSlow__c;
+  funcs->IDct10 = IDct10__c;
+  funcs->IDct3 = IDct10__c;
+#if defined(USE_ASM)
+  if (cpu_flags & CPU_X86_MMX) {
+    dsp_mmx_idct_init(funcs);
+  }
+#endif
+}

Modified: trunk/theora/lib/mcomp.c
===================================================================
--- trunk/theora/lib/mcomp.c	2007-04-01 13:39:37 UTC (rev 12825)
+++ trunk/theora/lib/mcomp.c	2007-04-02 21:08:28 UTC (rev 12826)
@@ -17,7 +17,6 @@
 
 #include <stdlib.h>
 #include <stdio.h>
-#include "dsp.h"
 #include "codec_internal.h"
 
 /* Initialises motion compentsation. */

Modified: trunk/theora/lib/reconstruct.c
===================================================================
--- trunk/theora/lib/reconstruct.c	2007-04-01 13:39:37 UTC (rev 12825)
+++ trunk/theora/lib/reconstruct.c	2007-04-02 21:08:28 UTC (rev 12826)
@@ -16,8 +16,6 @@
  ********************************************************************/
 
 #include "codec_internal.h"
-#include "dsp.h"
-#include "cpu.h"
 
 static void copy8x8__c (unsigned char *src,
 	                unsigned char *dest,

Added: trunk/theora/lib/x86_32/dct_decode_mmx.c
===================================================================
--- trunk/theora/lib/x86_32/dct_decode_mmx.c	2007-04-01 13:39:37 UTC (rev 12825)
+++ trunk/theora/lib/x86_32/dct_decode_mmx.c	2007-04-02 21:08:28 UTC (rev 12826)
@@ -0,0 +1,184 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+  last mod: $Id: dsp_mmx.c 12440 2007-02-06 16:36:26Z j $
+
+ ********************************************************************/
+
+#include <stdlib.h>
+
+#include "codec_internal.h"
+
+#if defined(USE_ASM)
+
+static const __attribute__((aligned(8),used)) ogg_int64_t V3= 0x0003000300030003LL;
+static const __attribute__((aligned(8),used)) ogg_int64_t V804= 0x0804080408040804LL;
+
+#if defined(__APPLE__)
+#define MANGLE(x) "_"#x
+#else
+#define MANGLE(x) #x
+#endif
+
+static void FilterHoriz__mmx(unsigned char * PixelPtr,
+                        ogg_int32_t LineLength,
+                        ogg_int16_t *BoundingValuePtr){
+
+#define OC_LOOP_H_4x4                                                   \
+    __asm__ __volatile__(                                               \
+    "lea (%1,%1,2),%%esi\n"     /* esi = ystride*3 */                   \
+    "movd (%0), %%mm0\n"        /* 0 0 0 0 3 2 1 0 */                   \
+    "movd (%0,%1),%%mm1\n"      /* 0 0 0 0 7 6 5 4 */                   \
+    "movd (%0,%1,2),%%mm2\n"    /* 0 0 0 0 b a 9 8 */                   \
+    "movd (%0,%%esi),%%mm3\n"   /* 0 0 0 0 f e d c */                   \
+    "punpcklbw %%mm1,%%mm0\n"   /* mm0 = 7 3 6 2 5 1 4 0 */             \
+    "punpcklbw %%mm3,%%mm2\n"   /* mm2 = f b e a d 9 c 8 */             \
+    "movq %%mm0,%%mm1\n"        /* mm1 = 7 3 6 2 5 1 4 0 */             \
+    "punpcklwd %%mm2,%%mm1\n"   /* mm1 = d 9 5 1 c 8 4 0 */             \
+    "punpckhwd %%mm2,%%mm0\n"   /* mm0 = f b 7 3 e a 6 2 */             \
+    "pxor %%mm7,%%mm7\n"                                                \
+    "movq %%mm1,%%mm5\n"        /* mm5 = d 9 5 1 c 8 4 0 */             \
+    "punpckhbw %%mm7,%%mm5\n"   /* mm5 = 0 d 0 9 0 5 0 1 = pix[1]*/     \
+    "punpcklbw %%mm7,%%mm1\n"   /* mm1 = 0 c 0 8 0 4 0 0 = pix[0]*/     \
+    "movq %%mm0,%%mm3\n"        /* mm3 = f b 7 3 e a 6 2 */             \
+    "punpckhbw %%mm7,%%mm3\n"   /* mm3 = 0 f 0 b 0 7 0 3 = pix[3]*/     \
+    "punpcklbw %%mm7,%%mm0\n"       /* mm0 = 0 e 0 a 0 6 0 2 = pix[2]*/ \
+                                                                        \
+    "psubw %%mm3,%%mm1\n"       /* mm1 = pix[0]-pix[3] mm1 - mm3 */     \
+    "movq %%mm0,%%mm7\n"        /* mm7 = pix[2]*/                       \
+    "psubw %%mm5,%%mm0\n"       /* mm0 = pix[2]-pix[1] mm0 - mm5*/      \
+    "PMULLW "MANGLE(V3)",%%mm0\n" /* *3 */                              \
+    "paddw %%mm0,%%mm1\n"         /* mm1 has f[0] ... f[4]*/            \
+    "paddw "MANGLE(V804)",%%mm1\n"/* add 4 */ /* add 256 after shift */ \
+    "psraw $3,%%mm1\n"          /* >>3 */                               \
+    " pextrw $0,%%mm1,%%esi\n"  /* In MM1 we have 4 f coefs (16bits) */ \
+    " pextrw $1,%%mm1,%%edi\n"  /* now perform MM4 = *(_bv+ f) */       \
+    " pinsrw $0,(%2,%%esi,2),%%mm4\n"                                   \
+    " pextrw $2,%%mm1,%%esi\n"                                          \
+    " pinsrw $1,(%2,%%edi,2),%%mm4\n"                                   \
+    " pextrw $3,%%mm1,%%edi\n"                                          \
+    " pinsrw $2,(%2,%%esi,2),%%mm4\n"                                   \
+    " pinsrw $3,(%2,%%edi,2),%%mm4\n" /* new f vals loaded */           \
+    "pxor %%mm0,%%mm0\n"                                                \
+    " paddw %%mm4,%%mm5\n"      /*(pix[1]+f);*/                         \
+    " psubw %%mm4,%%mm7\n"      /* (pix[2]-f); */                       \
+    " packuswb %%mm0,%%mm5\n"   /* mm5 = x x x x newpix1 */             \
+    " packuswb %%mm0,%%mm7\n"   /* mm7 = x x x x newpix2 */             \
+    " punpcklbw %%mm7,%%mm5\n"  /* 2 1 2 1 2 1 2 1 */                   \
+    " movd %%mm5,%%eax\n"       /* eax = newpix21 */                    \
+    " movw %%ax,1(%0)\n"                                                \
+    " psrlq $32,%%mm5\n"        /* why is so big stall here ? */        \
+    " shrl $16,%%eax\n"                                                 \
+    " lea 1(%0,%1,2),%%edi\n"                                           \
+    " movw %%ax,1(%0,%1,1)\n"                                           \
+    " movd %%mm5,%%eax\n"       /* eax = newpix21 high part */          \
+    " lea (%1,%1,2),%%esi\n"                                            \
+    " movw %%ax,(%%edi)\n"                                              \
+    " shrl $16,%%eax\n"                                                 \
+    " movw %%ax,1(%0,%%esi)\n"                                          \
+    :                                                                   \
+    : "r" (PixelPtr), "r" (LineLength), "r" (BoundingValuePtr-256)      \
+    : "esi", "edi" , "memory", "eax"                                    \
+    );
+
+    OC_LOOP_H_4x4
+    PixelPtr += LineLength*4;
+    OC_LOOP_H_4x4
+    __asm__ __volatile__("emms\n");
+}
+
+static void FilterVert__mmx(unsigned char * PixelPtr,
+                ogg_int32_t LineLength,
+                ogg_int16_t *BoundingValuePtr){
+    __asm__ __volatile__(
+    "pxor %%mm0,%%mm0\n"        /* mm0 = 0 */
+    "movq (%0),%%mm7\n"         /* mm7 = pix[0..7] */
+    "lea (%1,%1,2),%%esi\n"     /* esi = ystride*3 */
+    "movq (%0,%%esi),%%mm4\n"   /* mm4 = pix[0..7+ystride*3] */
+    "movq %%mm7,%%mm6\n"        /* mm6 = pix[0..7] */
+    "punpcklbw %%mm0,%%mm6\n"   /* expand unsigned pix[0..3] to 16 bits */
+    "movq %%mm4,%%mm5\n"
+    "punpckhbw %%mm0,%%mm7\n"   /* expand unsigned pix[4..7] to 16 bits */
+    "punpcklbw %%mm0,%%mm4\n"   /* expand other arrays too */
+    "punpckhbw %%mm0,%%mm5\n"
+    "psubw %%mm4,%%mm6\n"       /* mm6 = mm6 - mm4 */
+    "psubw %%mm5,%%mm7\n"       /* mm7 = mm7 - mm5 */
+                /* mm7:mm6 = _p[0]-_p[ystride*3] */
+    "movq (%0,%1),%%mm4\n"      /* mm4 = pix[0..7+ystride] */
+    "movq %%mm4,%%mm5\n"
+    "movq (%0,%1,2),%%mm2\n"    /* mm2 = pix[0..7+ystride*2] */
+    "movq %%mm2,%%mm3\n"
+    "movq %%mm2,%%mm1\n"        //ystride*2
+    "punpckhbw %%mm0,%%mm5\n"
+    "punpcklbw %%mm0,%%mm4\n"
+    "punpckhbw %%mm0,%%mm3\n"
+    "punpcklbw %%mm0,%%mm2\n"
+    "psubw %%mm5,%%mm3\n"
+    "psubw %%mm4,%%mm2\n"
+                /* mm3:mm2 = (pix[ystride*2]-pix[ystride]); */
+    "PMULLW "MANGLE(V3)",%%mm3\n"    /* *3 */
+    "PMULLW "MANGLE(V3)",%%mm2\n"    /* *3 */
+    "paddw %%mm7,%%mm3\n"            /* highpart */
+    "paddw %%mm6,%%mm2\n"            /* lowpart of pix[0]-pix[ystride*3]+3*(pix[ystride*2]-pix[ystride]);  */
+    "paddw "MANGLE(V804)",%%mm3\n"   /* add 4 */ /* add 256 after shift */
+    "paddw "MANGLE(V804)",%%mm2\n"   /* add 4 */ /* add 256 after shift */
+    "psraw $3,%%mm3\n"               /* >>3 f coefs high */
+    "psraw $3,%%mm2\n"               /* >>3 f coefs low */
+
+    " pextrw $0,%%mm2,%%esi\n"  /* In MM3:MM2 we have f coefs (16bits) */
+    " pextrw $1,%%mm2,%%edi\n"  /* now perform MM7:MM6 = *(_bv+ f) */
+    " pinsrw $0,(%2,%%esi,2),%%mm6\n"
+    " pinsrw $1,(%2,%%edi,2),%%mm6\n"
+
+    " pextrw $2,%%mm2,%%esi\n"
+    " pextrw $3,%%mm2,%%edi\n"
+    " pinsrw $2,(%2,%%esi,2),%%mm6\n"
+    " pinsrw $3,(%2,%%edi,2),%%mm6\n"
+
+    " pextrw $0,%%mm3,%%esi\n"
+    " pextrw $1,%%mm3,%%edi\n"
+    " pinsrw $0,(%2,%%esi,2),%%mm7\n"
+    " pinsrw $1,(%2,%%edi,2),%%mm7\n"
+
+    " pextrw $2,%%mm3,%%esi\n"
+    " pextrw $3,%%mm3,%%edi\n"
+    " pinsrw $2,(%2,%%esi,2),%%mm7\n"
+    " pinsrw $3,(%2,%%edi,2),%%mm7\n"   //MM7 MM6   f=*(_bv+(f+4>>3));
+
+    "paddw %%mm6,%%mm4\n"       /* (pix[ystride]+f); */
+    "paddw %%mm7,%%mm5\n"       /* (pix[ystride]+f); */
+    "movq %%mm1,%%mm2\n"
+    "punpcklbw %%mm0,%%mm1\n"
+    "punpckhbw %%mm0,%%mm2\n"   //[ystride*2]
+    "psubw %%mm6,%%mm1\n"       /* (pix[ystride*2]-f); */
+    "psubw %%mm7,%%mm2\n"       /* (pix[ystride*2]-f); */
+    "packuswb %%mm2,%%mm1\n"
+    "packuswb %%mm5,%%mm4\n"
+    "movq %%mm1,(%0,%1,2)\n"    /* pix[ystride*2]= */
+    "movq %%mm4,(%0,%1)\n"      /* pix[ystride]= */
+    "emms\n"
+    :
+    : "r" (PixelPtr-2*LineLength), "r" (LineLength), "r" (BoundingValuePtr-256)
+    : "esi", "edi" , "memory"
+    );
+}
+
+/* install our implementation in the function table */
+void dsp_mmx_dct_decode_init(DspFunctions *funcs)
+{
+  TH_DEBUG("enabling accelerated x86_32 mmx dct decode functions.\n");
+  funcs->FilterVert = FilterVert__mmx;
+  funcs->FilterHoriz = FilterHoriz__mmx;
+}
+
+#endif /* USE_ASM */

Added: trunk/theora/lib/x86_32/idct_mmx.c
===================================================================
--- trunk/theora/lib/x86_32/idct_mmx.c	2007-04-01 13:39:37 UTC (rev 12825)
+++ trunk/theora/lib/x86_32/idct_mmx.c	2007-04-02 21:08:28 UTC (rev 12826)
@@ -0,0 +1,1453 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+  last mod: $Id: dsp_mmx.c 12440 2007-02-06 16:36:26Z j $
+
+ ********************************************************************/
+
+#include "codec_internal.h"
+
+#if defined(USE_ASM)
+
+#define ASM asm
+
+/****************************************************************************
+*
+*   Description  :     IDCT with multiple versions based on # of non 0 coeffs
+*
+*****************************************************************************
+*/
+
+// Dequantization + inverse discrete cosine transform.
+
+// Constants used in MMX implementation of dequantization and idct.
+// All the MMX stuff works with 4 16-bit quantities at a time and
+// we create 11 constants of size 4 x 16 bits.
+// The first 4 are used to mask the individual 16-bit words within a group
+// and are used in the address-shuffling part of the dequantization.
+// The last 7 are fixed-point approximations to the cosines of angles
+// occurring in the DCT; each of these contains 4 copies of the same value.
+
+// There is only one (statically initialized) instance of this object
+// wrapped in an allocator object that forces its starting address
+// to be evenly divisible by 32.  Hence the actual object occupies 2.75
+// cache lines on a Pentium processor.
+
+// Offsets in bytes used by the assembler code below
+// must of course agree with the idctConstants constructor.
+
+#define MaskOffset 0        // 4 masks come in order low word to high
+#define CosineOffset 32     // 7 cosines come in order pi/16 * (1 ... 7)
+#define EightOffset 88
+#define IdctAdjustBeforeShift 8
+
+/*
+UINT16 idctcosTbl[ 7] =
+{
+    64277, 60547, 54491, 46341, 36410, 25080, 12785
+};
+
+void fillidctconstants(void)
+{
+    int j = 16;
+    UINT16 * p;
+    do
+    {
+        idctconstants[ --j] = 0;
+    }
+    while( j);
+
+    idctconstants[0] = idctconstants[5] = idctconstants[10] = idctconstants[15] = 65535;
+
+    j = 1;
+    do
+    {
+        p = idctconstants + ( (j+3) << 2);
+        p[0] = p[1] = p[2] = p[3] = idctcosTbl[ j - 1];
+    }
+    while( ++j <= 7);
+
+    idctconstants[44] = idctconstants[45] = idctconstants[46] = idctconstants[47] = IdctAdjustBeforeShift;
+}
+*/
+
+ogg_uint16_t idctconstants[(4+7+1) * 4] = {
+    65535,     0,     0,     0,     0, 65535,     0,     0,
+        0,     0, 65535,     0,     0,     0,     0, 65535,
+    64277, 64277, 64277, 64277, 60547, 60547, 60547, 60547,
+    54491, 54491, 54491, 54491, 46341, 46341, 46341, 46341,
+    36410, 36410, 36410, 36410, 25080, 25080, 25080, 25080,
+    12785, 12785, 12785, 12785,     8,     8,     8,     8,
+};
+
+/* Dequantization + inverse DCT.
+
+   Dequantization multiplies user's 16-bit signed indices (range -512 to +511)
+   by unsigned 16-bit quantization table entries.
+   These table entries are upscaled by 4, max is 30 * 128 * 4 < 2^14.
+   Result is scaled signed DCT coefficients (abs value < 2^15).
+
+   In the data stream, the coefficients are sent in order of increasing
+   total (horizontal + vertical) frequency.  The exact picture is as follows:
+
+    00 01 05 06  16 17 33 34
+    02 04 07 15  20 32 35 52
+    03 10 14 21  31 36 51 53
+    11 13 22 30  37 50 54 65
+
+    12 23 27 40  47 55 64 66
+    24 26 41 46  56 63 67 74
+    25 42 45 57  62 70 73 75
+    43 44 60 61  71 72 76 77
+
+   Here the position in the matrix corresponds to the (horiz,vert)
+   freqency indices and the octal entry in the matrix is the position
+   of the coefficient in the data stream.  Thus the coefficients are sent
+   in sort of a diagonal "snake".
+
+   The dequantization stage "uncurls the snake" and stores the expanded
+   coefficients in more convenient positions.  These are not exactly the
+   natural positions given above but take into account our implementation
+   of the idct, which basically requires two one-dimensional idcts and
+   two transposes.
+
+   We fold the first transpose into the storage of the expanded coefficients.
+   We don't actually do a full transpose because this would require doubling
+   the size of the idct buffer; rather, we just transpose each of the 4x4
+   subblocks.  Using slightly varying addressing schemes in each of the
+   four 4x8 idcts then allows these transforms to be done in place.
+
+   Transposing the 4x4 subblocks in the matrix above gives
+
+    00 02 03 11  16 20 31 37
+    01 04 10 13  17 32 36 50
+    05 07 14 22  33 35 51 54
+    06 15 21 30  34 52 53 65
+
+    12 24 25 43  47 56 62 71
+    23 26 42 44  55 63 70 72
+    27 41 45 60  64 67 73 76
+    40 46 57 61  66 74 75 77
+
+   Finally, we reverse the words in each 4 word group to clarify
+   direction of shifts.
+
+    11 03 02 00  37 31 20 16
+    13 10 04 01  50 36 32 17
+    22 14 07 05  54 51 35 33
+    30 21 15 06  65 53 52 34
+
+    43 25 24 12  71 62 56 47
+    44 42 26 23  72 70 63 55
+    60 45 41 27  76 73 67 64
+    61 57 46 40  77 75 74 66
+
+   This matrix then shows the 16 4x16 destination words in terms of
+   the 16 4x16 input words.
+
+   We implement this algorithm by manipulation of mmx registers,
+   which seems to be the fastest way to proceed.  It is completely
+   hand-written; there does not seem to be enough recurrence to
+   reasonably compartmentalize any of it.  Hence the resulting
+   program is ugly and bloated.  Furthermore, due to the absence of
+   register pressure, it is boring and artless.  I hate it.
+
+   The idct itself is more interesting.  Since the two-dimensional dct
+   basis functions are products of the one-dimesional dct basis functions,
+   we can compute an inverse (or forward) dct via two 1-D transforms,
+   on rows then on columns.  To exploit MMX parallelism, we actually do
+   both operations on columns, interposing a (partial) transpose between
+   the two 1-D transforms, the first transpose being done by the expansion
+   described above.
+
+   The 8-sample one-dimensional DCT is a standard orthogonal expansion using
+   the (unnormalized) basis functions
+
+    b[k]( i) = cos( pi * k * (2i + 1) / 16);
+
+   here k = 0 ... 7 is the frequency and i = 0 ... 7 is the spatial coordinate.
+   To normalize, b[0] should be multiplied by 1/sqrt( 8) and the other b[k]
+   should be multiplied by 1/2.
+
+   The 8x8 two-dimensional DCT is just the product of one-dimensional DCTs
+   in each direction.  The (unnormalized) basis functions are
+
+    B[k,l]( i, j) = b[k]( i) * b[l]( j);
+
+   this time k and l are the horizontal and vertical frequencies,
+   i and j are the horizontal and vertical spatial coordinates;
+   all indices vary from 0 ... 7 (as above)
+   and there are now 4 cases of normalization.
+
+   Our 1-D idct expansion uses constants C1 ... C7 given by
+
+    (*)  Ck = C(-k) = cos( pi * k/16) = S(8-k) = -S(k-8) = sin( pi * (8-k)/16)
+
+   and the following 1-D algorithm transforming I0 ... I7  to  R0 ... R7 :
+
+   A = (C1 * I1) + (C7 * I7)        B = (C7 * I1) - (C1 * I7)
+   C = (C3 * I3) + (C5 * I5)        D = (C3 * I5) - (C5 * I3)
+   A. = C4 * (A - C)                B. = C4 * (B - D)
+   C. = A + C                       D. = B + D
+
+   E = C4 * (I0 + I4)               F = C4 * (I0 - I4)
+   G = (C2 * I2) + (C6 * I6)        H = (C6 * I2) - (C2 * I6)
+   E. = E - G
+   G. = E + G
+
+   A.. = F + A.                 B.. = B. - H
+   F.  = F - A.                 H.  = B. + H
+
+   R0 = G. + C. R1 = A.. + H.   R3 = E. + D.    R5 = F. + B..
+   R7 = G. - C. R2 = A.. - H.   R4 = E. - D.    R6 = F. - B..
+
+   It is due to Vetterli and Lightenberg and may be found in the JPEG
+   reference book by Pennebaker and Mitchell.
+
+   Correctness of the algorithm follows from (*) together with the
+   addition formulas for sine and cosine:
+
+    cos( A + B) = cos( A) * cos( B)  -  sin( A) * sin( B)
+    sin( A + B) = sin( A) * cos( B)  +  cos( A) * sin( B)
+
+   Note that this implementation absorbs the difference in normalization
+   between the 0th and higher frequencies, although the results produced
+   are actually twice as big as they should be.  Since we do this for each
+   dimension, the 2-D idct results are 4x the desired results.  Finally,
+   taking into account that the dequantization multiplies by 4 as well,
+   our actual results are 16x too big.  We fix this by shifting the final
+   results right by 4 bits.
+
+   High precision version approximates C1 ... C7 to 16 bits.
+   Since MMX only provides a signed multiply, C1 ... C5 appear to be
+   negative and multiplies involving them must be adjusted to compensate
+   for this.  C6 and C7 do not require this adjustment since
+   they are < 1/2 and are correctly treated as positive numbers.
+
+   Following macro does four 8-sample one-dimensional idcts in parallel.
+   This is actually not such a difficult program to write once you
+   make a couple of observations (I of course was unable to make these
+   observations until I'd half-written a couple of other versions).
+
+    1. Everything is easy once you are done with the multiplies.
+       This is because, given X and Y in registers, one may easily
+       calculate X+Y and X-Y using just those 2 registers.
+
+    2. You always need at least 2 extra registers to calculate products,
+       so storing 2 temporaries is inevitable.  C. and D. seem to be
+       the best candidates.
+
+    3. The products should be calculated in decreasing order of complexity
+       (which translates into register pressure).  Since C1 ... C5 require
+       adjustment (and C6, C7 do not), we begin by calculating C and D.
+*/
+
+/**************************************************************************************
+ *
+ *      Routine:        BeginIDCT
+ *
+ *      Description:    The Macro does IDct on 4 1-D Dcts
+ *
+ *      Input:          None
+ *
+ *      Output:         None
+ *
+ *      Return:         None
+ *
+ *      Special Note:   None
+ *
+ *      Error:          None
+ *
+ ***************************************************************************************
+ */
+
+#define MtoSTR(s) #s
+
+#define Dump    "call MMX_dump\n"
+
+#define BeginIDCT "#BeginIDCT\n"    \
+                                    \
+    "   movq    "   I(3)","r2"\n"   \
+                                    \
+    "   movq    "   C(3)","r6"\n"   \
+    "   movq    "   r2","r4"\n"     \
+    "   movq    "   J(5)","r7"\n"   \
+    "   pmulhw  "   r6","r4"\n"     \
+    "   movq    "   C(5)","r1"\n"   \
+    "   pmulhw  "   r7","r6"\n"     \
+    "   movq    "   r1","r5"\n"     \
+    "   pmulhw  "   r2","r1"\n"     \
+    "   movq    "   I(1)","r3"\n"   \
+    "   pmulhw  "   r7","r5"\n"     \
+    "   movq    "   C(1)","r0"\n"   \
+    "   paddw   "   r2","r4"\n"     \
+    "   paddw   "   r7","r6"\n"     \
+    "   paddw   "   r1","r2"\n"     \
+    "   movq    "   J(7)","r1"\n"   \
+    "   paddw   "   r5","r7"\n"     \
+    "   movq    "   r0","r5"\n"     \
+    "   pmulhw  "   r3","r0"\n"     \
+    "   paddsw  "   r7","r4"\n"     \
+    "   pmulhw  "   r1","r5"\n"     \
+    "   movq    "   C(7)","r7"\n"   \
+    "   psubsw  "   r2","r6"\n"     \
+    "   paddw   "   r3","r0"\n"     \
+    "   pmulhw  "   r7","r3"\n"     \
+    "   movq    "   I(2)","r2"\n"   \
+    "   pmulhw  "   r1","r7"\n"     \
+    "   paddw   "   r1","r5"\n"     \
+    "   movq    "   r2","r1"\n"     \
+    "   pmulhw  "   C(2)","r2"\n"   \
+    "   psubsw  "   r5","r3"\n"     \
+    "   movq    "   J(6)","r5"\n"   \
+    "   paddsw  "   r7","r0"\n"     \
+    "   movq    "   r5","r7"\n"     \
+    "   psubsw  "   r4","r0"\n"     \
+    "   pmulhw  "   C(2)","r5"\n"   \
+    "   paddw   "   r1","r2"\n"     \
+    "   pmulhw  "   C(6)","r1"\n"   \
+    "   paddsw  "   r4","r4"\n"     \
+    "   paddsw  "   r0","r4"\n"     \
+    "   psubsw  "   r6","r3"\n"     \
+    "   paddw   "   r7","r5"\n"     \
+    "   paddsw  "   r6","r6"\n"     \
+    "   pmulhw  "   C(6)","r7"\n"   \
+    "   paddsw  "   r3","r6"\n"     \
+    "   movq    "   r4","I(1)"\n"   \
+    "   psubsw  "   r5","r1"\n"     \
+    "   movq    "   C(4)","r4"\n"   \
+    "   movq    "   r3","r5"\n"     \
+    "   pmulhw  "   r4","r3"\n"     \
+    "   paddsw  "   r2","r7"\n"     \
+    "   movq    "   r6","I(2)"\n"   \
+    "   movq    "   r0","r2"\n"     \
+    "   movq    "   I(0)","r6"\n"   \
+    "   pmulhw  "   r4","r0"\n"     \
+    "   paddw   "   r3","r5"\n"     \
+    "\n"                            \
+    "   movq    "   J(4)","r3"\n"   \
+    "   psubsw  "   r1","r5"\n"     \
+    "   paddw   "   r0","r2"\n"     \
+    "   psubsw  "   r3","r6"\n"     \
+    "   movq    "   r6","r0"\n"     \
+    "   pmulhw  "   r4","r6"\n"     \
+    "   paddsw  "   r3","r3"\n"     \
+    "   paddsw  "   r1","r1"\n"     \
+    "   paddsw  "   r0","r3"\n"     \
+    "   paddsw  "   r5","r1"\n"     \
+    "   pmulhw  "   r3","r4"\n"     \
+    "   paddsw  "   r0","r6"\n"     \
+    "   psubsw  "   r2","r6"\n"     \
+    "   paddsw  "   r2","r2"\n"     \
+    "   movq    "   I(1)","r0"\n"   \
+    "   paddsw  "   r6","r2"\n"     \
+    "   paddw   "   r3","r4"\n"     \
+    "   psubsw  "   r1","r2"\n"     \
+    "#end BeginIDCT\n"
+// end BeginIDCT macro (38 cycles).
+
+
+// Two versions of the end of the idct depending on whether we're feeding
+// into a transpose or dividing the final results by 16 and storing them.
+
+/**************************************************************************************
+ *
+ *      Routine:        RowIDCT
+ *
+ *      Description:    The Macro does 1-D IDct on 4 Rows
+ *
+ *      Input:          None
+ *
+ *      Output:         None
+ *
+ *      Return:         None
+ *
+ *      Special Note:   None
+ *
+ *      Error:          None
+ *
+ ***************************************************************************************
+ */
+
+// RowIDCT gets ready to transpose.
+
+#define RowIDCT ASM("\n"\
+    "#RowIDCT\n"                                        \
+    BeginIDCT                                           \
+    "\n"                                                \
+    "   movq    "I(2)","r3"\n"  /* r3 = D. */           \
+    "   psubsw  "r7","r4"\n"    /* r4 = E. = E - G */   \
+    "   paddsw  "r1","r1"\n"    /* r1 = H. + H. */      \
+    "   paddsw  "r7","r7"\n"    /* r7 = G + G */        \
+    "   paddsw  "r2","r1"\n"    /* r1 = R1 = A.. + H. */\
+    "   paddsw  "r4","r7"\n"    /* r7 = G. = E + G */   \
+    "   psubsw  "r3","r4"\n"    /* r4 = R4 = E. - D. */ \
+    "   paddsw  "r3","r3"\n"                            \
+    "   psubsw  "r5","r6"\n"    /* r6 = R6 = F. - B.. */\
+    "   paddsw  "r5","r5"\n"                            \
+    "   paddsw  "r4","r3"\n"    /* r3 = R3 = E. + D. */ \
+    "   paddsw  "r6","r5"\n"    /* r5 = R5 = F. + B.. */\
+    "   psubsw  "r0","r7"\n"    /* r7 = R7 = G. - C. */ \
+    "   paddsw  "r0","r0"\n"                            \
+    "   movq    "r1","I(1)"\n"  /* save R1 */           \
+    "   paddsw  "r7","r0"\n"    /* r0 = R0 = G. + C. */ \
+    "#end RowIDCT"										\
+);
+// end RowIDCT macro (8 + 38 = 46 cycles)
+
+
+/**************************************************************************************
+ *
+ *      Routine:        ColumnIDCT
+ *
+ *      Description:    The Macro does 1-D IDct on 4 columns
+ *
+ *      Input:          None
+ *
+ *      Output:         None
+ *
+ *      Return:         None
+ *
+ *      Special Note:   None
+ *
+ *      Error:          None
+ *
+ ***************************************************************************************
+ */
+// Column IDCT normalizes and stores final results.
+
+#define ColumnIDCT ASM("\n"                                 \
+    "#ColumnIDCT\n"                                         \
+    BeginIDCT                                               \
+    "\n"                                                    \
+    "   paddsw  "Eight","r2"\n"                             \
+    "   paddsw  "r1","r1"\n"        /* r1 = H. + H. */      \
+    "   paddsw  "r2","r1"\n"        /* r1 = R1 = A.. + H. */\
+    "   psraw   ""$4"","r2"\n"      /* r2 = NR2 */          \
+    "   psubsw  "r7","r4"\n"        /* r4 = E. = E - G */   \
+    "   psraw   ""$4"","r1"\n"      /* r1 = NR1 */          \
+    "   movq    "I(2)","r3"\n"  /* r3 = D. */               \
+    "   paddsw  "r7","r7"\n"        /* r7 = G + G */        \
+    "   movq    "r2","I(2)"\n"  /* store NR2 at I2 */       \
+    "   paddsw  "r4","r7"\n"        /* r7 = G. = E + G */   \
+    "   movq    "r1","I(1)"\n"  /* store NR1 at I1 */       \
+    "   psubsw  "r3","r4"\n"        /* r4 = R4 = E. - D. */ \
+    "   paddsw  "Eight","r4"\n"                             \
+    "   paddsw  "r3","r3"\n"        /* r3 = D. + D. */      \
+    "   paddsw  "r4","r3"\n"        /* r3 = R3 = E. + D. */ \
+    "   psraw   ""$4"","r4"\n"      /* r4 = NR4 */          \
+    "   psubsw  "r5","r6"\n"        /* r6 = R6 = F. - B.. */\
+    "   psraw   ""$4"","r3"\n"      /* r3 = NR3 */          \
+    "   paddsw  "Eight","r6"\n"                             \
+    "   paddsw  "r5","r5"\n"        /* r5 = B.. + B.. */    \
+    "   paddsw  "r6","r5"\n"        /* r5 = R5 = F. + B.. */\
+    "   psraw   ""$4"","r6"\n"      /* r6 = NR6 */          \
+    "   movq    "r4","J(4)"\n"  /* store NR4 at J4 */       \
+    "   psraw   ""$4"","r5"\n"      /* r5 = NR5 */          \
+    "   movq    "r3","I(3)"\n"  /* store NR3 at I3 */       \
+    "   psubsw  "r0","r7"\n"        /* r7 = R7 = G. - C. */ \
+    "   paddsw  "Eight","r7"\n"                             \
+    "   paddsw  "r0","r0"\n"        /* r0 = C. + C. */      \
+    "   paddsw  "r7","r0"\n"        /* r0 = R0 = G. + C. */ \
+    "   psraw   ""$4"","r7"\n"      /* r7 = NR7 */          \
+    "   movq    "r6","J(6)"\n"  /* store NR6 at J6 */       \
+    "   psraw   ""$4"","r0"\n"      /* r0 = NR0 */          \
+    "   movq    "r5","J(5)"\n"  /* store NR5 at J5 */       \
+    "   movq    "r7","J(7)"\n"  /* store NR7 at J7 */       \
+    "   movq    "r0","I(0)"\n"  /* store NR0 at I0 */       \
+    "#end ColumnIDCT\n"										\
+);
+// end ColumnIDCT macro (38 + 19 = 57 cycles)
+
+/**************************************************************************************
+ *
+ *      Routine:        Transpose
+ *
+ *      Description:    The Macro does two 4x4 transposes in place.
+ *
+ *      Input:          None
+ *
+ *      Output:         None
+ *
+ *      Return:         None
+ *
+ *      Special Note:   None
+ *
+ *      Error:          None
+ *
+ ***************************************************************************************
+ */
+
+/* Following macro does two 4x4 transposes in place.
+
+  At entry (we assume):
+
+    r0 = a3 a2 a1 a0
+    I(1) = b3 b2 b1 b0
+    r2 = c3 c2 c1 c0
+    r3 = d3 d2 d1 d0
+
+    r4 = e3 e2 e1 e0
+    r5 = f3 f2 f1 f0
+    r6 = g3 g2 g1 g0
+    r7 = h3 h2 h1 h0
+
+   At exit, we have:
+
+    I(0) = d0 c0 b0 a0
+    I(1) = d1 c1 b1 a1
+    I(2) = d2 c2 b2 a2
+    I(3) = d3 c3 b3 a3
+
+    J(4) = h0 g0 f0 e0
+    J(5) = h1 g1 f1 e1
+    J(6) = h2 g2 f2 e2
+    J(7) = h3 g3 f3 e3
+
+   I(0) I(1) I(2) I(3)  is the transpose of r0 I(1) r2 r3.
+   J(4) J(5) J(6) J(7)  is the transpose of r4 r5 r6 r7.
+
+   Since r1 is free at entry, we calculate the Js first. */
+
+
+#define Transpose ASM("\n#Transpose\n"      \
+                                            \
+    "   movq        "r4","r1"\n"            \
+    "   punpcklwd   "r5","r4"\n"            \
+    "   movq        "r0","I(0)"\n"          \
+    "   punpckhwd   "r5","r1"\n"            \
+    "   movq        "r6","r0"\n"            \
+    "   punpcklwd   "r7","r6"\n"            \
+    "   movq        "r4","r5"\n"            \
+    "   punpckldq   "r6","r4"\n"            \
+    "   punpckhdq   "r6","r5"\n"            \
+    "   movq        "r1","r6"\n"            \
+    "   movq        "r4","J(4)"\n"          \
+    "   punpckhwd   "r7","r0"\n"            \
+    "   movq        "r5","J(5)"\n"          \
+    "   punpckhdq   "r0","r6"\n"            \
+    "   movq        "I(0)","r4"\n"          \
+    "   punpckldq   "r0","r1"\n"            \
+    "   movq        "I(1)","r5"\n"          \
+    "   movq        "r4","r0"\n"            \
+    "   movq        "r6","J(7)"\n"          \
+    "   punpcklwd   "r5","r0"\n"            \
+    "   movq        "r1","J(6)"\n"          \
+    "   punpckhwd   "r5","r4"\n"            \
+    "   movq        "r2","r5"\n"            \
+    "   punpcklwd   "r3","r2"\n"            \
+    "   movq        "r0","r1"\n"            \
+    "   punpckldq   "r2","r0"\n"            \
+    "   punpckhdq   "r2","r1"\n"            \
+    "   movq        "r4","r2"\n"            \
+    "   movq        "r0","I(0)"\n"          \
+    "   punpckhwd   "r3","r5"\n"            \
+    "   movq        "r1","I(1)"\n"          \
+    "   punpckhdq   "r5","r4"\n"            \
+    "   punpckldq   "r5","r2"\n"            \
+                                            \
+    "   movq        "r4","I(3)"\n"          \
+                                            \
+    "   movq        "r2","I(2)"\n"          \
+    "#end Transpose\n"						\
+);
+// end Transpose macro (19 cycles).
+
+/*
+static void MMX_dump()
+{
+    ASM
+    ("\
+        movq    %mm0,(%edi)\n\
+        movq    %mm1,8(%edi)\n\
+        movq    %mm2,16(%edi)\n\
+        movq    %mm3,24(%edi)\n\
+        movq    %mm4,32(%edi)\n\
+        movq    %mm5,40(%edi)\n\
+        movq    %mm6,48(%edi)\n\
+        movq    %mm7,56(%edi)\n\
+        ret"
+    );
+}
+*/
+
+/**************************************************************************************
+ *
+ *      Routine:        MMX_idct
+ *
+ *      Description:    Perform IDCT on a 8x8 block
+ *
+ *      Input:          Pointer to input and output buffer
+ *
+ *      Output:         None
+ *
+ *      Return:         None
+ *
+ *      Special Note:   The input coefficients are in ZigZag order
+ *
+ *      Error:          None
+ *
+ ***************************************************************************************
+ */
+void IDctSlow__mmx(  Q_LIST_ENTRY * InputData,
+                ogg_int16_t *QuantMatrix,
+                ogg_int16_t * OutputData ) {
+
+#   define MIDM(M,I)    MtoSTR(M+I*8(%ecx))
+#   define M(I)         MIDM( MaskOffset , I )
+#   define MIDC(M,I)    MtoSTR(M+(I-1)*8(%ecx))
+#   define C(I)         MIDC( CosineOffset , I )
+#   define MIDEight(M)  MtoSTR(M(%ecx))
+#   define Eight        MIDEight(EightOffset)
+
+#   define r0   "%mm0"
+#   define r1   "%mm1"
+#   define r2   "%mm2"
+#   define r3   "%mm3"
+#   define r4   "%mm4"
+#   define r5   "%mm5"
+#   define r6   "%mm6"
+#   define r7   "%mm7"
+
+    __asm__ __volatile__ (
+    /* eax = quantized input */
+    /* esi = quantization table */
+    /* edx = destination (= idct buffer) */
+    /* ecx = idctconstants */
+    ""
+    :
+    :"a"(InputData), "S"(QuantMatrix), "d"(OutputData), "c"(idctconstants)
+    );
+
+    ASM(
+    "movq   (%eax), "r0"\n"
+    "pmullw (%esi), "r0"\n"     /* r0 = 03 02 01 00 */
+    "movq   16(%eax), "r1"\n"
+    "pmullw 16(%esi), "r1"\n"   /* r1 = 13 12 11 10 */
+    "movq   "M(0)", "r2"\n"     /* r2 = __ __ __ FF */
+    "movq   "r0", "r3"\n"       /* r3 = 03 02 01 00 */
+    "movq   8(%eax), "r4"\n"
+    "psrlq  $16, "r0"\n"        /* r0 = __ 03 02 01 */
+    "pmullw 8(%esi), "r4"\n"    /* r4 = 07 06 05 04 */
+    "pand   "r2", "r3"\n"       /* r3 = __ __ __ 00 */
+    "movq   "r0", "r5"\n"       /* r5 = __ 03 02 01 */
+    "movq   "r1", "r6"\n"       /* r6 = 13 12 11 10 */
+    "pand   "r2", "r5"\n"       /* r5 = __ __ __ 01 */
+    "psllq  $32, "r6"\n"        /* r6 = 11 10 __ __ */
+    "movq   "M(3)", "r7"\n"     /* r7 = FF __ __ __ */
+    "pxor   "r5", "r0"\n"       /* r0 = __ 03 02 __ */
+    "pand   "r6", "r7"\n"       /* r7 = 11 __ __ __ */
+    "por    "r3", "r0"\n"       /* r0 = __ 03 02 00 */
+    "pxor   "r7", "r6"\n"       /* r6 = __ 10 __ __ */
+    "por    "r7", "r0"\n"       /* r0 = 11 03 02 00 = R0 */
+    "movq   "M(3)", "r7"\n"     /* r7 = FF __ __ __ */
+    "movq   "r4", "r3"\n"       /* r3 = 07 06 05 04 */
+    "movq   "r0", (%edx)\n"     /* write R0 = r0 */
+    "pand   "r2", "r3"\n"       /* r3 = __ __ __ 04 */
+    "movq   32(%eax), "r0"\n"
+    "psllq  $16, "r3"\n"        /* r3 = __ __ 04 __ */
+    "pmullw 32(%esi), "r0"\n"   /* r0 = 23 22 21 20 */
+    "pand   "r1", "r7"\n"       /* r7 = 13 __ __ __ */
+    "por    "r3", "r5"\n"       /* r5 = __ __ 04 01 */
+    "por    "r6", "r7"\n"       /* r7 = 13 10 __ __ */
+    "movq   24(%eax), "r3"\n"
+    "por    "r5", "r7"\n"       /* r7 = 13 10 04 01 = R1 */
+    "pmullw 24(%esi), "r3"\n"   /* r3 = 17 16 15 14 */
+    "psrlq  $16, "r4"\n"        /* r4 = __ 07 06 05 */
+    "movq   "r7", 16(%edx)\n"   /* write R1 = r7 */
+    "movq   "r4", "r5"\n"       /* r5 = __ 07 06 05 */
+    "movq   "r0", "r7"\n"       /* r7 = 23 22 21 20 */
+    "psrlq  $16, "r4"\n"        /* r4 = __ __ 07 06 */
+    "psrlq  $48, "r7"\n"        /* r7 = __ __ __ 23 */
+    "movq   "r2", "r6"\n"       /* r6 = __ __ __ FF */
+    "pand   "r2", "r5"\n"       /* r5 = __ __ __ 05 */
+    "pand   "r4", "r6"\n"       /* r6 = __ __ __ 06 */
+    "movq   "r7", 80(%edx)\n"   /* partial R9 = __ __ __ 23 */
+    "pxor   "r6", "r4"\n"       /* r4 = __ __ 07 __ */
+    "psrlq  $32, "r1"\n"        /* r1 = __ __ 13 12 */
+    "por    "r5", "r4"\n"       /* r4 = __ __ 07 05 */
+    "movq   "M(3)", "r7"\n"     /* r7 = FF __ __ __ */
+    "pand   "r2", "r1"\n"       /* r1 = __ __ __ 12 */
+    "movq   48(%eax), "r5"\n"
+    "psllq  $16, "r0"\n"        /* r0 = 22 21 20 __ */
+    "pmullw 48(%esi), "r5"\n"   /* r5 = 33 32 31 30 */
+    "pand   "r0", "r7"\n"       /* r7 = 22 __ __ __ */
+    "movq   "r1", 64(%edx)\n"   /* partial R8 = __ __ __ 12 */
+    "por    "r4", "r7"\n"       /* r7 = 22 __ 07 05 */
+    "movq   "r3", "r4"\n"       /* r4 = 17 16 15 14 */
+    "pand   "r2", "r3"\n"       /* r3 = __ __ __ 14 */
+    "movq   "M(2)", "r1"\n"     /* r1 = __ FF __ __ */
+    "psllq  $32, "r3"\n"        /* r3 = __ 14 __ __ */
+    "por    "r3", "r7"\n"       /* r7 = 22 14 07 05 = R2 */
+    "movq   "r5", "r3"\n"       /* r3 = 33 32 31 30 */
+    "psllq  $48, "r3"\n"        /* r3 = 30 __ __ __ */
+    "pand   "r0", "r1"\n"       /* r1 = __ 21 __ __ */
+    "movq   "r7", 32(%edx)\n"   /* write R2 = r7 */
+    "por    "r3", "r6"\n"       /* r6 = 30 __ __ 06 */
+    "movq   "M(1)", "r7"\n"     /* r7 = __ __ FF __ */
+    "por    "r1", "r6"\n"       /* r6 = 30 21 __ 06 */
+    "movq   56(%eax), "r1"\n"
+    "pand   "r4", "r7"\n"       /* r7 = __ __ 15 __ */
+    "pmullw 56(%esi), "r1"\n"   /* r1 = 37 36 35 34 */
+    "por    "r6", "r7"\n"       /* r7 = 30 21 15 06 = R3 */
+    "pand   "M(1)", "r0"\n"     /* r0 = __ __ 20 __ */
+    "psrlq  $32, "r4"\n"        /* r4 = __ __ 17 16 */
+    "movq   "r7", 48(%edx)\n"   /* write R3 = r7 */
+    "movq   "r4", "r6"\n"       /* r6 = __ __ 17 16 */
+    "movq   "M(3)", "r7"\n"     /* r7 = FF __ __ __ */
+    "pand   "r2", "r4"\n"       /* r4 = __ __ __ 16 */
+    "movq   "M(1)", "r3"\n"     /* r3 = __ __ FF __ */
+    "pand   "r1", "r7"\n"       /* r7 = 37 __ __ __ */
+    "pand   "r5", "r3"\n"       /* r3 = __ __ 31 __ */
+    "por    "r4", "r0"\n"       /* r0 = __ __ 20 16 */
+    "psllq  $16, "r3"\n"        /* r3 = __ 31 __ __ */
+    "por    "r0", "r7"\n"       /* r7 = 37 __ 20 16 */
+    "movq   "M(2)", "r4"\n"     /* r4 = __ FF __ __ */
+    "por    "r3", "r7"\n"       /* r7 = 37 31 20 16 = R4 */
+    "movq   80(%eax), "r0"\n"
+    "movq   "r4", "r3"\n"       /* r3 = __ __ FF __ */
+    "pmullw 80(%esi), "r0"\n"   /* r0 = 53 52 51 50 */
+    "pand   "r5", "r4"\n"       /* r4 = __ 32 __ __ */
+    "movq   "r7", 8(%edx)\n"    /* write R4 = r7 */
+    "por    "r4", "r6"\n"       /* r6 = __ 32 17 16 */
+    "movq   "r3", "r4"\n"       /* r4 = __ FF __ __ */
+    "psrlq  $16, "r6"\n"        /* r6 = __ __ 32 17 */
+    "movq   "r0", "r7"\n"       /* r7 = 53 52 51 50 */
+    "pand   "r1", "r4"\n"       /* r4 = __ 36 __ __ */
+    "psllq  $48, "r7"\n"        /* r7 = 50 __ __ __ */
+    "por    "r4", "r6"\n"       /* r6 = __ 36 32 17 */
+    "movq   88(%eax), "r4"\n"
+    "por    "r6", "r7"\n"       /* r7 = 50 36 32 17 = R5 */
+    "pmullw 88(%esi), "r4"\n"   /* r4 = 57 56 55 54 */
+    "psrlq  $16, "r3"\n"        /* r3 = __ __ FF __ */
+    "movq   "r7", 24(%edx)\n"   /* write R5 = r7 */
+    "pand   "r1", "r3"\n"       /* r3 = __ __ 35 __ */
+    "psrlq  $48, "r5"\n"        /* r5 = __ __ __ 33 */
+    "pand   "r2", "r1"\n"       /* r1 = __ __ __ 34 */
+    "movq   104(%eax), "r6"\n"
+    "por    "r3", "r5"\n"       /* r5 = __ __ 35 33 */
+    "pmullw 104(%esi), "r6"\n"  /* r6 = 67 66 65 64 */
+    "psrlq  $16, "r0"\n"        /* r0 = __ 53 52 51 */
+    "movq   "r4", "r7"\n"       /* r7 = 57 56 55 54 */
+    "movq   "r2", "r3"\n"       /* r3 = __ __ __ FF */
+    "psllq  $48, "r7"\n"        /* r7 = 54 __ __ __ */
+    "pand   "r0", "r3"\n"       /* r3 = __ __ __ 51 */
+    "pxor   "r3", "r0"\n"       /* r0 = __ 53 52 __ */
+    "psllq  $32, "r3"\n"        /* r3 = __ 51 __ __ */
+    "por    "r5", "r7"\n"       /* r7 = 54 __ 35 33 */
+    "movq   "r6", "r5"\n"       /* r5 = 67 66 65 64 */
+    "pand   "M(1)", "r6"\n"     /* r6 = __ __ 65 __ */
+    "por    "r3", "r7"\n"       /* r7 = 54 51 35 33 = R6 */
+    "psllq  $32, "r6"\n"        /* r6 = 65 __ __ __ */
+    "por    "r1", "r0"\n"       /* r0 = __ 53 52 34 */
+    "movq   "r7", 40(%edx)\n"   /* write R6 = r7 */
+    "por    "r6", "r0"\n"       /* r0 = 65 53 52 34 = R7 */
+    "movq   120(%eax), "r7"\n"
+    "movq   "r5", "r6"\n"       /* r6 = 67 66 65 64 */
+    "pmullw 120(%esi), "r7"\n"  /* r7 = 77 76 75 74 */
+    "psrlq  $32, "r5"\n"        /* r5 = __ __ 67 66 */
+    "pand   "r2", "r6"\n"       /* r6 = __ __ __ 64 */
+    "movq   "r5", "r1"\n"       /* r1 = __ __ 67 66 */
+    "movq   "r0", 56(%edx)\n"   /* write R7 = r0 */
+    "pand   "r2", "r1"\n"       /* r1 = __ __ __ 66 */
+    "movq   112(%eax), "r0"\n"
+    "movq   "r7", "r3"\n"       /* r3 = 77 76 75 74 */
+    "pmullw 112(%esi), "r0"\n"  /* r0 = 73 72 71 70 */
+    "psllq  $16, "r3"\n"        /* r3 = 76 75 74 __ */
+    "pand   "M(3)", "r7"\n"     /* r7 = 77 __ __ __ */
+    "pxor   "r1", "r5"\n"       /* r5 = __ __ 67 __ */
+    "por    "r5", "r6"\n"       /* r6 = __ __ 67 64 */
+    "movq   "r3", "r5"\n"       /* r5 = 76 75 74 __ */
+    "pand   "M(3)", "r5"\n"     /* r5 = 76 __ __ __ */
+    "por    "r1", "r7"\n"       /* r7 = 77 __ __ 66 */
+    "movq   96(%eax), "r1"\n"
+    "pxor   "r5", "r3"\n"       /* r3 = __ 75 74 __ */
+    "pmullw 96(%esi), "r1"\n"   /* r1 = 63 62 61 60 */
+    "por    "r3", "r7"\n"       /* r7 = 77 75 74 66 = R15 */
+    "por    "r5", "r6"\n"       /* r6 = 76 __ 67 64 */
+    "movq   "r0", "r5"\n"       /* r5 = 73 72 71 70 */
+    "movq   "r7", 120(%edx)\n"  /* store R15 = r7 */
+    "psrlq  $16, "r5"\n"        /* r5 = __ 73 72 71 */
+    "pand   "M(2)", "r5"\n"     /* r5 = __ 73 __ __ */
+    "movq   "r0", "r7"\n"       /* r7 = 73 72 71 70 */
+    "por    "r5", "r6"\n"       /* r6 = 76 73 67 64 = R14 */
+    "pand   "r2", "r0"\n"       /* r0 = __ __ __ 70 */
+    "pxor   "r0", "r7"\n"       /* r7 = 73 72 71 __ */
+    "psllq  $32, "r0"\n"        /* r0 = __ 70 __ __ */
+    "movq   "r6", 104(%edx)\n"  /* write R14 = r6 */
+    "psrlq  $16, "r4"\n"        /* r4 = __ 57 56 55 */
+    "movq   72(%eax), "r5"\n"
+    "psllq  $16, "r7"\n"        /* r7 = 72 71 __ __ */
+    "pmullw 72(%esi), "r5"\n"   /* r5 = 47 46 45 44 */
+    "movq   "r7", "r6"\n"       /* r6 = 72 71 __ __ */
+    "movq   "M(2)", "r3"\n"     /* r3 = __ FF __ __ */
+    "psllq  $16, "r6"\n"        /* r6 = 71 __ __ __ */
+    "pand   "M(3)", "r7"\n"     /* r7 = 72 __ __ __ */
+    "pand   "r1", "r3"\n"       /* r3 = __ 62 __ __ */
+    "por    "r0", "r7"\n"       /* r7 = 72 70 __ __ */
+    "movq   "r1", "r0"\n"       /* r0 = 63 62 61 60 */
+    "pand   "M(3)", "r1"\n"     /* r1 = 63 __ __ __ */
+    "por    "r3", "r6"\n"       /* r6 = 71 62 __ __ */
+    "movq   "r4", "r3"\n"       /* r3 = __ 57 56 55 */
+    "psrlq  $32, "r1"\n"        /* r1 = __ __ 63 __ */
+    "pand   "r2", "r3"\n"       /* r3 = __ __ __ 55 */
+    "por    "r1", "r7"\n"       /* r7 = 72 70 63 __ */
+    "por    "r3", "r7"\n"       /* r7 = 72 70 63 55 = R13 */
+    "movq   "r4", "r3"\n"       /* r3 = __ 57 56 55 */
+    "pand   "M(1)", "r3"\n"     /* r3 = __ __ 56 __ */
+    "movq   "r5", "r1"\n"       /* r1 = 47 46 45 44 */
+    "movq   "r7", 88(%edx)\n"   /* write R13 = r7 */
+    "psrlq  $48, "r5"\n"        /* r5 = __ __ __ 47 */
+    "movq   64(%eax), "r7"\n"
+    "por    "r3", "r6"\n"       /* r6 = 71 62 56 __ */
+    "pmullw 64(%esi), "r7"\n"   /* r7 = 43 42 41 40 */
+    "por    "r5", "r6"\n"       /* r6 = 71 62 56 47 = R12 */
+    "pand   "M(2)", "r4"\n"     /* r4 = __ 57 __ __ */
+    "psllq  $32, "r0"\n"        /* r0 = 61 60 __ __ */
+    "movq   "r6", 72(%edx)\n"   /* write R12 = r6 */
+    "movq   "r0", "r6"\n"       /* r6 = 61 60 __ __ */
+    "pand   "M(3)", "r0"\n"     /* r0 = 61 __ __ __ */
+    "psllq  $16, "r6"\n"        /* r6 = 60 __ __ __ */
+    "movq   40(%eax), "r5"\n"
+    "movq   "r1", "r3"\n"       /* r3 = 47 46 45 44 */
+    "pmullw 40(%esi), "r5"\n"   /* r5 = 27 26 25 24 */
+    "psrlq  $16, "r1"\n"        /* r1 = __ 47 46 45 */
+    "pand   "M(1)", "r1"\n"     /* r1 = __ __ 46 __ */
+    "por    "r4", "r0"\n"       /* r0 = 61 57 __ __ */
+    "pand   "r7", "r2"\n"       /* r2 = __ __ __ 40 */
+    "por    "r1", "r0"\n"       /* r0 = 61 57 46 __ */
+    "por    "r2", "r0"\n"       /* r0 = 61 57 46 40 = R11 */
+    "psllq  $16, "r3"\n"        /* r3 = 46 45 44 __ */
+    "movq   "r3", "r4"\n"       /* r4 = 46 45 44 __ */
+    "movq   "r5", "r2"\n"       /* r2 = 27 26 25 24 */
+    "movq   "r0", 112(%edx)\n"  /* write R11 = r0 */
+    "psrlq  $48, "r2"\n"        /* r2 = __ __ __ 27 */
+    "pand   "M(2)", "r4"\n"     /* r4 = __ 45 __ __ */
+    "por    "r2", "r6"\n"       /* r6 = 60 __ __ 27 */
+    "movq   "M(1)", "r2"\n"     /* r2 = __ __ FF __ */
+    "por    "r4", "r6"\n"       /* r6 = 60 45 __ 27 */
+    "pand   "r7", "r2"\n"       /* r2 = __ __ 41 __ */
+    "psllq  $32, "r3"\n"        /* r3 = 44 __ __ __ */
+    "por    80(%edx), "r3"\n"   /* r3 = 44 __ __ 23 */
+    "por    "r2", "r6"\n"       /* r6 = 60 45 41 27 = R10 */
+    "movq   "M(3)", "r2"\n"     /* r2 = FF __ __ __ */
+    "psllq  $16, "r5"\n"        /* r5 = 26 25 24 __ */
+    "movq   "r6", 96(%edx)\n"   /* store R10 = r6 */
+    "pand   "r5", "r2"\n"       /* r2 = 26 __ __ __ */
+    "movq   "M(2)", "r6"\n"     /* r6 = __ FF __ __ */
+    "pxor   "r2", "r5"\n"       /* r5 = __ 25 24 __ */
+    "pand   "r7", "r6"\n"       /* r6 = __ 42 __ __ */
+    "psrlq  $32, "r2"\n"        /* r2 = __ __ 26 __ */
+    "pand   "M(3)", "r7"\n"     /* r7 = 43 __ __ __ */
+    "por    "r2", "r3"\n"       /* r3 = 44 __ 26 23 */
+    "por    64(%edx), "r7"\n"   /* r7 = 43 __ __ 12 */
+    "por    "r3", "r6"\n"       /* r6 = 44 42 26 23 = R9 */
+    "por    "r5", "r7"\n"       /* r7 = 43 25 24 12 = R8 */
+    "movq   "r6", 80(%edx)\n"   /* store R9 = r6 */
+    "movq   "r7", 64(%edx)\n"   /* store R8 = r7 */
+    );
+    /* 123c  ( / 64 coeffs  < 2c / coeff) */
+#   undef M
+
+/* Done w/dequant + descramble + partial transpose; now do the idct itself. */
+
+#   define I( K)    MtoSTR(K*16(%edx))
+#   define J( K)    MtoSTR(((K - 4)*16)+8(%edx))
+
+    RowIDCT         /* 46 c */
+    Transpose       /* 19 c */
+
+#   undef I
+#   undef J
+#   define I( K)    MtoSTR((K*16)+64(%edx))
+#   define J( K)    MtoSTR(((K-4)*16)+72(%edx))
+
+    RowIDCT         /* 46 c */
+    Transpose       /* 19 c */
+
+#   undef I
+#   undef J
+#   define I( K)    MtoSTR((K * 16)(%edx))
+#   define J( K)    I( K)
+
+    ColumnIDCT      /* 57 c */
+
+#   undef I
+#   undef J
+#   define I( K)    MtoSTR((K*16)+8(%edx))
+#   define J( K)    I( K)
+
+    ColumnIDCT      /* 57 c */
+
+#   undef I
+#   undef J
+    /* 368 cycles  ( / 64 coeff  <  6 c / coeff) */
+
+    ASM("emms\n");
+}
+
+/**************************************************************************************
+ *
+ *      Routine:        MMX_idct10
+ *
+ *      Description:    Perform IDCT on a 8x8 block with at most 10 nonzero coefficients
+ *
+ *      Input:          Pointer to input and output buffer
+ *
+ *      Output:         None
+ *
+ *      Return:         None
+ *
+ *      Special Note:   The input coefficients are in transposed ZigZag order
+ *
+ *      Error:          None
+ *
+ ***************************************************************************************
+ */
+/* --------------------------------------------------------------- */
+// This macro does four 4-sample one-dimensional idcts in parallel.  Inputs
+// 4 thru 7 are assumed to be zero.
+#define BeginIDCT_10 "#BeginIDCT_10\n"  \
+    "   movq    "I(3)","r2"\n"          \
+                                        \
+    "   movq    "C(3)","r6"\n"          \
+    "   movq    "r2","r4"\n"            \
+                                        \
+    "   movq    "C(5)","r1"\n"          \
+    "   pmulhw  "r6","r4"\n"            \
+                                        \
+    "   movq    "I(1)","r3"\n"          \
+    "   pmulhw  "r2","r1"\n"            \
+                                        \
+    "   movq    "C(1)","r0"\n"          \
+    "   paddw   "r2","r4"\n"            \
+                                        \
+    "   pxor    "r6","r6"\n"            \
+    "   paddw   "r1","r2"\n"            \
+                                        \
+    "   movq    "I(2)","r5"\n"          \
+    "   pmulhw  "r3","r0"\n"            \
+                                        \
+    "   movq    "r5","r1"\n"            \
+    "   paddw   "r3","r0"\n"            \
+                                        \
+    "   pmulhw  "C(7)","r3"\n"          \
+    "   psubsw  "r2","r6"\n"            \
+                                        \
+    "   pmulhw  "C(2)","r5"\n"          \
+    "   psubsw  "r4","r0"\n"            \
+                                        \
+    "   movq    "I(2)","r7"\n"          \
+    "   paddsw  "r4","r4"\n"            \
+                                        \
+    "   paddw   "r5","r7"\n"            \
+    "   paddsw  "r0","r4"\n"            \
+                                        \
+    "   pmulhw  "C(6)","r1"\n"          \
+    "   psubsw  "r6","r3"\n"            \
+                                        \
+    "   movq    "r4","I(1)"\n"          \
+    "   paddsw  "r6","r6"\n"            \
+                                        \
+    "   movq    "C(4)","r4"\n"          \
+    "   paddsw  "r3","r6"\n"            \
+                                        \
+    "   movq    "r3","r5"\n"            \
+    "   pmulhw  "r4","r3"\n"            \
+                                        \
+    "   movq    "r6","I(2)"\n"          \
+    "   movq    "r0","r2"\n"            \
+                                        \
+    "   movq    "I(0)","r6"\n"          \
+    "   pmulhw  "r4","r0"\n"            \
+                                        \
+    "   paddw   "r3","r5"\n"            \
+    "   paddw   "r0","r2"\n"            \
+                                        \
+    "   psubsw  "r1","r5"\n"            \
+    "   pmulhw  "r4","r6"\n"            \
+                                        \
+    "   paddw   "I(0)","r6"\n"          \
+    "   paddsw  "r1","r1"\n"            \
+                                        \
+    "   movq    "r6","r4"\n"            \
+    "   paddsw  "r5","r1"\n"            \
+                                        \
+    "   psubsw  "r2","r6"\n"            \
+    "   paddsw  "r2","r2"\n"            \
+                                        \
+    "   movq    "I(1)","r0"\n"          \
+    "   paddsw  "r6","r2"\n"            \
+                                        \
+    "   psubsw  "r1","r2"\n"            \
+    "#end BeginIDCT_10\n"
+// end BeginIDCT_10 macro (25 cycles).
+
+#define RowIDCT_10 ASM("\n"                                 \
+    "#RowIDCT_10\n"                                         \
+    BeginIDCT_10                                            \
+    "\n"                                                    \
+    "   movq    "I(2)","r3"\n"  /* r3 = D. */               \
+    "   psubsw  "r7","r4"\n"        /* r4 = E. = E - G */   \
+    "   paddsw  "r1","r1"\n"        /* r1 = H. + H. */      \
+    "   paddsw  "r7","r7"\n"        /* r7 = G + G */        \
+    "   paddsw  "r2","r1"\n"        /* r1 = R1 = A.. + H. */\
+    "   paddsw  "r4","r7"\n"        /* r7 = G. = E + G */   \
+    "   psubsw  "r3","r4"\n"        /* r4 = R4 = E. - D. */ \
+    "   paddsw  "r3","r3"\n"                                \
+    "   psubsw  "r5","r6"\n"        /* r6 = R6 = F. - B.. */\
+    "   paddsw  "r5","r5"\n"                                \
+    "   paddsw  "r4","r3"\n"        /* r3 = R3 = E. + D. */ \
+    "   paddsw  "r6","r5"\n"        /* r5 = R5 = F. + B.. */\
+    "   psubsw  "r0","r7"\n"        /* r7 = R7 = G. - C. */ \
+    "   paddsw  "r0","r0"\n"                                \
+    "   movq    "r1","I(1)"\n"  /* save R1 */               \
+    "   paddsw  "r7","r0"\n"        /* r0 = R0 = G. + C. */ \
+    "#end RowIDCT_10\n"										\
+);
+// end RowIDCT macro (8 + 38 = 46 cycles)
+
+// Column IDCT normalizes and stores final results.
+
+#define ColumnIDCT_10 ASM("\n"                          \
+    "#ColumnIDCT_10\n"                                  \
+    BeginIDCT_10                                        \
+    "\n"                                                \
+    "   paddsw  "Eight","r2"\n"                         \
+    "   paddsw  "r1","r1"\n"    /* r1 = H. + H. */      \
+    "   paddsw  "r2","r1"\n"    /* r1 = R1 = A.. + H. */\
+    "   psraw   ""$4"","r2"\n"      /* r2 = NR2 */      \
+    "   psubsw  "r7","r4"\n"    /* r4 = E. = E - G */   \
+    "   psraw   ""$4"","r1"\n"      /* r1 = NR1 */      \
+    "   movq    "I(2)","r3"\n"  /* r3 = D. */           \
+    "   paddsw  "r7","r7"\n"    /* r7 = G + G */        \
+    "   movq    "r2","I(2)"\n"  /* store NR2 at I2 */   \
+    "   paddsw  "r4","r7"\n"    /* r7 = G. = E + G */   \
+    "   movq    "r1","I(1)"\n"  /* store NR1 at I1 */   \
+    "   psubsw  "r3","r4"\n"    /* r4 = R4 = E. - D. */ \
+    "   paddsw  "Eight","r4"\n"                         \
+    "   paddsw  "r3","r3"\n"    /* r3 = D. + D. */      \
+    "   paddsw  "r4","r3"\n"    /* r3 = R3 = E. + D. */ \
+    "   psraw   ""$4"","r4"\n"      /* r4 = NR4 */      \
+    "   psubsw  "r5","r6"\n"    /* r6 = R6 = F. - B.. */\
+    "   psraw   ""$4"","r3"\n"      /* r3 = NR3 */      \
+    "   paddsw  "Eight","r6"\n"                         \
+    "   paddsw  "r5","r5"\n"    /* r5 = B.. + B.. */    \
+    "   paddsw  "r6","r5"\n"    /* r5 = R5 = F. + B.. */\
+    "   psraw   ""$4"","r6"\n"      /* r6 = NR6 */      \
+    "   movq    "r4","J(4)"\n"  /* store NR4 at J4 */   \
+    "   psraw   ""$4"","r5"\n"      /* r5 = NR5 */      \
+    "   movq    "r3","I(3)"\n"  /* store NR3 at I3 */   \
+    "   psubsw  "r0","r7"\n"    /* r7 = R7 = G. - C. */ \
+    "   paddsw  "Eight","r7"\n"                         \
+    "   paddsw  "r0","r0"\n"    /* r0 = C. + C. */      \
+    "   paddsw  "r7","r0"\n"    /* r0 = R0 = G. + C. */ \
+    "   psraw   ""$4"","r7"\n"      /* r7 = NR7 */      \
+    "   movq    "r6","J(6)"\n"  /* store NR6 at J6 */   \
+    "   psraw   ""$4"","r0"\n"      /* r0 = NR0 */      \
+    "   movq    "r5","J(5)"\n"  /* store NR5 at J5 */   \
+                                                        \
+    "   movq    "r7","J(7)"\n"  /* store NR7 at J7 */   \
+                                                        \
+    "   movq    "r0","I(0)"\n"  /* store NR0 at I0 */   \
+    "#end ColumnIDCT_10\n"								\
+);
+// end ColumnIDCT macro (38 + 19 = 57 cycles)
+/* --------------------------------------------------------------- */
+
+
+/* --------------------------------------------------------------- */
+/* IDCT 10 */
+void IDct10__mmx( Q_LIST_ENTRY * InputData,
+             ogg_int16_t *QuantMatrix,
+             ogg_int16_t * OutputData ) {
+
+#   define MIDM(M,I)    MtoSTR(M+I*8(%ecx))
+#   define M(I)         MIDM( MaskOffset , I )
+#   define MIDC(M,I)    MtoSTR(M+(I-1)*8(%ecx))
+#   define C(I)         MIDC( CosineOffset , I )
+#   define MIDEight(M)  MtoSTR(M(%ecx))
+#   define Eight        MIDEight(EightOffset)
+
+#   define r0   "%mm0"
+#   define r1   "%mm1"
+#   define r2   "%mm2"
+#   define r3   "%mm3"
+#   define r4   "%mm4"
+#   define r5   "%mm5"
+#   define r6   "%mm6"
+#   define r7   "%mm7"
+
+    __asm__ __volatile__ (
+    /* eax = quantized input */
+    /* esi = quantization table */
+    /* edx = destination (= idct buffer) */
+    /* ecx = idctconstants */
+    ""
+    :
+    :"a"(InputData), "S"(QuantMatrix), "d"(OutputData), "c"(idctconstants)
+    );
+
+    ASM(
+    "movq   (%eax), "r0"\n"
+    "pmullw (%esi), "r0"\n"     /* r0 = 03 02 01 00 */
+    "movq   16(%eax), "r1"\n"
+    "pmullw 16(%esi), "r1"\n"   /* r1 = 13 12 11 10 */
+    "movq   "M(0)", "r2"\n"     /* r2 = __ __ __ FF */
+    "movq   "r0", "r3"\n"       /* r3 = 03 02 01 00 */
+    "movq   8(%eax), "r4"\n"
+    "psrlq  $16, "r0"\n"        /* r0 = __ 03 02 01 */
+    "pmullw 8(%esi), "r4"\n"    /* r4 = 07 06 05 04 */
+    "pand   "r2", "r3"\n"       /* r3 = __ __ __ 00 */
+    "movq   "r0", "r5"\n"       /* r5 = __ 03 02 01 */
+    "pand   "r2", "r5"\n"       /* r5 = __ __ __ 01 */
+    "psllq  $32, "r1"\n"        /* r1 = 11 10 __ __ */
+    "movq   "M(3)", "r7"\n"     /* r7 = FF __ __ __ */
+    "pxor   "r5", "r0"\n"       /* r0 = __ 03 02 __ */
+    "pand   "r1", "r7"\n"       /* r7 = 11 __ __ __ */
+    "por    "r3", "r0"\n"       /* r0 = __ 03 02 00 */
+    "pxor   "r7", "r1"\n"       /* r1 = __ 10 __ __ */
+    "por    "r7", "r0"\n"       /* r0 = 11 03 02 00 = R0 */
+    "movq   "r4", "r3"\n"       /* r3 = 07 06 05 04 */
+    "movq   "r0", (%edx)\n"     /* write R0 = r0 */
+    "pand   "r2", "r3"\n"       /* r3 = __ __ __ 04 */
+    "psllq  $16, "r3"\n"        /* r3 = __ __ 04 __ */
+    "por    "r3", "r5"\n"       /* r5 = __ __ 04 01 */
+    "por    "r5", "r1"\n"       /* r1 = __ 10 04 01 = R1 */
+    "psrlq  $16, "r4"\n"        /* r4 = __ 07 06 05 */
+    "movq   "r1", 16(%edx)\n"   /* write R1 = r1 */
+    "movq   "r4", "r5"\n"       /* r5 = __ 07 06 05 */
+    "psrlq  $16, "r4"\n"        /* r4 = __ __ 07 06 */
+    "movq   "r2", "r6"\n"       /* r6 = __ __ __ FF */
+    "pand   "r2", "r5"\n"       /* r5 = __ __ __ 05 */
+    "pand   "r4", "r6"\n"       /* r6 = __ __ __ 06 */
+    "pxor   "r6", "r4"\n"       /* r4 = __ __ 07 __ */
+    "por    "r5", "r4"\n"       /* r4 = __ __ 07 05 */
+    "movq   "r4", 32(%edx)\n"   /* write R2 = r4 */
+    "movq   "r6", 48(%edx)\n"   /* write R3 = r6 */
+    );
+#   undef M
+
+/* Done w/dequant + descramble + partial transpose; now do the idct itself. */
+
+#   define I( K)    MtoSTR((K*16)(%edx))
+#   define J( K)    MtoSTR(((K - 4) * 16)+8(%edx))
+
+    RowIDCT_10      /* 33 c */
+    Transpose       /* 19 c */
+
+#   undef I
+#   undef J
+//# define I( K)    [edx + (  K      * 16) + 64]
+//# define J( K)    [edx + ( (K - 4) * 16) + 72]
+
+//  RowIDCT         ; 46 c
+//  Transpose       ; 19 c
+
+//# undef I
+//# undef J
+#   define I( K)    MtoSTR((K * 16)(%edx))
+#   define J( K)    I( K)
+
+    ColumnIDCT_10       /* 44 c */
+
+#   undef I
+#   undef J
+#   define I( K)    MtoSTR((K * 16)+8(%edx))
+#   define J( K)    I( K)
+
+    ColumnIDCT_10       /* 44 c */
+
+#   undef I
+#   undef J
+
+    ASM("emms\n");
+}
+
+/**************************************************************************************
+ *
+ *      Routine:        MMX_idct3
+ *
+ *      Description:    Perform IDCT on a 8x8 block with at most 3 nonzero coefficients
+ *
+ *      Input:          Pointer to input and output buffer
+ *
+ *      Output:         None
+ *
+ *      Return:         None
+ *
+ *      Special Note:   Only works for three nonzero coefficients.
+ *
+ *      Error:          None
+ *
+ ***************************************************************************************
+ */
+/***************************************************************************************
+    In IDCT 3, we are dealing with only three Non-Zero coefficients in the 8x8 block.
+    In the case that we work in the fashion RowIDCT -> ColumnIDCT, we only have to
+    do 1-D row idcts on the first two rows, the rest six rows remain zero anyway.
+    After row IDCTs, since every column could have nonzero coefficients, we need do
+    eight 1-D column IDCT. However, for each column, there are at most two nonzero
+    coefficients, coefficient 0 and coefficient 1. Same for the coefficents for the
+    two 1-d row idcts. For this reason, the process of a 1-D IDCT is simplified
+
+    from a full version:
+
+    A = (C1 * I1) + (C7 * I7)       B = (C7 * I1) - (C1 * I7)
+    C = (C3 * I3) + (C5 * I5)       D = (C3 * I5) - (C5 * I3)
+    A. = C4 * (A - C)               B. = C4 * (B - D)
+    C. = A + C                      D. = B + D
+
+    E = C4 * (I0 + I4)              F = C4 * (I0 - I4)
+    G = (C2 * I2) + (C6 * I6)       H = (C6 * I2) - (C2 * I6)
+    E. = E - G
+    G. = E + G
+
+    A.. = F + A.                    B.. = B. - H
+    F.  = F - A.                    H.  = B. + H
+
+    R0 = G. + C.    R1 = A.. + H.   R3 = E. + D.    R5 = F. + B..
+    R7 = G. - C.    R2 = A.. - H.   R4 = E. - D.    R6 = F. - B..
+
+    To:
+
+
+    A = (C1 * I1)                   B = (C7 * I1)
+    C = 0                           D = 0
+    A. = C4 * A                     B. = C4 * B
+    C. = A                          D. = B
+
+    E = C4 * I0                     F = E
+    G = 0                           H = 0
+    E. = E
+    G. = E
+
+    A.. = E + A.                    B.. = B.
+    F.  = E - A.                    H.  = B.
+
+    R0 = E + A      R1 = E + A. + B.    R3 = E + B      R5 = E - A. + B.
+    R7 = E - A      R2 = E + A. - B.    R4 = E - B      R6 = F - A. - B.
+
+******************************************************************************************/
+
+#define RowIDCT_3 ASM("\n"\
+    "#RowIDCT_3\n"\
+    "   movq        "I(1)","r7"\n"  /* r7 = I1                      */  \
+    "   movq        "C(1)","r0"\n"  /* r0 = C1                      */  \
+    "   movq        "C(7)","r3"\n"  /* r3 = C7                      */  \
+    "   pmulhw      "r7","r0"\n"    /* r0 = C1 * I1 - I1            */  \
+    "   pmulhw      "r7","r3"\n"    /* r3 = C7 * I1 = B, D.         */  \
+    "   movq        "I(0)","r6"\n"  /* r6 = I0                      */  \
+    "   movq        "C(4)","r4"\n"  /* r4 = C4                      */  \
+    "   paddw       "r7","r0"\n"    /* r0 = C1 * I1 = A, C.         */  \
+    "   movq        "r6","r1"\n"    /* make a copy of I0            */  \
+    "   pmulhw      "r4","r6"\n"    /* r2 = C4 * I0 - I0            */  \
+    "   movq        "r0","r2"\n"    /* make a copy of A             */  \
+    "   movq        "r3","r5"\n"    /* make a copy of B             */  \
+    "   pmulhw      "r4","r2"\n"    /* r2 = C4 * A - A              */  \
+    "   pmulhw      "r4","r5"\n"    /* r5 = C4 * B - B              */  \
+    "   paddw       "r1","r6"\n"    /* r2 = C4 * I0 = E, F          */  \
+    "   movq        "r6","r4"\n"    /* r4 = E                       */  \
+    "   paddw       "r0","r2"\n"    /* r2 = A.                      */  \
+    "   paddw       "r3","r5"\n"    /* r5 = B.                      */  \
+    "   movq        "r6","r7"\n"    /* r7 = E                       */  \
+    "   movq        "r5","r1"\n"    /* r1 = B.                      */  \
+    /*  r0 = A      */   \
+    /*  r3 = B      */   \
+    /*  r2 = A.     */   \
+    /*  r5 = B.     */   \
+    /*  r6 = E      */   \
+    /*  r4 = E      */   \
+    /*  r7 = E      */   \
+    /*  r1 = B.     */   \
+    "   psubw       "r2","r6"\n"    /* r6 = E - A.                  */  \
+    "   psubw       "r3","r4"\n"    /* r4 = E - B ----R4            */  \
+    "   psubw       "r0","r7"\n"    /* r7 = E - A ----R7            */  \
+    "   paddw       "r2","r2"\n"    /* r2 = A. + A.                 */  \
+    "   paddw       "r3","r3"\n"    /* r3 = B + B                   */  \
+    "   paddw       "r0","r0"\n"    /* r0 = A + A                   */  \
+    "   paddw       "r6","r2"\n"    /* r2 = E + A.                  */  \
+    "   paddw       "r4","r3"\n"    /* r3 = E + B ----R3            */  \
+    "   psubw       "r1","r2"\n"    /* r2 = E + A. - B. ----R2      */  \
+    "   psubw       "r5","r6"\n"    /* r6 = E - A. - B. ----R6      */  \
+    "   paddw       "r1","r1"\n"    /* r1 = B. + B.                 */  \
+    "   paddw       "r5","r5"\n"    /* r5 = B. + B.                 */  \
+    "   paddw       "r7","r0"\n"    /* r0 = E + A ----R0            */  \
+    "   paddw       "r2","r1"\n"    /* r1 = E + A. + B. -----R1     */  \
+    "   movq        "r1","I(1)"\n"  /* save r1                      */  \
+    "   paddw       "r6","r5"\n"    /* r5 = E - A. + B. -----R5     */  \
+    "#end RowIDCT_3\n"\
+);
+//End of RowIDCT_3
+
+#define ColumnIDCT_3 ASM("\n"\
+    "#ColumnIDCT_3\n"\
+    "   movq        "I(1)","r7"\n"  /* r7 = I1                      */  \
+    "   movq        "C(1)","r0"\n"  /* r0 = C1                      */  \
+    "   movq        "C(7)","r3"\n"  /* r3 = C7                      */  \
+    "   pmulhw      "r7","r0"\n"    /* r0 = C1 * I1 - I1            */  \
+    "   pmulhw      "r7","r3"\n"    /* r3 = C7 * I1 = B, D.         */  \
+    "   movq        "I(0)","r6"\n"  /* r6 = I0                      */  \
+    "   movq        "C(4)","r4"\n"  /* r4 = C4                      */  \
+    "   paddw       "r7","r0"\n"    /* r0 = C1 * I1 = A, C.         */  \
+    "   movq        "r6","r1"\n"    /* make a copy of I0            */  \
+    "   pmulhw      "r4","r6"\n"    /* r2 = C4 * I0 - I0            */  \
+    "   movq        "r0","r2"\n"    /* make a copy of A             */  \
+    "   movq        "r3","r5"\n"    /* make a copy of B             */  \
+    "   pmulhw      "r4","r2"\n"    /* r2 = C4 * A - A              */  \
+    "   pmulhw      "r4","r5"\n"    /* r5 = C4 * B - B              */  \
+    "   paddw       "r1","r6"\n"    /* r2 = C4 * I0 = E, F          */  \
+    "   movq        "r6","r4"\n"    /* r4 = E                       */  \
+    "   paddw       "Eight","r6"\n" /* +8 for shift                 */  \
+    "   paddw       "Eight","r4"\n" /* +8 for shift                 */  \
+    "   paddw       "r0","r2"\n"    /* r2 = A.                      */  \
+    "   paddw       "r3","r5"\n"    /* r5 = B.                      */  \
+    "   movq        "r6","r7"\n"    /* r7 = E                       */  \
+    "   movq        "r5","r1"\n"    /* r1 = B.                      */  \
+/*  r0 = A      */   \
+/*  r3 = B      */   \
+/*  r2 = A.     */   \
+/*  r5 = B.     */   \
+/*  r6 = E      */   \
+/*  r4 = E      */   \
+/*  r7 = E      */   \
+/*  r1 = B.     */   \
+    "   psubw       "r2","r6"\n"    /* r6 = E - A.                  */  \
+    "   psubw       "r3","r4"\n"    /* r4 = E - B ----R4            */  \
+    "   psubw       "r0","r7"\n"    /* r7 = E - A ----R7            */  \
+    "   paddw       "r2","r2"\n"    /* r2 = A. + A.                 */  \
+    "   paddw       "r3","r3"\n"    /* r3 = B + B                   */  \
+    "   paddw       "r0","r0"\n"    /* r0 = A + A                   */  \
+    "   paddw       "r6","r2"\n"    /* r2 = E + A.                  */  \
+    "   paddw       "r4","r3"\n"    /* r3 = E + B ----R3            */  \
+    "   psraw        $4,"r4"\n"     /* shift                        */  \
+    "   movq        "r4","J(4)"\n"  /* store R4 at J4               */  \
+    "   psraw       $4,"r3"\n"      /* shift                        */  \
+    "   movq        "r3","I(3)"\n"  /* store R3 at I3               */  \
+    "   psubw       "r1","r2"\n"    /* r2 = E + A. - B. ----R2      */  \
+    "   psubw       "r5","r6"\n"    /* r6 = E - A. - B. ----R6      */  \
+    "   paddw       "r1","r1"\n"    /* r1 = B. + B.                 */  \
+    "   paddw       "r5","r5"\n"    /* r5 = B. + B.                 */  \
+    "   paddw       "r7","r0"\n"    /* r0 = E + A ----R0            */  \
+    "   paddw       "r2","r1"\n"    /* r1 = E + A. + B. -----R1     */  \
+    "   psraw       $4,"r7"\n"      /* shift                        */  \
+    "   psraw       $4,"r2"\n"      /* shift                        */  \
+    "   psraw       $4,"r0"\n"      /* shift                        */  \
+    "   psraw       $4,"r1"\n"      /* shift                        */  \
+    "   movq        "r7","J(7)"\n"  /* store R7 to J7               */  \
+    "   movq        "r0","I(0)"\n"  /* store R0 to I0               */  \
+    "   movq        "r1","I(1)"\n"  /* store R1 to I1               */  \
+    "   movq        "r2","I(2)"\n"  /* store R2 to I2               */  \
+    "   movq        "r1","I(1)"\n"  /* save r1                      */  \
+    "   paddw       "r6","r5"\n"    /* r5 = E - A. + B. -----R5     */  \
+    "   psraw       $4,"r5"\n"      /* shift                        */  \
+    "   movq        "r5","J(5)"\n"  /* store R5 at J5               */  \
+    "   psraw       $4,"r6"\n"      /* shift                        */  \
+    "   movq        "r6","J(6)"\n"  /* store R6 at J6               */  \
+    "#end ColumnIDCT_3\n"\
+);
+//End of ColumnIDCT_3
+
+void IDct3__mmx( Q_LIST_ENTRY * InputData,
+            ogg_int16_t *QuantMatrix,
+            ogg_int16_t * OutputData ) {
+
+#   define MIDM(M,I)    MtoSTR(M+I*8(%ecx))
+#   define M(I)         MIDM( MaskOffset , I )
+#   define MIDC(M,I)    MtoSTR(M+(I-1)*8(%ecx))
+#   define C(I)         MIDC( CosineOffset , I )
+#   define MIDEight(M)  MtoSTR(M(%ecx))
+#   define Eight        MIDEight(EightOffset)
+
+#   define r0   "%mm0"
+#   define r1   "%mm1"
+#   define r2   "%mm2"
+#   define r3   "%mm3"
+#   define r4   "%mm4"
+#   define r5   "%mm5"
+#   define r6   "%mm6"
+#   define r7   "%mm7"
+
+    __asm__ __volatile__ (
+    /* eax = quantized input */
+    /* esi = quantization table */
+    /* edx = destination (= idct buffer) */
+    /* ecx = idctconstants */
+    ""
+    :
+    :"a"(InputData), "S"(QuantMatrix), "d"(OutputData), "c"(idctconstants)
+    );
+
+    ASM(
+    "movq   (%eax), "r0"\n"     
+    "pmullw (%esi), "r0"\n"     /* r0 = 03 02 01 00 */
+    "movq   "M(0)", "r2"\n"     /* r2 = __ __ __ FF */
+    "movq   "r0", "r3"\n"       /* r3 = 03 02 01 00 */
+    "psrlq  $16, "r0"\n"        /* r0 = __ 03 02 01 */
+    "pand   "r2", "r3"\n"       /* r3 = __ __ __ 00 */
+    "movq   "r0", "r5"\n"       /* r5 = __ 03 02 01 */
+    "pand   "r2", "r5"\n"       /* r5 = __ __ __ 01 */
+    "pxor   "r5", "r0"\n"       /* r0 = __ 03 02 __ */
+    "por    "r3", "r0"\n"       /* r0 = __ 03 02 00 */
+    "movq   "r0", (%edx)\n"     /* write R0 = r0 */
+    "movq   "r5", 16(%edx)\n"   /* write R1 = r5 */
+    );
+#   undef M
+
+/* Done partial transpose; now do the idct itself. */
+
+#   define I( K)    MtoSTR(K*16(%edx))
+#   define J( K)    MtoSTR(((K - 4)*16)+8(%edx))
+
+    RowIDCT_3       /* 33 c */
+    Transpose       /* 19 c */
+
+#   undef I
+#   undef J
+//# define I( K)    [edx + (  K      * 16) + 64]
+//# define J( K)    [edx + ( (K - 4) * 16) + 72]
+
+//  RowIDCT         ; 46 c
+//  Transpose       ; 19 c
+
+//# undef I
+//# undef J
+#   define I( K)    MtoSTR((K * 16)(%edx))
+#   define J( K)    I( K)
+
+    ColumnIDCT_3    /* 44 c */
+
+#   undef I
+#   undef J
+#   define I( K)    MtoSTR((K*16)+8(%edx))
+#   define J( K)    I( K)
+
+    ColumnIDCT_3    /* 44 c */
+
+#   undef I
+#   undef J
+
+    ASM("emms\n");
+}
+
+
+/* install our implementation in the function table */
+void dsp_mmx_idct_init(DspFunctions *funcs)
+{
+  TH_DEBUG("enabling accelerated x86_32 mmx idct functions.\n");
+  funcs->IDctSlow = IDctSlow__mmx;
+  funcs->IDct10 = IDct10__mmx;
+  funcs->IDct3 = IDct3__mmx;
+}
+
+#endif /* USE_ASM */

Added: trunk/theora/lib/x86_64/dct_decode_mmx.c
===================================================================
--- trunk/theora/lib/x86_64/dct_decode_mmx.c	2007-04-01 13:39:37 UTC (rev 12825)
+++ trunk/theora/lib/x86_64/dct_decode_mmx.c	2007-04-02 21:08:28 UTC (rev 12826)
@@ -0,0 +1,29 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+  last mod: $Id: dsp_mmx.c 12440 2007-02-06 16:36:26Z j $
+
+ ********************************************************************/
+
+#include <stdlib.h>
+
+#include "codec_internal.h"
+
+#if defined(USE_ASM)
+
+/* nothing implemented right now */
+void dsp_mmx_dct_decode_init(DspFunctions *funcs)
+{
+}
+
+#endif /* USE_ASM */

Added: trunk/theora/lib/x86_64/idct_mmx.c
===================================================================
--- trunk/theora/lib/x86_64/idct_mmx.c	2007-04-01 13:39:37 UTC (rev 12825)
+++ trunk/theora/lib/x86_64/idct_mmx.c	2007-04-02 21:08:28 UTC (rev 12826)
@@ -0,0 +1,27 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+  last mod: $Id: dsp_mmx.c 12440 2007-02-06 16:36:26Z j $
+
+ ********************************************************************/
+
+#include "codec_internal.h"
+
+#if defined(USE_ASM)
+
+/* nothing implemented right now */
+void dsp_mmx_idct_init(DspFunctions *funcs)
+{
+}
+
+#endif /* USE_ASM */