[xiph-commits] r11675 - branches/theora-playtime/lib/x86_32_vs

illiminable at svn.xiph.org illiminable at svn.xiph.org
Thu Jun 29 15:08:36 PDT 2006


Author: illiminable
Date: 2006-06-29 15:08:27 -0700 (Thu, 29 Jun 2006)
New Revision: 11675

Modified:
   branches/theora-playtime/lib/x86_32_vs/dct_decode_sse2.c
   branches/theora-playtime/lib/x86_32_vs/dsp_mmx.c
   branches/theora-playtime/lib/x86_32_vs/dsp_sse2.c
   branches/theora-playtime/lib/x86_32_vs/perf_helper.c
   branches/theora-playtime/lib/x86_32_vs/perf_helper.h
Log:
* Instrument the mmx versions of dsp
* Decrackulate the perf function, max is a stupid metric
* sub8x8_sse2 faster than mmx : best case 62 vs 70 cycles

Modified: branches/theora-playtime/lib/x86_32_vs/dct_decode_sse2.c
===================================================================
--- branches/theora-playtime/lib/x86_32_vs/dct_decode_sse2.c	2006-06-29 21:50:53 UTC (rev 11674)
+++ branches/theora-playtime/lib/x86_32_vs/dct_decode_sse2.c	2006-06-29 22:08:27 UTC (rev 11675)
@@ -48,7 +48,7 @@
     static __declspec(align(16)) unsigned char temp[128];
     static unsigned char* temp_ptr = temp; 
 
-  PERF_BLOCK_START();
+  //PERF_BLOCK_START();
     __asm {
         align           16
         mov             esi, PixelPtr
@@ -153,7 +153,7 @@
         pop             ebp
     }
     
-	PERF_BLOCK_END("filter horiz sse2", filter_horiz_perf, 10000);
+	//PERF_BLOCK_END("filter horiz sse2", filter_horiz_perf, 10000);
 #endif
 }
 

Modified: branches/theora-playtime/lib/x86_32_vs/dsp_mmx.c
===================================================================
--- branches/theora-playtime/lib/x86_32_vs/dsp_mmx.c	2006-06-29 21:50:53 UTC (rev 11674)
+++ branches/theora-playtime/lib/x86_32_vs/dsp_mmx.c	2006-06-29 22:08:27 UTC (rev 11675)
@@ -32,9 +32,7 @@
 static const ogg_int64_t V128 = 0x0080008000800080LL;
 
 
-static unsigned __int64 perf_sad8x8_time;
-static unsigned __int64 perf_sad8x8_count;
-static unsigned __int64 perf_sad8x8_min;
+static perf_info sub8x8_mmx_perf;
 
 static void sub8x8__mmx (unsigned char *FiltPtr, unsigned char *ReconPtr,
                   ogg_int16_t *DctInputPtr, ogg_uint32_t PixelsPerLine,
@@ -62,6 +60,7 @@
     DctInputPtr += 8;
   }
 #else
+	PERF_BLOCK_START();
     __asm {
         align 16
 
@@ -245,6 +244,8 @@
      
 
     };
+
+	PERF_BLOCK_END("sub8x8 mmx", sub8x8_mmx_perf, 10000);
  
 #endif
 }
@@ -1611,9 +1612,7 @@
   funcs->inter8x8_err = inter8x8_err__mmx;
   funcs->inter8x8_err_xy2 = inter8x8_err_xy2__mmx;
 
-  perf_sad8x8_time = 0;
- perf_sad8x8_count = 0;
-perf_sad8x8_min = -1;
+  ClearPerfData(&sub8x8_mmx_perf);
 }
 
 

Modified: branches/theora-playtime/lib/x86_32_vs/dsp_sse2.c
===================================================================
--- branches/theora-playtime/lib/x86_32_vs/dsp_sse2.c	2006-06-29 21:50:53 UTC (rev 11674)
+++ branches/theora-playtime/lib/x86_32_vs/dsp_sse2.c	2006-06-29 22:08:27 UTC (rev 11675)
@@ -26,9 +26,7 @@
 
 
 
-//static unsigned __int64 perf_sad8x8_time;
-//static unsigned __int64 perf_sad8x8_count;
-//static unsigned __int64 perf_sad8x8_min;
+static perf_info sub8x8_sse2_perf;
 
 
 
@@ -63,7 +61,7 @@
     DctInputPtr += 8;
   }
 #else
-
+	PERF_BLOCK_START();
     __asm {
         align 16
 
@@ -194,7 +192,9 @@
 
     };
 
+	PERF_BLOCK_END("sub8x8 sse2", sub8x8_sse2_perf, 10000);
 
+
 #endif
 }
 
@@ -1636,7 +1636,7 @@
 void dsp_sse2_init(DspFunctions *funcs)
 {
   TH_DEBUG("enabling accelerated x86_32 sse2 dsp functions.\n");
-  //funcs->sub8x8 = sub8x8__sse2;
+  funcs->sub8x8 = sub8x8__sse2;
   //funcs->sub8x8_128 = sub8x8_128__sse2;
   //funcs->sub8x8avg2 = sub8x8avg2__sse2;
   //funcs->row_sad8 = row_sad8__sse2;
@@ -1654,8 +1654,10 @@
   //funcs->inter8x8_err = inter8x8_err__sse2;
   //funcs->inter8x8_err_xy2 = inter8x8_err_xy2__sse2;
 
+  ClearPerfData(&sub8x8_sse2_perf);
 
 
+
   
 
 }

Modified: branches/theora-playtime/lib/x86_32_vs/perf_helper.c
===================================================================
--- branches/theora-playtime/lib/x86_32_vs/perf_helper.c	2006-06-29 21:50:53 UTC (rev 11674)
+++ branches/theora-playtime/lib/x86_32_vs/perf_helper.c	2006-06-29 22:08:27 UTC (rev 11675)
@@ -24,5 +24,4 @@
 	inoutData->sum = 0;
 	inoutData->count = 0;
 	inoutData->min = (unsigned __int64)-1;
-	inoutData->max = 0;
 }

Modified: branches/theora-playtime/lib/x86_32_vs/perf_helper.h
===================================================================
--- branches/theora-playtime/lib/x86_32_vs/perf_helper.h	2006-06-29 21:50:53 UTC (rev 11674)
+++ branches/theora-playtime/lib/x86_32_vs/perf_helper.h	2006-06-29 22:08:27 UTC (rev 11675)
@@ -11,8 +11,8 @@
     unsigned __int64 sum;
     unsigned __int64 count;
     unsigned __int64 min;
-	unsigned __int64 max;
 
+
 } perf_info;
 
 
@@ -27,12 +27,11 @@
 #define PERF_BLOCK_END(s, perf, z)                                                               \
         perf_temp = (GetCPUTime() - perf_start_time[--depth]);                                      \
         (perf.min) = ((perf.min) > perf_temp) ? perf_temp : (perf.min);                                                  \
-		(perf.max) = ((perf.max) > perf_temp) ? (perf.max) : perf_temp;                                  \
         perf.sum += perf_temp;                                                                             \
         (perf.count)++;                                                                                      \
   if (((perf.count) % (z)) == 0)                                                                             \
   {                                                                                                 \
-    printf(s " - %lld from %lld iterations -- @%lld cycles -- min(%lld) -- max(%lld)\n", perf.sum, perf.count, (perf.sum) / (perf.count), perf.min, perf.max);    \
+    printf(s " - %lld from %lld iterations -- @%lld cycles -- min(%lld)\n", perf.sum, perf.count, (perf.sum) / (perf.count), perf.min);    \
   }                
 
 #else



More information about the commits mailing list