[xiph-commits] r11560 - in branches/theora-playtime: lib
lib/x86_32_vs win32/VS2005/encoder_example win32/VS2005/libtheora
illiminable at svn.xiph.org
illiminable at svn.xiph.org
Sun Jun 11 10:57:25 PDT 2006
Author: illiminable
Date: 2006-06-11 10:56:24 -0700 (Sun, 11 Jun 2006)
New Revision: 11560
Added:
branches/theora-playtime/lib/x86_32_vs/quant_sse2.c
Modified:
branches/theora-playtime/lib/codec_internal.h
branches/theora-playtime/lib/dct_decode.c
branches/theora-playtime/lib/dct_encode.c
branches/theora-playtime/lib/dsp.c
branches/theora-playtime/lib/dsp.h
branches/theora-playtime/lib/encoder_toplevel.c
branches/theora-playtime/lib/idct.c
branches/theora-playtime/lib/quant.c
branches/theora-playtime/lib/reconstruct.c
branches/theora-playtime/lib/scan.c
branches/theora-playtime/lib/x86_32_vs/perf_helper.h
branches/theora-playtime/lib/x86_32_vs/scan_sse2.c
branches/theora-playtime/win32/VS2005/encoder_example/encoder_example.vcproj
branches/theora-playtime/win32/VS2005/libtheora/libtheora.vcproj
Log:
* Force alignement of arrays in pb_instance, and alignment of all structs containing pb_instances
* Add a macro, so you can have asm enabled, but sse2 forceably disabled, ie just mmx if it's available
* sse2 implementation of quantize (still slightly incorrect, and not that much faster)
* Alignments of dct_codes
Modified: branches/theora-playtime/lib/codec_internal.h
===================================================================
--- branches/theora-playtime/lib/codec_internal.h 2006-06-11 04:26:48 UTC (rev 11559)
+++ branches/theora-playtime/lib/codec_internal.h 2006-06-11 17:56:24 UTC (rev 11560)
@@ -453,9 +453,9 @@
ogg_int16_t *DequantBuffer;
- ogg_int32_t fp_quant_InterUV_coeffs[64];
- ogg_int32_t fp_quant_InterUV_round[64];
- ogg_int32_t fp_ZeroBinSize_InterUV[64];
+ __declspec(align(16)) ogg_int32_t fp_quant_InterUV_coeffs[64];
+ __declspec(align(16)) ogg_int32_t fp_quant_InterUV_round[64];
+ __declspec(align(16)) ogg_int32_t fp_ZeroBinSize_InterUV[64];
ogg_int16_t *TmpReconBuffer;
ogg_int16_t *TmpDataBuffer;
@@ -467,12 +467,12 @@
/* Dequantiser and rounding tables */
ogg_uint32_t QThreshTable[Q_TABLE_SIZE];
Q_LIST_ENTRY DcScaleFactorTable[Q_TABLE_SIZE];
- Q_LIST_ENTRY Y_coeffs[64];
- Q_LIST_ENTRY U_coeffs[64];
- Q_LIST_ENTRY V_coeffs[64];
- Q_LIST_ENTRY InterY_coeffs[64];
- Q_LIST_ENTRY InterU_coeffs[64];
- Q_LIST_ENTRY InterV_coeffs[64];
+ __declspec(align(16)) Q_LIST_ENTRY Y_coeffs[64];
+ __declspec(align(16)) Q_LIST_ENTRY U_coeffs[64];
+ __declspec(align(16)) Q_LIST_ENTRY V_coeffs[64];
+ __declspec(align(16)) Q_LIST_ENTRY InterY_coeffs[64];
+ __declspec(align(16)) Q_LIST_ENTRY InterU_coeffs[64];
+ __declspec(align(16)) Q_LIST_ENTRY InterV_coeffs[64];
Q_LIST_ENTRY *dequant_Y_coeffs;
Q_LIST_ENTRY *dequant_U_coeffs;
Q_LIST_ENTRY *dequant_V_coeffs;
@@ -480,10 +480,10 @@
Q_LIST_ENTRY *dequant_InterU_coeffs;
Q_LIST_ENTRY *dequant_InterV_coeffs;
Q_LIST_ENTRY *dequant_coeffs; /* currently active quantizer */
- unsigned int zigzag_index[64];
- ogg_int32_t quant_Y_coeffs[64];
- ogg_int32_t quant_UV_coeffs[64];
- ogg_int32_t fp_quant_Y_coeffs[64]; /* used in reiniting quantizers */
+ __declspec(align(16)) unsigned int zigzag_index[64];
+ __declspec(align(16)) ogg_int32_t quant_Y_coeffs[64];
+ __declspec(align(16)) ogg_int32_t quant_UV_coeffs[64];
+ __declspec(align(16)) ogg_int32_t fp_quant_Y_coeffs[64]; /* used in reiniting quantizers */
HUFF_ENTRY *HuffRoot_VP3x[NUM_HUFF_TABLES];
ogg_uint32_t *HuffCodeArray_VP3x[NUM_HUFF_TABLES];
@@ -491,14 +491,14 @@
const unsigned char *ExtraBitLengths_VP3x;
/* Quantiser and rounding tables */
- ogg_int32_t fp_quant_UV_coeffs[64];
- ogg_int32_t fp_quant_Inter_coeffs[64];
- ogg_int32_t fp_quant_Y_round[64];
- ogg_int32_t fp_quant_UV_round[64];
- ogg_int32_t fp_quant_Inter_round[64];
- ogg_int32_t fp_ZeroBinSize_Y[64];
- ogg_int32_t fp_ZeroBinSize_UV[64];
- ogg_int32_t fp_ZeroBinSize_Inter[64];
+ __declspec(align(16)) ogg_int32_t fp_quant_UV_coeffs[64];
+ __declspec(align(16)) ogg_int32_t fp_quant_Inter_coeffs[64];
+ __declspec(align(16)) ogg_int32_t fp_quant_Y_round[64];
+ __declspec(align(16)) ogg_int32_t fp_quant_UV_round[64];
+ __declspec(align(16)) ogg_int32_t fp_quant_Inter_round[64];
+ __declspec(align(16)) ogg_int32_t fp_ZeroBinSize_Y[64];
+ __declspec(align(16)) ogg_int32_t fp_ZeroBinSize_UV[64];
+ __declspec(align(16)) ogg_int32_t fp_ZeroBinSize_Inter[64];
ogg_int32_t *fquant_coeffs;
ogg_int32_t *fquant_round;
ogg_int32_t *fquant_ZbSize;
@@ -688,8 +688,12 @@
/* instances (used for reconstructing buffers and to hold tokens etc.) */
PP_INSTANCE pp; /* preprocessor */
- PB_INSTANCE pb; /* playback */
+ /* ZEN::: Since there are things in PB_INSTANCE that are 16 byte aligned, we
+ have to make sure the base address is also 16 byte aligned or the alignment
+ won't work */
+ __declspec(align(16))PB_INSTANCE pb; /* playback */
+
/* ogg bitpacker for use in packet coding, other API state */
oggpack_buffer *oggbuffer;
#ifdef LIBOGG2 /* Remember, this is just until we drop libogg1 */
Modified: branches/theora-playtime/lib/dct_decode.c
===================================================================
--- branches/theora-playtime/lib/dct_decode.c 2006-06-11 04:26:48 UTC (rev 11559)
+++ branches/theora-playtime/lib/dct_decode.c 2006-06-11 17:56:24 UTC (rev 11560)
@@ -1242,9 +1242,10 @@
//if (cpu_flags & CPU_X86_MMX) {
// dsp_mmx_idct_init(funcs);
//}
-
+#ifndef USE_NO_SSE2
if (cpu_flags & CPU_X86_SSE2) {
dsp_sse2_dct_decode_init(funcs);
}
#endif
+#endif
}
Modified: branches/theora-playtime/lib/dct_encode.c
===================================================================
--- branches/theora-playtime/lib/dct_encode.c 2006-06-11 04:26:48 UTC (rev 11559)
+++ branches/theora-playtime/lib/dct_encode.c 2006-06-11 17:56:24 UTC (rev 11560)
@@ -450,7 +450,7 @@
dsp_fdct_short(cpi->dsp, cpi->DCTDataBuffer, cpi->DCT_codes );
/* Quantize that transform data. */
- quantize ( &cpi->pb, cpi->DCT_codes, cpi->pb.QFragData[FragIndex] );
+ dsp_quant_quantize (cpi->dsp, &cpi->pb, cpi->DCT_codes, cpi->pb.QFragData[FragIndex] );
if ( (cpi->pb.CodingMode == CODE_INTER_NO_MV) &&
( AllZeroDctData(cpi->pb.QFragData[FragIndex]) ) ) {
Modified: branches/theora-playtime/lib/dsp.c
===================================================================
--- branches/theora-playtime/lib/dsp.c 2006-06-11 04:26:48 UTC (rev 11559)
+++ branches/theora-playtime/lib/dsp.c 2006-06-11 17:56:24 UTC (rev 11560)
@@ -412,6 +412,7 @@
dsp_idct_init (funcs, cpuflags);
dsp_dct_decode_init(funcs, cpuflags);
dsp_scan_init(funcs, cpuflags);
+ dsp_quant_init(funcs, cpuflags);
#if defined(USE_ASM)
if (cpuflags & CPU_X86_MMX) {
dsp_mmx_init(funcs);
@@ -423,12 +424,12 @@
dsp_mmxext_init(funcs);
}
# endif
-
+#ifndef USE_NO_SSE2
if (cpuflags & CPU_X86_SSE2) {
dsp_sse2_init(funcs);
}
+#endif
-
#endif
}
Modified: branches/theora-playtime/lib/dsp.h
===================================================================
--- branches/theora-playtime/lib/dsp.h 2006-06-11 04:26:48 UTC (rev 11559)
+++ branches/theora-playtime/lib/dsp.h 2006-06-11 17:56:24 UTC (rev 11560)
@@ -22,7 +22,10 @@
struct PP_INSTANCE;
+struct PB_INSTANCE;
+//typedef ogg_int16_t Q_LIST_ENTRY;
+
typedef unsigned long int ogg_uint64_t;
typedef struct
@@ -96,12 +99,12 @@
ogg_int16_t *QuantMatrix,
ogg_int16_t * OutputData );
- void (*IDctSlow)( ogg_int16_t/*Q_LIST_ENTRY*/ * InputData,
+ void (*IDctSlow)( ogg_int16_t /*Q_LIST_ENTRY*/ * InputData,
ogg_int16_t *QuantMatrix,
ogg_int16_t * OutputData );
void (*dequant_slow)( ogg_int16_t * dequant_coeffs,
- ogg_int16_t * quantized_list,
+ ogg_int16_t /*Q_LIST_ENTRY*/ * quantized_list,
ogg_int32_t * DCT_block);
/* dct_decode */
@@ -114,18 +117,24 @@
ogg_int32_t *BoundingValuePtr);
- void (*RowDiffScan)( struct PP_INSTANCE *ppi,
- unsigned char * YuvPtr1,
- unsigned char * YuvPtr2,
- ogg_int16_t * YUVDiffsPtr,
- unsigned char * bits_map_ptr,
- signed char * SgcPtr,
- signed char * DispFragPtr,
- unsigned char * FDiffPixels,
- ogg_int32_t * RowDiffsPtr,
+ /* Scan */
+ void (*RowDiffScan)( struct PP_INSTANCE *ppi,
+ unsigned char * YuvPtr1,
+ unsigned char * YuvPtr2,
+ ogg_int16_t * YUVDiffsPtr,
+ unsigned char * bits_map_ptr,
+ signed char * SgcPtr,
+ signed char * DispFragPtr,
+ unsigned char * FDiffPixels,
+ ogg_int32_t * RowDiffsPtr,
unsigned char * ChLocalsPtr, int EdgeRow );
+ /* Quant */
+ void (*quantize)( struct PB_INSTANCE *pbi,
+ ogg_int16_t * DCT_block,
+ ogg_int16_t /*Q_LIST_ENTRY*/ * quantized_list);
+
} DspFunctions;
extern void dsp_dct_init(DspFunctions *funcs, ogg_uint32_t cpu_flags);
@@ -133,6 +142,7 @@
extern void dsp_idct_init (DspFunctions *funcs, ogg_uint32_t cpu_flags);
extern void dsp_dct_decode_init (DspFunctions *funcs, ogg_uint32_t cpu_flags);
extern void dsp_scan_init (DspFunctions *funcs, ogg_uint32_t cpu_flags);
+extern void dsp_quant_init (DspFunctions *funcs, ogg_uint32_t cpu_flags);
void dsp_init(DspFunctions *funcs);
@@ -148,6 +158,7 @@
extern void dsp_sse2_idct_init(DspFunctions *funcs);
extern void dsp_sse2_dct_decode_init(DspFunctions *funcs);
extern void dsp_sse2_scan_init(DspFunctions *funcs);
+extern void dsp_sse2_quant_init(DspFunctions *funcs);
#endif
@@ -216,5 +227,8 @@
#define dsp_scan_row_diff_scan(funcs, ptr1, ptr2, ptr3, ptr4, ptr5, ptr6, ptr7, ptr8, ptr9, ptr10, a1) \
(funcs.RowDiffScan(ptr1, ptr2, ptr3, ptr4, ptr5, ptr6, ptr7, ptr8, ptr9, ptr10, a1))
+#define dsp_quant_quantize(funcs, ptr1, ptr2, ptr3) \
+ (funcs.quantize(ptr1, ptr2, ptr3))
+
#endif /* DSP_H */
Modified: branches/theora-playtime/lib/encoder_toplevel.c
===================================================================
--- branches/theora-playtime/lib/encoder_toplevel.c 2006-06-11 04:26:48 UTC (rev 11559)
+++ branches/theora-playtime/lib/encoder_toplevel.c 2006-06-11 17:56:24 UTC (rev 11560)
@@ -47,7 +47,7 @@
if(cpi->MVList)
_ogg_free(cpi->MVList);
if(cpi->DCT_codes )
- _ogg_free( cpi->DCT_codes );
+ _theora_16_byte_aligned_free( cpi->DCT_codes );
if(cpi->DCTDataBuffer )
_theora_16_byte_aligned_free( cpi->DCTDataBuffer);
if(cpi->quantized_list)
@@ -137,7 +137,7 @@
_ogg_malloc(cpi->pb.UnitFragments*
sizeof(cpi->MVList));
cpi->DCT_codes =
- _ogg_malloc(64*
+ _theora_16_byte_aligned_malloc(64*
sizeof(*cpi->DCT_codes));
cpi->DCTDataBuffer =
_theora_16_byte_aligned_malloc(64*
@@ -777,7 +777,12 @@
memset(th, 0, sizeof(*th));
/*Currently only the 4:2:0 format is supported.*/
if(c->pixelformat!=OC_PF_420)return OC_IMPL;
- th->internal_encode=cpi=_ogg_calloc(1,sizeof(*cpi));
+
+ /* th->internal_encode=cpi=_ogg_calloc(1,sizeof(*cpi)); */
+ /* ZEN::: Need 16 byte alignment and calloc doesn't cut it */
+ th->internal_encode=cpi=_theora_16_byte_aligned_malloc(sizeof(*cpi));
+ memset(cpi, 0, sizeof(*cpi));
+ /* Equivalent of calloc */
dsp_static_init (&cpi->dsp);
memcpy (&cpi->pb.dsp, &cpi->dsp, sizeof(DspFunctions));
@@ -1216,6 +1221,7 @@
oggpackB_writeclear(cpi->oggbuffer);
_ogg_free(cpi->oggbuffer);
- _ogg_free(cpi);
+ /*_ogg_free(cpi); */
+ _theora_16_byte_aligned_free(cpi);
}
}
Modified: branches/theora-playtime/lib/idct.c
===================================================================
--- branches/theora-playtime/lib/idct.c 2006-06-11 04:26:48 UTC (rev 11559)
+++ branches/theora-playtime/lib/idct.c 2006-06-11 17:56:24 UTC (rev 11560)
@@ -572,9 +572,10 @@
//if (cpu_flags & CPU_X86_MMX) {
// dsp_mmx_idct_init(funcs);
//}
-
+#ifndef USE_NO_SSE2
if (cpu_flags & CPU_X86_SSE2) {
dsp_sse2_idct_init(funcs);
}
#endif
+#endif
}
\ No newline at end of file
Modified: branches/theora-playtime/lib/quant.c
===================================================================
--- branches/theora-playtime/lib/quant.c 2006-06-11 04:26:48 UTC (rev 11559)
+++ branches/theora-playtime/lib/quant.c 2006-06-11 17:56:24 UTC (rev 11560)
@@ -17,6 +17,8 @@
#include <stdlib.h>
#include <string.h>
+#include "dsp.h"
+#include "cpu.h"
#include "codec_internal.h"
#include "quant_lookup.h"
@@ -560,7 +562,7 @@
pbi->fquant_ZbSize = pbi->fp_ZeroBinSize_InterUV;
}
-void quantize( PB_INSTANCE *pbi,
+void quantize__c( PB_INSTANCE *pbi,
ogg_int16_t * DCT_block,
Q_LIST_ENTRY * quantized_list){
ogg_uint32_t i; /* Row index */
@@ -871,3 +873,22 @@
init_quantizer ( cpi, qscale, (unsigned char) pbi->FrameQIndex );
init_dequantizer ( pbi, qscale, (unsigned char) pbi->FrameQIndex );
}
+
+void dsp_quant_init (DspFunctions *funcs, ogg_uint32_t cpu_flags)
+{
+ funcs->quantize = quantize__c;
+ //funcs->copy8x8 = copy8x8__c;
+ //funcs->recon_intra8x8 = recon_intra8x8__c;
+ //funcs->recon_inter8x8 = recon_inter8x8__c;
+ //funcs->recon_inter8x8_half = recon_inter8x8_half__c;
+#if defined(USE_ASM)
+ //if (cpu_flags & CPU_X86_MMX) {
+ // dsp_mmx_scan_init(funcs);
+ //}
+#ifndef USE_NO_SSE2
+ if (cpu_flags & CPU_X86_SSE2) {
+ dsp_sse2_quant_init(funcs);
+ }
+#endif
+#endif
+}
\ No newline at end of file
Modified: branches/theora-playtime/lib/reconstruct.c
===================================================================
--- branches/theora-playtime/lib/reconstruct.c 2006-06-11 04:26:48 UTC (rev 11559)
+++ branches/theora-playtime/lib/reconstruct.c 2006-06-11 17:56:24 UTC (rev 11560)
@@ -108,9 +108,10 @@
if (cpu_flags & CPU_X86_MMX) {
dsp_mmx_recon_init(funcs);
}
-
+#ifndef USE_NO_SSE2
if (cpu_flags & CPU_X86_SSE2) {
dsp_sse2_recon_init(funcs);
}
#endif
+#endif
}
Modified: branches/theora-playtime/lib/scan.c
===================================================================
--- branches/theora-playtime/lib/scan.c 2006-06-11 04:26:48 UTC (rev 11559)
+++ branches/theora-playtime/lib/scan.c 2006-06-11 17:56:24 UTC (rev 11560)
@@ -2313,9 +2313,10 @@
//if (cpu_flags & CPU_X86_MMX) {
// dsp_mmx_scan_init(funcs);
//}
-
+#ifndef USE_NO_SSE2
if (cpu_flags & CPU_X86_SSE2) {
dsp_sse2_scan_init(funcs);
}
#endif
+#endif
}
Modified: branches/theora-playtime/lib/x86_32_vs/perf_helper.h
===================================================================
--- branches/theora-playtime/lib/x86_32_vs/perf_helper.h 2006-06-11 04:26:48 UTC (rev 11559)
+++ branches/theora-playtime/lib/x86_32_vs/perf_helper.h 2006-06-11 17:56:24 UTC (rev 11560)
@@ -16,7 +16,7 @@
*/
extern unsigned __int64 GetCPUTime();
-#define PERF_DATA_ON
+//#define PERF_DATA_ON
#ifdef PERF_DATA_ON
Added: branches/theora-playtime/lib/x86_32_vs/quant_sse2.c
===================================================================
--- branches/theora-playtime/lib/x86_32_vs/quant_sse2.c 2006-06-11 04:26:48 UTC (rev 11559)
+++ branches/theora-playtime/lib/x86_32_vs/quant_sse2.c 2006-06-11 17:56:24 UTC (rev 11560)
@@ -0,0 +1,453 @@
+/********************************************************************
+ * *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
+ * *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2005 *
+ * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * *
+ ********************************************************************
+
+ function:
+ last mod: $Id: quant.c 11442 2006-05-27 17:28:08Z giles $
+
+ ********************************************************************/
+
+#include <stdlib.h>
+#include <string.h>
+#include "codec_internal.h"
+#include "quant_lookup.h"
+
+#include "perf_helper.h"
+
+static unsigned __int64 perf_quant_time;
+static unsigned __int64 perf_quant_min;
+static unsigned __int64 perf_quant_count;
+
+
+void quantize__sse2( PB_INSTANCE *pbi,
+ ogg_int16_t * DCT_block,
+ Q_LIST_ENTRY * quantized_list){
+
+#if 0
+ ogg_uint32_t i; /* Row index */
+ Q_LIST_ENTRY val; /* Quantised value. */
+
+ ogg_int32_t * FquantRoundPtr = pbi->fquant_round;
+ ogg_int32_t * FquantCoeffsPtr = pbi->fquant_coeffs;
+ ogg_int32_t * FquantZBinSizePtr = pbi->fquant_ZbSize;
+ ogg_int16_t * DCT_blockPtr = DCT_block;
+ ogg_uint32_t * ZigZagPtr = (ogg_uint32_t *)pbi->zigzag_index;
+ ogg_int32_t temp;
+
+ PERF_BLOCK_START();
+ /* Set the quantized_list to default to 0 */
+ memset( quantized_list, 0, 64 * sizeof(Q_LIST_ENTRY) );
+
+ /* Note that we add half divisor to effect rounding on positive number */
+ for( i = 0; i < VFRAGPIXELS; i++) {
+ /* Column 0 */
+ if ( DCT_blockPtr[0] >= FquantZBinSizePtr[0] ) {
+ temp = FquantCoeffsPtr[0] * ( DCT_blockPtr[0] + FquantRoundPtr[0] ) ;
+ val = (Q_LIST_ENTRY) (temp>>16);
+ quantized_list[ZigZagPtr[0]] = ( val > 511 ) ? 511 : val;
+ } else if ( DCT_blockPtr[0] <= -FquantZBinSizePtr[0] ) {
+ temp = FquantCoeffsPtr[0] *
+ ( DCT_blockPtr[0] - FquantRoundPtr[0] ) + MIN16;
+ val = (Q_LIST_ENTRY) (temp>>16);
+ quantized_list[ZigZagPtr[0]] = ( val < -511 ) ? -511 : val;
+ }
+
+ /* Column 1 */
+ if ( DCT_blockPtr[1] >= FquantZBinSizePtr[1] ) {
+ temp = FquantCoeffsPtr[1] *
+ ( DCT_blockPtr[1] + FquantRoundPtr[1] ) ;
+ val = (Q_LIST_ENTRY) (temp>>16);
+ quantized_list[ZigZagPtr[1]] = ( val > 511 ) ? 511 : val;
+ } else if ( DCT_blockPtr[1] <= -FquantZBinSizePtr[1] ) {
+ temp = FquantCoeffsPtr[1] *
+ ( DCT_blockPtr[1] - FquantRoundPtr[1] ) + MIN16;
+ val = (Q_LIST_ENTRY) (temp>>16);
+ quantized_list[ZigZagPtr[1]] = ( val < -511 ) ? -511 : val;
+ }
+
+ /* Column 2 */
+ if ( DCT_blockPtr[2] >= FquantZBinSizePtr[2] ) {
+ temp = FquantCoeffsPtr[2] *
+ ( DCT_blockPtr[2] + FquantRoundPtr[2] ) ;
+ val = (Q_LIST_ENTRY) (temp>>16);
+ quantized_list[ZigZagPtr[2]] = ( val > 511 ) ? 511 : val;
+ } else if ( DCT_blockPtr[2] <= -FquantZBinSizePtr[2] ) {
+ temp = FquantCoeffsPtr[2] *
+ ( DCT_blockPtr[2] - FquantRoundPtr[2] ) + MIN16;
+ val = (Q_LIST_ENTRY) (temp>>16);
+ quantized_list[ZigZagPtr[2]] = ( val < -511 ) ? -511 : val;
+ }
+
+ /* Column 3 */
+ if ( DCT_blockPtr[3] >= FquantZBinSizePtr[3] ) {
+ temp = FquantCoeffsPtr[3] *
+ ( DCT_blockPtr[3] + FquantRoundPtr[3] ) ;
+ val = (Q_LIST_ENTRY) (temp>>16);
+ quantized_list[ZigZagPtr[3]] = ( val > 511 ) ? 511 : val;
+ } else if ( DCT_blockPtr[3] <= -FquantZBinSizePtr[3] ) {
+ temp = FquantCoeffsPtr[3] *
+ ( DCT_blockPtr[3] - FquantRoundPtr[3] ) + MIN16;
+ val = (Q_LIST_ENTRY) (temp>>16);
+ quantized_list[ZigZagPtr[3]] = ( val < -511 ) ? -511 : val;
+ }
+
+ /* Column 4 */
+ if ( DCT_blockPtr[4] >= FquantZBinSizePtr[4] ) {
+ temp = FquantCoeffsPtr[4] *
+ ( DCT_blockPtr[4] + FquantRoundPtr[4] ) ;
+ val = (Q_LIST_ENTRY) (temp>>16);
+ quantized_list[ZigZagPtr[4]] = ( val > 511 ) ? 511 : val;
+ } else if ( DCT_blockPtr[4] <= -FquantZBinSizePtr[4] ) {
+ temp = FquantCoeffsPtr[4] *
+ ( DCT_blockPtr[4] - FquantRoundPtr[4] ) + MIN16;
+ val = (Q_LIST_ENTRY) (temp>>16);
+ quantized_list[ZigZagPtr[4]] = ( val < -511 ) ? -511 : val;
+ }
+
+ /* Column 5 */
+ if ( DCT_blockPtr[5] >= FquantZBinSizePtr[5] ) {
+ temp = FquantCoeffsPtr[5] *
+ ( DCT_blockPtr[5] + FquantRoundPtr[5] ) ;
+ val = (Q_LIST_ENTRY) (temp>>16);
+ quantized_list[ZigZagPtr[5]] = ( val > 511 ) ? 511 : val;
+ } else if ( DCT_blockPtr[5] <= -FquantZBinSizePtr[5] ) {
+ temp = FquantCoeffsPtr[5] *
+ ( DCT_blockPtr[5] - FquantRoundPtr[5] ) + MIN16;
+ val = (Q_LIST_ENTRY) (temp>>16);
+ quantized_list[ZigZagPtr[5]] = ( val < -511 ) ? -511 : val;
+ }
+
+ /* Column 6 */
+ if ( DCT_blockPtr[6] >= FquantZBinSizePtr[6] ) {
+ temp = FquantCoeffsPtr[6] *
+ ( DCT_blockPtr[6] + FquantRoundPtr[6] ) ;
+ val = (Q_LIST_ENTRY) (temp>>16);
+ quantized_list[ZigZagPtr[6]] = ( val > 511 ) ? 511 : val;
+ } else if ( DCT_blockPtr[6] <= -FquantZBinSizePtr[6] ) {
+ temp = FquantCoeffsPtr[6] *
+ ( DCT_blockPtr[6] - FquantRoundPtr[6] ) + MIN16;
+ val = (Q_LIST_ENTRY) (temp>>16);
+ quantized_list[ZigZagPtr[6]] = ( val < -511 ) ? -511 : val;
+ }
+
+ /* Column 7 */
+ if ( DCT_blockPtr[7] >= FquantZBinSizePtr[7] ) {
+ temp = FquantCoeffsPtr[7] *
+ ( DCT_blockPtr[7] + FquantRoundPtr[7] ) ;
+ val = (Q_LIST_ENTRY) (temp>>16);
+ quantized_list[ZigZagPtr[7]] = ( val > 511 ) ? 511 : val;
+ } else if ( DCT_blockPtr[7] <= -FquantZBinSizePtr[7] ) {
+ temp = FquantCoeffsPtr[7] *
+ ( DCT_blockPtr[7] - FquantRoundPtr[7] ) + MIN16;
+ val = (Q_LIST_ENTRY) (temp>>16);
+ quantized_list[ZigZagPtr[7]] = ( val < -511 ) ? -511 : val;
+ }
+
+ FquantRoundPtr += 8;
+ FquantCoeffsPtr += 8;
+ FquantZBinSizePtr += 8;
+ DCT_blockPtr += 8;
+ ZigZagPtr += 8;
+ }
+
+ PERF_BLOCK_END("quantize C", perf_quant_time, perf_quant_count, perf_quant_min, 20000);
+
+#else
+static __declspec(align(16)) unsigned short Some511s[8] = { 0x01FF, 0x01FF, 0x01FF, 0x01FF, 0x01FF, 0x01FF, 0x01FF, 0x01FF };
+static unsigned short* Some511sPtr = Some511s;
+
+static __declspec(align(16)) unsigned char temp[128];
+static unsigned char* temp_ptr = temp;
+
+
+ ogg_int32_t * FquantRoundPtr = pbi->fquant_round; /* These are not aligned for now */
+ ogg_int32_t * FquantCoeffsPtr = pbi->fquant_coeffs; /* These are not aligned for now */
+ ogg_int32_t * FquantZBinSizePtr = pbi->fquant_ZbSize; /* These are not aligned for now */
+ ogg_int16_t * DCT_blockPtr = DCT_block;
+ ogg_uint32_t * ZigZagPtr = (ogg_uint32_t *)pbi->zigzag_index;
+
+
+ PERF_BLOCK_START();
+
+
+ __asm {
+ align 16
+
+ mov edi, temp_ptr
+ mov esi, DCT_blockPtr
+ mov eax, FquantRoundPtr
+ mov edx, FquantCoeffsPtr
+ mov ecx, Some511sPtr
+
+ movdqa xmm7, [ecx]
+ pcmpeqw xmm6, xmm6 /* All 1's */
+ mov ecx, FquantZBinSizePtr
+
+ push ebx
+ mov ebx, 8/* Loop counter */
+
+
+
+ /* Set to 0, might be better to do twice as many modq's than to unaligned write? */
+
+ /* 128 bytes worth of 0's */
+ //movdqu [edi], xmm0
+ //movdqu [edi + 16], xmm0
+ //movdqu [edi + 32], xmm0
+ //movdqu [edi + 48], xmm0
+ //movdqu [edi + 64], xmm0
+ //movdqu [edi + 80], xmm0
+ //movdqu [edi + 96], xmm0
+ //movdqu [edi + 112], xmm0
+
+ read_loop_start:
+ pxor xmm0, xmm0
+
+ /* Read all 8x16 bitsof the dct block */
+ movdqa xmm1, [esi]
+
+
+ /* Load 8x32bits of the rounding values */
+ movdqa xmm3, [eax]
+ movdqa xmm4, [eax + 16]
+
+
+ /* Shrnk them back to 16 bits */
+ packssdw xmm3, xmm4
+
+
+ /* Load 8x32bits of the coeff values */
+ movdqa xmm4, [edx]
+ movdqa xmm5, [edx + 16]
+
+ /* Shirnk the coeffs back to 16 bits */
+ packssdw xmm4, xmm5
+
+
+ /* Add the rounding to the dct in one register */
+ movdqa xmm2, xmm1
+ paddd xmm1, xmm3
+
+ /* Subtract in another */
+ psubd xmm2, xmm3
+
+ /* Multiply both suma nd diff by the coeffs, keeping only the high word
+ (since in the C code we shift right by 16 bits ) */
+ pmulhw xmm1, xmm4
+ pmulhw xmm2, xmm4 /* TODO::: In the subtraction, have too do the round by adding 65535 */
+
+
+ /* Now need to do the gt checks and mask in the appropriate result */
+ /* Check the summed results to see if any are over 511 */
+
+ /* Duplicate the summed values */
+ movdqa xmm5, xmm1
+
+ /* Compare each word to 511, any that are >511 will have their word set to all 1s */
+ pcmpgtw xmm5, xmm7
+
+ /* CHECK INSERTED CODE, to save reloading xmm7, assumes xmm3 was not holding a value at this point */
+ movdqa xmm3, xmm7
+
+ /* Use the mask created to make a register with 511 in every place the original sum was >511 and 0 elsewhere */
+ pand xmm3, xmm5
+
+ /* Flip the bits in the mask */
+ pxor xmm5, xmm6
+
+ /* Now register has all the values that were less than or eq to 511 intact, but every where else is 0 */
+ pand xmm1, xmm5
+
+ /* Now combine all the vals lt or eq to 511, from the original sums, with the register that has 511
+ in all the places where the value was >511. This effectively performs an upper bound clipping.
+ So every value that was >511 is now set to 511 */
+ por xmm1, xmm3
+
+ /* Now do similar for the differences, to clip any value less than -511 */
+ /* Duplicate the differenced values */
+ movdqa xmm5, xmm2
+
+
+ /* Subtract the 511s from 0 to get -511's */
+
+ psubw xmm0, xmm7
+
+ /* See if -511 is greater than the value. If it is that word is all 1's. This is effectively
+ a check to see if the value is less or equal to -511. Since the operation is the same for the
+ equal case, it doesn't matter that we check for less or equal rather than just less than */
+
+
+ movdqa xmm3, xmm0 /* mm3 is now a duplicate of the -511's */
+ pcmpgtw xmm0, xmm5
+
+ /* Create a mask on the -511s */
+ pand xmm3, xmm0
+
+ /* Flip the bits in the mask */
+ pxor xmm0, xmm6
+
+ /* Mask the values in the difference register */
+ pand xmm2, xmm0
+
+ /* Combine them together */
+ por xmm2, xmm3
+
+
+
+ /* By here, xmm1 contains the clipped 8 values of the sum, and xmm2 the clipped 8 values of the difference */
+
+
+ pxor xmm0, xmm0
+ //mov eax, FquantZBinSizePtr
+
+ /* Load 8x32bits of the fquantzbin values */
+ movdqa xmm3, [ecx]
+ movdqa xmm4, [ecx + 16]
+
+
+ /* Shrnk them back to 16 bits */
+ packssdw xmm3, xmm4
+
+ /* Load the DCT Block values 8x16 bits again */
+ movdqa xmm4, [esi]
+
+ /* Find -Fqauntzbin by subtract it from 0 */
+ psubw xmm0, xmm3
+
+ /* Check if fquantzbin is greater dct_block value. if it's not, then use the summed register value */
+ pcmpgtw xmm3, xmm4
+
+
+ /* Flip the mask */
+ pxor xmm3, xmm6
+
+ /* Copy the mask to save for later */
+ movdqa xmm5, xmm3
+
+ /* And the summed value regsiter */
+ pand xmm1, xmm3
+
+
+ /* Check if -fquantzbin is greater than dct_block. If it is, use the difference register value */
+ pcmpgtw xmm0, xmm4
+
+ /* Or the mask with the other mask, so we have the combination, of all those using the sum and all those
+ using the difference, everything else will be set to zero later */
+ por xmm5, xmm0
+
+ /* And the difference register to mask the appropriate values */
+ pand xmm2, xmm0
+
+ /* Merge together the selected summed and differenced values */
+ por xmm1, xmm2
+
+ /* Zero out everything that wasn't selected by the sum mask or the difference mask */
+ pand xmm1, xmm5
+
+
+
+ /* Write these values out to the temp_space, after all 8 loops, 64 x 16 bit values are written,
+ later we can apply the zigzag write all at once */
+ movdqa [edi], xmm1
+
+
+ /* Increment the pointer */
+ add edi, 16 /* Temp output by 16 bytes */
+ add esi, 16 /* DCT_Block by 16 bytes */
+ add edx, 32 /* Fquantcoeffs by 32 bytes */
+ add eax, 32 /* fquant round ptr by 32 bytes */
+ add ecx, 32 /* fquant zbin by 32 bytes */
+
+ /* Update the loop variable */
+ sub ebx, 1
+ jnz read_loop_start
+
+
+
+ /* Now read through the temp output space and write using the zigpag pointer values */
+
+
+ mov edx, quantized_list
+ mov esi, ZigZagPtr
+ mov ebx, 8
+
+ /* Put the temp output back to the start of the block */
+ sub edi, 128
+
+ write_loop_start:
+ mov ax, WORD PTR [edi]
+ mov ecx, [esi]
+ mov WORD PTR [edx + ecx*2], ax
+
+ mov ax, WORD PTR [edi+2]
+ mov ecx, [esi+4]
+ mov WORD PTR [edx + ecx*2], ax
+
+ mov ax, WORD PTR [edi+4]
+ mov ecx, [esi+8]
+ mov WORD PTR [edx + ecx*2], ax
+
+ mov ax, WORD PTR [edi+6]
+ mov ecx, [esi+12]
+ mov WORD PTR [edx + ecx*2], ax
+
+
+ mov ax, WORD PTR [edi+8]
+ mov ecx, [esi+16]
+ mov WORD PTR [edx + ecx*2], ax
+
+ mov ax, WORD PTR [edi+10]
+ mov ecx, [esi+20]
+ mov WORD PTR [edx + ecx*2], ax
+
+ mov ax, WORD PTR [edi+12]
+ mov ecx, [esi+24]
+ mov WORD PTR [edx + ecx*2], ax
+
+ mov ax, WORD PTR [edi+14]
+ mov ecx, [esi+28]
+ mov WORD PTR [edx + ecx*2], ax
+
+ /* Advance all the pointer */
+ add esi, 32
+ add edi, 16
+
+ /* Check the loop counter */
+ sub ebx, 1
+ jnz write_loop_start
+
+
+ /* Restore ebx */
+ pop ebx
+
+ }
+
+ PERF_BLOCK_END("quantize sse2", perf_quant_time, perf_quant_count, perf_quant_min, 20000);
+
+#endif
+}
+
+
+void dsp_sse2_quant_init(DspFunctions *funcs)
+{
+#ifndef USE_NO_SSE2
+ TH_DEBUG("enabling accelerated x86_32 sse2 quant functions.\n");
+ perf_quant_time = 0;
+ perf_quant_min = -1;
+ perf_quant_count = 0;
+ funcs->quantize = quantize__sse2;
+
+#endif
+ //funcs->copy8x8 = copy8x8__sse2;
+ //funcs->recon_intra8x8 = recon_intra8x8__sse2;
+ //funcs->recon_inter8x8 = recon_inter8x8__sse2;
+ //funcs->recon_inter8x8_half = recon_inter8x8_half__sse2;
+}
\ No newline at end of file
Modified: branches/theora-playtime/lib/x86_32_vs/scan_sse2.c
===================================================================
--- branches/theora-playtime/lib/x86_32_vs/scan_sse2.c 2006-06-11 04:26:48 UTC (rev 11559)
+++ branches/theora-playtime/lib/x86_32_vs/scan_sse2.c 2006-06-11 17:56:24 UTC (rev 11560)
@@ -929,8 +929,5 @@
{
TH_DEBUG("enabling accelerated x86_32 sse2 scan functions.\n");
funcs->RowDiffScan = RowDiffScan__sse2;
- //funcs->copy8x8 = copy8x8__sse2;
- //funcs->recon_intra8x8 = recon_intra8x8__sse2;
- //funcs->recon_inter8x8 = recon_inter8x8__sse2;
- //funcs->recon_inter8x8_half = recon_inter8x8_half__sse2;
+
}
\ No newline at end of file
Modified: branches/theora-playtime/win32/VS2005/encoder_example/encoder_example.vcproj
===================================================================
--- branches/theora-playtime/win32/VS2005/encoder_example/encoder_example.vcproj 2006-06-11 04:26:48 UTC (rev 11559)
+++ branches/theora-playtime/win32/VS2005/encoder_example/encoder_example.vcproj 2006-06-11 17:56:24 UTC (rev 11560)
@@ -119,13 +119,14 @@
InlineFunctionExpansion="2"
EnableIntrinsicFunctions="true"
FavorSizeOrSpeed="1"
+ WholeProgramOptimization="false"
AdditionalIncludeDirectories="..\..\..\include;..\..\..\..\libogg\include;..\..\..\..\libvorbis\include"
PreprocessorDefinitions="WIN32;NDEBUG;_CONSOLE"
RuntimeLibrary="0"
UsePrecompiledHeader="0"
WarningLevel="4"
Detect64BitPortabilityProblems="true"
- DebugInformationFormat="0"
+ DebugInformationFormat="3"
/>
<Tool
Name="VCManagedResourceCompilerTool"
@@ -139,7 +140,7 @@
<Tool
Name="VCLinkerTool"
LinkIncremental="1"
- GenerateDebugInformation="false"
+ GenerateDebugInformation="true"
SubSystem="1"
OptimizeReferences="2"
EnableCOMDATFolding="2"
Modified: branches/theora-playtime/win32/VS2005/libtheora/libtheora.vcproj
===================================================================
--- branches/theora-playtime/win32/VS2005/libtheora/libtheora.vcproj 2006-06-11 04:26:48 UTC (rev 11559)
+++ branches/theora-playtime/win32/VS2005/libtheora/libtheora.vcproj 2006-06-11 17:56:24 UTC (rev 11560)
@@ -48,7 +48,7 @@
RuntimeLibrary="1"
EnableEnhancedInstructionSet="0"
UsePrecompiledHeader="0"
- WarningLevel="3"
+ WarningLevel="4"
Detect64BitPortabilityProblems="true"
DebugInformationFormat="3"
DisableSpecificWarnings="4996"
@@ -129,6 +129,7 @@
EnableIntrinsicFunctions="true"
FavorSizeOrSpeed="1"
OmitFramePointers="true"
+ WholeProgramOptimization="false"
AdditionalIncludeDirectories="..\..\..\include;..\..\..\..\libogg\include;..\..\..\lib;G:\Dev\xiph\zens_sdk\lib\libtheora-playtime\lib\x86_32_vs"
PreprocessorDefinitions="WIN32;NDEBUG;_WINDOWS;_USRDLL;LIBTHEORA_EXPORTS; USE_ASM"
StringPooling="true"
@@ -157,7 +158,7 @@
OutputFile="$(OutDir)/libtheora.dll"
LinkIncremental="1"
ModuleDefinitionFile="..\..\libtheora.def"
- GenerateDebugInformation="false"
+ GenerateDebugInformation="true"
SubSystem="2"
OptimizeReferences="2"
EnableCOMDATFolding="2"
@@ -490,6 +491,10 @@
>
</File>
<File
+ RelativePath="..\..\..\lib\x86_32_vs\quant_sse2.c"
+ >
+ </File>
+ <File
RelativePath="..\..\..\lib\x86_32_vs\recon_mmx.c"
>
</File>
More information about the commits
mailing list