[xiph-commits] r11516 - in branches/theora-playtime: lib
lib/x86_32_vs win32/VS2005/libtheora
illiminable at svn.xiph.org
illiminable at svn.xiph.org
Sun Jun 4 06:30:09 PDT 2006
Author: illiminable
Date: 2006-06-04 06:29:50 -0700 (Sun, 04 Jun 2006)
New Revision: 11516
Added:
branches/theora-playtime/lib/x86_32_vs/idct_sse2.c
Modified:
branches/theora-playtime/lib/dsp.h
branches/theora-playtime/lib/encoder_toplevel.c
branches/theora-playtime/lib/idct.c
branches/theora-playtime/lib/pb.c
branches/theora-playtime/lib/quant_lookup.h
branches/theora-playtime/win32/VS2005/libtheora/libtheora.vcproj
Log:
* Force alignment of memory used by idct.c
* Align the dezigzag table
* Implemnet dequant_slow__sse2
Modified: branches/theora-playtime/lib/dsp.h
===================================================================
--- branches/theora-playtime/lib/dsp.h 2006-06-04 11:16:48 UTC (rev 11515)
+++ branches/theora-playtime/lib/dsp.h 2006-06-04 13:29:50 UTC (rev 11516)
@@ -83,23 +83,23 @@
/* iDCT Functions */
- void (*IDct1)( ogg_int16_t/*Q_LIST_ENTRY*/ * InputData, ogg_int16_t *QuantMatrix,
+ void (*IDct1)( ogg_int16_t/*Q_LIST_ENTRY*/ * InputData, ogg_int16_t *QuantMatrix,
ogg_int16_t * OutputData );
- void (*dequant_slow10)( ogg_int16_t * dequant_coeffs,
- ogg_int16_t * quantized_list,
+ void (*dequant_slow10)( ogg_int16_t * dequant_coeffs,
+ ogg_int16_t * quantized_list,
ogg_int32_t * DCT_block);
- void (*IDct10)( ogg_int16_t/*Q_LIST_ENTRY*/ * InputData,
- ogg_int16_t *QuantMatrix,
+ void (*IDct10)( ogg_int16_t/*Q_LIST_ENTRY*/ * InputData,
+ ogg_int16_t *QuantMatrix,
ogg_int16_t * OutputData );
- void (*IDctSlow)( ogg_int16_t/*Q_LIST_ENTRY*/ * InputData,
- ogg_int16_t *QuantMatrix,
+ void (*IDctSlow)( ogg_int16_t/*Q_LIST_ENTRY*/ * InputData,
+ ogg_int16_t *QuantMatrix,
ogg_int16_t * OutputData );
- void (*dequant_slow)( ogg_int16_t * dequant_coeffs,
- ogg_int16_t * quantized_list,
+ void (*dequant_slow)( ogg_int16_t * dequant_coeffs,
+ ogg_int16_t * quantized_list,
ogg_int32_t * DCT_block);
Modified: branches/theora-playtime/lib/encoder_toplevel.c
===================================================================
--- branches/theora-playtime/lib/encoder_toplevel.c 2006-06-04 11:16:48 UTC (rev 11515)
+++ branches/theora-playtime/lib/encoder_toplevel.c 2006-06-04 13:29:50 UTC (rev 11516)
@@ -51,7 +51,7 @@
if(cpi->DCTDataBuffer )
_theora_16_byte_aligned_free( cpi->DCTDataBuffer);
if(cpi->quantized_list)
- _ogg_free( cpi->quantized_list);
+ _theora_16_byte_aligned_free( cpi->quantized_list);
if(cpi->OriginalDC)
_ogg_free( cpi->OriginalDC);
if(cpi->PartiallyCodedFlags)
@@ -143,7 +143,7 @@
_theora_16_byte_aligned_malloc(64*
sizeof(*cpi->DCTDataBuffer));
cpi->quantized_list =
- _ogg_malloc(64*
+ _theora_16_byte_aligned_malloc(64*
sizeof(*cpi->quantized_list));
cpi->PartiallyCodedFlags =
_ogg_malloc(cpi->pb.MacroBlocks*
Modified: branches/theora-playtime/lib/idct.c
===================================================================
--- branches/theora-playtime/lib/idct.c 2006-06-04 11:16:48 UTC (rev 11515)
+++ branches/theora-playtime/lib/idct.c 2006-06-04 13:29:50 UTC (rev 11516)
@@ -16,6 +16,8 @@
********************************************************************/
#include <string.h>
+#include "dsp.h"
+#include "cpu.h"
#include "codec_internal.h"
#include "quant_lookup.h"
@@ -571,8 +573,8 @@
// dsp_mmx_idct_init(funcs);
//}
- //if (cpu_flags & CPU_X86_SSE2) {
- // dsp_sse2_idct_init(funcs);
- //}
+ if (cpu_flags & CPU_X86_SSE2) {
+ dsp_sse2_idct_init(funcs);
+ }
#endif
}
\ No newline at end of file
Modified: branches/theora-playtime/lib/pb.c
===================================================================
--- branches/theora-playtime/lib/pb.c 2006-06-04 11:16:48 UTC (rev 11515)
+++ branches/theora-playtime/lib/pb.c 2006-06-04 13:29:50 UTC (rev 11516)
@@ -24,23 +24,23 @@
if(pbi->ReconDataBuffer)
_theora_16_byte_aligned_free(pbi->ReconDataBuffer);
if(pbi->DequantBuffer)
- _ogg_free(pbi->DequantBuffer);
+ _theora_16_byte_aligned_free(pbi->DequantBuffer);
if(pbi->TmpDataBuffer)
- _ogg_free(pbi->TmpDataBuffer);
+ _theora_16_byte_aligned_free(pbi->TmpDataBuffer);
if(pbi->TmpReconBuffer)
- _ogg_free(pbi->TmpReconBuffer);
+ _theora_16_byte_aligned_free(pbi->TmpReconBuffer);
if(pbi->dequant_Y_coeffs)
- _ogg_free(pbi->dequant_Y_coeffs);
+ _theora_16_byte_aligned_free(pbi->dequant_Y_coeffs);
if(pbi->dequant_U_coeffs)
- _ogg_free(pbi->dequant_U_coeffs);
+ _theora_16_byte_aligned_free(pbi->dequant_U_coeffs);
if(pbi->dequant_V_coeffs)
- _ogg_free(pbi->dequant_V_coeffs);
+ _theora_16_byte_aligned_free(pbi->dequant_V_coeffs);
if(pbi->dequant_InterY_coeffs)
- _ogg_free(pbi->dequant_InterY_coeffs);
+ _theora_16_byte_aligned_free(pbi->dequant_InterY_coeffs);
if(pbi->dequant_InterU_coeffs)
- _ogg_free(pbi->dequant_InterU_coeffs);
+ _theora_16_byte_aligned_free(pbi->dequant_InterU_coeffs);
if(pbi->dequant_InterV_coeffs)
- _ogg_free(pbi->dequant_InterV_coeffs);
+ _theora_16_byte_aligned_free(pbi->dequant_InterV_coeffs);
pbi->ReconDataBuffer=0;
@@ -66,31 +66,31 @@
_theora_16_byte_aligned_malloc(64*sizeof(*pbi->ReconDataBuffer));
pbi->DequantBuffer =
- _ogg_malloc(64 * sizeof(*pbi->DequantBuffer));
+ _theora_16_byte_aligned_malloc(64 * sizeof(*pbi->DequantBuffer));
pbi->TmpDataBuffer =
- _ogg_malloc(64 * sizeof(*pbi->TmpDataBuffer));
+ _theora_16_byte_aligned_malloc(64 * sizeof(*pbi->TmpDataBuffer));
pbi->TmpReconBuffer =
- _ogg_malloc(64 * sizeof(*pbi->TmpReconBuffer));
+ _theora_16_byte_aligned_malloc(64 * sizeof(*pbi->TmpReconBuffer));
pbi->dequant_Y_coeffs =
- _ogg_malloc(64 * sizeof(*pbi->dequant_Y_coeffs));
+ _theora_16_byte_aligned_malloc(64 * sizeof(*pbi->dequant_Y_coeffs));
pbi->dequant_U_coeffs =
- _ogg_malloc(64 * sizeof(*pbi->dequant_U_coeffs));
+ _theora_16_byte_aligned_malloc(64 * sizeof(*pbi->dequant_U_coeffs));
pbi->dequant_V_coeffs =
- _ogg_malloc(64 * sizeof(*pbi->dequant_V_coeffs));
+ _theora_16_byte_aligned_malloc(64 * sizeof(*pbi->dequant_V_coeffs));
pbi->dequant_InterY_coeffs =
- _ogg_malloc(64 * sizeof(*pbi->dequant_InterY_coeffs));
+ _theora_16_byte_aligned_malloc(64 * sizeof(*pbi->dequant_InterY_coeffs));
pbi->dequant_InterU_coeffs =
- _ogg_malloc(64 * sizeof(*pbi->dequant_InterU_coeffs));
+ _theora_16_byte_aligned_malloc(64 * sizeof(*pbi->dequant_InterU_coeffs));
pbi->dequant_InterV_coeffs =
- _ogg_malloc(64 * sizeof(*pbi->dequant_InterV_coeffs));
+ _theora_16_byte_aligned_malloc(64 * sizeof(*pbi->dequant_InterV_coeffs));
}
Modified: branches/theora-playtime/lib/quant_lookup.h
===================================================================
--- branches/theora-playtime/lib/quant_lookup.h 2006-06-04 11:16:48 UTC (rev 11515)
+++ branches/theora-playtime/lib/quant_lookup.h 2006-06-04 13:29:50 UTC (rev 11516)
@@ -25,7 +25,21 @@
#define IDCT_SCALE_FACTOR 2 /* Shift left bits to improve IDCT precision */
#define OLD_SCHEME 1
+#if defined(USE_ASM) && defined(WIN32)
/* lookup table for DCT coefficient zig-zag ordering */
+static const __declspec(align(16)) ogg_uint32_t dezigzag_index[64] = {
+ 0, 1, 8, 16, 9, 2, 3, 10,
+ 17, 24, 32, 25, 18, 11, 4, 5,
+ 12, 19, 26, 33, 40, 48, 41, 34,
+ 27, 20, 13, 6, 7, 14, 21, 28,
+ 35, 42, 49, 56, 57, 50, 43, 36,
+ 29, 22, 15, 23, 30, 37, 44, 51,
+ 58, 59, 52, 45, 38, 31, 39, 46,
+ 53, 60, 61, 54, 47, 55, 62, 63
+};
+
+#else
+/* lookup table for DCT coefficient zig-zag ordering */
static const ogg_uint32_t dezigzag_index[64] = {
0, 1, 8, 16, 9, 2, 3, 10,
17, 24, 32, 25, 18, 11, 4, 5,
@@ -36,3 +50,4 @@
58, 59, 52, 45, 38, 31, 39, 46,
53, 60, 61, 54, 47, 55, 62, 63
};
+#endif
Added: branches/theora-playtime/lib/x86_32_vs/idct_sse2.c
===================================================================
--- branches/theora-playtime/lib/x86_32_vs/idct_sse2.c 2006-06-04 11:16:48 UTC (rev 11515)
+++ branches/theora-playtime/lib/x86_32_vs/idct_sse2.c 2006-06-04 13:29:50 UTC (rev 11516)
@@ -0,0 +1,721 @@
+/********************************************************************
+ * *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
+ * *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003 *
+ * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * *
+ ********************************************************************
+
+ function: SSE2 implementation of the Theora iDCT
+ last mod: $Id: idct_sse2.c 11513 2006-06-04 09:46:34Z illiminable $
+
+ ********************************************************************/
+
+#include <string.h>
+#include "codec_internal.h"
+#include "quant_lookup.h"
+
+#define IdctAdjustBeforeShift 8
+
+/* cos(n*pi/16) or sin(8-n)*pi/16) */
+#define xC1S7 64277
+#define xC2S6 60547
+#define xC3S5 54491
+#define xC4S4 46341
+#define xC5S3 36410
+#define xC6S2 25080
+#define xC7S1 12785
+
+
+
+static void dequant_slow__sse2( ogg_int16_t * dequant_coeffs,
+ ogg_int16_t * quantized_list,
+ ogg_int32_t * DCT_block)
+{
+#if 0
+ int i;
+ for(i=0;i<64;i++)
+ DCT_block[dezigzag_index[i]] = quantized_list[i] * dequant_coeffs[i];
+#else
+
+ static __declspec(align(16)) ogg_int32_t temp_block[64];
+ static ogg_int32_t* temp_block_ptr = temp_block;
+ static ogg_int32_t* zigzag_ptr = dezigzag_index;
+
+ /* quantized list is not aligned */
+
+
+ __asm {
+ align 16
+
+ mov edi, DCT_block /* int32 */
+ mov edx, zigzag_ptr /* int32 */
+ mov esi, quantized_list /* int16 */
+ mov ebx, dequant_coeffs /* int16 */
+ mov eax, temp_block_ptr
+
+
+ /*
+ The repeated blocks of 16 iterations are identical except
+ for the offsets in the writes at the end and the reads at start
+ */
+
+ /* 16 Iterations at a time */
+ /* Read 16x16 bits of quatized_list and dequant_coeffs */
+ movdqu xmm1, [esi]
+ movdqu xmm5, [esi + 16]
+
+ movdqa xmm2, [ebx]
+ movdqa xmm6, [ebx + 16]
+
+ /* Make a copy of xmm1 and xmm5 */
+ movdqa xmm7, xmm1
+ movdqa xmm0, xmm5
+
+ /* Multiply */
+ pmullw xmm1, xmm2
+ pmulhw xmm2, xmm7
+
+ pmullw xmm5, xmm6
+ pmulhw xmm6, xmm0
+
+ /* Interleave the multiplicataion results */
+ movdqa xmm0, xmm1
+ punpcklwd xmm1, xmm2 /* Now the low 4 x 32 bits */
+ punpckhwd xmm0, xmm2 /* The high 4x32 bits */
+
+ movdqa xmm2, xmm5
+ punpcklwd xmm5, xmm6
+ punpckhwd xmm2, xmm6
+
+ /* Write the 16x32 bits of output to temp space */
+ movdqa [eax], xmm1
+ movdqa [eax + 16], xmm0
+ movdqa [eax + 32], xmm5
+ movdqa [eax + 48], xmm2
+
+
+
+ /* 16 Iterations at a time */
+ /* Read 16x16 bits of quatized_list and dequant_coeffs */
+ movdqu xmm1, [esi + 32]
+ movdqu xmm5, [esi + 48]
+
+ movdqa xmm2, [ebx + 32]
+ movdqa xmm6, [ebx + 48]
+
+ /* Make a copy of xmm1 and xmm5 */
+ movdqa xmm7, xmm1
+ movdqa xmm0, xmm5
+
+ /* Multiply */
+ pmullw xmm1, xmm2
+ pmulhw xmm2, xmm7
+
+ pmullw xmm5, xmm6
+ pmulhw xmm6, xmm0
+
+ /* Interleave the multiplicataion results */
+ movdqa xmm0, xmm1
+ punpcklwd xmm1, xmm2 /* Now the low 4 x 32 bits */
+ punpckhwd xmm0, xmm2 /* The high 4x32 bits */
+
+ movdqa xmm2, xmm5
+ punpcklwd xmm5, xmm6
+ punpckhwd xmm2, xmm6
+
+ /* Write the 16x32 bits of output to temp space */
+ movdqa [eax + 64], xmm1
+ movdqa [eax + 80], xmm0
+ movdqa [eax + 96], xmm5
+ movdqa [eax + 112], xmm2
+
+ /* 16 Iterations at a time */
+ /* Read 16x16 bits of quatized_list and dequant_coeffs */
+ movdqu xmm1, [esi + 64]
+ movdqu xmm5, [esi + 80]
+
+ movdqa xmm2, [ebx + 64]
+ movdqa xmm6, [ebx + 80]
+
+ /* Make a copy of xmm1 and xmm5 */
+ movdqa xmm7, xmm1
+ movdqa xmm0, xmm5
+
+ /* Multiply */
+ pmullw xmm1, xmm2
+ pmulhw xmm2, xmm7
+
+ pmullw xmm5, xmm6
+ pmulhw xmm6, xmm0
+
+ /* Interleave the multiplicataion results */
+ movdqa xmm0, xmm1
+ punpcklwd xmm1, xmm2 /* Now the low 4 x 32 bits */
+ punpckhwd xmm0, xmm2 /* The high 4x32 bits */
+
+ movdqa xmm2, xmm5
+ punpcklwd xmm5, xmm6
+ punpckhwd xmm2, xmm6
+
+ /* Write the 16x32 bits of output to temp space */
+ movdqa [eax + 128], xmm1
+ movdqa [eax + 144], xmm0
+ movdqa [eax + 160], xmm5
+ movdqa [eax + 176], xmm2
+
+
+
+ /* 16 Iterations at a time */
+ /* Read 16x16 bits of quatized_list and dequant_coeffs */
+ movdqu xmm1, [esi + 96]
+ movdqu xmm5, [esi + 112]
+
+ movdqa xmm2, [ebx + 96]
+ movdqa xmm6, [ebx + 112]
+
+ /* Make a copy of xmm1 and xmm5 */
+ movdqa xmm7, xmm1
+ movdqa xmm0, xmm5
+
+ /* Multiply */
+ pmullw xmm1, xmm2
+ pmulhw xmm2, xmm7
+
+ pmullw xmm5, xmm6
+ pmulhw xmm6, xmm0
+
+ /* Interleave the multiplicataion results */
+ movdqa xmm0, xmm1
+ punpcklwd xmm1, xmm2 /* Now the low 4 x 32 bits */
+ punpckhwd xmm0, xmm2 /* The high 4x32 bits */
+
+ movdqa xmm2, xmm5
+ punpcklwd xmm5, xmm6
+ punpckhwd xmm2, xmm6
+
+ /* Write the 16x32 bits of output to temp space */
+ movdqa [eax + 192], xmm1
+ movdqa [eax + 208], xmm0
+ movdqa [eax + 224], xmm5
+ movdqa [eax + 240], xmm2
+
+ /* Now follow the pattern to write - can't use simd */
+ mov ebx, 4
+ loop_start:
+ mov ecx , [edx]
+ mov esi , [eax]
+ mov [edi + ecx*4] , esi
+ mov ecx , [edx + 4]
+ mov esi , [eax + 4]
+ mov [edi + ecx*4] , esi
+ mov ecx , [edx + 8]
+ mov esi , [eax + 8]
+ mov [edi + ecx*4] , esi
+ mov ecx , [edx + 12]
+ mov esi , [eax + 12]
+ mov [edi + ecx*4] , esi
+
+ mov ecx , [edx + 16]
+ mov esi , [eax + 16]
+ mov [edi + ecx*4] , esi
+ mov ecx , [edx + 20]
+ mov esi , [eax + 20]
+ mov [edi + ecx*4] , esi
+ mov ecx , [edx + 24]
+ mov esi , [eax + 24]
+ mov [edi + ecx*4] , esi
+ mov ecx , [edx + 28]
+ mov esi , [eax + 28]
+ mov [edi + ecx*4] , esi
+
+ mov ecx , [edx + 32]
+ mov esi , [eax + 32]
+ mov [edi + ecx*4] , esi
+ mov ecx , [edx + 36]
+ mov esi , [eax + 36]
+ mov [edi + ecx*4] , esi
+ mov ecx , [edx + 40]
+ mov esi , [eax + 40]
+ mov [edi + ecx*4] , esi
+ mov ecx , [edx + 44]
+ mov esi , [eax + 44]
+ mov [edi + ecx*4] , esi
+
+ mov ecx , [edx + 48]
+ mov esi , [eax + 48]
+ mov [edi + ecx*4] , esi
+ mov ecx , [edx + 52]
+ mov esi , [eax + 52]
+ mov [edi + ecx*4] , esi
+ mov ecx , [edx + 56]
+ mov esi , [eax + 56]
+ mov [edi + ecx*4] , esi
+ mov ecx , [edx + 60]
+ mov esi , [eax + 60]
+ mov [edi + ecx*4] , esi
+
+ add eax, 64
+ add edx, 64
+
+ sub ebx, 1
+ jnz loop_start
+
+
+ };
+#endif
+}
+
+
+
+void IDctSlow__sse2( Q_LIST_ENTRY * InputData,
+ ogg_int16_t *QuantMatrix,
+ ogg_int16_t * OutputData ) {
+ __declspec(align(16)) ogg_int32_t IntermediateData[64];
+ ogg_int32_t * ip = IntermediateData;
+ ogg_int16_t * op = OutputData;
+
+ ogg_int32_t _A, _B, _C, _D, _Ad, _Bd, _Cd, _Dd, _E, _F, _G, _H;
+ ogg_int32_t _Ed, _Gd, _Add, _Bdd, _Fd, _Hd;
+ ogg_int32_t t1, t2;
+
+ int loop;
+
+ dequant_slow__sse2( QuantMatrix, InputData, IntermediateData);
+
+ /* Inverse DCT on the rows now */
+ for ( loop = 0; loop < 8; loop++){
+ /* Check for non-zero values */
+ if ( ip[0] | ip[1] | ip[2] | ip[3] | ip[4] | ip[5] | ip[6] | ip[7] ) {
+ t1 = (xC1S7 * ip[1]);
+ t2 = (xC7S1 * ip[7]);
+ t1 >>= 16;
+ t2 >>= 16;
+ _A = t1 + t2;
+
+ t1 = (xC7S1 * ip[1]);
+ t2 = (xC1S7 * ip[7]);
+ t1 >>= 16;
+ t2 >>= 16;
+ _B = t1 - t2;
+
+ t1 = (xC3S5 * ip[3]);
+ t2 = (xC5S3 * ip[5]);
+ t1 >>= 16;
+ t2 >>= 16;
+ _C = t1 + t2;
+
+ t1 = (xC3S5 * ip[5]);
+ t2 = (xC5S3 * ip[3]);
+ t1 >>= 16;
+ t2 >>= 16;
+ _D = t1 - t2;
+
+ t1 = (xC4S4 * (ogg_int16_t)(_A - _C));
+ t1 >>= 16;
+ _Ad = t1;
+
+ t1 = (xC4S4 * (ogg_int16_t)(_B - _D));
+ t1 >>= 16;
+ _Bd = t1;
+
+
+ _Cd = _A + _C;
+ _Dd = _B + _D;
+
+ t1 = (xC4S4 * (ogg_int16_t)(ip[0] + ip[4]));
+ t1 >>= 16;
+ _E = t1;
+
+ t1 = (xC4S4 * (ogg_int16_t)(ip[0] - ip[4]));
+ t1 >>= 16;
+ _F = t1;
+
+ t1 = (xC2S6 * ip[2]);
+ t2 = (xC6S2 * ip[6]);
+ t1 >>= 16;
+ t2 >>= 16;
+ _G = t1 + t2;
+
+ t1 = (xC6S2 * ip[2]);
+ t2 = (xC2S6 * ip[6]);
+ t1 >>= 16;
+ t2 >>= 16;
+ _H = t1 - t2;
+
+
+ _Ed = _E - _G;
+ _Gd = _E + _G;
+
+ _Add = _F + _Ad;
+ _Bdd = _Bd - _H;
+
+ _Fd = _F - _Ad;
+ _Hd = _Bd + _H;
+
+ /* Final sequence of operations over-write original inputs. */
+ ip[0] = (ogg_int16_t)((_Gd + _Cd ) >> 0);
+ ip[7] = (ogg_int16_t)((_Gd - _Cd ) >> 0);
+
+ ip[1] = (ogg_int16_t)((_Add + _Hd ) >> 0);
+ ip[2] = (ogg_int16_t)((_Add - _Hd ) >> 0);
+
+ ip[3] = (ogg_int16_t)((_Ed + _Dd ) >> 0);
+ ip[4] = (ogg_int16_t)((_Ed - _Dd ) >> 0);
+
+ ip[5] = (ogg_int16_t)((_Fd + _Bdd ) >> 0);
+ ip[6] = (ogg_int16_t)((_Fd - _Bdd ) >> 0);
+
+ }
+
+ ip += 8; /* next row */
+ }
+
+ ip = IntermediateData;
+
+ for ( loop = 0; loop < 8; loop++){
+ /* Check for non-zero values (bitwise or faster than ||) */
+ if ( ip[0 * 8] | ip[1 * 8] | ip[2 * 8] | ip[3 * 8] |
+ ip[4 * 8] | ip[5 * 8] | ip[6 * 8] | ip[7 * 8] ) {
+
+ t1 = (xC1S7 * ip[1*8]);
+ t2 = (xC7S1 * ip[7*8]);
+ t1 >>= 16;
+ t2 >>= 16;
+ _A = t1 + t2;
+
+ t1 = (xC7S1 * ip[1*8]);
+ t2 = (xC1S7 * ip[7*8]);
+ t1 >>= 16;
+ t2 >>= 16;
+ _B = t1 - t2;
+
+ t1 = (xC3S5 * ip[3*8]);
+ t2 = (xC5S3 * ip[5*8]);
+ t1 >>= 16;
+ t2 >>= 16;
+ _C = t1 + t2;
+
+ t1 = (xC3S5 * ip[5*8]);
+ t2 = (xC5S3 * ip[3*8]);
+ t1 >>= 16;
+ t2 >>= 16;
+ _D = t1 - t2;
+
+ t1 = (xC4S4 * (ogg_int16_t)(_A - _C));
+ t1 >>= 16;
+ _Ad = t1;
+
+ t1 = (xC4S4 * (ogg_int16_t)(_B - _D));
+ t1 >>= 16;
+ _Bd = t1;
+
+
+ _Cd = _A + _C;
+ _Dd = _B + _D;
+
+ t1 = (xC4S4 * (ogg_int16_t)(ip[0*8] + ip[4*8]));
+ t1 >>= 16;
+ _E = t1;
+
+ t1 = (xC4S4 * (ogg_int16_t)(ip[0*8] - ip[4*8]));
+ t1 >>= 16;
+ _F = t1;
+
+ t1 = (xC2S6 * ip[2*8]);
+ t2 = (xC6S2 * ip[6*8]);
+ t1 >>= 16;
+ t2 >>= 16;
+ _G = t1 + t2;
+
+ t1 = (xC6S2 * ip[2*8]);
+ t2 = (xC2S6 * ip[6*8]);
+ t1 >>= 16;
+ t2 >>= 16;
+ _H = t1 - t2;
+
+ _Ed = _E - _G;
+ _Gd = _E + _G;
+
+ _Add = _F + _Ad;
+ _Bdd = _Bd - _H;
+
+ _Fd = _F - _Ad;
+ _Hd = _Bd + _H;
+
+ _Gd += IdctAdjustBeforeShift;
+ _Add += IdctAdjustBeforeShift;
+ _Ed += IdctAdjustBeforeShift;
+ _Fd += IdctAdjustBeforeShift;
+
+ /* Final sequence of operations over-write original inputs. */
+ op[0*8] = (ogg_int16_t)((_Gd + _Cd ) >> 4);
+ op[7*8] = (ogg_int16_t)((_Gd - _Cd ) >> 4);
+
+ op[1*8] = (ogg_int16_t)((_Add + _Hd ) >> 4);
+ op[2*8] = (ogg_int16_t)((_Add - _Hd ) >> 4);
+
+ op[3*8] = (ogg_int16_t)((_Ed + _Dd ) >> 4);
+ op[4*8] = (ogg_int16_t)((_Ed - _Dd ) >> 4);
+
+ op[5*8] = (ogg_int16_t)((_Fd + _Bdd ) >> 4);
+ op[6*8] = (ogg_int16_t)((_Fd - _Bdd ) >> 4);
+ }else{
+ op[0*8] = 0;
+ op[7*8] = 0;
+ op[1*8] = 0;
+ op[2*8] = 0;
+ op[3*8] = 0;
+ op[4*8] = 0;
+ op[5*8] = 0;
+ op[6*8] = 0;
+ }
+
+ ip++; /* next column */
+ op++;
+ }
+}
+
+/************************
+ x x x x 0 0 0 0
+ x x x 0 0 0 0 0
+ x x 0 0 0 0 0 0
+ x 0 0 0 0 0 0 0
+ 0 0 0 0 0 0 0 0
+ 0 0 0 0 0 0 0 0
+ 0 0 0 0 0 0 0 0
+ 0 0 0 0 0 0 0 0
+*************************/
+
+static void dequant_slow10__sse2( ogg_int16_t * dequant_coeffs,
+ ogg_int16_t * quantized_list,
+ ogg_int32_t * DCT_block){
+ int i;
+ memset(DCT_block,0, 128);
+ for(i=0;i<10;i++)
+ DCT_block[dezigzag_index[i]] = quantized_list[i] * dequant_coeffs[i];
+
+}
+
+void IDct10__sse2( Q_LIST_ENTRY * InputData,
+ ogg_int16_t *QuantMatrix,
+ ogg_int16_t * OutputData ){
+ ogg_int32_t IntermediateData[64];
+ ogg_int32_t * ip = IntermediateData;
+ ogg_int16_t * op = OutputData;
+
+ ogg_int32_t _A, _B, _C, _D, _Ad, _Bd, _Cd, _Dd, _E, _F, _G, _H;
+ ogg_int32_t _Ed, _Gd, _Add, _Bdd, _Fd, _Hd;
+ ogg_int32_t t1, t2;
+
+ int loop;
+
+ dequant_slow10__sse2( QuantMatrix, InputData, IntermediateData);
+
+ /* Inverse DCT on the rows now */
+ for ( loop = 0; loop < 4; loop++){
+ /* Check for non-zero values */
+ if ( ip[0] | ip[1] | ip[2] | ip[3] ){
+ t1 = (xC1S7 * ip[1]);
+ t1 >>= 16;
+ _A = t1;
+
+ t1 = (xC7S1 * ip[1]);
+ t1 >>= 16;
+ _B = t1 ;
+
+ t1 = (xC3S5 * ip[3]);
+ t1 >>= 16;
+ _C = t1;
+
+ t2 = (xC5S3 * ip[3]);
+ t2 >>= 16;
+ _D = -t2;
+
+
+ t1 = (xC4S4 * (ogg_int16_t)(_A - _C));
+ t1 >>= 16;
+ _Ad = t1;
+
+ t1 = (xC4S4 * (ogg_int16_t)(_B - _D));
+ t1 >>= 16;
+ _Bd = t1;
+
+
+ _Cd = _A + _C;
+ _Dd = _B + _D;
+
+ t1 = (xC4S4 * ip[0] );
+ t1 >>= 16;
+ _E = t1;
+
+ _F = t1;
+
+ t1 = (xC2S6 * ip[2]);
+ t1 >>= 16;
+ _G = t1;
+
+ t1 = (xC6S2 * ip[2]);
+ t1 >>= 16;
+ _H = t1 ;
+
+
+ _Ed = _E - _G;
+ _Gd = _E + _G;
+
+ _Add = _F + _Ad;
+ _Bdd = _Bd - _H;
+
+ _Fd = _F - _Ad;
+ _Hd = _Bd + _H;
+
+ /* Final sequence of operations over-write original inputs. */
+ ip[0] = (ogg_int16_t)((_Gd + _Cd ) >> 0);
+ ip[7] = (ogg_int16_t)((_Gd - _Cd ) >> 0);
+
+ ip[1] = (ogg_int16_t)((_Add + _Hd ) >> 0);
+ ip[2] = (ogg_int16_t)((_Add - _Hd ) >> 0);
+
+ ip[3] = (ogg_int16_t)((_Ed + _Dd ) >> 0);
+ ip[4] = (ogg_int16_t)((_Ed - _Dd ) >> 0);
+
+ ip[5] = (ogg_int16_t)((_Fd + _Bdd ) >> 0);
+ ip[6] = (ogg_int16_t)((_Fd - _Bdd ) >> 0);
+
+ }
+
+ ip += 8; /* next row */
+ }
+
+ ip = IntermediateData;
+
+ for ( loop = 0; loop < 8; loop++) {
+ /* Check for non-zero values (bitwise or faster than ||) */
+ if ( ip[0 * 8] | ip[1 * 8] | ip[2 * 8] | ip[3 * 8] ) {
+
+ t1 = (xC1S7 * ip[1*8]);
+ t1 >>= 16;
+ _A = t1 ;
+
+ t1 = (xC7S1 * ip[1*8]);
+ t1 >>= 16;
+ _B = t1 ;
+
+ t1 = (xC3S5 * ip[3*8]);
+ t1 >>= 16;
+ _C = t1 ;
+
+ t2 = (xC5S3 * ip[3*8]);
+ t2 >>= 16;
+ _D = - t2;
+
+
+ t1 = (xC4S4 * (ogg_int16_t)(_A - _C));
+ t1 >>= 16;
+ _Ad = t1;
+
+ t1 = (xC4S4 * (ogg_int16_t)(_B - _D));
+ t1 >>= 16;
+ _Bd = t1;
+
+
+ _Cd = _A + _C;
+ _Dd = _B + _D;
+
+ t1 = (xC4S4 * ip[0*8]);
+ t1 >>= 16;
+ _E = t1;
+ _F = t1;
+
+ t1 = (xC2S6 * ip[2*8]);
+ t1 >>= 16;
+ _G = t1;
+
+ t1 = (xC6S2 * ip[2*8]);
+ t1 >>= 16;
+ _H = t1;
+
+
+ _Ed = _E - _G;
+ _Gd = _E + _G;
+
+ _Add = _F + _Ad;
+ _Bdd = _Bd - _H;
+
+ _Fd = _F - _Ad;
+ _Hd = _Bd + _H;
+
+ _Gd += IdctAdjustBeforeShift;
+ _Add += IdctAdjustBeforeShift;
+ _Ed += IdctAdjustBeforeShift;
+ _Fd += IdctAdjustBeforeShift;
+
+ /* Final sequence of operations over-write original inputs. */
+ op[0*8] = (ogg_int16_t)((_Gd + _Cd ) >> 4);
+ op[7*8] = (ogg_int16_t)((_Gd - _Cd ) >> 4);
+
+ op[1*8] = (ogg_int16_t)((_Add + _Hd ) >> 4);
+ op[2*8] = (ogg_int16_t)((_Add - _Hd ) >> 4);
+
+ op[3*8] = (ogg_int16_t)((_Ed + _Dd ) >> 4);
+ op[4*8] = (ogg_int16_t)((_Ed - _Dd ) >> 4);
+
+ op[5*8] = (ogg_int16_t)((_Fd + _Bdd ) >> 4);
+ op[6*8] = (ogg_int16_t)((_Fd - _Bdd ) >> 4);
+ }else{
+ op[0*8] = 0;
+ op[7*8] = 0;
+ op[1*8] = 0;
+ op[2*8] = 0;
+ op[3*8] = 0;
+ op[4*8] = 0;
+ op[5*8] = 0;
+ op[6*8] = 0;
+ }
+
+ ip++; /* next column */
+ op++;
+ }
+}
+
+/***************************
+ x 0 0 0 0 0 0 0
+ 0 0 0 0 0 0 0 0
+ 0 0 0 0 0 0 0 0
+ 0 0 0 0 0 0 0 0
+ 0 0 0 0 0 0 0 0
+ 0 0 0 0 0 0 0 0
+ 0 0 0 0 0 0 0 0
+ 0 0 0 0 0 0 0 0
+**************************/
+
+void IDct1__sse2( Q_LIST_ENTRY * InputData,
+ ogg_int16_t *QuantMatrix,
+ ogg_int16_t * OutputData ){
+ int loop;
+
+ ogg_int16_t OutD;
+
+ OutD=(ogg_int16_t) ((ogg_int32_t)(InputData[0]*QuantMatrix[0]+15)>>5);
+
+ for(loop=0;loop<64;loop++)
+ OutputData[loop]=OutD;
+
+}
+
+
+void dsp_sse2_idct_init (DspFunctions *funcs, ogg_uint32_t cpu_flags)
+{
+ /* TODO::: Match function order */
+ funcs->dequant_slow = dequant_slow__sse2;
+ funcs->IDct1 = IDct1__sse2;
+ funcs->IDct10 = IDct10__sse2;
+ funcs->dequant_slow10 = dequant_slow10__sse2;
+ funcs->IDctSlow = IDctSlow__sse2;
+ funcs->dequant_slow = dequant_slow__sse2;
+
+}
\ No newline at end of file
Modified: branches/theora-playtime/win32/VS2005/libtheora/libtheora.vcproj
===================================================================
--- branches/theora-playtime/win32/VS2005/libtheora/libtheora.vcproj 2006-06-04 11:16:48 UTC (rev 11515)
+++ branches/theora-playtime/win32/VS2005/libtheora/libtheora.vcproj 2006-06-04 13:29:50 UTC (rev 11516)
@@ -447,6 +447,14 @@
>
</File>
<File
+ RelativePath="..\..\..\lib\x86_32_vs\idct_mmx.c"
+ >
+ </File>
+ <File
+ RelativePath="..\..\..\lib\x86_32_vs\idct_sse2.c"
+ >
+ </File>
+ <File
RelativePath="..\..\libtheora.def"
>
</File>
More information about the commits
mailing list