[xiph-commits] r11551 - in branches/theora-playtime: examples lib
lib/x86_32_vs win32/VS2005/libtheora
illiminable at svn.xiph.org
illiminable at svn.xiph.org
Fri Jun 9 07:26:03 PDT 2006
Author: illiminable
Date: 2006-06-09 07:25:50 -0700 (Fri, 09 Jun 2006)
New Revision: 11551
Added:
branches/theora-playtime/lib/x86_32_vs/scan_sse2.c
Modified:
branches/theora-playtime/examples/encoder_example.c
branches/theora-playtime/lib/dsp.c
branches/theora-playtime/lib/dsp.h
branches/theora-playtime/lib/scan.c
branches/theora-playtime/lib/x86_32_vs/idct_sse2.c
branches/theora-playtime/win32/VS2005/libtheora/libtheora.vcproj
Log:
* Add initialisation of the scan functions to dspfunctions
* Revert scan.c back to it's original state
* Add a scan_sse2.c for the sse2 parts of scan
Modified: branches/theora-playtime/examples/encoder_example.c
===================================================================
--- branches/theora-playtime/examples/encoder_example.c 2006-06-09 12:15:37 UTC (rev 11550)
+++ branches/theora-playtime/examples/encoder_example.c 2006-06-09 14:25:50 UTC (rev 11551)
@@ -21,8 +21,9 @@
#define _LARGEFILE64_SOURCE
#define _FILE_OFFSET_BITS 64
+#define FIXED_SERIAL_NO
/* Define to give performance data win32 only*/
-//#define THEORA_PERF_DATA
+#define THEORA_PERF_DATA
#ifdef THEORA_PERF_DATA
#include <windows.h>
#endif
@@ -318,9 +319,9 @@
int spinner=0;
char *spinascii="|/-\\";
void spinnit(void){
- spinner++;
- if(spinner==4)spinner=0;
- fprintf(stderr,"\r%c",spinascii[spinner]);
+ //spinner++;
+ //if(spinner==4)spinner=0;
+ //fprintf(stderr,"\r%c",spinascii[spinner]);
}
int fetch_and_process_audio(FILE *audio,ogg_page *audiopage,
@@ -663,8 +664,13 @@
{
/* need two inequal serial numbers */
int serial1, serial2;
+#ifdef FIXED_SERIAL_NO
+ serial1 = 1;
+ serial2 = 2;
+#else
serial1 = rand();
serial2 = rand();
+#endif
if (serial1 == serial2) serial2++;
ogg_stream_init(&to,serial1);
ogg_stream_init(&vo,serial2);
@@ -859,9 +865,9 @@
else
akbps=rint(audio_bytesout*8./timebase*.001);
- fprintf(stderr,
- "\r %d:%02d:%02d.%02d audio: %dkbps video: %dkbps ",
- hours,minutes,seconds,hundredths,akbps,vkbps);
+ //fprintf(stderr,
+ // "\r %d:%02d:%02d.%02d audio: %dkbps video: %dkbps ",
+ // hours,minutes,seconds,hundredths,akbps,vkbps);
}
}
Modified: branches/theora-playtime/lib/dsp.c
===================================================================
--- branches/theora-playtime/lib/dsp.c 2006-06-09 12:15:37 UTC (rev 11550)
+++ branches/theora-playtime/lib/dsp.c 2006-06-09 14:25:50 UTC (rev 11551)
@@ -411,6 +411,7 @@
dsp_dct_init (funcs, cpuflags);
dsp_idct_init (funcs, cpuflags);
dsp_dct_decode_init(funcs, cpuflags);
+ dsp_scan_init(funcs, cpuflags);
#if defined(USE_ASM)
if (cpuflags & CPU_X86_MMX) {
dsp_mmx_init(funcs);
Modified: branches/theora-playtime/lib/dsp.h
===================================================================
--- branches/theora-playtime/lib/dsp.h 2006-06-09 12:15:37 UTC (rev 11550)
+++ branches/theora-playtime/lib/dsp.h 2006-06-09 14:25:50 UTC (rev 11551)
@@ -19,8 +19,10 @@
#define DSP_H
#include <theora/theora.h>
-/*ZEN::: Added for QLIST_ENTRY */
-//#include "codec_internal.h"
+
+
+struct PP_INSTANCE;
+
typedef unsigned long int ogg_uint64_t;
typedef struct
@@ -112,6 +114,16 @@
ogg_int32_t *BoundingValuePtr);
+ void (*RowDiffScan)( struct PP_INSTANCE *ppi,
+ unsigned char * YuvPtr1,
+ unsigned char * YuvPtr2,
+ ogg_int16_t * YUVDiffsPtr,
+ unsigned char * bits_map_ptr,
+ signed char * SgcPtr,
+ signed char * DispFragPtr,
+ unsigned char * FDiffPixels,
+ ogg_int32_t * RowDiffsPtr,
+ unsigned char * ChLocalsPtr, int EdgeRow );
} DspFunctions;
@@ -120,6 +132,7 @@
extern void dsp_recon_init (DspFunctions *funcs, ogg_uint32_t cpu_flags);
extern void dsp_idct_init (DspFunctions *funcs, ogg_uint32_t cpu_flags);
extern void dsp_dct_decode_init (DspFunctions *funcs, ogg_uint32_t cpu_flags);
+extern void dsp_scan_init (DspFunctions *funcs, ogg_uint32_t cpu_flags);
void dsp_init(DspFunctions *funcs);
@@ -134,6 +147,7 @@
extern void dsp_sse2_recon_init(DspFunctions *funcs);
extern void dsp_sse2_idct_init(DspFunctions *funcs);
extern void dsp_sse2_dct_decode_init(DspFunctions *funcs);
+extern void dsp_sse2_scan_init(DspFunctions *funcs);
#endif
@@ -199,7 +213,8 @@
#define dsp_dct_decode_filter_vert(funcs, ptr1, a1, ptr2) \
(funcs.FilterVert (ptr1, a1, ptr2))
+#define dsp_scan_row_diff_scan(funcs, ptr1, ptr2, ptr3, ptr4, ptr5, ptr6, ptr7, ptr8, ptr9, ptr10, a1) \
+ (funcs.RowDiffScan(ptr1, ptr2, ptr3, ptr4, ptr5, ptr6, ptr7, ptr8, ptr9, ptr10, a1))
-
#endif /* DSP_H */
Modified: branches/theora-playtime/lib/scan.c
===================================================================
--- branches/theora-playtime/lib/scan.c 2006-06-09 12:15:37 UTC (rev 11550)
+++ branches/theora-playtime/lib/scan.c 2006-06-09 14:25:50 UTC (rev 11551)
@@ -18,17 +18,11 @@
#include <stdlib.h>
#include <math.h>
#include <string.h>
-#include "codec_internal.h"
+
#include "dsp.h"
-#include "perf_helper.h"
+#include "cpu.h"
+#include "codec_internal.h"
-
-
-static unsigned __int64 perf_rds_datmf_time = 0;
-static unsigned __int64 perf_rds_datmf_count = 0;
-static unsigned __int64 perf_rds_datmf_min = -1;
-//#include "perf_helper.h"
-
#define MAX_SEARCH_LINE_LEN 7
#define SET8_0(ptr) \
@@ -688,621 +682,11 @@
}
-
-static void ApplyPakLowPass_Vectorised( PP_INSTANCE *ppi,
- unsigned char * SrcPtr,
- unsigned short * OutputPtr)
-{
-
-#if 0
-
- int i;
- for (i = 0; i < 8; i++)
- {
- unsigned char * SrcPtr1 = SrcPtr - 1;
- unsigned char * SrcPtr0 = SrcPtr1 - ppi->PlaneStride; /* Note the
- use of
- stride not
- width. */
- unsigned char * SrcPtr2 = SrcPtr1 + ppi->PlaneStride;
-
- //OutputPtr[i] = ( ( (ogg_uint32_t)SrcPtr[i-1-s] +
- // (ogg_uint32_t)SrcPtr[i-s] +
- // (ogg_uint32_t)SrcPtr[i-s+1] +
- // (ogg_uint32_t)SrcPtr[i-1] +
- // (ogg_uint32_t)SrcPtr[i+1] +
- // (ogg_uint32_t)SrcPtr[i+s-1] +
- // (ogg_uint32_t)SrcPtr[i+s] +
- // (ogg_uint32_t)SrcPtr[i+s+1] ) >> 3 );
-
- OutputPtr[i] = (unsigned char)( ( (ogg_uint32_t)SrcPtr0[0 + i] +
- (ogg_uint32_t)SrcPtr0[1 + i] +
- (ogg_uint32_t)SrcPtr0[2 + i] +
- (ogg_uint32_t)SrcPtr1[0 + i] +
- (ogg_uint32_t)SrcPtr1[2 + i] +
- (ogg_uint32_t)SrcPtr2[0 + i] +
- (ogg_uint32_t)SrcPtr2[1 + i] +
- (ogg_uint32_t)SrcPtr2[2 + i] ) >> 3 );
- }
-
-
-#else
-
- /*
- .... .... .... .... XXXX XXXX .... .... .... ....
-
-
-
-
- .... .... .... ...1 23.. ..ab c... .... .... ....
- .... .... .... ...4 X5.. ..dY e... .... .... ....
- .... .... .... ...6 78.. ..fg h... .... .... ....
-
-
- //Different numbering below
- //Showing per row for the top and bottom rows
-
- 1234567abc
-
- desired,
- 1+2+3 = A
- 2+3+4 = B
- 3+4+5 = C
- 4+5+6 = D
- 5+6+7 = E
- 6+7+a = F
- 7+a+b = G
- a+b+c = H
-
- 1 2 3 4 5 6 7 a | b c
- + _ 1 2 3 4 5 6 7 | a b c
- -------------------------------------------------------
- 1 1+2 2+3 3+4 4+5 5+6 6+7 7+a |a+b b+c c
-
- + 2 3 4 5 6 7 a | b c _
- -------------------------------------------------------
- 1+2 A B C D E F | G H
-
-
-
- //Showing per row for the middle row
-
- 1234567abc
-
- desired,
- 1+3 = A
- 2+4 = B
- 3+5 = C
- 4+6 = D
- 5+7 = E
- 6+a = F
- 7+b = G
- a+c = H
-
-
- 1 2 3 4 5 6 7 a | b c
- + _ _ 1 2 3 4 5 6 | 7 a b c
- -------------------------------------------------------
- A B C D E F G H
-
-
- */
-
- static __declspec(align(16)) unsigned long Low6WordsMask[4] = { 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000 };
- static unsigned char* Low6WordsMaskPtr = (unsigned char*)Low6WordsMask;
- long stride = ppi->PlaneStride;
- unsigned char* SrcPtrTopLeft = SrcPtr - stride - 1;
-
-
- __asm {
- align 16
-
- mov esi, SrcPtrTopLeft
- mov eax, Low6WordsMaskPtr
- mov ecx, stride
- mov edi, OutputPtr
-
- movdqa xmm7, [eax]
- pxor xmm0, xmm0
- pcmpeqw xmm6, xmm6 /* All 1's */
-
- /* Create the inverse mask -- xmm6 = ~xmm7 */
- pxor xmm6, xmm7
-
- /***************************************/
- /* TOP ROW OF THE 8 SURROUNDING PIXELS */
- /***************************************/
-
- /* There are 10 bytes, read the first 8 into the first register, after the shifting
- there will be 6 usable results. For the second register start at plus 2
- so it also has 8 but 6 of them overlap, this stops us reading past the block
- we are supposed to be looking at, and since we operate on the whole register
- anyway, it actually doesn't matter if theres 8 or only 2 inside */
-
- movq xmm1, QWORD PTR [esi]
- movq xmm2, QWORD PTR [esi + 2] /* this one partly overlaps */
-
-
-
- /* Expand to 16 bits */
- punpcklbw xmm1, xmm0
- punpcklbw xmm2, xmm0
- movdqa xmm3, xmm1
- movdqa xmm4, xmm2
-
- /* Shift all 8 items right by 1 lot of 16 bits to get the intermediate sums */
- psrldq xmm1, 2
- psrldq xmm2, 2
- paddw xmm1, xmm3
- paddw xmm2, xmm4
-
- /* Shift right by 1 lot of 16 to get the intermediate triple sums */
- pslldq xmm3, 2
- pslldq xmm4, 2
- paddw xmm1, xmm3
- paddw xmm2, xmm4
-
- /* Now have 6 lots of triple sums in words 1-6 (0, and 7 have junk)
- in the first regsiter. and in bytes 5 and 6 of the second register
- there is the final 2 triple sums */
-
-
- /* Merge the 8 results into 1 register */
- /* Shift words 1-6 to positions 0-5 */
- psrldq xmm1, 2
- /* Shift words 5 and 6 to positions 6 and 7 - since
- we don't care about any of the other positions in this regsiter
- use the qword 64 bitwise shift which is twice as fast as the
- dq 128 bitwise one */
- psllq xmm2, 16
-
- /* Clear the high 32 bits in the first register */
- pand xmm1, xmm7
-
- /* Clear the low 6 bytes of the second register */
- pand xmm2, xmm6
-
- /* First register now contains all 8 triple sums ie. the sum of the top 3 pixels
- in each of the eight 3x3 adjacent blocks */
- por xmm1, xmm2
-
-
-
- /***************************************/
- /* BOTTOM ROW OF THE 8 SURROUNDING PIXELS */
- /***************************************/
-
- /* Jump down 2 lines */
- lea esi, [esi + ecx*2]
-
- /* There are 10 bytes, read the first 8 into the first register, after the shifting
- there will be 6 usable results. For the second register start at plus 2
- so it also has 8 but 6 of them overlap, this stops us reading past the block
- we are supposed to be looking at, and since we operate on the whole register
- anyway, it actually doesn't matter if theres 8 or only 2 inside */
-
- movq xmm5, QWORD PTR [esi]
- movq xmm2, QWORD PTR [esi + 2] /* this one partly overlaps */
-
-
- /* Expand to 16 bits */
- punpcklbw xmm5, xmm0
- punpcklbw xmm2, xmm0
- movdqa xmm3, xmm5
- movdqa xmm4, xmm2
-
- /* Shift all 8 items right by 1 lot of 16 bits to get the intermediate sums */
- psrldq xmm5, 2
- psrldq xmm2, 2
- paddw xmm5, xmm3
- paddw xmm2, xmm4
-
- /* Shift right by 1 lot of 16 to get the intermediate triple sums */
- pslldq xmm3, 2
- pslldq xmm4, 2
- paddw xmm5, xmm3
- paddw xmm2, xmm4
-
- /* Now have 6 lots of triple sums in words 1-6 (0, and 7 have junk)
- in the first regsiter. and in bytes 5 and 6 of the second register
- there is the final 2 triple sums */
-
-
- /* Merge the 8 results into 1 register */
- /* Shift words 1-6 to positions 0-5 */
- psrldq xmm5, 2
- /* Shift words 5 and 6 to positions 6 and 7 - since
- we don't care about any of the other positions in this regsiter
- use the dword 32 bitwise shift which is twice as fast as the
- dq 128 bitwise one */
- psllq xmm2, 16
-
- /* Clear the high 32 bits in the first register */
- pand xmm5, xmm7
-
- /* Clear the low 6 bytes of the second register */
- pand xmm2, xmm6
-
- /* First register now contains all 8 triple sums */
- por xmm5, xmm2
-
-
- /* xmm1 contains the top rows, and xmm5 the bottom rows
- now sum the top rows into the bottom rows.
- */
- paddw xmm5, xmm1
-
-
-
- /***************************************/
- /* MIDDLE ROW OF THE 8 SURROUNDING PIXELS */
- /***************************************/
-
- /* Go back one row to the middle row */
- sub esi, ecx
-
- /* In this row, the middle pixel of each consecutive 3 is not to be summed */
-
-
- movq xmm1, QWORD PTR [esi]
- movq xmm2, QWORD PTR [esi + 2] /* this one partly overlaps */
- //movdqa xmm7, [eax]
-
-
- /* Expand to 16 bits */
- punpcklbw xmm1, xmm0
- punpcklbw xmm2, xmm0
- movdqa xmm3, xmm1
- movdqa xmm4, xmm2
-
- /* Shift all 8 items right by 2 lot of 16 bits to get the intermediate sums */
- psrldq xmm1, 4
- psrldq xmm2, 4
- paddw xmm1, xmm3
- paddw xmm2, xmm4
-
- /* Merge the 8 results into 1 register */
- /* First register has words 0-5 filled with sums */
-
- /* Shift words 4 and 5 to positions 6 and 7 - since
- we don't care about any of the other positions in this regsiter
- use the qword 64 bitwise shift which is twice as fast as the
- dq 128 bitwise one */
- psllq xmm2, 32
-
- /* Clear the high 32 bits in the first register */
- pand xmm1, xmm7
-
- /* Clear the low 6 bytes of the second register */
- pand xmm2, xmm6
-
- /* First register now contains the sum of the left and right pixel
- for each of the eight 3x3 adjacent blocks */
- por xmm1, xmm2
-
-
- /* ---------------------- */
-
- /* Final 8 sums */
- paddw xmm1, xmm5
-
- /* Divide by 8 */
- psrlw xmm1, 3
-
- /* Write it into temp[0..16] */
- movdqa [edi], xmm1
- }
-
-#endif
-}
-
-/* This is a new function factor out of rowdiffscan, maybe needs a better name */
-static ogg_int32_t RowDiffScan_DiffAndThresholding(PP_INSTANCE *ppi,
+static void RowDiffScan__c( PP_INSTANCE *ppi,
unsigned char * YuvPtr1,
unsigned char * YuvPtr2,
ogg_int16_t * YUVDiffsPtr,
unsigned char * bits_map_ptr,
- signed char * SgcPtr)
-{
- ogg_int16_t Diff; /* Temp local workspace. */
- ogg_int32_t j;
- ogg_int32_t FragChangedPixels = 0;
-
- for ( j = 0; j < HFRAGPIXELS; j++ ){
- /* Take a local copy of the measured difference. */
- Diff = (int)YuvPtr1[j] - (int)YuvPtr2[j];
-
- /* Store the actual difference value */
- YUVDiffsPtr[j] = Diff;
-
- /* Test against the Level thresholds and record the results */
- SgcPtr[0] += ppi->SgcThreshTable[Diff+255];
-
- /* Test against the SRF thresholds */
- bits_map_ptr[j] = ppi->SrfThreshTable[Diff+255];
- FragChangedPixels += ppi->SrfThreshTable[Diff+255];
- }
-
- return FragChangedPixels;
-
-}
-
-/* This is a new function factor out of rowdiffscan, maybe needs a better name */
-static ogg_int32_t RowDiffScan_DiffAndThresholdingFirstFrag(PP_INSTANCE *ppi,
- unsigned char * YuvPtr1,
- unsigned char * YuvPtr2,
- ogg_int16_t * YUVDiffsPtr,
- unsigned char * bits_map_ptr,
- signed char * SgcPtr)
-{
-
- ogg_int16_t Diff; /* Temp local workspace. */
- ogg_int32_t j;
- ogg_int32_t FragChangedPixels = 0;
-
- for ( j = 0; j < HFRAGPIXELS; j++ ){
- /* Take a local copy of the measured difference. */
- Diff = (int)YuvPtr1[j] - (int)YuvPtr2[j];
-
- /* Store the actual difference value */
- YUVDiffsPtr[j] = Diff;
-
- /* Test against the Level thresholds and record the results */
- SgcPtr[0] += ppi->SgcThreshTable[Diff+255];
-
- if (j>0 && ppi->SrfPakThreshTable[Diff+255] )
- Diff = (int)ApplyPakLowPass( ppi, &YuvPtr1[j] ) -
- (int)ApplyPakLowPass( ppi, &YuvPtr2[j] );
-
- /* Test against the SRF thresholds */
- bits_map_ptr[j] = ppi->SrfThreshTable[Diff+255];
- FragChangedPixels += ppi->SrfThreshTable[Diff+255];
- }
- return FragChangedPixels;
-
-}
-
-
-
-/* This is a new function factor out of rowdiffscan, maybe needs a better name */
-static ogg_int32_t RowDiffScan_DiffAndThresholdingLastFrag(PP_INSTANCE *ppi,
- unsigned char * YuvPtr1,
- unsigned char * YuvPtr2,
- ogg_int16_t * YUVDiffsPtr,
- unsigned char * bits_map_ptr,
- signed char * SgcPtr)
-{
- ogg_int16_t Diff; /* Temp local workspace. */
- ogg_int32_t j;
- ogg_int32_t FragChangedPixels = 0;
-
- for ( j = 0; j < HFRAGPIXELS; j++ ){
- /* Take a local copy of the measured difference. */
- Diff = (int)YuvPtr1[j] - (int)YuvPtr2[j];
-
- /* Store the actual difference value */
- YUVDiffsPtr[j] = Diff;
-
- /* Test against the Level thresholds and record the results */
- SgcPtr[0] += ppi->SgcThreshTable[Diff+255];
-
- if (j<7 && ppi->SrfPakThreshTable[Diff+255] )
- Diff = (int)ApplyPakLowPass( ppi, &YuvPtr1[j] ) -
- (int)ApplyPakLowPass( ppi, &YuvPtr2[j] );
-
-
- /* Test against the SRF thresholds */
- bits_map_ptr[j] = ppi->SrfThreshTable[Diff+255];
- FragChangedPixels += ppi->SrfThreshTable[Diff+255];
- }
- return FragChangedPixels;
-
-}
-
-
-
-
-
-
-/* This is a new function factor out of rowdiffscan, maybe needs a better name */
-static ogg_int32_t RowDiffScan_DiffAndThresholdingMiddleFrag(PP_INSTANCE *ppi,
- unsigned char * YuvPtr1,
- unsigned char * YuvPtr2,
- ogg_int16_t * YUVDiffsPtr,
- unsigned char * bits_map_ptr,
- signed char * SgcPtr)
-{
-#if 0
-
- /* 10% of all encode exectution is in this function, most
- heavily used function in alpha 6 */
-
- ogg_int16_t Diff; /* Temp local workspace. */
- ogg_int32_t j;
- ogg_int32_t FragChangedPixels = 0;
-
-
- for ( j = 0; j < HFRAGPIXELS; j++ ){
- /* Take a local copy of the measured difference. */
- Diff = (int)YuvPtr1[j] - (int)YuvPtr2[j];
-
- /* Store the actual difference value */
- YUVDiffsPtr[j] = Diff;
-
- /* Test against the Level thresholds and record the results */
- SgcPtr[0] += ppi->SgcThreshTable[Diff+255];
-
- if (ppi->SrfPakThreshTable[Diff+255] )
- Diff = (int)ApplyPakLowPass( ppi, &YuvPtr1[j] ) -
- (int)ApplyPakLowPass( ppi, &YuvPtr2[j] );
-
-
- /* Test against the SRF thresholds */
- bits_map_ptr[j] = ppi->SrfThreshTable[Diff+255];
- FragChangedPixels += ppi->SrfThreshTable[Diff+255];
- }
- return FragChangedPixels;
- //PERF_BLOCK_END("RowDiffScan_DiffAndThresholdingMiddleFrag", perf_rds_datmf_time, perf_rds_datmf_time,perf_rds_datmf_time, 10000);
-
-#else
-
- static __declspec(align(16)) unsigned long Some255s[4] = { 0x00ff00ff, 0x00ff00ff, 0x00ff00ff, 0x00ff00ff };
- static __declspec(align(16)) unsigned char temp[48];
- static unsigned short* temp_ptr = (unsigned short*)temp;
-
- static unsigned char* some_255s_ptr = (unsigned char*)Some255s;
- unsigned char* local_sgc_thresh_table = ppi->SgcThreshTable;
- unsigned char* local_srf_thresh_table = ppi->SrfThreshTable;
- unsigned char* local_srf_pak_thresh_table = ppi->SrfPakThreshTable;
-
-
- unsigned char thresh_val;
- int i, FragChangedPixels = 0;
-
-
- __asm {
- align 16
- mov esi, YuvPtr1
- mov edx, YuvPtr2
- mov edi, YUVDiffsPtr /* Not aligned */
- mov eax, some_255s_ptr;
- mov ecx, temp_ptr
-
- movdqa xmm7, [eax]
- pxor xmm0, xmm0
-
- /* Load yuvptr1[0..7] into low 8 bytes */
- movq xmm1, QWORD PTR [esi]
- /* Load yuvptr2[0..7] into low 8 bytes */
- movq xmm2, QWORD PTR [edx]
-
- /* Unpack to 16 bits */
- punpcklbw xmm1, xmm0
- punpcklbw xmm2, xmm0
-
- /* Subtract the YUV Ptr values */
- psubw xmm1, xmm2 /*should it be subsw?? */
-
- /* Write out to YUVDiffs */
- movdqu [edi], xmm1
-
- /* Add 255 to them all */
- paddw xmm1, xmm7
-
- /* Write them to the temp area */
- movdqa [ecx], xmm1
-
-
-
- }
-
- ApplyPakLowPass_Vectorised(ppi, YuvPtr1, temp_ptr + 8); /* Bytes 16-31 */
- ApplyPakLowPass_Vectorised(ppi, YuvPtr2, temp_ptr + 16); /* Bytes 32 - 47 */
-
- __asm {
- align 16
-
- mov esi, temp_ptr
- mov ecx, some_255s_ptr
-
- movdqa xmm1, [esi + 16]
- movdqa xmm2, [esi + 32]
-
- movdqa xmm6, [ecx]
-
- /* New diffs after PakLowPass */
- psubw xmm1, xmm2
-
- /* Add 255 to the diffs */
- paddw xmm1, xmm6
-
- /* Write back out to temp */
- movdqa [esi +16], xmm1
-
- /* Now need to process with normal registers ops */
-
-
-
- /* At this point
- temp_ptr[0..15] = 8 lots of Early loop diffs + 255
- temp_ptr[16..31] = 8 lots of late loop diffs + 255
- temp_ptr[32..47] = who cares */
-
- }
-
-
- /* Apply the pak threash_table and write into temp[32..47] */
- temp_ptr[16] = local_srf_pak_thresh_table[temp_ptr[0]];
- temp_ptr[17] = local_srf_pak_thresh_table[temp_ptr[1]];
- temp_ptr[18] = local_srf_pak_thresh_table[temp_ptr[2]];
- temp_ptr[19] = local_srf_pak_thresh_table[temp_ptr[3]];
- temp_ptr[20] = local_srf_pak_thresh_table[temp_ptr[4]];
- temp_ptr[21] = local_srf_pak_thresh_table[temp_ptr[5]];
- temp_ptr[22] = local_srf_pak_thresh_table[temp_ptr[6]];
- temp_ptr[23] = local_srf_pak_thresh_table[temp_ptr[7]];
-
- __asm {
- align 16
-
- //mov edx, YUVDiffsPtr
- mov esi, temp_ptr
-
- /* Read back the old diffs+255 */
- movdqu xmm4, [esi]
-
- /* Read back the new diffs+255 */
- movdqa xmm3, [esi + 16]
-
- /* Read back the pak_threshed values used in the if statement */
- movdqa xmm6, [esi + 32]
-
- pxor xmm0, xmm0
- pcmpeqw xmm7, xmm7 /* All 1's */
-
- /* Compare the pak_thresh values to 0, any word which was 0, will now be set to all 1's in xmm0
- the if basically said, if it's zero, leave it alone, otherwise, replace it
- with the new diff */
- pcmpeqw xmm0, xmm6
-
- /* On the old diffs, keep all the words where the pak_thresh is zero */
- pand xmm4, xmm0
-
- /* Flip the bits so that the places that were 0 are now all zeros */
- pxor xmm0, xmm7
-
- /* This zero's out all the words in the new diffs which were 0 in the pak_thresh */
- pand xmm3, xmm0
-
- /* Merge the old and new diffs */
- por xmm3, xmm4
-
- /* Write back out to temp */
- movdqa [esi + 32], xmm3
- }
-
- for (i = 0; i < 8; i++)
- {
-
- thresh_val = local_srf_thresh_table[temp_ptr[16 + i]];
- SgcPtr[0] += local_sgc_thresh_table[temp_ptr[i]];
- bits_map_ptr[i] = thresh_val;
- FragChangedPixels += thresh_val;
-
- }
-
- return FragChangedPixels;
-
-
-
-#endif
-}
-
-
-static void RowDiffScan( PP_INSTANCE *ppi,
- unsigned char * YuvPtr1,
- unsigned char * YuvPtr2,
- ogg_int16_t * YUVDiffsPtr,
- unsigned char * bits_map_ptr,
signed char * SgcPtr,
signed char * DispFragPtr,
unsigned char * FDiffPixels,
@@ -1326,13 +710,20 @@
/* Clear down entries in changed locals array */
SET8_0(ChLocalsPtr);
- FragChangedPixels += RowDiffScan_DiffAndThresholding( ppi,
- YuvPtr1,
- YuvPtr2,
- YUVDiffsPtr,
- bits_map_ptr,
- SgcPtr);
-
+ for ( j = 0; j < HFRAGPIXELS; j++ ){
+ /* Take a local copy of the measured difference. */
+ Diff = (int)YuvPtr1[j] - (int)YuvPtr2[j];
+
+ /* Store the actual difference value */
+ YUVDiffsPtr[j] = Diff;
+
+ /* Test against the Level thresholds and record the results */
+ SgcPtr[0] += ppi->SgcThreshTable[Diff+255];
+
+ /* Test against the SRF thresholds */
+ bits_map_ptr[j] = ppi->SrfThreshTable[Diff+255];
+ FragChangedPixels += ppi->SrfThreshTable[Diff+255];
+ }
}else{
/* If we are breaking out here mark all pixels as changed. */
if ( *DispFragPtr > BLOCK_NOT_CODED ){
@@ -1377,14 +768,24 @@
/* Clear down entries in changed locals array */
SET8_0(ChLocalsPtr);
- FragChangedPixels += RowDiffScan_DiffAndThresholdingFirstFrag(
- ppi,
- YuvPtr1,
- YuvPtr2,
- YUVDiffsPtr,
- bits_map_ptr,
- SgcPtr);
+ for ( j = 0; j < HFRAGPIXELS; j++ ){
+ /* Take a local copy of the measured difference. */
+ Diff = (int)YuvPtr1[j] - (int)YuvPtr2[j];
+ /* Store the actual difference value */
+ YUVDiffsPtr[j] = Diff;
+
+ /* Test against the Level thresholds and record the results */
+ SgcPtr[0] += ppi->SgcThreshTable[Diff+255];
+
+ if (j>0 && ppi->SrfPakThreshTable[Diff+255] )
+ Diff = (int)ApplyPakLowPass( ppi, &YuvPtr1[j] ) -
+ (int)ApplyPakLowPass( ppi, &YuvPtr2[j] );
+
+ /* Test against the SRF thresholds */
+ bits_map_ptr[j] = ppi->SrfThreshTable[Diff+255];
+ FragChangedPixels += ppi->SrfThreshTable[Diff+255];
+ }
}else{
/* If we are breaking out here mark all pixels as changed. */
if ( *DispFragPtr > BLOCK_NOT_CODED ){
@@ -1426,15 +827,25 @@
if (*DispFragPtr == CANDIDATE_BLOCK){
/* Clear down entries in changed locals array */
SET8_0(ChLocalsPtr);
+ for ( j = 0; j < HFRAGPIXELS; j++ ){
+ /* Take a local copy of the measured difference. */
+ Diff = (int)YuvPtr1[j] - (int)YuvPtr2[j];
- FragChangedPixels += RowDiffScan_DiffAndThresholdingMiddleFrag(
- ppi,
- YuvPtr1,
- YuvPtr2,
- YUVDiffsPtr,
- bits_map_ptr,
- SgcPtr);
+ /* Store the actual difference value */
+ YUVDiffsPtr[j] = Diff;
+ /* Test against the Level thresholds and record the results */
+ SgcPtr[0] += ppi->SgcThreshTable[Diff+255];
+
+ if (ppi->SrfPakThreshTable[Diff+255] )
+ Diff = (int)ApplyPakLowPass( ppi, &YuvPtr1[j] ) -
+ (int)ApplyPakLowPass( ppi, &YuvPtr2[j] );
+
+
+ /* Test against the SRF thresholds */
+ bits_map_ptr[j] = ppi->SrfThreshTable[Diff+255];
+ FragChangedPixels += ppi->SrfThreshTable[Diff+255];
+ }
}else{
/* If we are breaking out here mark all pixels as changed. */
if ( *DispFragPtr > BLOCK_NOT_CODED ){
@@ -1476,15 +887,25 @@
/* Clear down entries in changed locals array */
SET8_0(ChLocalsPtr);
- FragChangedPixels += RowDiffScan_DiffAndThresholdingLastFrag(
- ppi,
- YuvPtr1,
- YuvPtr2,
- YUVDiffsPtr,
- bits_map_ptr,
- SgcPtr);
+ for ( j = 0; j < HFRAGPIXELS; j++ ){
+ /* Take a local copy of the measured difference. */
+ Diff = (int)YuvPtr1[j] - (int)YuvPtr2[j];
+ /* Store the actual difference value */
+ YUVDiffsPtr[j] = Diff;
+ /* Test against the Level thresholds and record the results */
+ SgcPtr[0] += ppi->SgcThreshTable[Diff+255];
+
+ if (j<7 && ppi->SrfPakThreshTable[Diff+255] )
+ Diff = (int)ApplyPakLowPass( ppi, &YuvPtr1[j] ) -
+ (int)ApplyPakLowPass( ppi, &YuvPtr2[j] );
+
+
+ /* Test against the SRF thresholds */
+ bits_map_ptr[j] = ppi->SrfThreshTable[Diff+255];
+ FragChangedPixels += ppi->SrfThreshTable[Diff+255];
+ }
}else{
/* If we are breaking out here mark all pixels as changed.*/
if ( *DispFragPtr > BLOCK_NOT_CODED ) {
@@ -2695,7 +2116,7 @@
if ( UpdatedOrCandidateBlocks ){
/* Scan the row for interesting differences */
/* Also clear the array that will be used for changed locals map */
- RowDiffScan( ppi, RawPlanePtr0, RawPlanePtr1,
+ dsp_scan_row_diff_scan( ppi->dsp, ppi, RawPlanePtr0, RawPlanePtr1,
YUVDiffsPtr, PixelsChangedPtr0,
&ppi->SameGreyDirPixels[FragIndex],
DispFragPtr0, &ppi->FragDiffPixels[FragIndex],
@@ -2880,3 +2301,21 @@
return ppi->OutputBlocksUpdated;
}
+
+void dsp_scan_init (DspFunctions *funcs, ogg_uint32_t cpu_flags)
+{
+ funcs->RowDiffScan = RowDiffScan__c;
+ //funcs->copy8x8 = copy8x8__c;
+ //funcs->recon_intra8x8 = recon_intra8x8__c;
+ //funcs->recon_inter8x8 = recon_inter8x8__c;
+ //funcs->recon_inter8x8_half = recon_inter8x8_half__c;
+#if defined(USE_ASM)
+ //if (cpu_flags & CPU_X86_MMX) {
+ // dsp_mmx_scan_init(funcs);
+ //}
+
+ if (cpu_flags & CPU_X86_SSE2) {
+ dsp_sse2_scan_init(funcs);
+ }
+#endif
+}
Modified: branches/theora-playtime/lib/x86_32_vs/idct_sse2.c
===================================================================
--- branches/theora-playtime/lib/x86_32_vs/idct_sse2.c 2006-06-09 12:15:37 UTC (rev 11550)
+++ branches/theora-playtime/lib/x86_32_vs/idct_sse2.c 2006-06-09 14:25:50 UTC (rev 11551)
@@ -173,6 +173,8 @@
+
+
void IDctSlow__sse2( Q_LIST_ENTRY * InputData,
ogg_int16_t *QuantMatrix,
ogg_int16_t * OutputData ) {
@@ -395,6 +397,7 @@
static void dequant_slow10__sse2( ogg_int16_t * dequant_coeffs,
ogg_int16_t * quantized_list,
ogg_int32_t * DCT_block){
+
#if 0
int i;
PERF_BLOCK_START();
Added: branches/theora-playtime/lib/x86_32_vs/scan_sse2.c
===================================================================
--- branches/theora-playtime/lib/x86_32_vs/scan_sse2.c 2006-06-09 12:15:37 UTC (rev 11550)
+++ branches/theora-playtime/lib/x86_32_vs/scan_sse2.c 2006-06-09 14:25:50 UTC (rev 11551)
@@ -0,0 +1,936 @@
+
+/********************************************************************
+ * *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
+ * *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003 *
+ * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * *
+ ********************************************************************
+
+ function:
+ last mod: $Id: scan.c 11548 2006-06-09 09:37:51Z illiminable $
+
+ ********************************************************************/
+
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+#include "codec_internal.h"
+#include "dsp.h"
+#include "perf_helper.h"
+
+
+#define MAX_SEARCH_LINE_LEN 7
+
+#define SET8_0(ptr) \
+ ((ogg_uint32_t *)ptr)[0] = 0x00000000; \
+ ((ogg_uint32_t *)ptr)[1] = 0x00000000;
+#define SET8_1(ptr) \
+ ((ogg_uint32_t *)ptr)[0] = 0x01010101; \
+ ((ogg_uint32_t *)ptr)[1] = 0x01010101;
+#define SET8_8(ptr) \
+ ((ogg_uint32_t *)ptr)[0] = 0x08080808; \
+ ((ogg_uint32_t *)ptr)[1] = 0x08080808;
+
+static ogg_uint32_t LineLengthScores[ MAX_SEARCH_LINE_LEN + 1 ] = {
+ 0, 0, 0, 0, 2, 4, 12, 24
+};
+
+static ogg_uint32_t BodyNeighbourScore = 8;
+static double DiffDevisor = 0.0625;
+#define HISTORY_BLOCK_FACTOR 2
+#define MIN_STEP_THRESH 6
+#define SCORE_MULT_LOW 0.5
+#define SCORE_MULT_HIGH 4
+
+#define UP 0
+#define DOWN 1
+#define LEFT 2
+#define RIGHT 3
+
+#define INTERNAL_BLOCK_HEIGHT 8
+#define INTERNAL_BLOCK_WIDTH 8
+
+#define BLOCK_NOT_CODED 0
+#define BLOCK_CODED_BAR 3
+#define BLOCK_CODED_SGC 4
+#define BLOCK_CODED_LOW 4
+#define BLOCK_CODED 5
+
+#define CANDIDATE_BLOCK_LOW -2
+#define CANDIDATE_BLOCK -1
+
+#define FIRST_ROW 0
+#define NOT_EDGE_ROW 1
+#define LAST_ROW 2
+
+#define YDIFF_CB_ROWS (INTERNAL_BLOCK_HEIGHT * 3)
+#define CHLOCALS_CB_ROWS (INTERNAL_BLOCK_HEIGHT * 3)
+#define PMAP_CB_ROWS (INTERNAL_BLOCK_HEIGHT * 3)
+
+
+static unsigned __int64 perf_rds_datmf_time = 0;
+static unsigned __int64 perf_rds_datmf_count = 0;
+static unsigned __int64 perf_rds_datmf_min = -1;
+
+/* This is temporary until all the brances have been vectorise */
+static unsigned char ApplyPakLowPass__sse2( PP_INSTANCE *ppi,
+ unsigned char * SrcPtr ){
+ unsigned char * SrcPtr1 = SrcPtr - 1;
+ unsigned char * SrcPtr0 = SrcPtr1 - ppi->PlaneStride; /* Note the
+ use of
+ stride not
+ width. */
+ unsigned char * SrcPtr2 = SrcPtr1 + ppi->PlaneStride;
+
+ return (unsigned char)( ( (ogg_uint32_t)SrcPtr0[0] +
+ (ogg_uint32_t)SrcPtr0[1] +
+ (ogg_uint32_t)SrcPtr0[2] +
+ (ogg_uint32_t)SrcPtr1[0] +
+ (ogg_uint32_t)SrcPtr1[2] +
+ (ogg_uint32_t)SrcPtr2[0] +
+ (ogg_uint32_t)SrcPtr2[1] +
+ (ogg_uint32_t)SrcPtr2[2] ) >> 3 );
+
+}
+
+static void ApplyPakLowPass_Vectorised__sse2( PP_INSTANCE *ppi,
+ unsigned char * SrcPtr,
+ unsigned short * OutputPtr)
+{
+
+#if 0
+
+ int i;
+ for (i = 0; i < 8; i++)
+ {
+ unsigned char * SrcPtr1 = SrcPtr - 1;
+ unsigned char * SrcPtr0 = SrcPtr1 - ppi->PlaneStride; /* Note the
+ use of
+ stride not
+ width. */
+ unsigned char * SrcPtr2 = SrcPtr1 + ppi->PlaneStride;
+
+ //OutputPtr[i] = ( ( (ogg_uint32_t)SrcPtr[i-1-s] +
+ // (ogg_uint32_t)SrcPtr[i-s] +
+ // (ogg_uint32_t)SrcPtr[i-s+1] +
+ // (ogg_uint32_t)SrcPtr[i-1] +
+ // (ogg_uint32_t)SrcPtr[i+1] +
+ // (ogg_uint32_t)SrcPtr[i+s-1] +
+ // (ogg_uint32_t)SrcPtr[i+s] +
+ // (ogg_uint32_t)SrcPtr[i+s+1] ) >> 3 );
+
+ OutputPtr[i] = (unsigned char)( ( (ogg_uint32_t)SrcPtr0[0 + i] +
+ (ogg_uint32_t)SrcPtr0[1 + i] +
+ (ogg_uint32_t)SrcPtr0[2 + i] +
+ (ogg_uint32_t)SrcPtr1[0 + i] +
+ (ogg_uint32_t)SrcPtr1[2 + i] +
+ (ogg_uint32_t)SrcPtr2[0 + i] +
+ (ogg_uint32_t)SrcPtr2[1 + i] +
+ (ogg_uint32_t)SrcPtr2[2 + i] ) >> 3 );
+ }
+
+
+#else
+
+ /*
+ .... .... .... .... XXXX XXXX .... .... .... ....
+
+
+
+
+ .... .... .... ...1 23.. ..ab c... .... .... ....
+ .... .... .... ...4 X5.. ..dY e... .... .... ....
+ .... .... .... ...6 78.. ..fg h... .... .... ....
+
+
+ //Different numbering below
+ //Showing per row for the top and bottom rows
+
+ 1234567abc
+
+ desired,
+ 1+2+3 = A
+ 2+3+4 = B
+ 3+4+5 = C
+ 4+5+6 = D
+ 5+6+7 = E
+ 6+7+a = F
+ 7+a+b = G
+ a+b+c = H
+
+ 1 2 3 4 5 6 7 a | b c
+ + _ 1 2 3 4 5 6 7 | a b c
+ -------------------------------------------------------
+ 1 1+2 2+3 3+4 4+5 5+6 6+7 7+a |a+b b+c c
+
+ + 2 3 4 5 6 7 a | b c _
+ -------------------------------------------------------
+ 1+2 A B C D E F | G H
+
+
+
+ //Showing per row for the middle row
+
+ 1234567abc
+
+ desired,
+ 1+3 = A
+ 2+4 = B
+ 3+5 = C
+ 4+6 = D
+ 5+7 = E
+ 6+a = F
+ 7+b = G
+ a+c = H
+
+
+ 1 2 3 4 5 6 7 a | b c
+ + _ _ 1 2 3 4 5 6 | 7 a b c
+ -------------------------------------------------------
+ A B C D E F G H
+
+
+ */
+
+ static __declspec(align(16)) unsigned long Low6WordsMask[4] = { 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000 };
+ static unsigned char* Low6WordsMaskPtr = (unsigned char*)Low6WordsMask;
+ long stride = ppi->PlaneStride;
+ unsigned char* SrcPtrTopLeft = SrcPtr - stride - 1;
+
+
+ __asm {
+ align 16
+
+ mov esi, SrcPtrTopLeft
+ mov eax, Low6WordsMaskPtr
+ mov ecx, stride
+ mov edi, OutputPtr
+
+ movdqa xmm7, [eax]
+ pxor xmm0, xmm0
+ pcmpeqw xmm6, xmm6 /* All 1's */
+
+ /* Create the inverse mask -- xmm6 = ~xmm7 */
+ pxor xmm6, xmm7
+
+ /***************************************/
+ /* TOP ROW OF THE 8 SURROUNDING PIXELS */
+ /***************************************/
+
+ /* There are 10 bytes, read the first 8 into the first register, after the shifting
+ there will be 6 usable results. For the second register start at plus 2
+ so it also has 8 but 6 of them overlap, this stops us reading past the block
+ we are supposed to be looking at, and since we operate on the whole register
+ anyway, it actually doesn't matter if theres 8 or only 2 inside */
+
+ movq xmm1, QWORD PTR [esi]
+ movq xmm2, QWORD PTR [esi + 2] /* this one partly overlaps */
+
+
+
+ /* Expand to 16 bits */
+ punpcklbw xmm1, xmm0
+ punpcklbw xmm2, xmm0
+ movdqa xmm3, xmm1
+ movdqa xmm4, xmm2
+
+ /* Shift all 8 items right by 1 lot of 16 bits to get the intermediate sums */
+ psrldq xmm1, 2
+ psrldq xmm2, 2
+ paddw xmm1, xmm3
+ paddw xmm2, xmm4
+
+ /* Shift right by 1 lot of 16 to get the intermediate triple sums */
+ pslldq xmm3, 2
+ pslldq xmm4, 2
+ paddw xmm1, xmm3
+ paddw xmm2, xmm4
+
+ /* Now have 6 lots of triple sums in words 1-6 (0, and 7 have junk)
+ in the first regsiter. and in bytes 5 and 6 of the second register
+ there is the final 2 triple sums */
+
+
+ /* Merge the 8 results into 1 register */
+ /* Shift words 1-6 to positions 0-5 */
+ psrldq xmm1, 2
+ /* Shift words 5 and 6 to positions 6 and 7 - since
+ we don't care about any of the other positions in this regsiter
+ use the qword 64 bitwise shift which is twice as fast as the
+ dq 128 bitwise one */
+ psllq xmm2, 16
+
+ /* Clear the high 32 bits in the first register */
+ pand xmm1, xmm7
+
+ /* Clear the low 6 bytes of the second register */
+ pand xmm2, xmm6
+
+ /* First register now contains all 8 triple sums ie. the sum of the top 3 pixels
+ in each of the eight 3x3 adjacent blocks */
+ por xmm1, xmm2
+
+
+
+ /***************************************/
+ /* BOTTOM ROW OF THE 8 SURROUNDING PIXELS */
+ /***************************************/
+
+ /* Jump down 2 lines */
+ lea esi, [esi + ecx*2]
+
+ /* There are 10 bytes, read the first 8 into the first register, after the shifting
+ there will be 6 usable results. For the second register start at plus 2
+ so it also has 8 but 6 of them overlap, this stops us reading past the block
+ we are supposed to be looking at, and since we operate on the whole register
+ anyway, it actually doesn't matter if theres 8 or only 2 inside */
+
+ movq xmm5, QWORD PTR [esi]
+ movq xmm2, QWORD PTR [esi + 2] /* this one partly overlaps */
+
+
+ /* Expand to 16 bits */
+ punpcklbw xmm5, xmm0
+ punpcklbw xmm2, xmm0
+ movdqa xmm3, xmm5
+ movdqa xmm4, xmm2
+
+ /* Shift all 8 items right by 1 lot of 16 bits to get the intermediate sums */
+ psrldq xmm5, 2
+ psrldq xmm2, 2
+ paddw xmm5, xmm3
+ paddw xmm2, xmm4
+
+ /* Shift right by 1 lot of 16 to get the intermediate triple sums */
+ pslldq xmm3, 2
+ pslldq xmm4, 2
+ paddw xmm5, xmm3
+ paddw xmm2, xmm4
+
+ /* Now have 6 lots of triple sums in words 1-6 (0, and 7 have junk)
+ in the first regsiter. and in bytes 5 and 6 of the second register
+ there is the final 2 triple sums */
+
+
+ /* Merge the 8 results into 1 register */
+ /* Shift words 1-6 to positions 0-5 */
+ psrldq xmm5, 2
+ /* Shift words 5 and 6 to positions 6 and 7 - since
+ we don't care about any of the other positions in this regsiter
+ use the dword 32 bitwise shift which is twice as fast as the
+ dq 128 bitwise one */
+ psllq xmm2, 16
+
+ /* Clear the high 32 bits in the first register */
+ pand xmm5, xmm7
+
+ /* Clear the low 6 bytes of the second register */
+ pand xmm2, xmm6
+
+ /* First register now contains all 8 triple sums */
+ por xmm5, xmm2
+
+
+ /* xmm1 contains the top rows, and xmm5 the bottom rows
+ now sum the top rows into the bottom rows.
+ */
+ paddw xmm5, xmm1
+
+
+
+ /***************************************/
+ /* MIDDLE ROW OF THE 8 SURROUNDING PIXELS */
+ /***************************************/
+
+ /* Go back one row to the middle row */
+ sub esi, ecx
+
+ /* In this row, the middle pixel of each consecutive 3 is not to be summed */
+
+
+ movq xmm1, QWORD PTR [esi]
+ movq xmm2, QWORD PTR [esi + 2] /* this one partly overlaps */
+ //movdqa xmm7, [eax]
+
+
+ /* Expand to 16 bits */
+ punpcklbw xmm1, xmm0
+ punpcklbw xmm2, xmm0
+ movdqa xmm3, xmm1
+ movdqa xmm4, xmm2
+
+ /* Shift all 8 items right by 2 lot of 16 bits to get the intermediate sums */
+ psrldq xmm1, 4
+ psrldq xmm2, 4
+ paddw xmm1, xmm3
+ paddw xmm2, xmm4
+
+ /* Merge the 8 results into 1 register */
+ /* First register has words 0-5 filled with sums */
+
+ /* Shift words 4 and 5 to positions 6 and 7 - since
+ we don't care about any of the other positions in this regsiter
+ use the qword 64 bitwise shift which is twice as fast as the
+ dq 128 bitwise one */
+ psllq xmm2, 32
+
+ /* Clear the high 32 bits in the first register */
+ pand xmm1, xmm7
+
+ /* Clear the low 6 bytes of the second register */
+ pand xmm2, xmm6
+
+ /* First register now contains the sum of the left and right pixel
+ for each of the eight 3x3 adjacent blocks */
+ por xmm1, xmm2
+
+
+ /* ---------------------- */
+
+ /* Final 8 sums */
+ paddw xmm1, xmm5
+
+ /* Divide by 8 */
+ psrlw xmm1, 3
+
+ /* Write it into temp[0..16] */
+ movdqa [edi], xmm1
+ }
+
+#endif
+}
+
+/* This is a new function factor out of rowdiffscan, maybe needs a better name */
+static ogg_int32_t RowDiffScan_DiffAndThresholding__sse2(PP_INSTANCE *ppi,
+ unsigned char * YuvPtr1,
+ unsigned char * YuvPtr2,
+ ogg_int16_t * YUVDiffsPtr,
+ unsigned char * bits_map_ptr,
+ signed char * SgcPtr)
+{
+ ogg_int16_t Diff; /* Temp local workspace. */
+ ogg_int32_t j;
+ ogg_int32_t FragChangedPixels = 0;
+
+ for ( j = 0; j < HFRAGPIXELS; j++ ){
+ /* Take a local copy of the measured difference. */
+ Diff = (int)YuvPtr1[j] - (int)YuvPtr2[j];
+
+ /* Store the actual difference value */
+ YUVDiffsPtr[j] = Diff;
+
+ /* Test against the Level thresholds and record the results */
+ SgcPtr[0] += ppi->SgcThreshTable[Diff+255];
+
+ /* Test against the SRF thresholds */
+ bits_map_ptr[j] = ppi->SrfThreshTable[Diff+255];
+ FragChangedPixels += ppi->SrfThreshTable[Diff+255];
+ }
+
+ return FragChangedPixels;
+
+}
+
+/* This is a new function factor out of rowdiffscan, maybe needs a better name */
+static ogg_int32_t RowDiffScan_DiffAndThresholdingFirstFrag__sse2(PP_INSTANCE *ppi,
+ unsigned char * YuvPtr1,
+ unsigned char * YuvPtr2,
+ ogg_int16_t * YUVDiffsPtr,
+ unsigned char * bits_map_ptr,
+ signed char * SgcPtr)
+{
+
+ ogg_int16_t Diff; /* Temp local workspace. */
+ ogg_int32_t j;
+ ogg_int32_t FragChangedPixels = 0;
+
+ for ( j = 0; j < HFRAGPIXELS; j++ ){
+ /* Take a local copy of the measured difference. */
+ Diff = (int)YuvPtr1[j] - (int)YuvPtr2[j];
+
+ /* Store the actual difference value */
+ YUVDiffsPtr[j] = Diff;
+
+ /* Test against the Level thresholds and record the results */
+ SgcPtr[0] += ppi->SgcThreshTable[Diff+255];
+
+ if (j>0 && ppi->SrfPakThreshTable[Diff+255] )
+ Diff = (int)ApplyPakLowPass__sse2( ppi, &YuvPtr1[j] ) -
+ (int)ApplyPakLowPass__sse2( ppi, &YuvPtr2[j] );
+
+ /* Test against the SRF thresholds */
+ bits_map_ptr[j] = ppi->SrfThreshTable[Diff+255];
+ FragChangedPixels += ppi->SrfThreshTable[Diff+255];
+ }
+ return FragChangedPixels;
+
+}
+
+
+
+/* This is a new function factor out of rowdiffscan, maybe needs a better name */
+static ogg_int32_t RowDiffScan_DiffAndThresholdingLastFrag__sse2(PP_INSTANCE *ppi,
+ unsigned char * YuvPtr1,
+ unsigned char * YuvPtr2,
+ ogg_int16_t * YUVDiffsPtr,
+ unsigned char * bits_map_ptr,
+ signed char * SgcPtr)
+{
+ ogg_int16_t Diff; /* Temp local workspace. */
+ ogg_int32_t j;
+ ogg_int32_t FragChangedPixels = 0;
+
+ for ( j = 0; j < HFRAGPIXELS; j++ ){
+ /* Take a local copy of the measured difference. */
+ Diff = (int)YuvPtr1[j] - (int)YuvPtr2[j];
+
+ /* Store the actual difference value */
+ YUVDiffsPtr[j] = Diff;
+
+ /* Test against the Level thresholds and record the results */
+ SgcPtr[0] += ppi->SgcThreshTable[Diff+255];
+
+ if (j<7 && ppi->SrfPakThreshTable[Diff+255] )
+ Diff = (int)ApplyPakLowPass__sse2( ppi, &YuvPtr1[j] ) -
+ (int)ApplyPakLowPass__sse2( ppi, &YuvPtr2[j] );
+
+
+ /* Test against the SRF thresholds */
+ bits_map_ptr[j] = ppi->SrfThreshTable[Diff+255];
+ FragChangedPixels += ppi->SrfThreshTable[Diff+255];
+ }
+ return FragChangedPixels;
+
+}
+
+
+
+
+
+
+/* This is a new function factor out of rowdiffscan, maybe needs a better name */
+static __inline ogg_int32_t RowDiffScan_DiffAndThresholdingMiddleFrag__sse2(PP_INSTANCE *ppi,
+ unsigned char * YuvPtr1,
+ unsigned char * YuvPtr2,
+ ogg_int16_t * YUVDiffsPtr,
+ unsigned char * bits_map_ptr,
+ signed char * SgcPtr)
+{
+#if 0
+
+ /* 10% of all encode exectution is in this function, most
+ heavily used function in alpha 6 */
+
+ ogg_int16_t Diff; /* Temp local workspace. */
+ ogg_int32_t j;
+ ogg_int32_t FragChangedPixels = 0;
+
+
+ for ( j = 0; j < HFRAGPIXELS; j++ ){
+ /* Take a local copy of the measured difference. */
+ Diff = (int)YuvPtr1[j] - (int)YuvPtr2[j];
+
+ /* Store the actual difference value */
+ YUVDiffsPtr[j] = Diff;
+
+ /* Test against the Level thresholds and record the results */
+ SgcPtr[0] += ppi->SgcThreshTable[Diff+255];
+
+ if (ppi->SrfPakThreshTable[Diff+255] )
+ Diff = (int)ApplyPakLowPass__sse2( ppi, &YuvPtr1[j] ) -
+ (int)ApplyPakLowPass__sse2( ppi, &YuvPtr2[j] );
+
+
+ /* Test against the SRF thresholds */
+ bits_map_ptr[j] = ppi->SrfThreshTable[Diff+255];
+ FragChangedPixels += ppi->SrfThreshTable[Diff+255];
+ }
+ return FragChangedPixels;
+ //PERF_BLOCK_END("RowDiffScan_DiffAndThresholdingMiddleFrag", perf_rds_datmf_time, perf_rds_datmf_time,perf_rds_datmf_time, 10000);
+
+#else
+
+ static __declspec(align(16)) unsigned long Some255s[4] = { 0x00ff00ff, 0x00ff00ff, 0x00ff00ff, 0x00ff00ff };
+ static __declspec(align(16)) unsigned char temp[48];
+ static unsigned short* temp_ptr = (unsigned short*)temp;
+
+ static unsigned char* some_255s_ptr = (unsigned char*)Some255s;
+ unsigned char* local_sgc_thresh_table = ppi->SgcThreshTable;
+ unsigned char* local_srf_thresh_table = ppi->SrfThreshTable;
+ unsigned char* local_srf_pak_thresh_table = ppi->SrfPakThreshTable;
+
+
+ unsigned char thresh_val;
+ int i, FragChangedPixels = 0;
+
+
+ __asm {
+ align 16
+ mov esi, YuvPtr1
+ mov edx, YuvPtr2
+ mov edi, YUVDiffsPtr /* Not aligned */
+ mov eax, some_255s_ptr;
+ mov ecx, temp_ptr
+
+ movdqa xmm7, [eax]
+ pxor xmm0, xmm0
+
+ /* Load yuvptr1[0..7] into low 8 bytes */
+ movq xmm1, QWORD PTR [esi]
+ /* Load yuvptr2[0..7] into low 8 bytes */
+ movq xmm2, QWORD PTR [edx]
+
+ /* Unpack to 16 bits */
+ punpcklbw xmm1, xmm0
+ punpcklbw xmm2, xmm0
+
+ /* Subtract the YUV Ptr values */
+ psubw xmm1, xmm2 /*should it be subsw?? */
+
+ /* Write out to YUVDiffs */
+ movdqu [edi], xmm1
+
+ /* Add 255 to them all */
+ paddw xmm1, xmm7
+
+ /* Write them to the temp area */
+ movdqa [ecx], xmm1
+
+
+
+ }
+
+ ApplyPakLowPass_Vectorised__sse2(ppi, YuvPtr1, temp_ptr + 8); /* Bytes 16-31 */
+ ApplyPakLowPass_Vectorised__sse2(ppi, YuvPtr2, temp_ptr + 16); /* Bytes 32 - 47 */
+
+ __asm {
+ align 16
+
+ mov esi, temp_ptr
+ mov ecx, some_255s_ptr
+
+ movdqa xmm1, [esi + 16]
+ movdqa xmm2, [esi + 32]
+
+ movdqa xmm6, [ecx]
+
+ /* New diffs after PakLowPass */
+ psubw xmm1, xmm2
+
+ /* Add 255 to the diffs */
+ paddw xmm1, xmm6
+
+ /* Write back out to temp */
+ movdqa [esi +16], xmm1
+
+ /* Now need to process with normal registers ops */
+
+
+
+ /* At this point
+ temp_ptr[0..15] = 8 lots of Early loop diffs + 255
+ temp_ptr[16..31] = 8 lots of late loop diffs + 255
+ temp_ptr[32..47] = who cares */
+
+ }
+
+
+ /* Apply the pak threash_table and write into temp[32..47] */
+ temp_ptr[16] = local_srf_pak_thresh_table[temp_ptr[0]];
+ temp_ptr[17] = local_srf_pak_thresh_table[temp_ptr[1]];
+ temp_ptr[18] = local_srf_pak_thresh_table[temp_ptr[2]];
+ temp_ptr[19] = local_srf_pak_thresh_table[temp_ptr[3]];
+ temp_ptr[20] = local_srf_pak_thresh_table[temp_ptr[4]];
+ temp_ptr[21] = local_srf_pak_thresh_table[temp_ptr[5]];
+ temp_ptr[22] = local_srf_pak_thresh_table[temp_ptr[6]];
+ temp_ptr[23] = local_srf_pak_thresh_table[temp_ptr[7]];
+
+ __asm {
+ align 16
+
+ //mov edx, YUVDiffsPtr
+ mov esi, temp_ptr
+
+ /* Read back the old diffs+255 */
+ movdqu xmm4, [esi]
+
+ /* Read back the new diffs+255 */
+ movdqa xmm3, [esi + 16]
+
+ /* Read back the pak_threshed values used in the if statement */
+ movdqa xmm6, [esi + 32]
+
+ pxor xmm0, xmm0
+ pcmpeqw xmm7, xmm7 /* All 1's */
+
+ /* Compare the pak_thresh values to 0, any word which was 0, will now be set to all 1's in xmm0
+ the if basically said, if it's zero, leave it alone, otherwise, replace it
+ with the new diff */
+ pcmpeqw xmm0, xmm6
+
+ /* On the old diffs, keep all the words where the pak_thresh is zero */
+ pand xmm4, xmm0
+
+ /* Flip the bits so that the places that were 0 are now all zeros */
+ pxor xmm0, xmm7
+
+ /* This zero's out all the words in the new diffs which were 0 in the pak_thresh */
+ pand xmm3, xmm0
+
+ /* Merge the old and new diffs */
+ por xmm3, xmm4
+
+ /* Write back out to temp */
+ movdqa [esi + 32], xmm3
+ }
+
+ for (i = 0; i < 8; i++)
+ {
+
+ thresh_val = local_srf_thresh_table[temp_ptr[16 + i]];
+ SgcPtr[0] += local_sgc_thresh_table[temp_ptr[i]];
+ bits_map_ptr[i] = thresh_val;
+ FragChangedPixels += thresh_val;
+
+ }
+
+ return FragChangedPixels;
+
+
+
+#endif
+}
+
+
+static void RowDiffScan__sse2( PP_INSTANCE *ppi,
+ unsigned char * YuvPtr1,
+ unsigned char * YuvPtr2,
+ ogg_int16_t * YUVDiffsPtr,
+ unsigned char * bits_map_ptr,
+ signed char * SgcPtr,
+ signed char * DispFragPtr,
+ unsigned char * FDiffPixels,
+ ogg_int32_t * RowDiffsPtr,
+ unsigned char * ChLocalsPtr, int EdgeRow ){
+
+ ogg_int32_t i,j;
+ ogg_int32_t FragChangedPixels;
+
+ ogg_int16_t Diff; /* Temp local workspace. */
+ PERF_BLOCK_START();
+ /* Cannot use kernel if at edge or if PAK disabled */
+ if ( (!ppi->PAKEnabled) || EdgeRow ){
+ for ( i = 0; i < ppi->PlaneWidth; i += HFRAGPIXELS ){
+ /* Reset count of pixels changed for the current fragment. */
+ FragChangedPixels = 0;
+
+ /* Test for break out conditions to save time. */
+ if (*DispFragPtr == CANDIDATE_BLOCK){
+
+ /* Clear down entries in changed locals array */
+ SET8_0(ChLocalsPtr);
+
+ FragChangedPixels += RowDiffScan_DiffAndThresholding__sse2( ppi,
+ YuvPtr1,
+ YuvPtr2,
+ YUVDiffsPtr,
+ bits_map_ptr,
+ SgcPtr);
+
+ }else{
+ /* If we are breaking out here mark all pixels as changed. */
+ if ( *DispFragPtr > BLOCK_NOT_CODED ){
+ SET8_1(bits_map_ptr);
+ SET8_8(ChLocalsPtr);
+ }else{
+ SET8_0(ChLocalsPtr);
+ }
+ }
+
+ *RowDiffsPtr += FragChangedPixels;
+ *FDiffPixels += (unsigned char)FragChangedPixels;
+
+ YuvPtr1 += HFRAGPIXELS;
+ YuvPtr2 += HFRAGPIXELS;
+ bits_map_ptr += HFRAGPIXELS;
+ ChLocalsPtr += HFRAGPIXELS;
+ YUVDiffsPtr += HFRAGPIXELS;
+ SgcPtr ++;
+ FDiffPixels ++;
+
+ /* If we have a lot of changed pixels for this fragment on this
+ row then the fragment is almost sure to be picked (e.g. through
+ the line search) so we can mark it as selected and then ignore
+ it. */
+ if (FragChangedPixels >= 7){
+ *DispFragPtr = BLOCK_CODED_LOW;
+ }
+ DispFragPtr++;
+ }
+ }else{
+
+ /*************************************************************/
+ /* First fragment of row !! */
+
+ i = 0;
+ /* Reset count of pixels changed for the current fragment. */
+ FragChangedPixels = 0;
+
+ /* Test for break out conditions to save time. */
+ if (*DispFragPtr == CANDIDATE_BLOCK){
+ /* Clear down entries in changed locals array */
+ SET8_0(ChLocalsPtr);
+
+ FragChangedPixels += RowDiffScan_DiffAndThresholdingFirstFrag__sse2(
+ ppi,
+ YuvPtr1,
+ YuvPtr2,
+ YUVDiffsPtr,
+ bits_map_ptr,
+ SgcPtr);
+
+ }else{
+ /* If we are breaking out here mark all pixels as changed. */
+ if ( *DispFragPtr > BLOCK_NOT_CODED ){
+ SET8_1(bits_map_ptr);
+ SET8_8(ChLocalsPtr);
+ }else{
+ SET8_0(ChLocalsPtr);
+ }
+ }
+
+ *RowDiffsPtr += FragChangedPixels;
+ *FDiffPixels += (unsigned char)FragChangedPixels;
+
+ YuvPtr1 += HFRAGPIXELS;
+ YuvPtr2 += HFRAGPIXELS;
+ bits_map_ptr += HFRAGPIXELS;
+ ChLocalsPtr += HFRAGPIXELS;
+ YUVDiffsPtr += HFRAGPIXELS;
+ SgcPtr ++;
+ FDiffPixels ++;
+
+ /* If we have a lot of changed pixels for this fragment on this
+ row then the fragment is almost sure to be picked
+ (e.g. through the line search) so we can mark it as selected
+ and then ignore it. */
+ if (FragChangedPixels >= 7){
+ *DispFragPtr = BLOCK_CODED_LOW;
+ }
+ DispFragPtr++;
+ /*************************************************************/
+ /* Fragment in between!! */
+
+ for ( i = HFRAGPIXELS ; i < ppi->PlaneWidth-HFRAGPIXELS;
+ i += HFRAGPIXELS ){
+ /* Reset count of pixels changed for the current fragment. */
+ FragChangedPixels = 0;
+
+ /* Test for break out conditions to save time. */
+ if (*DispFragPtr == CANDIDATE_BLOCK){
+ /* Clear down entries in changed locals array */
+ SET8_0(ChLocalsPtr);
+
+ FragChangedPixels += RowDiffScan_DiffAndThresholdingMiddleFrag__sse2(
+ ppi,
+ YuvPtr1,
+ YuvPtr2,
+ YUVDiffsPtr,
+ bits_map_ptr,
+ SgcPtr);
+
+ }else{
+ /* If we are breaking out here mark all pixels as changed. */
+ if ( *DispFragPtr > BLOCK_NOT_CODED ){
+ SET8_1(bits_map_ptr);
+ SET8_8(ChLocalsPtr);
+ }else{
+ SET8_0(ChLocalsPtr);
+ }
+ }
+
+ *RowDiffsPtr += FragChangedPixels;
+ *FDiffPixels += (unsigned char)FragChangedPixels;
+
+ YuvPtr1 += HFRAGPIXELS;
+ YuvPtr2 += HFRAGPIXELS;
+ bits_map_ptr += HFRAGPIXELS;
+ ChLocalsPtr += HFRAGPIXELS;
+ YUVDiffsPtr += HFRAGPIXELS;
+ SgcPtr ++;
+ FDiffPixels ++;
+
+ /* If we have a lot of changed pixels for this fragment on this
+ row then the fragment is almost sure to be picked
+ (e.g. through the line search) so we can mark it as selected
+ and then ignore it. */
+ if (FragChangedPixels >= 7){
+ *DispFragPtr = BLOCK_CODED_LOW;
+ }
+ DispFragPtr++;
+ }
+ /*************************************************************/
+ /* Last fragment of row !! */
+
+ /* Reset count of pixels changed for the current fragment. */
+ FragChangedPixels = 0;
+
+ /* Test for break out conditions to save time. */
+ if (*DispFragPtr == CANDIDATE_BLOCK){
+ /* Clear down entries in changed locals array */
+ SET8_0(ChLocalsPtr);
+
+ FragChangedPixels += RowDiffScan_DiffAndThresholdingLastFrag__sse2(
+ ppi,
+ YuvPtr1,
+ YuvPtr2,
+ YUVDiffsPtr,
+ bits_map_ptr,
+ SgcPtr);
+
+
+ }else{
+ /* If we are breaking out here mark all pixels as changed.*/
+ if ( *DispFragPtr > BLOCK_NOT_CODED ) {
+ SET8_1(bits_map_ptr);
+ SET8_8(ChLocalsPtr);
+ }else{
+ SET8_0(ChLocalsPtr);
+ }
+ }
+ /* If we have a lot of changed pixels for this fragment on this
+ row then the fragment is almost sure to be picked (e.g. through
+ the line search) so we can mark it as selected and then ignore
+ it. */
+ *RowDiffsPtr += FragChangedPixels;
+ *FDiffPixels += (unsigned char)FragChangedPixels;
+
+ /* If we have a lot of changed pixels for this fragment on this
+ row then the fragment is almost sure to be picked (e.g. through
+ the line search) so we can mark it as selected and then ignore
+ it. */
+ if (FragChangedPixels >= 7){
+ *DispFragPtr = BLOCK_CODED_LOW;
+ }
+ DispFragPtr++;
+
+ }
+
+ PERF_BLOCK_END("RowDiffScan ", perf_rds_datmf_time, perf_rds_datmf_count, perf_rds_datmf_min, 10000);
+}
+
+
+void dsp_sse2_scan_init(DspFunctions *funcs)
+{
+ TH_DEBUG("enabling accelerated x86_32 sse2 scan functions.\n");
+ funcs->RowDiffScan = RowDiffScan__sse2;
+ //funcs->copy8x8 = copy8x8__sse2;
+ //funcs->recon_intra8x8 = recon_intra8x8__sse2;
+ //funcs->recon_inter8x8 = recon_inter8x8__sse2;
+ //funcs->recon_inter8x8_half = recon_inter8x8_half__sse2;
+}
\ No newline at end of file
Modified: branches/theora-playtime/win32/VS2005/libtheora/libtheora.vcproj
===================================================================
--- branches/theora-playtime/win32/VS2005/libtheora/libtheora.vcproj 2006-06-09 12:15:37 UTC (rev 11550)
+++ branches/theora-playtime/win32/VS2005/libtheora/libtheora.vcproj 2006-06-09 14:25:50 UTC (rev 11551)
@@ -136,6 +136,7 @@
RuntimeLibrary="0"
BufferSecurityCheck="false"
UsePrecompiledHeader="0"
+ AssemblerOutput="4"
WarningLevel="4"
Detect64BitPortabilityProblems="true"
DebugInformationFormat="3"
@@ -505,6 +506,10 @@
>
</File>
<File
+ RelativePath="..\..\..\lib\x86_32_vs\scan_sse2.c"
+ >
+ </File>
+ <File
RelativePath="..\..\..\lib\toplevel.c"
>
</File>
More information about the commits
mailing list