Index: examples/encoder_example.c =================================================================== --- examples/encoder_example.c (revision 14720) +++ examples/encoder_example.c (working copy) @@ -33,10 +33,14 @@ #endif #include +#if !defined(_WIN32) +#include #include +#else +#include "getopt.h" +#endif #include #include -#include #include #include #include "theora/theoraenc.h" Index: lib/dec/x86_vc/mmxfrag.c =================================================================== --- lib/dec/x86_vc/mmxfrag.c (revision 14720) +++ lib/dec/x86_vc/mmxfrag.c (working copy) @@ -211,220 +211,4 @@ _asm { emms } } -#endif - -/******************************************************************** - * * - * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * - * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * - * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * - * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * - * * - * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007 * - * by the Xiph.Org Foundation http://www.xiph.org/ * - * * - ******************************************************************** - - function: - last mod: $Id: - - ********************************************************************/ -#include "../../internal.h" - -/* ------------------------------------------------------------------------ - MMX reconstruction fragment routines for Visual Studio. - Tested with VS2005. Should compile for VS2003 and VC6 as well. - - Initial implementation 2007 by Nils Pipenbrinck. - ---------------------------------------------------------------------*/ - -#if defined(USE_ASM) - -void oc_frag_recon_intra_mmx(unsigned char *_dst,int _dst_ystride, - const ogg_int16_t *_residue){ - /* --------------------------------------------------------------------- - This function does the inter reconstruction step with 8 iterations - unrolled. The iteration for each instruction is noted by the #id in the - comments (in case you want to reconstruct it) - --------------------------------------------------------------------- */ - _asm{ - mov edi, [_residue] /* load residue ptr */ - mov eax, 0x00800080 /* generate constant */ - mov ebx, [_dst_ystride] /* load dst-stride */ - mov edx, [_dst] /* load dest pointer */ - - /* unrolled loop begins here */ - - movd mm0, eax /* load constant */ - movq mm1, [edi+ 8*0] /* #1 load low residue */ - movq mm2, [edi+ 8*1] /* #1 load high residue */ - punpckldq mm0, mm0 /* build constant */ - movq mm3, [edi+ 8*2] /* #2 load low residue */ - movq mm4, [edi+ 8*3] /* #2 load high residue */ - movq mm5, [edi+ 8*4] /* #3 load low residue */ - movq mm6, [edi+ 8*5] /* #3 load high residue */ - paddsw mm1, mm0 /* #1 bias low residue */ - paddsw mm2, mm0 /* #1 bias high residue */ - packuswb mm1, mm2 /* #1 pack to byte */ - paddsw mm3, mm0 /* #2 bias low residue */ - paddsw mm4, mm0 /* #2 bias high residue */ - packuswb mm3, mm4 /* #2 pack to byte */ - paddsw mm5, mm0 /* #3 bias low residue */ - paddsw mm6, mm0 /* #3 bias high residue */ - packuswb mm5, mm6 /* #3 pack to byte */ - movq [edx], mm1 /* #1 write row */ - movq [edx + ebx], mm3 /* #2 write row */ - movq [edx + ebx*2], mm5 /* #3 write row */ - movq mm1, [edi+ 8*6] /* #4 load low residue */ - lea ecx, [ebx + ebx*2] /* make dst_ystride * 3 */ - movq mm2, [edi+ 8*7] /* #4 load high residue */ - movq mm3, [edi+ 8*8] /* #5 load low residue */ - lea esi, [ebx*4 + ebx] /* make dst_ystride * 5 */ - movq mm4, [edi+ 8*9] /* #5 load high residue */ - movq mm5, [edi+ 8*10] /* #6 load low residue */ - lea eax, [ecx*2 + ebx] /* make dst_ystride * 7 */ - movq mm6, [edi+ 8*11] /* #6 load high residue */ - paddsw mm1, mm0 /* #4 bias low residue */ - paddsw mm2, mm0 /* #4 bias high residue */ - packuswb mm1, mm2 /* #4 pack to byte */ - paddsw mm3, mm0 /* #5 bias low residue */ - paddsw mm4, mm0 /* #5 bias high residue */ - packuswb mm3, mm4 /* #5 pack to byte */ - paddsw mm5, mm0 /* #6 bias low residue */ - paddsw mm6, mm0 /* #6 bias high residue */ - packuswb mm5, mm6 /* #6 pack to byte */ - movq [edx + ecx], mm1 /* #4 write row */ - movq [edx + ebx*4], mm3 /* #5 write row */ - movq [edx + esi], mm5 /* #6 write row */ - movq mm1, [edi+ 8*12] /* #7 load low residue */ - movq mm2, [edi+ 8*13] /* #7 load high residue */ - movq mm3, [edi+ 8*14] /* #8 load low residue */ - movq mm4, [edi+ 8*15] /* #8 load high residue */ - paddsw mm1, mm0 /* #7 bias low residue */ - paddsw mm2, mm0 /* #7 bias high residue */ - packuswb mm1, mm2 /* #7 pack to byte */ - paddsw mm3, mm0 /* #8 bias low residue */ - paddsw mm4, mm0 /* #8 bias high residue */ - packuswb mm3, mm4 /* #8 pack to byte */ - movq [edx + ecx*2], mm1 /* #7 write row */ - movq [edx + eax], mm3 /* #8 write row */ - } -} - - - -void oc_frag_recon_inter_mmx (unsigned char *_dst, int _dst_ystride, - const unsigned char *_src, int _src_ystride, const ogg_int16_t *_residue){ - /* --------------------------------------------------------------------- - This function does the inter reconstruction step with two iterations - running in parallel to hide some load-latencies and break the dependency - chains. The iteration for each instruction is noted by the #id in the - comments (in case you want to reconstruct it) - --------------------------------------------------------------------- */ - _asm{ - pxor mm0, mm0 /* generate constant 0 */ - mov esi, [_src] - mov edi, [_residue] - mov eax, [_src_ystride] - mov edx, [_dst] - mov ebx, [_dst_ystride] - mov ecx, 4 - - align 16 - -nextchunk: - movq mm3, [esi] /* #1 load source */ - movq mm1, [edi+0] /* #1 load residium low */ - movq mm2, [edi+8] /* #1 load residium high */ - movq mm7, [esi+eax] /* #2 load source */ - movq mm4, mm3 /* #1 get copy of src */ - movq mm5, [edi+16] /* #2 load residium low */ - punpckhbw mm4, mm0 /* #1 expand high source */ - movq mm6, [edi+24] /* #2 load residium high */ - punpcklbw mm3, mm0 /* #1 expand low source */ - paddsw mm4, mm2 /* #1 add residium high */ - movq mm2, mm7 /* #2 get copy of src */ - paddsw mm3, mm1 /* #1 add residium low */ - punpckhbw mm2, mm0 /* #2 expand high source */ - packuswb mm3, mm4 /* #1 final row pixels */ - punpcklbw mm7, mm0 /* #2 expand low source */ - movq [edx], mm3 /* #1 write row */ - paddsw mm2, mm6 /* #2 add residium high */ - add edi, 32 /* residue += 4 */ - paddsw mm7, mm5 /* #2 add residium low */ - sub ecx, 1 /* update loop counter */ - packuswb mm7, mm2 /* #2 final row */ - lea esi, [esi+eax*2] /* src += stride * 2 */ - movq [edx + ebx], mm7 /* #2 write row */ - lea edx, [edx+ebx*2] /* dst += stride * 2 */ - jne nextchunk - } -} - - -void oc_frag_recon_inter2_mmx(unsigned char *_dst, int _dst_ystride, - const unsigned char *_src1, int _src1_ystride, const unsigned char *_src2, - int _src2_ystride,const ogg_int16_t *_residue){ - /* --------------------------------------------------------------------- - This function does the inter2 reconstruction step.The building of the - average is done with a bit-twiddeling trick to avoid excessive register - copy work during byte to word conversion. - - average = (a & b) + (((a ^ b) & 0xfe) >> 1); - - (shown for a single byte; it's done with 8 of them at a time) - - Slightly faster than the obvious method using add and shift, but not - earthshaking improvement either. - - If anyone comes up with a way that produces bit-identical outputs - using the pavgb instruction let me know and I'll do the 3dnow codepath. - --------------------------------------------------------------------- */ - _asm{ - mov eax, 0xfefefefe - mov esi, [_src1] - mov edi, [_src2] - movd mm1, eax - mov ebx, [_residue] - mov edx, [_dst] - mov eax, [_dst_ystride] - punpckldq mm1, mm1 /* replicate lsb32 */ - mov ecx, 8 /* init loop counter */ - pxor mm0, mm0 /* constant zero */ - sub edx, eax /* dst -= dst_stride */ - - align 16 - -nextrow: - movq mm2, [esi] /* load source1 */ - movq mm3, [edi] /* load source2 */ - movq mm5, [ebx + 0] /* load lower residue */ - movq mm6, [ebx + 8] /* load higer residue */ - add esi, _src1_ystride /* src1 += src1_stride */ - add edi, _src2_ystride /* src2 += src1_stride */ - movq mm4, mm2 /* get copy of source1 */ - pand mm2, mm3 /* s1 & s2 (avg part) */ - pxor mm3, mm4 /* s1 ^ s2 (avg part) */ - add ebx, 16 /* residue++ */ - pand mm3, mm1 /* mask out low bits */ - psrlq mm3, 1 /* shift xor avg-part */ - paddd mm3, mm2 /* build final average */ - add edx, eax /* dst += dst_stride */ - movq mm2, mm3 /* get copy of average */ - punpckhbw mm3, mm0 /* average high */ - punpcklbw mm2, mm0 /* average low */ - paddsw mm3, mm6 /* high + residue */ - paddsw mm2, mm5 /* low + residue */ - sub ecx, 1 /* update loop counter */ - packuswb mm2, mm3 /* pack and saturate */ - movq [edx], mm2 /* write row */ - jne nextrow - } -} - -void oc_restore_fpu_mmx(void){ - _asm { emms } -} - -#endif - +#endif \ No newline at end of file Index: lib/dec/x86_vc/mmxloopfilter.c =================================================================== --- lib/dec/x86_vc/mmxloopfilter.c (revision 14720) +++ lib/dec/x86_vc/mmxloopfilter.c (working copy) @@ -352,17 +352,17 @@ while(fragcoded){ if(frag>frag0){ - loop_filter_h(frag->buffer[_refi],iplane->ystride,ll); + loop_filter_h(frag->buffer[_refi],iplane->stride,ll); } if(frag0>frag_top){ - loop_filter_v(frag->buffer[_refi],iplane->ystride,ll); + loop_filter_v(frag->buffer[_refi],iplane->stride,ll); } if(frag+1coded){ - loop_filter_h(frag->buffer[_refi]+8,iplane->ystride,ll); + loop_filter_h(frag->buffer[_refi]+8,iplane->stride,ll); } if(frag+fplane->nhfragsnhfrags)->coded){ loop_filter_v((frag+fplane->nhfrags)->buffer[_refi], - iplane->ystride,ll); + iplane->stride,ll); } } frag++; @@ -374,383 +374,4 @@ _mm_empty(); } -#endif - -/******************************************************************** - * * - * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * - * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * - * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * - * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * - * * - * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007 * - * by the Xiph.Org Foundation http://www.xiph.org/ * - * * - ******************************************************************** - - function: - last mod: $Id: - - ********************************************************************/ - -/* ------------------------------------------------------------------- - MMX based loop filter for the theora codec. - - Originally written by Rudolf Marek, based on code from On2's VP3. - Converted to Visual Studio inline assembly by Nils Pipenbrinck. - - Note: I can't test these since my example files never get into the - loop filters, but the code has been converted semi-automatic from - the GCC sources, so it ought to work. - ---------------------------------------------------------------------*/ -#include "../../internal.h" -#include "x86int.h" -#include - -#if defined(USE_ASM) - - - -static void loop_filter_v(unsigned char *_pix,int _ystride, - const ogg_int16_t *_ll){ - _asm { - mov eax, [_pix] - mov edx, [_ystride] - mov ebx, [_ll] - - /* _pix -= ystride */ - sub eax, edx - /* mm0=0 */ - pxor mm0, mm0 - /* _pix -= ystride */ - sub eax, edx - /* esi=_ystride*3 */ - lea esi, [edx + edx*2] - - /* mm7=_pix[0...8]*/ - movq mm7, [eax] - /* mm4=_pix[0...8+_ystride*3]*/ - movq mm4, [eax + esi] - /* mm6=_pix[0...8]*/ - movq mm6, mm7 - /* Expand unsigned _pix[0...3] to 16 bits.*/ - punpcklbw mm6, mm0 - movq mm5, mm4 - /* Expand unsigned _pix[4...7] to 16 bits.*/ - punpckhbw mm7, mm0 - punpcklbw mm4, mm0 - /* Expand other arrays too.*/ - punpckhbw mm5, mm0 - /*mm7:mm6=_p[0...7]-_p[0...7+_ystride*3]:*/ - psubw mm6, mm4 - psubw mm7, mm5 - /*mm5=mm4=_pix[0...7+_ystride]*/ - movq mm4, [eax + edx] - /*mm1=mm3=mm2=_pix[0..7]+_ystride*2]*/ - movq mm2, [eax + edx*2] - movq mm5, mm4 - movq mm3, mm2 - movq mm1, mm2 - /*Expand these arrays.*/ - punpckhbw mm5, mm0 - punpcklbw mm4, mm0 - punpckhbw mm3, mm0 - punpcklbw mm2, mm0 - pcmpeqw mm0, mm0 - /*mm0=3 3 3 3 - mm3:mm2=_pix[0...8+_ystride*2]-_pix[0...8+_ystride]*/ - psubw mm3, mm5 - psrlw mm0, 14 - psubw mm2, mm4 - /*Scale by 3.*/ - pmullw mm3, mm0 - pmullw mm2, mm0 - /*mm0=4 4 4 4 - f=mm3:mm2==_pix[0...8]-_pix[0...8+_ystride*3]+ - 3*(_pix[0...8+_ystride*2]-_pix[0...8+_ystride])*/ - psrlw mm0, 1 - paddw mm3, mm7 - psllw mm0, 2 - paddw mm2, mm6 - /*Add 4.*/ - paddw mm3, mm0 - paddw mm2, mm0 - /*"Divide" by 8.*/ - psraw mm3, 3 - psraw mm2, 3 - /*Now compute lflim of mm3:mm2 cf. Section 7.10 of the sepc.*/ - /*Free up mm5.*/ - packuswb mm4, mm5 - /*mm0=L L L L*/ - movq mm0, [ebx] - /*if(R_i<-2L||R_i>2L)R_i=0:*/ - movq mm5, mm2 - pxor mm6, mm6 - movq mm7, mm0 - psubw mm6, mm0 - psllw mm7, 1 - psllw mm6, 1 - /*mm2==R_3 R_2 R_1 R_0*/ - /*mm5==R_3 R_2 R_1 R_0*/ - /*mm6==-2L -2L -2L -2L*/ - /*mm7==2L 2L 2L 2L*/ - pcmpgtw mm7, mm2 - pcmpgtw mm5, mm6 - pand mm2, mm7 - movq mm7, mm0 - pand mm2, mm5 - psllw mm7, 1 - movq mm5, mm3 - /*mm3==R_7 R_6 R_5 R_4*/ - /*mm5==R_7 R_6 R_5 R_4*/ - /*mm6==-2L -2L -2L -2L*/ - /*mm7==2L 2L 2L 2L*/ - pcmpgtw mm7, mm3 - pcmpgtw mm5, mm6 - pand mm3, mm7 - movq mm7, mm0 - pand mm3, mm5 - /*if(R_i<-L)R_i'=R_i+2L; - if(R_i>L)R_i'=R_i-2L; - if(R_i<-L||R_i>L)R_i=-R_i':*/ - psraw mm6, 1 - movq mm5, mm2 - psllw mm7, 1 - /*mm2==R_3 R_2 R_1 R_0*/ - /*mm5==R_3 R_2 R_1 R_0*/ - /*mm6==-L -L -L -L*/ - /*mm0==L L L L*/ - /*mm5=R_i>L?FF:00*/ - pcmpgtw mm5, mm0 - /*mm6=-L>R_i?FF:00*/ - pcmpgtw mm6, mm2 - /*mm7=R_i>L?2L:0*/ - pand mm7, mm5 - /*mm2=R_i>L?R_i-2L:R_i*/ - psubw mm2, mm7 - movq mm7, mm0 - /*mm5=-L>R_i||R_i>L*/ - por mm5, mm6 - psllw mm7, 1 - /*mm7=-L>R_i?2L:0*/ - pand mm7, mm6 - pxor mm6, mm6 - /*mm2=-L>R_i?R_i+2L:R_i*/ - paddw mm2, mm7 - psubw mm6, mm0 - /*mm5=-L>R_i||R_i>L?-R_i':0*/ - pand mm5, mm2 - movq mm7, mm0 - /*mm2=-L>R_i||R_i>L?0:R_i*/ - psubw mm2, mm5 - psllw mm7, 1 - /*mm2=-L>R_i||R_i>L?-R_i':R_i*/ - psubw mm2, mm5 - movq mm5, mm3 - /*mm3==R_7 R_6 R_5 R_4*/ - /*mm5==R_7 R_6 R_5 R_4*/ - /*mm6==-L -L -L -L*/ - /*mm0==L L L L*/ - /*mm6=-L>R_i?FF:00*/ - pcmpgtw mm6, mm3 - /*mm5=R_i>L?FF:00*/ - pcmpgtw mm5, mm0 - /*mm7=R_i>L?2L:0*/ - pand mm7, mm5 - /*mm2=R_i>L?R_i-2L:R_i*/ - psubw mm3, mm7 - psllw mm0, 1 - /*mm5=-L>R_i||R_i>L*/ - por mm5, mm6 - /*mm0=-L>R_i?2L:0*/ - pand mm0, mm6 - /*mm3=-L>R_i?R_i+2L:R_i*/ - paddw mm3, mm0 - /*mm5=-L>R_i||R_i>L?-R_i':0*/ - pand mm5, mm3 - /*mm2=-L>R_i||R_i>L?0:R_i*/ - psubw mm3, mm5 - /*mm3=-L>R_i||R_i>L?-R_i':R_i*/ - psubw mm3, mm5 - /*Unfortunately, there's no unsigned byte+signed byte with unsigned - saturation op code, so we have to promote things back 16 bits.*/ - pxor mm0, mm0 - movq mm5, mm4 - punpcklbw mm4, mm0 - punpckhbw mm5, mm0 - movq mm6, mm1 - punpcklbw mm1, mm0 - punpckhbw mm6, mm0 - /*_pix[0...8+_ystride]+=R_i*/ - paddw mm4, mm2 - paddw mm5, mm3 - /*_pix[0...8+_ystride*2]-=R_i*/ - psubw mm1, mm2 - psubw mm6, mm3 - packuswb mm4, mm5 - packuswb mm1, mm6 - /*Write it back out.*/ - movq [eax + edx], mm4 - movq [eax + edx*2], mm1 - } -} - -/*This code implements the bulk of loop_filter_h(). - Data are striped p0 p1 p2 p3 ... p0 p1 p2 p3 ..., so in order to load all - four p0's to one register we must transpose the values in four mmx regs. - When half is done we repeat this for the rest.*/ -static void loop_filter_h4(unsigned char *_pix,long _ystride, - const ogg_int16_t *_ll){ - /* todo: merge the comments from the GCC sources */ - _asm { - mov ecx, [_pix] - mov edx, [_ystride] - mov eax, [_ll] - /*esi=_ystride*3*/ - lea esi, [edx + edx*2] - - movd mm0, dword ptr [ecx] - movd mm1, dword ptr [ecx + edx] - movd mm2, dword ptr [ecx + edx*2] - movd mm3, dword ptr [ecx + esi] - punpcklbw mm0, mm1 - punpcklbw mm2, mm3 - movq mm1, mm0 - punpckhwd mm0, mm2 - punpcklwd mm1, mm2 - pxor mm7, mm7 - movq mm5, mm1 - punpcklbw mm1, mm7 - punpckhbw mm5, mm7 - movq mm3, mm0 - punpcklbw mm0, mm7 - punpckhbw mm3, mm7 - psubw mm1, mm3 - movq mm4, mm0 - pcmpeqw mm2, mm2 - psubw mm0, mm5 - psrlw mm2, 14 - pmullw mm0, mm2 - psrlw mm2, 1 - paddw mm0, mm1 - psllw mm2, 2 - paddw mm0, mm2 - psraw mm0, 3 - movq mm6, qword ptr [eax] - movq mm1, mm0 - pxor mm2, mm2 - movq mm3, mm6 - psubw mm2, mm6 - psllw mm3, 1 - psllw mm2, 1 - pcmpgtw mm3, mm0 - pcmpgtw mm1, mm2 - pand mm0, mm3 - pand mm0, mm1 - psraw mm2, 1 - movq mm1, mm0 - movq mm3, mm6 - pcmpgtw mm2, mm0 - pcmpgtw mm1, mm6 - psllw mm3, 1 - psllw mm6, 1 - pand mm3, mm1 - pand mm6, mm2 - psubw mm0, mm3 - por mm1, mm2 - paddw mm0, mm6 - pand mm1, mm0 - psubw mm0, mm1 - psubw mm0, mm1 - paddw mm5, mm0 - psubw mm4, mm0 - packuswb mm5, mm7 - packuswb mm4, mm7 - punpcklbw mm5, mm4 - movd edi, mm5 - mov word ptr [ecx + 01H], di - psrlq mm5, 32 - shr edi, 16 - mov word ptr [ecx + edx + 01H], di - movd edi, mm5 - mov word ptr [ecx + edx*2 + 01H], di - shr edi, 16 - mov word ptr [ecx + esi + 01H], di - } -} - -static void loop_filter_h(unsigned char *_pix,int _ystride, - const ogg_int16_t *_ll){ - _pix-=2; - loop_filter_h4(_pix,_ystride,_ll); - loop_filter_h4(_pix+(_ystride<<2),_ystride,_ll); -} - - -/*We copy the whole function because the MMX routines will be inlined 4 times, - and we can do just a single emms call at the end this way. - We also do not use the _bv lookup table, instead computing the values that - would lie in it on the fly.*/ - -/*Apply the loop filter to a given set of fragment rows in the given plane. - The filter may be run on the bottom edge, affecting pixels in the next row of - fragments, so this row also needs to be available. - _bv: The bounding values array. - _refi: The index of the frame buffer to filter. - _pli: The color plane to filter. - _fragy0: The Y coordinate of the first fragment row to filter. - _fragy_end: The Y coordinate of the fragment row to stop filtering at.*/ -void oc_state_loop_filter_frag_rows_mmx(oc_theora_state *_state,int *_bv, - int _refi,int _pli,int _fragy0,int _fragy_end){ - ogg_int16_t __declspec(align(8)) ll[4]; - th_img_plane *iplane; - oc_fragment_plane *fplane; - oc_fragment *frag_top; - oc_fragment *frag0; - oc_fragment *frag; - oc_fragment *frag_end; - oc_fragment *frag0_end; - oc_fragment *frag_bot; - ll[0]=ll[1]=ll[2]=ll[3]= - (ogg_int16_t)_state->loop_filter_limits[_state->qis[0]]; - iplane=_state->ref_frame_bufs[_refi]+_pli; - fplane=_state->fplanes+_pli; - /*The following loops are constructed somewhat non-intuitively on purpose. - The main idea is: if a block boundary has at least one coded fragment on - it, the filter is applied to it. - However, the order that the filters are applied in matters, and VP3 chose - the somewhat strange ordering used below.*/ - frag_top=_state->frags+fplane->froffset; - frag0=frag_top+_fragy0*fplane->nhfrags; - frag0_end=frag0+(_fragy_end-_fragy0)*fplane->nhfrags; - frag_bot=_state->frags+fplane->froffset+fplane->nfrags; - while(frag0nhfrags; - while(fragcoded){ - if(frag>frag0){ - loop_filter_h(frag->buffer[_refi],iplane->ystride,ll); - } - if(frag0>frag_top){ - loop_filter_v(frag->buffer[_refi],iplane->ystride,ll); - } - if(frag+1coded){ - loop_filter_h(frag->buffer[_refi]+8,iplane->ystride,ll); - } - if(frag+fplane->nhfragsnhfrags)->coded){ - loop_filter_v((frag+fplane->nhfrags)->buffer[_refi], - iplane->ystride,ll); - } - } - frag++; - } - frag0+=fplane->nhfrags; - } - - /*This needs to be removed when decode specific functions are implemented:*/ - _mm_empty(); -} - -#endif - +#endif \ No newline at end of file Index: lib/dec/x86_vc/mmxstate.c =================================================================== --- lib/dec/x86_vc/mmxstate.c (revision 14720) +++ lib/dec/x86_vc/mmxstate.c (working copy) @@ -138,7 +138,7 @@ } /*Fill in the target buffer.*/ dst_framei=_state->ref_frame_idx[OC_FRAME_SELF]; - dst_ystride=_state->ref_frame_bufs[dst_framei][_pli].ystride; + dst_ystride=_state->ref_frame_bufs[dst_framei][_pli].stride; /*For now ystride values in all ref frames assumed to be equal.*/ if(_frag->mbmode==OC_MODE_INTRA){ oc_frag_recon_intra_mmx(_frag->buffer[dst_framei],dst_ystride,res_buf); @@ -149,7 +149,7 @@ int mvoffset0; int mvoffset1; ref_framei=_state->ref_frame_idx[OC_FRAME_FOR_MODE[_frag->mbmode]]; - ref_ystride=_state->ref_frame_bufs[ref_framei][_pli].ystride; + ref_ystride=_state->ref_frame_bufs[ref_framei][_pli].stride; if(oc_state_get_mv_offsets(_state,&mvoffset0,&mvoffset1,_frag->mv[0], _frag->mv[1],ref_ystride,_pli)>1){ oc_frag_recon_inter2_mmx(_frag->buffer[dst_framei],dst_ystride, @@ -176,8 +176,8 @@ int src_ystride; dst_framei=_state->ref_frame_idx[_dst_frame]; src_framei=_state->ref_frame_idx[_src_frame]; - dst_ystride=_state->ref_frame_bufs[dst_framei][_pli].ystride; - src_ystride=_state->ref_frame_bufs[src_framei][_pli].ystride; + dst_ystride=_state->ref_frame_bufs[dst_framei][_pli].stride; + src_ystride=_state->ref_frame_bufs[src_framei][_pli].stride; fragi_end=_fragis+_nfragis; for(fragi=_fragis;fragifrags+*fragi; @@ -187,196 +187,4 @@ _m_empty(); } -#endif - -/******************************************************************** - * * - * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * - * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * - * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * - * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * - * * - * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007 * - * by the Xiph.Org Foundation http://www.xiph.org/ * - * * - ******************************************************************** - - function: - last mod: $Id: - - ********************************************************************/ - -/* ------------------------------------------------------------------------ - MMX acceleration of complete fragment reconstruction algorithm. - Originally written by Rudolf Marek. - - Conversion to MSC intrinsics by Nils Pipenbrinck. - ---------------------------------------------------------------------*/ -#if defined(USE_ASM) - -#include "../../internal.h" -#include "../idct.h" -#include "x86int.h" -#include - -static const unsigned char OC_FZIG_ZAGMMX[64]= -{ - 0, 8, 1, 2, 9,16,24,17, - 10, 3,32,11,18,25, 4,12, - 5,26,19,40,33,34,41,48, - 27, 6,13,20,28,21,14, 7, - 56,49,42,35,43,50,57,36, - 15,22,29,30,23,44,37,58, - 51,59,38,45,52,31,60,53, - 46,39,47,54,61,62,55,63 -}; - -/* Fill a block with value */ -static __inline void loc_fill_mmx_value (__m64 * _dst, __m64 _value){ - __m64 t = _value; - _dst[0] = t; _dst[1] = t; _dst[2] = t; _dst[3] = t; - _dst[4] = t; _dst[5] = t; _dst[6] = t; _dst[7] = t; - _dst[8] = t; _dst[9] = t; _dst[10] = t; _dst[11] = t; - _dst[12] = t; _dst[13] = t; _dst[14] = t; _dst[15] = t; -} - -/* copy a block of 8 byte elements using different strides */ -static __inline void loc_blockcopy_mmx (unsigned char * _dst, int _dst_ystride, - unsigned char * _src, int _src_ystride){ - __m64 a,b,c,d,e,f,g,h; - a = *(__m64*)(_src + 0 * _src_ystride); - b = *(__m64*)(_src + 1 * _src_ystride); - c = *(__m64*)(_src + 2 * _src_ystride); - d = *(__m64*)(_src + 3 * _src_ystride); - e = *(__m64*)(_src + 4 * _src_ystride); - f = *(__m64*)(_src + 5 * _src_ystride); - g = *(__m64*)(_src + 6 * _src_ystride); - h = *(__m64*)(_src + 7 * _src_ystride); - *(__m64*)(_dst + 0 * _dst_ystride) = a; - *(__m64*)(_dst + 1 * _dst_ystride) = b; - *(__m64*)(_dst + 2 * _dst_ystride) = c; - *(__m64*)(_dst + 3 * _dst_ystride) = d; - *(__m64*)(_dst + 4 * _dst_ystride) = e; - *(__m64*)(_dst + 5 * _dst_ystride) = f; - *(__m64*)(_dst + 6 * _dst_ystride) = g; - *(__m64*)(_dst + 7 * _dst_ystride) = h; -} - -void oc_state_frag_recon_mmx(oc_theora_state *_state,const oc_fragment *_frag, - int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,int _ncoefs, - ogg_uint16_t _dc_iquant,const ogg_uint16_t _ac_iquant[64]){ - ogg_int16_t __declspec(align(16)) res_buf[64]; - int dst_framei; - int dst_ystride; - int zzi; - /*_last_zzi is subtly different from an actual count of the number of - coefficients we decoded for this block. - It contains the value of zzi BEFORE the final token in the block was - decoded. - In most cases this is an EOB token (the continuation of an EOB run from a - previous block counts), and so this is the same as the coefficient count. - However, in the case that the last token was NOT an EOB token, but filled - the block up with exactly 64 coefficients, _last_zzi will be less than 64. - Provided the last token was not a pure zero run, the minimum value it can - be is 46, and so that doesn't affect any of the cases in this routine. - However, if the last token WAS a pure zero run of length 63, then _last_zzi - will be 1 while the number of coefficients decoded is 64. - Thus, we will trigger the following special case, where the real - coefficient count would not. - Note also that a zero run of length 64 will give _last_zzi a value of 0, - but we still process the DC coefficient, which might have a non-zero value - due to DC prediction. - Although convoluted, this is arguably the correct behavior: it allows us to - dequantize fewer coefficients and use a smaller transform when the block - ends with a long zero run instead of a normal EOB token. - It could be smarter... multiple separate zero runs at the end of a block - will fool it, but an encoder that generates these really deserves what it - gets. - Needless to say we inherited this approach from VP3.*/ - /*Special case only having a DC component.*/ - if(_last_zzi<2){ - __m64 p; - /*Why is the iquant product rounded in this case and no others? Who knows.*/ - p = _m_from_int((ogg_int32_t)_frag->dc*_dc_iquant+15>>5); - /* broadcast 16 bits into all 4 mmx subregisters */ - p = _m_punpcklwd (p,p); - p = _m_punpckldq (p,p); - loc_fill_mmx_value ((__m64 *)res_buf, p); - } - else{ - /*Then, fill in the remainder of the coefficients with 0's, and perform - the iDCT.*/ - /*First zero the buffer.*/ - /*On K7, etc., this could be replaced with movntq and sfence.*/ - loc_fill_mmx_value ((__m64 *)res_buf, _mm_setzero_si64()); - - res_buf[0]=(ogg_int16_t)((ogg_int32_t)_frag->dc*_dc_iquant); - /*This is planned to be rewritten in MMX.*/ - for(zzi=1;zzi<_ncoefs;zzi++) - { - int ci; - ci=OC_FZIG_ZAG[zzi]; - res_buf[OC_FZIG_ZAGMMX[zzi]]=(ogg_int16_t)((ogg_int32_t)_dct_coeffs[zzi]* - _ac_iquant[ci]); - } - - if(_last_zzi<10){ - oc_idct8x8_10_mmx(res_buf); - } - else { - oc_idct8x8_mmx(res_buf); - } - } - /*Fill in the target buffer.*/ - dst_framei=_state->ref_frame_idx[OC_FRAME_SELF]; - dst_ystride=_state->ref_frame_bufs[dst_framei][_pli].ystride; - /*For now ystride values in all ref frames assumed to be equal.*/ - if(_frag->mbmode==OC_MODE_INTRA){ - oc_frag_recon_intra_mmx(_frag->buffer[dst_framei],dst_ystride,res_buf); - } - else{ - int ref_framei; - int ref_ystride; - int mvoffset0; - int mvoffset1; - ref_framei=_state->ref_frame_idx[OC_FRAME_FOR_MODE[_frag->mbmode]]; - ref_ystride=_state->ref_frame_bufs[ref_framei][_pli].ystride; - if(oc_state_get_mv_offsets(_state,&mvoffset0,&mvoffset1,_frag->mv[0], - _frag->mv[1],ref_ystride,_pli)>1){ - oc_frag_recon_inter2_mmx(_frag->buffer[dst_framei],dst_ystride, - _frag->buffer[ref_framei]+mvoffset0,ref_ystride, - _frag->buffer[ref_framei]+mvoffset1,ref_ystride,res_buf); - } - else{ - oc_frag_recon_inter_mmx(_frag->buffer[dst_framei],dst_ystride, - _frag->buffer[ref_framei]+mvoffset0,ref_ystride,res_buf); - } - } - - _mm_empty(); -} - - -void oc_state_frag_copy_mmx(const oc_theora_state *_state,const int *_fragis, - int _nfragis,int _dst_frame,int _src_frame,int _pli){ - const int *fragi; - const int *fragi_end; - int dst_framei; - int dst_ystride; - int src_framei; - int src_ystride; - dst_framei=_state->ref_frame_idx[_dst_frame]; - src_framei=_state->ref_frame_idx[_src_frame]; - dst_ystride=_state->ref_frame_bufs[dst_framei][_pli].ystride; - src_ystride=_state->ref_frame_bufs[src_framei][_pli].ystride; - fragi_end=_fragis+_nfragis; - for(fragi=_fragis;fragifrags+*fragi; - loc_blockcopy_mmx (frag->buffer[dst_framei], dst_ystride, - frag->buffer[src_framei], src_ystride); - } - _m_empty(); -} - -#endif - +#endif \ No newline at end of file Index: lib/dec/x86_vc/mmxidct.c =================================================================== --- lib/dec/x86_vc/mmxidct.c (revision 14720) +++ lib/dec/x86_vc/mmxidct.c (working copy) @@ -1003,1012 +1003,4 @@ } } -#endif - -/******************************************************************** - * * - * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * - * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * - * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * - * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * - * * - * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007 * - * by the Xiph.Org Foundation http://www.xiph.org/ * - * * - ******************************************************************** - - function: - last mod: $Id: - - ********************************************************************/ - -/* ------------------------------------------------------------------- - MMX based IDCT for the theora codec. - - Originally written by Rudolf Marek, based on code from On2's VP3. - Converted to Visual Studio inline assembly by Nils Pipenbrinck. - - ---------------------------------------------------------------------*/ -#if defined(USE_ASM) - -#include -#include "../dct.h" -#include "../idct.h" -#include "x86int.h" - -/*A table of constants used by the MMX routines.*/ -static const __declspec(align(16)) ogg_uint16_t - OC_IDCT_CONSTS[(7+1)*4]={ - (ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7, - (ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7, - (ogg_uint16_t)OC_C2S6,(ogg_uint16_t)OC_C2S6, - (ogg_uint16_t)OC_C2S6,(ogg_uint16_t)OC_C2S6, - (ogg_uint16_t)OC_C3S5,(ogg_uint16_t)OC_C3S5, - (ogg_uint16_t)OC_C3S5,(ogg_uint16_t)OC_C3S5, - (ogg_uint16_t)OC_C4S4,(ogg_uint16_t)OC_C4S4, - (ogg_uint16_t)OC_C4S4,(ogg_uint16_t)OC_C4S4, - (ogg_uint16_t)OC_C5S3,(ogg_uint16_t)OC_C5S3, - (ogg_uint16_t)OC_C5S3,(ogg_uint16_t)OC_C5S3, - (ogg_uint16_t)OC_C6S2,(ogg_uint16_t)OC_C6S2, - (ogg_uint16_t)OC_C6S2,(ogg_uint16_t)OC_C6S2, - (ogg_uint16_t)OC_C7S1,(ogg_uint16_t)OC_C7S1, - (ogg_uint16_t)OC_C7S1,(ogg_uint16_t)OC_C7S1, - 8, 8, 8, 8 -}; - - -void oc_idct8x8_10_mmx(ogg_int16_t _y[64]){ - _asm { - mov edx, [_y] - mov eax, offset OC_IDCT_CONSTS - movq mm2, [edx + 30H] - movq mm6, [eax + 10H] - movq mm4, mm2 - movq mm7, [edx + 18H] - pmulhw mm4, mm6 - movq mm1, [eax + 20H] - pmulhw mm6, mm7 - movq mm5, mm1 - pmulhw mm1, mm2 - movq mm3, [edx + 10H] - pmulhw mm5, mm7 - movq mm0, [eax] - paddw mm4, mm2 - paddw mm6, mm7 - paddw mm2, mm1 - movq mm1, [edx + 38H] - paddw mm7, mm5 - movq mm5, mm0 - pmulhw mm0, mm3 - paddw mm4, mm7 - pmulhw mm5, mm1 - movq mm7, [eax + 30H] - psubw mm6, mm2 - paddw mm0, mm3 - pmulhw mm3, mm7 - movq mm2, [edx + 20H] - pmulhw mm7, mm1 - paddw mm5, mm1 - movq mm1, mm2 - pmulhw mm2, [eax + 08H] - psubw mm3, mm5 - movq mm5, [edx + 28H] - paddw mm0, mm7 - movq mm7, mm5 - psubw mm0, mm4 - pmulhw mm5, [eax + 08H] - paddw mm2, mm1 - pmulhw mm1, [eax + 28H] - paddw mm4, mm4 - paddw mm4, mm0 - psubw mm3, mm6 - paddw mm5, mm7 - paddw mm6, mm6 - pmulhw mm7, [eax + 28H] - paddw mm6, mm3 - movq [edx + 10H], mm4 - psubw mm1, mm5 - movq mm4, [eax + 18H] - movq mm5, mm3 - pmulhw mm3, mm4 - paddw mm7, mm2 - movq [edx + 20H], mm6 - movq mm2, mm0 - movq mm6, [edx] - pmulhw mm0, mm4 - paddw mm5, mm3 - movq mm3, [edx + 08H] - psubw mm5, mm1 - paddw mm2, mm0 - psubw mm6, mm3 - movq mm0, mm6 - pmulhw mm6, mm4 - paddw mm3, mm3 - paddw mm1, mm1 - paddw mm3, mm0 - paddw mm1, mm5 - pmulhw mm4, mm3 - paddw mm6, mm0 - psubw mm6, mm2 - paddw mm2, mm2 - movq mm0, [edx + 10H] - paddw mm2, mm6 - paddw mm4, mm3 - psubw mm2, mm1 - movq mm3, [edx + 20H] - psubw mm4, mm7 - paddw mm1, mm1 - paddw mm7, mm7 - paddw mm1, mm2 - paddw mm7, mm4 - psubw mm4, mm3 - paddw mm3, mm3 - psubw mm6, mm5 - paddw mm5, mm5 - paddw mm3, mm4 - paddw mm5, mm6 - psubw mm7, mm0 - paddw mm0, mm0 - movq [edx + 10H], mm1 - paddw mm0, mm7 - movq mm1, mm4 - punpcklwd mm4, mm5 - movq [edx], mm0 - punpckhwd mm1, mm5 - movq mm0, mm6 - punpcklwd mm6, mm7 - movq mm5, mm4 - punpckldq mm4, mm6 - punpckhdq mm5, mm6 - movq mm6, mm1 - movq [edx + 08H], mm4 - punpckhwd mm0, mm7 - movq [edx + 18H], mm5 - punpckhdq mm6, mm0 - movq mm4, [edx] - punpckldq mm1, mm0 - movq mm5, [edx + 10H] - movq mm0, mm4 - movq [edx + 38H], mm6 - punpcklwd mm0, mm5 - movq [edx + 28H], mm1 - punpckhwd mm4, mm5 - movq mm5, mm2 - punpcklwd mm2, mm3 - movq mm1, mm0 - punpckldq mm0, mm2 - punpckhdq mm1, mm2 - movq mm2, mm4 - movq [edx], mm0 - punpckhwd mm5, mm3 - movq [edx + 10H], mm1 - punpckhdq mm4, mm5 - punpckldq mm2, mm5 - movq [edx + 30H], mm4 - movq [edx + 20H], mm2 - movq mm2, [edx + 70H] - movq mm6, [eax + 10H] - movq mm4, mm2 - movq mm7, [edx + 58H] - pmulhw mm4, mm6 - movq mm1, [eax + 20H] - pmulhw mm6, mm7 - movq mm5, mm1 - pmulhw mm1, mm2 - movq mm3, [edx + 50H] - pmulhw mm5, mm7 - movq mm0, [eax] - paddw mm4, mm2 - paddw mm6, mm7 - paddw mm2, mm1 - movq mm1, [edx + 78H] - paddw mm7, mm5 - movq mm5, mm0 - pmulhw mm0, mm3 - paddw mm4, mm7 - pmulhw mm5, mm1 - movq mm7, [eax + 30H] - psubw mm6, mm2 - paddw mm0, mm3 - pmulhw mm3, mm7 - movq mm2, [edx + 60H] - pmulhw mm7, mm1 - paddw mm5, mm1 - movq mm1, mm2 - pmulhw mm2, [eax + 08H] - psubw mm3, mm5 - movq mm5, [edx + 68H] - paddw mm0, mm7 - movq mm7, mm5 - psubw mm0, mm4 - pmulhw mm5, [eax + 08H] - paddw mm2, mm1 - pmulhw mm1, [eax + 28H] - paddw mm4, mm4 - paddw mm4, mm0 - psubw mm3, mm6 - paddw mm5, mm7 - paddw mm6, mm6 - pmulhw mm7, [eax + 28H] - paddw mm6, mm3 - movq [edx + 50H], mm4 - psubw mm1, mm5 - movq mm4, [eax + 18H] - movq mm5, mm3 - pmulhw mm3, mm4 - paddw mm7, mm2 - movq [edx + 60H], mm6 - movq mm2, mm0 - movq mm6, [edx + 40H] - pmulhw mm0, mm4 - paddw mm5, mm3 - movq mm3, [edx + 48H] - psubw mm5, mm1 - paddw mm2, mm0 - psubw mm6, mm3 - movq mm0, mm6 - pmulhw mm6, mm4 - paddw mm3, mm3 - paddw mm1, mm1 - paddw mm3, mm0 - paddw mm1, mm5 - pmulhw mm4, mm3 - paddw mm6, mm0 - psubw mm6, mm2 - paddw mm2, mm2 - movq mm0, [edx + 50H] - paddw mm2, mm6 - paddw mm4, mm3 - psubw mm2, mm1 - movq mm3, [edx + 60H] - psubw mm4, mm7 - paddw mm1, mm1 - paddw mm7, mm7 - paddw mm1, mm2 - paddw mm7, mm4 - psubw mm4, mm3 - paddw mm3, mm3 - psubw mm6, mm5 - paddw mm5, mm5 - paddw mm3, mm4 - paddw mm5, mm6 - psubw mm7, mm0 - paddw mm0, mm0 - movq [edx + 50H], mm1 - paddw mm0, mm7 - movq mm1, mm4 - punpcklwd mm4, mm5 - movq [edx + 40H], mm0 - punpckhwd mm1, mm5 - movq mm0, mm6 - punpcklwd mm6, mm7 - movq mm5, mm4 - punpckldq mm4, mm6 - punpckhdq mm5, mm6 - movq mm6, mm1 - movq [edx + 48H], mm4 - punpckhwd mm0, mm7 - movq [edx + 58H], mm5 - punpckhdq mm6, mm0 - movq mm4, [edx + 40H] - punpckldq mm1, mm0 - movq mm5, [edx + 50H] - movq mm0, mm4 - movq [edx + 78H], mm6 - punpcklwd mm0, mm5 - movq [edx + 68H], mm1 - punpckhwd mm4, mm5 - movq mm5, mm2 - punpcklwd mm2, mm3 - movq mm1, mm0 - punpckldq mm0, mm2 - punpckhdq mm1, mm2 - movq mm2, mm4 - movq [edx + 40H], mm0 - punpckhwd mm5, mm3 - movq [edx + 50H], mm1 - punpckhdq mm4, mm5 - punpckldq mm2, mm5 - movq [edx + 70H], mm4 - movq [edx + 60H], mm2 - movq mm2, [edx + 30H] - movq mm6, [eax + 10H] - movq mm4, mm2 - movq mm7, [edx + 50H] - pmulhw mm4, mm6 - movq mm1, [eax + 20H] - pmulhw mm6, mm7 - movq mm5, mm1 - pmulhw mm1, mm2 - movq mm3, [edx + 10H] - pmulhw mm5, mm7 - movq mm0, [eax] - paddw mm4, mm2 - paddw mm6, mm7 - paddw mm2, mm1 - movq mm1, [edx + 70H] - paddw mm7, mm5 - movq mm5, mm0 - pmulhw mm0, mm3 - paddw mm4, mm7 - pmulhw mm5, mm1 - movq mm7, [eax + 30H] - psubw mm6, mm2 - paddw mm0, mm3 - pmulhw mm3, mm7 - movq mm2, [edx + 20H] - pmulhw mm7, mm1 - paddw mm5, mm1 - movq mm1, mm2 - pmulhw mm2, [eax + 08H] - psubw mm3, mm5 - movq mm5, [edx + 60H] - paddw mm0, mm7 - movq mm7, mm5 - psubw mm0, mm4 - pmulhw mm5, [eax + 08H] - paddw mm2, mm1 - pmulhw mm1, [eax + 28H] - paddw mm4, mm4 - paddw mm4, mm0 - psubw mm3, mm6 - paddw mm5, mm7 - paddw mm6, mm6 - pmulhw mm7, [eax + 28H] - paddw mm6, mm3 - movq [edx + 10H], mm4 - psubw mm1, mm5 - movq mm4, [eax + 18H] - movq mm5, mm3 - pmulhw mm3, mm4 - paddw mm7, mm2 - movq [edx + 20H], mm6 - movq mm2, mm0 - movq mm6, [edx] - pmulhw mm0, mm4 - paddw mm5, mm3 - movq mm3, [edx + 40H] - psubw mm5, mm1 - paddw mm2, mm0 - psubw mm6, mm3 - movq mm0, mm6 - pmulhw mm6, mm4 - paddw mm3, mm3 - paddw mm1, mm1 - paddw mm3, mm0 - paddw mm1, mm5 - pmulhw mm4, mm3 - paddw mm6, mm0 - psubw mm6, mm2 - paddw mm2, mm2 - movq mm0, [edx + 10H] - paddw mm2, mm6 - paddw mm4, mm3 - psubw mm2, mm1 - paddw mm2, [eax + 38H] - paddw mm1, mm1 - paddw mm1, mm2 - psraw mm2, 4 - psubw mm4, mm7 - psraw mm1, 4 - movq mm3, [edx + 20H] - paddw mm7, mm7 - movq [edx + 20H], mm2 - paddw mm7, mm4 - movq [edx + 10H], mm1 - psubw mm4, mm3 - paddw mm4, [eax + 38H] - paddw mm3, mm3 - paddw mm3, mm4 - psraw mm4, 4 - psubw mm6, mm5 - psraw mm3, 4 - paddw mm6, [eax + 38H] - paddw mm5, mm5 - paddw mm5, mm6 - psraw mm6, 4 - movq [edx + 40H], mm4 - psraw mm5, 4 - movq [edx + 30H], mm3 - psubw mm7, mm0 - paddw mm7, [eax + 38H] - paddw mm0, mm0 - paddw mm0, mm7 - psraw mm7, 4 - movq [edx + 60H], mm6 - psraw mm0, 4 - movq [edx + 50H], mm5 - movq [edx + 70H], mm7 - movq [edx], mm0 - movq mm2, [edx + 38H] - movq mm6, [eax + 10H] - movq mm4, mm2 - movq mm7, [edx + 58H] - pmulhw mm4, mm6 - movq mm1, [eax + 20H] - pmulhw mm6, mm7 - movq mm5, mm1 - pmulhw mm1, mm2 - movq mm3, [edx + 18H] - pmulhw mm5, mm7 - movq mm0, [eax] - paddw mm4, mm2 - paddw mm6, mm7 - paddw mm2, mm1 - movq mm1, [edx + 78H] - paddw mm7, mm5 - movq mm5, mm0 - pmulhw mm0, mm3 - paddw mm4, mm7 - pmulhw mm5, mm1 - movq mm7, [eax + 30H] - psubw mm6, mm2 - paddw mm0, mm3 - pmulhw mm3, mm7 - movq mm2, [edx + 28H] - pmulhw mm7, mm1 - paddw mm5, mm1 - movq mm1, mm2 - pmulhw mm2, [eax + 08H] - psubw mm3, mm5 - movq mm5, [edx + 68H] - paddw mm0, mm7 - movq mm7, mm5 - psubw mm0, mm4 - pmulhw mm5, [eax + 08H] - paddw mm2, mm1 - pmulhw mm1, [eax + 28H] - paddw mm4, mm4 - paddw mm4, mm0 - psubw mm3, mm6 - paddw mm5, mm7 - paddw mm6, mm6 - pmulhw mm7, [eax + 28H] - paddw mm6, mm3 - movq [edx + 18H], mm4 - psubw mm1, mm5 - movq mm4, [eax + 18H] - movq mm5, mm3 - pmulhw mm3, mm4 - paddw mm7, mm2 - movq [edx + 28H], mm6 - movq mm2, mm0 - movq mm6, [edx + 08H] - pmulhw mm0, mm4 - paddw mm5, mm3 - movq mm3, [edx + 48H] - psubw mm5, mm1 - paddw mm2, mm0 - psubw mm6, mm3 - movq mm0, mm6 - pmulhw mm6, mm4 - paddw mm3, mm3 - paddw mm1, mm1 - paddw mm3, mm0 - paddw mm1, mm5 - pmulhw mm4, mm3 - paddw mm6, mm0 - psubw mm6, mm2 - paddw mm2, mm2 - movq mm0, [edx + 18H] - paddw mm2, mm6 - paddw mm4, mm3 - psubw mm2, mm1 - paddw mm2, [eax + 38H] - paddw mm1, mm1 - paddw mm1, mm2 - psraw mm2, 4 - psubw mm4, mm7 - psraw mm1, 4 - movq mm3, [edx + 28H] - paddw mm7, mm7 - movq [edx + 28H], mm2 - paddw mm7, mm4 - movq [edx + 18H], mm1 - psubw mm4, mm3 - paddw mm4, [eax + 38H] - paddw mm3, mm3 - paddw mm3, mm4 - psraw mm4, 4 - psubw mm6, mm5 - psraw mm3, 4 - paddw mm6, [eax + 38H] - paddw mm5, mm5 - paddw mm5, mm6 - psraw mm6, 4 - movq [edx + 48H], mm4 - psraw mm5, 4 - movq [edx + 38H], mm3 - psubw mm7, mm0 - paddw mm7, [eax + 38H] - paddw mm0, mm0 - paddw mm0, mm7 - psraw mm7, 4 - movq [edx + 68H], mm6 - psraw mm0, 4 - movq [edx + 58H], mm5 - movq [edx + 78H], mm7 - movq [edx + 08H], mm0 - /* emms */ - } -} - - -void oc_idct8x8_mmx(ogg_int16_t _y[64]){ - _asm { - mov edx, [_y] - mov eax, offset OC_IDCT_CONSTS - movq mm2, [edx + 30H] - movq mm6, [eax + 10H] - movq mm4, mm2 - movq mm7, [edx + 18H] - pmulhw mm4, mm6 - movq mm1, [eax + 20H] - pmulhw mm6, mm7 - movq mm5, mm1 - pmulhw mm1, mm2 - movq mm3, [edx + 10H] - pmulhw mm5, mm7 - movq mm0, [eax] - paddw mm4, mm2 - paddw mm6, mm7 - paddw mm2, mm1 - movq mm1, [edx + 38H] - paddw mm7, mm5 - movq mm5, mm0 - pmulhw mm0, mm3 - paddw mm4, mm7 - pmulhw mm5, mm1 - movq mm7, [eax + 30H] - psubw mm6, mm2 - paddw mm0, mm3 - pmulhw mm3, mm7 - movq mm2, [edx + 20H] - pmulhw mm7, mm1 - paddw mm5, mm1 - movq mm1, mm2 - pmulhw mm2, [eax + 08H] - psubw mm3, mm5 - movq mm5, [edx + 28H] - paddw mm0, mm7 - movq mm7, mm5 - psubw mm0, mm4 - pmulhw mm5, [eax + 08H] - paddw mm2, mm1 - pmulhw mm1, [eax + 28H] - paddw mm4, mm4 - paddw mm4, mm0 - psubw mm3, mm6 - paddw mm5, mm7 - paddw mm6, mm6 - pmulhw mm7, [eax + 28H] - paddw mm6, mm3 - movq [edx + 10H], mm4 - psubw mm1, mm5 - movq mm4, [eax + 18H] - movq mm5, mm3 - pmulhw mm3, mm4 - paddw mm7, mm2 - movq [edx + 20H], mm6 - movq mm2, mm0 - movq mm6, [edx] - pmulhw mm0, mm4 - paddw mm5, mm3 - movq mm3, [edx + 08H] - psubw mm5, mm1 - paddw mm2, mm0 - psubw mm6, mm3 - movq mm0, mm6 - pmulhw mm6, mm4 - paddw mm3, mm3 - paddw mm1, mm1 - paddw mm3, mm0 - paddw mm1, mm5 - pmulhw mm4, mm3 - paddw mm6, mm0 - psubw mm6, mm2 - paddw mm2, mm2 - movq mm0, [edx + 10H] - paddw mm2, mm6 - paddw mm4, mm3 - psubw mm2, mm1 - movq mm3, [edx + 20H] - psubw mm4, mm7 - paddw mm1, mm1 - paddw mm7, mm7 - paddw mm1, mm2 - paddw mm7, mm4 - psubw mm4, mm3 - paddw mm3, mm3 - psubw mm6, mm5 - paddw mm5, mm5 - paddw mm3, mm4 - paddw mm5, mm6 - psubw mm7, mm0 - paddw mm0, mm0 - movq [edx + 10H], mm1 - paddw mm0, mm7 - movq mm1, mm4 - punpcklwd mm4, mm5 - movq [edx], mm0 - punpckhwd mm1, mm5 - movq mm0, mm6 - punpcklwd mm6, mm7 - movq mm5, mm4 - punpckldq mm4, mm6 - punpckhdq mm5, mm6 - movq mm6, mm1 - movq [edx + 08H], mm4 - punpckhwd mm0, mm7 - movq [edx + 18H], mm5 - punpckhdq mm6, mm0 - movq mm4, [edx] - punpckldq mm1, mm0 - movq mm5, [edx + 10H] - movq mm0, mm4 - movq [edx + 38H], mm6 - punpcklwd mm0, mm5 - movq [edx + 28H], mm1 - punpckhwd mm4, mm5 - movq mm5, mm2 - punpcklwd mm2, mm3 - movq mm1, mm0 - punpckldq mm0, mm2 - punpckhdq mm1, mm2 - movq mm2, mm4 - movq [edx], mm0 - punpckhwd mm5, mm3 - movq [edx + 10H], mm1 - punpckhdq mm4, mm5 - punpckldq mm2, mm5 - movq [edx + 30H], mm4 - movq [edx + 20H], mm2 - movq mm2, [edx + 70H] - movq mm6, [eax + 10H] - movq mm4, mm2 - movq mm7, [edx + 58H] - pmulhw mm4, mm6 - movq mm1, [eax + 20H] - pmulhw mm6, mm7 - movq mm5, mm1 - pmulhw mm1, mm2 - movq mm3, [edx + 50H] - pmulhw mm5, mm7 - movq mm0, [eax] - paddw mm4, mm2 - paddw mm6, mm7 - paddw mm2, mm1 - movq mm1, [edx + 78H] - paddw mm7, mm5 - movq mm5, mm0 - pmulhw mm0, mm3 - paddw mm4, mm7 - pmulhw mm5, mm1 - movq mm7, [eax + 30H] - psubw mm6, mm2 - paddw mm0, mm3 - pmulhw mm3, mm7 - movq mm2, [edx + 60H] - pmulhw mm7, mm1 - paddw mm5, mm1 - movq mm1, mm2 - pmulhw mm2, [eax + 08H] - psubw mm3, mm5 - movq mm5, [edx + 68H] - paddw mm0, mm7 - movq mm7, mm5 - psubw mm0, mm4 - pmulhw mm5, [eax + 08H] - paddw mm2, mm1 - pmulhw mm1, [eax + 28H] - paddw mm4, mm4 - paddw mm4, mm0 - psubw mm3, mm6 - paddw mm5, mm7 - paddw mm6, mm6 - pmulhw mm7, [eax + 28H] - paddw mm6, mm3 - movq [edx + 50H], mm4 - psubw mm1, mm5 - movq mm4, [eax + 18H] - movq mm5, mm3 - pmulhw mm3, mm4 - paddw mm7, mm2 - movq [edx + 60H], mm6 - movq mm2, mm0 - movq mm6, [edx + 40H] - pmulhw mm0, mm4 - paddw mm5, mm3 - movq mm3, [edx + 48H] - psubw mm5, mm1 - paddw mm2, mm0 - psubw mm6, mm3 - movq mm0, mm6 - pmulhw mm6, mm4 - paddw mm3, mm3 - paddw mm1, mm1 - paddw mm3, mm0 - paddw mm1, mm5 - pmulhw mm4, mm3 - paddw mm6, mm0 - psubw mm6, mm2 - paddw mm2, mm2 - movq mm0, [edx + 50H] - paddw mm2, mm6 - paddw mm4, mm3 - psubw mm2, mm1 - movq mm3, [edx + 60H] - psubw mm4, mm7 - paddw mm1, mm1 - paddw mm7, mm7 - paddw mm1, mm2 - paddw mm7, mm4 - psubw mm4, mm3 - paddw mm3, mm3 - psubw mm6, mm5 - paddw mm5, mm5 - paddw mm3, mm4 - paddw mm5, mm6 - psubw mm7, mm0 - paddw mm0, mm0 - movq [edx + 50H], mm1 - paddw mm0, mm7 - movq mm1, mm4 - punpcklwd mm4, mm5 - movq [edx + 40H], mm0 - punpckhwd mm1, mm5 - movq mm0, mm6 - punpcklwd mm6, mm7 - movq mm5, mm4 - punpckldq mm4, mm6 - punpckhdq mm5, mm6 - movq mm6, mm1 - movq [edx + 48H], mm4 - punpckhwd mm0, mm7 - movq [edx + 58H], mm5 - punpckhdq mm6, mm0 - movq mm4, [edx + 40H] - punpckldq mm1, mm0 - movq mm5, [edx + 50H] - movq mm0, mm4 - movq [edx + 78H], mm6 - punpcklwd mm0, mm5 - movq [edx + 68H], mm1 - punpckhwd mm4, mm5 - movq mm5, mm2 - punpcklwd mm2, mm3 - movq mm1, mm0 - punpckldq mm0, mm2 - punpckhdq mm1, mm2 - movq mm2, mm4 - movq [edx + 40H], mm0 - punpckhwd mm5, mm3 - movq [edx + 50H], mm1 - punpckhdq mm4, mm5 - punpckldq mm2, mm5 - movq [edx + 70H], mm4 - movq [edx + 60H], mm2 - movq mm2, [edx + 30H] - movq mm6, [eax + 10H] - movq mm4, mm2 - movq mm7, [edx + 50H] - pmulhw mm4, mm6 - movq mm1, [eax + 20H] - pmulhw mm6, mm7 - movq mm5, mm1 - pmulhw mm1, mm2 - movq mm3, [edx + 10H] - pmulhw mm5, mm7 - movq mm0, [eax] - paddw mm4, mm2 - paddw mm6, mm7 - paddw mm2, mm1 - movq mm1, [edx + 70H] - paddw mm7, mm5 - movq mm5, mm0 - pmulhw mm0, mm3 - paddw mm4, mm7 - pmulhw mm5, mm1 - movq mm7, [eax + 30H] - psubw mm6, mm2 - paddw mm0, mm3 - pmulhw mm3, mm7 - movq mm2, [edx + 20H] - pmulhw mm7, mm1 - paddw mm5, mm1 - movq mm1, mm2 - pmulhw mm2, [eax + 08H] - psubw mm3, mm5 - movq mm5, [edx + 60H] - paddw mm0, mm7 - movq mm7, mm5 - psubw mm0, mm4 - pmulhw mm5, [eax + 08H] - paddw mm2, mm1 - pmulhw mm1, [eax + 28H] - paddw mm4, mm4 - paddw mm4, mm0 - psubw mm3, mm6 - paddw mm5, mm7 - paddw mm6, mm6 - pmulhw mm7, [eax + 28H] - paddw mm6, mm3 - movq [edx + 10H], mm4 - psubw mm1, mm5 - movq mm4, [eax + 18H] - movq mm5, mm3 - pmulhw mm3, mm4 - paddw mm7, mm2 - movq [edx + 20H], mm6 - movq mm2, mm0 - movq mm6, [edx] - pmulhw mm0, mm4 - paddw mm5, mm3 - movq mm3, [edx + 40H] - psubw mm5, mm1 - paddw mm2, mm0 - psubw mm6, mm3 - movq mm0, mm6 - pmulhw mm6, mm4 - paddw mm3, mm3 - paddw mm1, mm1 - paddw mm3, mm0 - paddw mm1, mm5 - pmulhw mm4, mm3 - paddw mm6, mm0 - psubw mm6, mm2 - paddw mm2, mm2 - movq mm0, [edx + 10H] - paddw mm2, mm6 - paddw mm4, mm3 - psubw mm2, mm1 - paddw mm2, [eax + 38H] - paddw mm1, mm1 - paddw mm1, mm2 - psraw mm2, 4 - psubw mm4, mm7 - psraw mm1, 4 - movq mm3, [edx + 20H] - paddw mm7, mm7 - movq [edx + 20H], mm2 - paddw mm7, mm4 - movq [edx + 10H], mm1 - psubw mm4, mm3 - paddw mm4, [eax + 38H] - paddw mm3, mm3 - paddw mm3, mm4 - psraw mm4, 4 - psubw mm6, mm5 - psraw mm3, 4 - paddw mm6, [eax + 38H] - paddw mm5, mm5 - paddw mm5, mm6 - psraw mm6, 4 - movq [edx + 40H], mm4 - psraw mm5, 4 - movq [edx + 30H], mm3 - psubw mm7, mm0 - paddw mm7, [eax + 38H] - paddw mm0, mm0 - paddw mm0, mm7 - psraw mm7, 4 - movq [edx + 60H], mm6 - psraw mm0, 4 - movq [edx + 50H], mm5 - movq [edx + 70H], mm7 - movq [edx], mm0 - movq mm2, [edx + 38H] - movq mm6, [eax + 10H] - movq mm4, mm2 - movq mm7, [edx + 58H] - pmulhw mm4, mm6 - movq mm1, [eax + 20H] - pmulhw mm6, mm7 - movq mm5, mm1 - pmulhw mm1, mm2 - movq mm3, [edx + 18H] - pmulhw mm5, mm7 - movq mm0, [eax] - paddw mm4, mm2 - paddw mm6, mm7 - paddw mm2, mm1 - movq mm1, [edx + 78H] - paddw mm7, mm5 - movq mm5, mm0 - pmulhw mm0, mm3 - paddw mm4, mm7 - pmulhw mm5, mm1 - movq mm7, [eax + 30H] - psubw mm6, mm2 - paddw mm0, mm3 - pmulhw mm3, mm7 - movq mm2, [edx + 28H] - pmulhw mm7, mm1 - paddw mm5, mm1 - movq mm1, mm2 - pmulhw mm2, [eax + 08H] - psubw mm3, mm5 - movq mm5, [edx + 68H] - paddw mm0, mm7 - movq mm7, mm5 - psubw mm0, mm4 - pmulhw mm5, [eax + 08H] - paddw mm2, mm1 - pmulhw mm1, [eax + 28H] - paddw mm4, mm4 - paddw mm4, mm0 - psubw mm3, mm6 - paddw mm5, mm7 - paddw mm6, mm6 - pmulhw mm7, [eax + 28H] - paddw mm6, mm3 - movq [edx + 18H], mm4 - psubw mm1, mm5 - movq mm4, [eax + 18H] - movq mm5, mm3 - pmulhw mm3, mm4 - paddw mm7, mm2 - movq [edx + 28H], mm6 - movq mm2, mm0 - movq mm6, [edx + 08H] - pmulhw mm0, mm4 - paddw mm5, mm3 - movq mm3, [edx + 48H] - psubw mm5, mm1 - paddw mm2, mm0 - psubw mm6, mm3 - movq mm0, mm6 - pmulhw mm6, mm4 - paddw mm3, mm3 - paddw mm1, mm1 - paddw mm3, mm0 - paddw mm1, mm5 - pmulhw mm4, mm3 - paddw mm6, mm0 - psubw mm6, mm2 - paddw mm2, mm2 - movq mm0, [edx + 18H] - paddw mm2, mm6 - paddw mm4, mm3 - psubw mm2, mm1 - paddw mm2, [eax + 38H] - paddw mm1, mm1 - paddw mm1, mm2 - psraw mm2, 4 - psubw mm4, mm7 - psraw mm1, 4 - movq mm3, [edx + 28H] - paddw mm7, mm7 - movq [edx + 28H], mm2 - paddw mm7, mm4 - movq [edx + 18H], mm1 - psubw mm4, mm3 - paddw mm4, [eax + 38H] - paddw mm3, mm3 - paddw mm3, mm4 - psraw mm4, 4 - psubw mm6, mm5 - psraw mm3, 4 - paddw mm6, [eax + 38H] - paddw mm5, mm5 - paddw mm5, mm6 - psraw mm6, 4 - movq [edx + 48H], mm4 - psraw mm5, 4 - movq [edx + 38H], mm3 - psubw mm7, mm0 - paddw mm7, [eax + 38H] - paddw mm0, mm0 - paddw mm0, mm7 - psraw mm7, 4 - movq [edx + 68H], mm6 - psraw mm0, 4 - movq [edx + 58H], mm5 - movq [edx + 78H], mm7 - movq [edx + 08H], mm0 - /* emms */ - } -} - -#endif - +#endif \ No newline at end of file Index: lib/dec/x86_vc/x86int.h =================================================================== --- lib/dec/x86_vc/x86int.h (revision 14720) +++ lib/dec/x86_vc/x86int.h (working copy) @@ -46,53 +46,4 @@ void oc_state_loop_filter_frag_rows_mmx(oc_theora_state *_state,int *_bv, int _refi,int _pli,int _fragy0,int _fragy_end); -#endif -/******************************************************************** - * * - * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * - * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * - * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * - * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * - * * - * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007 * - * by the Xiph.Org Foundation http://www.xiph.org/ * - * * - ******************************************************************** - - function: - last mod: $Id$ - - ********************************************************************/ - -#if !defined(_x86_x86int_vc_H) -# define _x86_x86int_vc_H (1) -# include "../../internal.h" - -void oc_state_vtable_init_x86(oc_theora_state *_state); - -void oc_frag_recon_intra_mmx(unsigned char *_dst,int _dst_ystride, - const ogg_int16_t *_residue); - -void oc_frag_recon_inter_mmx(unsigned char *_dst,int _dst_ystride, - const unsigned char *_src,int _src_ystride,const ogg_int16_t *_residue); - -void oc_frag_recon_inter2_mmx(unsigned char *_dst,int _dst_ystride, - const unsigned char *_src1,int _src1_ystride,const unsigned char *_src2, - int _src2_ystride,const ogg_int16_t *_residue); - -void oc_state_frag_copy_mmx(const oc_theora_state *_state,const int *_fragis, - int _nfragis,int _dst_frame,int _src_frame,int _pli); - -void oc_restore_fpu_mmx(void); - -void oc_state_frag_recon_mmx(oc_theora_state *_state,const oc_fragment *_frag, - int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,int _ncoefs, - ogg_uint16_t _dc_iquant,const ogg_uint16_t _ac_iquant[64]); - -void oc_idct8x8_mmx(ogg_int16_t _y[64]); -void oc_idct8x8_10_mmx(ogg_int16_t _y[64]); - -void oc_state_loop_filter_frag_rows_mmx(oc_theora_state *_state,int *_bv, - int _refi,int _pli,int _fragy0,int _fragy_end); - -#endif +#endif