[xiph-commits] r17336 - in experimental/derf/theora-ptalarbvorm/lib: . c64x
tterribe at svn.xiph.org
tterribe at svn.xiph.org
Sun Jul 18 20:43:43 PDT 2010
Author: tterribe
Date: 2010-07-18 20:43:42 -0700 (Sun, 18 Jul 2010)
New Revision: 17336
Added:
experimental/derf/theora-ptalarbvorm/lib/c64x/
experimental/derf/theora-ptalarbvorm/lib/c64x/c64xdec.c
experimental/derf/theora-ptalarbvorm/lib/c64x/c64xdec.h
experimental/derf/theora-ptalarbvorm/lib/c64x/c64xfrag.c
experimental/derf/theora-ptalarbvorm/lib/c64x/c64xidct.c
experimental/derf/theora-ptalarbvorm/lib/c64x/c64xint.h
experimental/derf/theora-ptalarbvorm/lib/c64x/c64xstate.c
Log:
Add new files that were supposed to be added with r17334.
Added: experimental/derf/theora-ptalarbvorm/lib/c64x/c64xdec.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/c64x/c64xdec.c (rev 0)
+++ experimental/derf/theora-ptalarbvorm/lib/c64x/c64xdec.c 2010-07-19 03:43:42 UTC (rev 17336)
@@ -0,0 +1,154 @@
+/********************************************************************
+ * *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
+ * *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007 *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ * *
+ ********************************************************************
+
+ function:
+ last mod: $Id$
+
+ ********************************************************************/
+#include "c64xdec.h"
+
+#if defined(OC_C64X_ASM)
+
+void oc_dec_vtable_init_c64x(oc_dec_ctx *_dec){
+ _dec->opt_vtable.dc_unpredict_mcu_plane=oc_dec_dc_unpredict_mcu_plane_c64x;
+}
+
+
+/*Undo the DC prediction in a single plane of an MCU (one or two super block
+ rows).
+ As a side effect, the number of coded and uncoded fragments in this plane of
+ the MCU is also computed.*/
+void oc_dec_dc_unpredict_mcu_plane_c64x(oc_dec_ctx *_dec,
+ oc_dec_pipeline_state *_pipe,int _pli){
+ const oc_fragment_plane *fplane;
+ oc_fragment *frags;
+ int *pred_last;
+ ptrdiff_t ncoded_fragis;
+ ptrdiff_t fragi;
+ int fragx;
+ int fragy;
+ int fragy0;
+ int fragy_end;
+ int nhfrags;
+ /*Compute the first and last fragment row of the current MCU for this
+ plane.*/
+ fplane=_dec->state.fplanes+_pli;
+ fragy0=_pipe->fragy0[_pli];
+ fragy_end=_pipe->fragy_end[_pli];
+ nhfrags=fplane->nhfrags;
+ pred_last=_pipe->pred_last[_pli];
+ frags=_dec->state.frags;
+ ncoded_fragis=0;
+ fragi=fplane->froffset+fragy0*(ptrdiff_t)nhfrags;
+ for(fragy=fragy0;fragy<fragy_end;fragy++){
+ if(fragy==0){
+ /*For the first row, all of the cases reduce to just using the previous
+ predictor for the same reference frame.*/
+ for(fragx=0;fragx<nhfrags;fragx++,fragi++){
+ int coded;
+ int ref;
+ /*The TI compiler refuses to pipeline this if we put it in an if(coded)
+ block.
+ We can do the loads unconditionally, which helps move them earlier.
+ We do the store unconditionally too, because if we use a condtional
+ store, the compiler propagates the condition back to the operations
+ the store depended on, presumably to reduce cache pressure by
+ eliminating dead loads.
+ However, these loads are "free" in the cache sense, since reading the
+ coded flag brings in all four bytes anyway, and starting the loads
+ before we know the coded flag saves 6 cycles.*/
+ ref=OC_FRAME_FOR_MODE(frags[fragi].mb_mode);
+ coded=frags[fragi].coded;
+ frags[fragi].dc=pred_last[ref]+=frags[fragi].dc&-coded;
+ ncoded_fragis+=coded;
+ }
+ }
+ else{
+ oc_fragment *u_frags;
+ int l_ref;
+ int ul_ref;
+ int u_ref;
+ u_frags=frags-nhfrags;
+ l_ref=-1;
+ ul_ref=-1;
+ u_ref=u_frags[fragi].coded?OC_FRAME_FOR_MODE(u_frags[fragi].mb_mode):-1;
+ for(fragx=0;fragx<nhfrags;fragx++,fragi++){
+ int ur_ref;
+ int ref;
+ if(fragx+1>=nhfrags)ur_ref=-1;
+ else{
+ ur_ref=u_frags[fragi+1].coded?
+ OC_FRAME_FOR_MODE(u_frags[fragi+1].mb_mode):-1;
+ }
+ ref=OC_FRAME_FOR_MODE(frags[fragi].mb_mode);
+ /*HACK: This p0 reference could potentially be out of bounds, but
+ because we know what allocator we're using, we know it can't
+ segfault.*/
+ if(frags[fragi].coded){
+ static const int OC_PRED_SCALE[16][2]={
+ {0x00000000,0x00000000},
+ {0x00000000,0x00000080},
+ {0x00800000,0x00000000},
+ {0x00000000,0x00000080},
+ {0x00000080,0x00000000},
+ {0x00000040,0x00000040},
+ {0x00000080,0x00000000},
+ {0xFF980074,0x00000074},
+ {0x00000000,0x00800000},
+ {0x00000000,0x0035004B},
+ {0x00400000,0x00400000},
+ {0x00000000,0x0035004B},
+ {0x00000080,0x00000000},
+ {0x00000000,0x0035004B},
+ {0x00180050,0x00180000},
+ {0xFF980074,0x00000074},
+ };
+ ogg_int16_t p0;
+ ogg_int16_t p1;
+ ogg_int16_t p2;
+ ogg_int16_t p3;
+ int pred;
+ int pflags;
+ /*29 cycles.*/
+ p0=u_frags[fragi-1].dc;
+ p1=u_frags[fragi].dc;
+ p2=u_frags[fragi+1].dc;
+ p3=frags[fragi-1].dc;
+ pflags=_cmpeq4(_packl4(_pack2(ur_ref,u_ref),_pack2(ul_ref,l_ref)),
+ _packl4(_pack2(ref,ref),_pack2(ref,ref)));
+ if(pflags==0)pred=pred_last[ref];
+ else{
+ pred=(_dotp2(_pack2(p0,p1),OC_PRED_SCALE[pflags][0])
+ +_dotp2(_pack2(p2,p3),OC_PRED_SCALE[pflags][1]))/128;
+ if((pflags&7)==7){
+ if(abs(pred-p1)>128)pred=p1;
+ else if(abs(pred-p3)>128)pred=p3;
+ else if(abs(pred-p0)>128)pred=p0;
+ }
+ }
+ pred_last[ref]=frags[fragi].dc+=pred;
+ ncoded_fragis++;
+ l_ref=ref;
+ }
+ else l_ref=-1;
+ ul_ref=u_ref;
+ u_ref=ur_ref;
+ }
+ }
+ }
+ _pipe->ncoded_fragis[_pli]=ncoded_fragis;
+ /*Also save the number of uncoded fragments so we know how many to copy.*/
+ _pipe->nuncoded_fragis[_pli]=
+ (fragy_end-fragy0)*(ptrdiff_t)nhfrags-ncoded_fragis;
+}
+
+#endif
Added: experimental/derf/theora-ptalarbvorm/lib/c64x/c64xdec.h
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/c64x/c64xdec.h (rev 0)
+++ experimental/derf/theora-ptalarbvorm/lib/c64x/c64xdec.h 2010-07-19 03:43:42 UTC (rev 17336)
@@ -0,0 +1,34 @@
+/********************************************************************
+ * *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
+ * *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007 *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ * *
+ ********************************************************************
+
+ function:
+ last mod: $Id$
+
+ ********************************************************************/
+#if !defined(_c64x_c64xdec_H)
+# define _c64x_c64xdec_H (1)
+# include "c64xint.h"
+
+# if defined(OC_C64X_ASM)
+# if !defined(oc_dec_dc_unpredict_mcu_plane)
+# define oc_dec_dc_unpredict_mcu_plane oc_dec_dc_unpredict_mcu_plane_c64x
+# endif
+# endif
+
+# include "../decint.h"
+
+void oc_dec_vtable_init_c64x(oc_dec_ctx *_dec);
+
+void oc_dec_dc_unpredict_mcu_plane_c64x(oc_dec_ctx *_dec,
+ oc_dec_pipeline_state *_pipe,int _pli);
+
+#endif
Added: experimental/derf/theora-ptalarbvorm/lib/c64x/c64xfrag.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/c64x/c64xfrag.c (rev 0)
+++ experimental/derf/theora-ptalarbvorm/lib/c64x/c64xfrag.c 2010-07-19 03:43:42 UTC (rev 17336)
@@ -0,0 +1,451 @@
+/********************************************************************
+ * *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
+ * *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007 *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ * *
+ ********************************************************************
+
+ function:
+ last mod: $Id$
+
+ ********************************************************************/
+#include <string.h>
+#include "c64xint.h"
+
+
+
+/*14 cycles.*/
+void oc_frag_copy_c64x(unsigned char *restrict _dst,
+ const unsigned char *restrict _src,int _ystride){
+ unsigned char *restrict d2;
+ const unsigned char *restrict s2;
+ d2=_dst+_ystride;
+ s2=_src+_ystride;
+#define OC_ITER() \
+ do{ \
+ _amem8(_dst)=_mem8(_src); \
+ _dst+=2*_ystride; \
+ _src+=2*_ystride; \
+ _amem8(d2)=_mem8(s2); \
+ d2+=2*_ystride; \
+ s2+=2*_ystride; \
+ } \
+ while(0)
+ OC_ITER();
+ OC_ITER();
+ OC_ITER();
+ OC_ITER();
+#undef OC_ITER
+}
+
+/*34 cycles.*/
+void oc_frag_recon_intra_c64x(unsigned char *_dst,int _ystride,
+ const ogg_int16_t _residue[64]){
+ int i;
+ for(i=0;i<8;i++){
+ long long ll;
+ int x1;
+ int y1;
+ int x2;
+ int y2;
+ ll=_amem8_const(_residue+i*8+0);
+ x1=_sadd2(_loll(ll),0x00800080);
+ y1=_sadd2(_hill(ll),0x00800080);
+ ll=_amem8_const(_residue+i*8+4);
+ x2=_sadd2(_loll(ll),0x00800080);
+ y2=_sadd2(_hill(ll),0x00800080);
+ _amem8(_dst)=_itoll(_spacku4(y2,x2),_spacku4(y1,x1));
+ _dst+=_ystride;
+ }
+}
+
+/*41 cycles.*/
+void oc_frag_recon_inter_c64x(unsigned char *_dst,const unsigned char *_src,
+ int _ystride,const ogg_int16_t _residue[64]){
+ int i;
+ for(i=0;i<8;i++){
+ long long ll;
+ int x1;
+ int y1;
+ int z1;
+ int x2;
+ int y2;
+ int z2;
+ ll=_mem8_const(_src);
+ z1=_loll(ll);
+ z2=_hill(ll);
+ ll=_amem8_const(_residue+i*8+0);
+ x1=_sadd2(_unpklu4(z1),_loll(ll));
+ y1=_sadd2(_unpkhu4(z1),_hill(ll));
+ ll=_amem8_const(_residue+i*8+4);
+ x2=_sadd2(_unpklu4(z2),_loll(ll));
+ y2=_sadd2(_unpkhu4(z2),_hill(ll));
+ _amem8(_dst)=_itoll(_spacku4(y2,x2),_spacku4(y1,x1));
+ _dst+=_ystride;
+ _src+=_ystride;
+ }
+}
+
+/*56 cycles.*/
+void oc_frag_recon_inter2_c64x(unsigned char *_dst,
+ const unsigned char *_src1,const unsigned char *_src2,int _ystride,
+ const ogg_int16_t _residue[64]){
+ int i;
+ for(i=0;i<8;i++){
+ long long ll;
+ int a;
+ int b;
+ int c;
+ int d;
+ int x1;
+ int y1;
+ int z1;
+ int x2;
+ int y2;
+ int z2;
+ ll=_mem8_const(_src1);
+ a=_loll(ll);
+ b=_hill(ll);
+ ll=_mem8_const(_src2);
+ c=_loll(ll);
+ d=_hill(ll);
+ ll=_amem8_const(_residue+i*8+0);
+ z1=~_avgu4(~a,~c);
+ x1=_sadd2(_unpklu4(z1),_loll(ll));
+ y1=_sadd2(_unpkhu4(z1),_hill(ll));
+ ll=_amem8_const(_residue+i*8+4);
+ z2=~_avgu4(~b,~d);
+ x2=_sadd2(_unpklu4(z2),_loll(ll));
+ y2=_sadd2(_unpkhu4(z2),_hill(ll));
+ _amem8(_dst)=_itoll(_spacku4(y2,x2),_spacku4(y1,x1));
+ _dst+=_ystride;
+ _src1+=_ystride;
+ _src2+=_ystride;
+ }
+}
+
+void oc_state_frag_recon_c64x(const oc_theora_state *_state,ptrdiff_t _fragi,
+ int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant){
+ unsigned char *dst;
+ ptrdiff_t frag_buf_off;
+ int ystride;
+ int mb_mode;
+ /*Apply the inverse transform.*/
+ /*Special case only having a DC component.*/
+ if(_last_zzi<2){
+ ogg_int16_t p;
+ int ci;
+ /*We round this dequant product (and not any of the others) because there's
+ no iDCT rounding.*/
+ p=(ogg_int16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5);
+ /*LOOP VECTORIZES.*/
+ for(ci=0;ci<64;ci++)_dct_coeffs[ci]=p;
+ }
+ else{
+ /*First, dequantize the DC coefficient.*/
+ _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
+ oc_idct8x8_c64x(_dct_coeffs,_last_zzi);
+ }
+ /*Fill in the target buffer.*/
+ frag_buf_off=_state->frag_buf_offs[_fragi];
+ mb_mode=_state->frags[_fragi].mb_mode;
+ ystride=_state->ref_ystride[_pli];
+ dst=_state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_SELF]]+frag_buf_off;
+ if(mb_mode==OC_MODE_INTRA)oc_frag_recon_intra_c64x(dst,ystride,_dct_coeffs);
+ else{
+ const unsigned char *ref;
+ int mvoffsets[2];
+ ref=
+ _state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_FOR_MODE(mb_mode)]]
+ +frag_buf_off;
+ if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
+ _state->frag_mvs[_fragi][0],_state->frag_mvs[_fragi][1])>1){
+ oc_frag_recon_inter2_c64x(dst,ref+mvoffsets[0],ref+mvoffsets[1],
+ ystride,_dct_coeffs);
+ }
+ else oc_frag_recon_inter_c64x(dst,ref+mvoffsets[0],ystride,_dct_coeffs);
+ }
+}
+
+void oc_state_frag_copy_list_c64x(const oc_theora_state *_state,
+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
+ int _dst_frame,int _src_frame,int _pli){
+ const ptrdiff_t *frag_buf_offs;
+ const unsigned char *src_frame_data;
+ unsigned char *dst_frame_data;
+ ptrdiff_t fragii;
+ int ystride;
+ dst_frame_data=_state->ref_frame_data[_state->ref_frame_idx[_dst_frame]];
+ src_frame_data=_state->ref_frame_data[_state->ref_frame_idx[_src_frame]];
+ ystride=_state->ref_ystride[_pli];
+ frag_buf_offs=_state->frag_buf_offs;
+ /*9 cycles per iteration.*/
+ for(fragii=0;fragii<_nfragis;fragii++){
+ const unsigned char *restrict src;
+ const unsigned char *restrict s2;
+ unsigned char *restrict dst;
+ unsigned char *restrict d2;
+ ptrdiff_t frag_buf_off;
+ frag_buf_off=frag_buf_offs[_fragis[fragii]];
+ dst=dst_frame_data+frag_buf_off;
+ src=src_frame_data+frag_buf_off;
+ d2=dst+ystride;
+ s2=src+ystride;
+#define OC_ITER() \
+ do{ \
+ _amem8(dst)=_amem8_const(src); \
+ dst+=2*ystride; \
+ src+=2*ystride; \
+ _amem8(d2)=_amem8_const(s2); \
+ d2+=2*ystride; \
+ s2+=2*ystride; \
+ } \
+ while(0)
+ OC_ITER();
+ OC_ITER();
+ OC_ITER();
+ OC_ITER();
+#undef OC_ITER
+ }
+}
+
+/*46 cycles.*/
+static void loop_filter_h(unsigned char *restrict _pix,int _ystride,int _ll){
+ int p0;
+ int p1;
+ int p2;
+ int p3;
+ int p4;
+ int p5;
+ int p6;
+ int p7;
+ int y;
+ _pix-=2;
+ /*Do all the loads now to avoid the compiler's inability to prove they're not
+ dependent on the stores later.*/
+ p0=_mem4(_pix+_ystride*0);
+ p1=_mem4(_pix+_ystride*1);
+ p2=_mem4(_pix+_ystride*2);
+ p3=_mem4(_pix+_ystride*3);
+ p4=_mem4(_pix+_ystride*4);
+ p5=_mem4(_pix+_ystride*5);
+ p6=_mem4(_pix+_ystride*6);
+ p7=_mem4(_pix+_ystride*7);
+ for(y=0;y<8;y+=4){
+ int f;
+ int a;
+ int b;
+ int u;
+ int v;
+ /*We could pack things right after the dot product, but delaying it
+ actually saves three cycles due to better instruction scheduling.*/
+ a=_dotpsu4(0x01FD03FF,p0)+3>>3;
+ b=_dotpsu4(0x01FD03FF,p1)+3>>3;
+ u=_dotpsu4(0x01FD03FF,p2)+3>>3;
+ v=_dotpsu4(0x01FD03FF,p3)+3>>3;
+ f=_packl4(_pack2(v,u),_pack2(b,a));
+ /*We split the results by sign and work with abs(f) here, since the C64x
+ signed-unsigned addition with unsigned saturation is only available for
+ 16-bit operands.
+ For 8-bit operands, we have to emulate it with a saturated addition and a
+ saturated subtraction using separate unsigned values.
+ There's no direct support for 8-bit saturated subtraction, either, so we
+ have to emulate that as well, using either x-_minu4(x,y) or
+ ~_saddu4(~x,y), depending on which one schedules better.*/
+ f=_add4(0x80808080,f);
+ b=_minu4(0x80808080,f);
+ a=0x80808080-b;
+ b=f-b;
+ /*Compute f=clamp(0,2*L-abs(f),abs(f)).*/
+ u=_saddu4(a,_ll);
+ v=_saddu4(b,_ll);
+ a=_saddu4(a,u);
+ b=_saddu4(b,v);
+ a=a-_minu4(a,u);
+ b=b-_minu4(b,v);
+ /*Apply the changes to the original pixels.*/
+ u=_pack2(p1>>8,p0>>8);
+ v=_pack2(p3>>8,p2>>8);
+ p1=_packl4(v,u);
+ p2=_packh4(v,u);
+ p1=_saddu4(~_saddu4(~p1,b),a);
+ p2=_saddu4(p2-_minu4(p2,a),b);
+ /*For unaligned short stores, we have to store byte by byte.
+ It's faster to do it explicitly than to use _mem2().*/
+ _pix[_ystride*0+1]=(unsigned char)p1;
+ _pix[_ystride*0+2]=(unsigned char)p2;
+ _pix[_ystride*1+1]=(unsigned char)(p1>>8);
+ _pix[_ystride*1+2]=(unsigned char)(p2>>8);
+ _pix[_ystride*2+1]=(unsigned char)(p1>>16);
+ _pix[_ystride*2+2]=(unsigned char)(p2>>16);
+ _pix[_ystride*3+1]=(unsigned char)(p1>>24);
+ _pix[_ystride*3+2]=(unsigned char)(p2>>24);
+ p0=p4;
+ p1=p5;
+ p2=p6;
+ p3=p7;
+ _pix+=4*_ystride;
+ }
+}
+
+/*38 cycles.*/
+static void loop_filter_v(unsigned char * restrict _pix,int _ystride,int _ll){
+ long long ll;
+ int p0;
+ int p1;
+ int p2;
+ int p3;
+ int p4;
+ int p5;
+ int p6;
+ int p7;
+ int a1;
+ int b1;
+ int f1;
+ int m1;
+ int u1;
+ int v1;
+ int a2;
+ int b2;
+ int f2;
+ int m2;
+ int u2;
+ int v2;
+ /*Do all the loads now to avoid the compiler's inability to prove they're not
+ dependent on the stores later.*/
+ ll=_amem8(_pix-_ystride*2);
+ p0=_loll(ll);
+ p4=_hill(ll);
+ ll=_amem8(_pix-_ystride*1);
+ p1=_loll(ll);
+ p5=_hill(ll);
+ ll=_amem8(_pix+_ystride*0);
+ p2=_loll(ll);
+ p6=_hill(ll);
+ ll=_amem8(_pix+_ystride*1);
+ p3=_loll(ll);
+ p7=_hill(ll);
+ /*I can't find a way to put the rest in a loop that the compiler thinks is
+ unrollable, so instead it's unrolled manually.*/
+ /*This first part is based on the transformation
+ f = -(3*(p2-p1)+p0-p3+4>>3)
+ = -(3*(p2+255-p1)+(p0+255-p3)+4-1020>>3)
+ = -(3*(p2+~p1)+(p0+~p3)-1016>>3)
+ = 127-(3*(p2+~p1)+(p0+~p3)>>3)
+ = 128+~(3*(p2+~p1)+(p0+~p3)>>3) (mod 256).
+ Although _avgu4(a,b) = (a+b+1>>1) (biased up), we rely heavily on the
+ fact that ~_avgu4(~a,~b) = (a+b>>1) (biased down).*/
+ /*We need this first average both biased up and biased down.*/
+ u1=~_avgu4(~p1,p2);
+ v1=_avgu4(p1,~p2);
+ /*The difference controls whether (p3+255-p0>>1) is biased up or down.*/
+ m1=_sub4(u1,v1);
+ a1=m1^_avgu4(m1^~p0,m1^p3);
+ f1=_avgu4(_avgu4(a1,u1),v1);
+ /*Instead of removing the bias by 128, we use it to split f by sign, since
+ the C64x signed-unsigned addition with unsigned saturation is only
+ available for 16-bit operands.
+ For 8-bit operands, we have to emulate it with a saturated addition and a
+ saturated subtraction using separate unsigned values.
+ There's no direct support for 8-bit saturated subtraction, either, so we
+ have to emulate that as well, using either x-_minu4(x,y) or
+ ~_saddu4(~x,y), depending on which one schedules better.*/
+ b1=_minu4(0x80808080,f1);
+ a1=0x80808080-b1;
+ b1=f1-b1;
+ /*Compute f=clamp(0,2*L-abs(f),abs(f)).*/
+ u1=_saddu4(a1,_ll);
+ v1=_saddu4(b1,_ll);
+ a1=_saddu4(a1,u1);
+ b1=_saddu4(b1,v1);
+ a1=a1-_minu4(a1,u1);
+ b1=b1-_minu4(b1,v1);
+ /*Apply the changes to the original pixels.*/
+ p1=_saddu4(p1-_minu4(p1,b1),a1);
+ p2=_saddu4(p2-_minu4(p2,a1),b1);
+ /*We need this first average both biased up and biased down.*/
+ u2=~_avgu4(~p5,p6);
+ v2=_avgu4(p5,~p6);
+ /*The difference controls whether (p3+255-p0>>1) is biased up or down.*/
+ m2=_sub4(u2,v2);
+ a2=m2^_avgu4(m2^~p4,m2^p7);
+ f2=_avgu4(_avgu4(a2,u2),v2);
+ /*Instead of removing the bias by 128, we use it to split f by sign.*/
+ b2=_minu4(0x80808080,f2);
+ a2=0x80808080-b2;
+ b2=f2-b2;
+ /*Compute f=clamp(0,2*L-abs(f),abs(f)).*/
+ u2=_saddu4(a2,_ll);
+ v2=_saddu4(b2,_ll);
+ a2=_saddu4(a2,u2);
+ b2=_saddu4(b2,v2);
+ a2=a2-_minu4(a2,u2);
+ b2=b2-_minu4(b2,v2);
+ /*Apply the changes to the original pixels.*/
+ p5=_saddu4(p5-_minu4(p5,b2),a2);
+ p6=_saddu4(p6-_minu4(p6,a2),b2);
+ /*Write out the results.*/
+ _amem8(_pix-_ystride)=_itoll(p5,p1);
+ _amem8(_pix)=_itoll(p6,p2);
+}
+
+
+void oc_state_loop_filter_frag_rows_c64x(const oc_theora_state *_state,int *_bv,
+ int _refi,int _pli,int _fragy0,int _fragy_end){
+ const oc_fragment_plane *fplane;
+ const oc_fragment *frags;
+ const ptrdiff_t *frag_buf_offs;
+ unsigned char *ref_frame_data;
+ ptrdiff_t fragi_top;
+ ptrdiff_t fragi_bot;
+ ptrdiff_t fragi0;
+ ptrdiff_t fragi0_end;
+ int ystride;
+ int nhfrags;
+ int ll;
+ fplane=_state->fplanes+_pli;
+ nhfrags=fplane->nhfrags;
+ fragi_top=fplane->froffset;
+ fragi_bot=fragi_top+fplane->nfrags;
+ fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags;
+ fragi0_end=fragi0+(_fragy_end-_fragy0)*(ptrdiff_t)nhfrags;
+ ystride=_state->ref_ystride[_pli];
+ frags=_state->frags;
+ frag_buf_offs=_state->frag_buf_offs;
+ ref_frame_data=_state->ref_frame_data[_refi];
+ ll=_state->loop_filter_limits[_state->qis[0]]<<1;
+ ll=_pack2(ll,ll);
+ ll=~_spacku4(ll,ll);
+ /*The following loops are constructed somewhat non-intuitively on purpose.
+ The main idea is: if a block boundary has at least one coded fragment on
+ it, the filter is applied to it.
+ However, the order that the filters are applied in matters, and VP3 chose
+ the somewhat strange ordering used below.*/
+ while(fragi0<fragi0_end){
+ ptrdiff_t fragi;
+ ptrdiff_t fragi_end;
+ fragi=fragi0;
+ fragi_end=fragi+nhfrags;
+ while(fragi<fragi_end){
+ if(frags[fragi].coded){
+ unsigned char *ref;
+ ref=ref_frame_data+frag_buf_offs[fragi];
+ if(fragi>fragi0)loop_filter_h(ref,ystride,ll);
+ if(fragi0>fragi_top)loop_filter_v(ref,ystride,ll);
+ if(fragi+1<fragi_end&&!frags[fragi+1].coded){
+ loop_filter_h(ref+8,ystride,ll);
+ }
+ if(fragi+nhfrags<fragi_bot&&!frags[fragi+nhfrags].coded){
+ loop_filter_v(ref+(ystride<<3),ystride,ll);
+ }
+ }
+ fragi++;
+ }
+ fragi0+=nhfrags;
+ }
+}
Added: experimental/derf/theora-ptalarbvorm/lib/c64x/c64xidct.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/c64x/c64xidct.c (rev 0)
+++ experimental/derf/theora-ptalarbvorm/lib/c64x/c64xidct.c 2010-07-19 03:43:42 UTC (rev 17336)
@@ -0,0 +1,399 @@
+/********************************************************************
+ * *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
+ * *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ * *
+ ********************************************************************
+
+ function:
+ last mod: $Id$
+
+ ********************************************************************/
+#include <string.h>
+#include "c64xint.h"
+#include "dct.h"
+
+#define DOC_C1S7 ((OC_C1S7<<16)|(OC_C1S7&0xffff))
+#define DOC_C2S6 ((OC_C2S6<<16)|(OC_C2S6&0xffff))
+#define DOC_C3S5 ((OC_C3S5<<16)|(OC_C3S5&0xffff))
+#define DOC_C4S4 ((OC_C4S4<<16)|(OC_C4S4&0xffff))
+#define DOC_C5S3 ((OC_C5S3<<16)|(OC_C5S3&0xffff))
+#define DOC_C6S2 ((OC_C6S2<<16)|(OC_C6S2&0xffff))
+#define DOC_C7S1 ((OC_C7S1<<16)|(OC_C7S1&0xffff))
+
+/*Various building blocks for the iDCT implementations.
+ These are done in macros instead of functions so that we can use all local
+ variables, which avoids leaving the compiler to try to sort out memory
+ reference dependencies.*/
+
+/*Load two rows into x0...x7.*/
+#define OC_IDCT8x2_LOAD8(_x) \
+ do{ \
+ long long ll; \
+ ll=_dpack2(_amem4_const((_x)+8),_amem4_const((_x)+0)); \
+ x0=_loll(ll); \
+ x1=_hill(ll); \
+ ll=_dpack2(_amem4_const((_x)+10),_amem4_const((_x)+2)); \
+ x2=_loll(ll); \
+ x3=_hill(ll); \
+ ll=_dpack2(_amem4_const((_x)+12),_amem4_const((_x)+4)); \
+ x4=_loll(ll); \
+ x5=_hill(ll); \
+ ll=_dpack2(_amem4_const((_x)+14),_amem4_const((_x)+6)); \
+ x6=_loll(ll); \
+ x7=_hill(ll); \
+ } \
+ while(0)
+
+/*Load two rows into x0...x3.
+ Uses ll as a temporary.*/
+#define OC_IDCT8x2_LOAD4(_x) \
+ do{ \
+ long long ll; \
+ ll=_dpack2(_amem4_const((_x)+8),_amem4_const((_x)+0)); \
+ x0=_loll(ll); \
+ x1=_hill(ll); \
+ ll=_dpack2(_amem4_const((_x)+10),_amem4_const((_x)+2)); \
+ x2=_loll(ll); \
+ x3=_hill(ll); \
+ } \
+ while(0)
+
+/*Load two rows into x0...x1.*/
+#define OC_IDCT8x2_LOAD2(_x) \
+ do{ \
+ long long ll; \
+ ll=_dpack2(_amem4_const((_x)+8),_amem4_const((_x)+0)); \
+ x0=_loll(ll); \
+ x1=_hill(ll); \
+ } \
+ while(0)
+
+/*Load two columns into x0...x1.*/
+#define OC_IDCT8x2_LOAD2T(_x) \
+ do{ \
+ x0=_amem4_const((_x)+(0<<3)); \
+ x1=_amem4_const((_x)+(1<<3)); \
+ } \
+ while(0)
+
+/*Transform x0...x7 into t0...t7.*/
+#define OC_IDCT8x2() \
+ do{ \
+ long long ll; \
+ int a; \
+ int b; \
+ /*Stage 1:*/ \
+ ll=_addsub2(x0,x4); \
+ a=_hill(ll); \
+ b=_loll(ll); \
+ t0=_packh2(_mpyhus(DOC_C4S4,a),_mpyus(DOC_C4S4,a)); \
+ t1=_packh2(_mpyhus(DOC_C4S4,b),_mpyus(DOC_C4S4,b)); \
+ ll=_mpy2ll(DOC_C6S2,x2); \
+ a=_packh2(_hill(ll),_loll(ll)); \
+ ll=_mpy2ll(DOC_C2S6,x6); \
+ b=_add2(_packh2(_hill(ll),_loll(ll)),x6); \
+ t2=_sub2(a,b); \
+ ll=_mpy2ll(DOC_C2S6,x2); \
+ a=_add2(_packh2(_hill(ll),_loll(ll)),x2); \
+ ll=_mpy2ll(DOC_C6S2,x6); \
+ b=_packh2(_hill(ll),_loll(ll)); \
+ t3=_add2(a,b); \
+ ll=_mpy2ll(DOC_C7S1,x1); \
+ a=_packh2(_hill(ll),_loll(ll)); \
+ ll=_mpy2ll(DOC_C1S7,x7); \
+ b=_add2(_packh2(_hill(ll),_loll(ll)),x7); \
+ t4=_sub2(a,b); \
+ ll=_mpy2ll(DOC_C3S5,x5); \
+ a=_add2(_packh2(_hill(ll),_loll(ll)),x5); \
+ ll=_mpy2ll(DOC_C5S3,x3); \
+ b=_add2(_packh2(_hill(ll),_loll(ll)),x3); \
+ t5=_sub2(a,b); \
+ ll=_mpy2ll(DOC_C5S3,x5); \
+ a=_add2(_packh2(_hill(ll),_loll(ll)),x5); \
+ ll=_mpy2ll(DOC_C3S5,x3); \
+ b=_add2(_packh2(_hill(ll),_loll(ll)),x3); \
+ t6=_add2(a,b); \
+ ll=_mpy2ll(DOC_C1S7,x1); \
+ a=_add2(_packh2(_hill(ll),_loll(ll)),x1); \
+ ll=_mpy2ll(DOC_C7S1,x7); \
+ b=_packh2(_hill(ll),_loll(ll)); \
+ t7=_add2(a,b); \
+ /*Stage 2:*/ \
+ ll=_addsub2(t4,t5); \
+ t4=_hill(ll); \
+ b=_loll(ll); \
+ ll=_mpy2ll(DOC_C4S4,b); \
+ t5=_add2(_packh2(_hill(ll),_loll(ll)),b); \
+ ll=_addsub2(t7,t6); \
+ t7=_hill(ll); \
+ b=_loll(ll); \
+ ll=_mpy2ll(DOC_C4S4,b); \
+ t6=_add2(_packh2(_hill(ll),_loll(ll)),b); \
+ /*Stage 3:*/ \
+ ll=_addsub2(t0,t3); \
+ t0=_hill(ll); \
+ t3=_loll(ll); \
+ ll=_addsub2(t1,t2); \
+ t1=_hill(ll); \
+ t2=_loll(ll); \
+ ll=_addsub2(t6,t5); \
+ t6=_hill(ll); \
+ t5=_loll(ll); \
+ } \
+ while(0)
+
+/*Transform x0...x3 into t0...t7, assuming x4...x7 are zero.*/
+#define OC_IDCT8x2_4() \
+ do{ \
+ long long ll; \
+ int a; \
+ /*Stage 1:*/ \
+ ll=_mpy2ll(DOC_C4S4,x0); \
+ t0=_add2(_packh2(_hill(ll),_loll(ll)),x0); \
+ t1=t0; \
+ ll=_mpy2ll(DOC_C6S2,x2); \
+ t2=_packh2(_hill(ll),_loll(ll)); \
+ ll=_mpy2ll(DOC_C2S6,x2); \
+ t3=_add2(_packh2(_hill(ll),_loll(ll)),x2); \
+ ll=_mpy2ll(DOC_C7S1,x1); \
+ t4=_packh2(_hill(ll),_loll(ll)); \
+ ll=_mpy2ll(DOC_C5S3,x3); \
+ t5=_add2(_packh2(_hill(ll),_loll(ll)),x3); \
+ ll=_mpy2ll(DOC_C3S5,x3); \
+ t6=_add2(_packh2(_hill(ll),_loll(ll)),x3); \
+ ll=_mpy2ll(DOC_C1S7,x1); \
+ t7=_add2(_packh2(_hill(ll),_loll(ll)),x1); \
+ /*Stage 2:*/ \
+ ll=_addsub2(t4,t5); \
+ t4=_loll(ll); \
+ a=_hill(ll); \
+ ll=_mpy2ll(DOC_C4S4,a); \
+ t5=_add2(_packh2(_hill(ll),_loll(ll)),a); \
+ ll=_addsub2(t7,t6); \
+ t7=_hill(ll); \
+ a=_loll(ll); \
+ ll=_mpy2ll(DOC_C4S4,a); \
+ t6=_add2(_packh2(_hill(ll),_loll(ll)),a); \
+ /*Stage 3:*/ \
+ ll=_addsub2(t0,t3); \
+ t0=_hill(ll); \
+ t3=_loll(ll); \
+ ll=_addsub2(t1,t2); \
+ t1=_hill(ll); \
+ t2=_loll(ll); \
+ ll=_addsub2(t6,t5); \
+ t6=_hill(ll); \
+ t5=_loll(ll); \
+ } \
+ while(0)
+
+/*Transform x0...x1 into t0...t7, assuming x2...x7 are zero.*/
+#define OC_IDCT8x2_2() \
+ do{ \
+ long long ll; \
+ /*Stage 1:*/ \
+ ll=_mpy2ll(DOC_C4S4,x0); \
+ t0=_add2(_packh2(_hill(ll),_loll(ll)),x0); \
+ t1=t0; \
+ ll=_mpy2ll(DOC_C7S1,x1); \
+ t4=_packh2(_hill(ll),_loll(ll)); \
+ ll=_mpy2ll(DOC_C1S7,x1); \
+ t7=_add2(_packh2(_hill(ll),_loll(ll)),x1); \
+ /*Stage 2:*/ \
+ ll=_mpy2ll(DOC_C4S4,t4); \
+ t5=_add2(_packh2(_hill(ll),_loll(ll)),t4); \
+ ll=_mpy2ll(DOC_C4S4,t7); \
+ t6=_add2(_packh2(_hill(ll),_loll(ll)),t7); \
+ /*Stage 3:*/ \
+ t3=t0; \
+ t2=t1; \
+ ll=_addsub2(t6,t5); \
+ t6=_hill(ll); \
+ t5=_loll(ll); \
+ } \
+ while(0)
+
+/*Finish transforming t0...t7 and store two rows.*/
+#define OC_IDCT8x2_STORE(_y) \
+ do{ \
+ long long ll; \
+ int a; \
+ int b; \
+ int c; \
+ int d; \
+ /*Stage 4:*/ \
+ ll=_addsub2(t0,t7); \
+ a=_hill(ll); \
+ c=_loll(ll); \
+ ll=_addsub2(t1,t6); \
+ b=_hill(ll); \
+ d=_loll(ll); \
+ ll=_dpack2(b,a); \
+ _amem4((_y)+0)=_loll(ll); \
+ _amem4((_y)+8)=_hill(ll); \
+ ll=_dpack2(c,d); \
+ _amem4((_y)+6)=_loll(ll); \
+ _amem4((_y)+14)=_hill(ll); \
+ ll=_addsub2(t2,t5); \
+ a=_hill(ll); \
+ c=_loll(ll); \
+ ll=_addsub2(t3,t4); \
+ b=_hill(ll); \
+ d=_loll(ll); \
+ ll=_dpack2(b,a); \
+ _amem4((_y)+2)=_loll(ll); \
+ _amem4((_y)+10)=_hill(ll); \
+ ll=_dpack2(c,d); \
+ _amem4((_y)+4)=_loll(ll); \
+ _amem4((_y)+12)=_hill(ll); \
+ } \
+ while(0)
+
+/*Finish transforming t0...t7 and store two columns.*/
+#define OC_IDCT8x2_STORET(_y) \
+ do{ \
+ long long ll; \
+ /*Stage 4:*/ \
+ ll=_addsub2(t0,t7); \
+ _amem4((_y)+(0<<3))=_hill(ll); \
+ _amem4((_y)+(7<<3))=_loll(ll); \
+ ll=_addsub2(t1,t6); \
+ _amem4((_y)+(1<<3))=_hill(ll); \
+ _amem4((_y)+(6<<3))=_loll(ll); \
+ ll=_addsub2(t2,t5); \
+ _amem4((_y)+(2<<3))=_hill(ll); \
+ _amem4((_y)+(5<<3))=_loll(ll); \
+ ll=_addsub2(t3,t4); \
+ _amem4((_y)+(3<<3))=_hill(ll); \
+ _amem4((_y)+(4<<3))=_loll(ll); \
+ } \
+ while(0)
+
+/*Finish transforming t0...t7, round and scale, and store two columns.*/
+#define OC_IDCT8x2_ROUND_STORET(_y) \
+ do{ \
+ long long ll; \
+ /*Stage 4:*/ \
+ /*Adjust for the scale factor.*/ \
+ ll=_addsub2(t0,t7); \
+ _amem4((_y)+(0<<3))=_shr2(_add2(_hill(ll),0x00080008),4); \
+ _amem4((_y)+(7<<3))=_shr2(_add2(_loll(ll),0x00080008),4); \
+ ll=_addsub2(t1,t6); \
+ _amem4((_y)+(1<<3))=_shr2(_add2(_hill(ll),0x00080008),4); \
+ _amem4((_y)+(6<<3))=_shr2(_add2(_loll(ll),0x00080008),4); \
+ ll=_addsub2(t2,t5); \
+ _amem4((_y)+(2<<3))=_shr2(_add2(_hill(ll),0x00080008),4); \
+ _amem4((_y)+(5<<3))=_shr2(_add2(_loll(ll),0x00080008),4); \
+ ll=_addsub2(t3,t4); \
+ _amem4((_y)+(3<<3))=_shr2(_add2(_hill(ll),0x00080008),4); \
+ _amem4((_y)+(4<<3))=_shr2(_add2(_loll(ll),0x00080008),4); \
+ } \
+ while(0)
+
+/*179 cycles.*/
+static void oc_idct8x8_slow_c64x(ogg_int16_t _y[64]){
+ ogg_int16_t w[64];
+ int x0;
+ int x1;
+ int x2;
+ int x3;
+ int x4;
+ int x5;
+ int x6;
+ int x7;
+ int t0;
+ int t1;
+ int t2;
+ int t3;
+ int t4;
+ int t5;
+ int t6;
+ int t7;
+ int i;
+ /*Transform rows of x into columns of w.*/
+ for(i=0;i<8;i+=2){
+ OC_IDCT8x2_LOAD8(_y+i*8);
+ OC_IDCT8x2();
+ OC_IDCT8x2_STORET(w+i);
+ }
+ /*Transform rows of w into columns of y.*/
+ for(i=0;i<8;i+=2){
+ OC_IDCT8x2_LOAD8(w+i*8);
+ OC_IDCT8x2();
+ OC_IDCT8x2_ROUND_STORET(_y+i);
+ }
+}
+
+/*107 cycles.*/
+static void oc_idct8x8_10_c64x(ogg_int16_t _y[64]){
+ ogg_int16_t w[64];
+ int t0;
+ int t1;
+ int t2;
+ int t3;
+ int t4;
+ int t5;
+ int t6;
+ int t7;
+ int x0;
+ int x1;
+ int x2;
+ int x3;
+ int i;
+ /*Transform rows of x into columns of w.*/
+ OC_IDCT8x2_LOAD4(_y);
+ OC_IDCT8x2_4();
+ OC_IDCT8x2_STORET(w);
+ OC_IDCT8x2_LOAD2(_y+16);
+ OC_IDCT8x2_2();
+ OC_IDCT8x2_STORET(w+2);
+ /*Transform rows of w into columns of y.*/
+ for(i=0;i<8;i+=2){
+ OC_IDCT8x2_LOAD4(w+i*8);
+ OC_IDCT8x2_4();
+ OC_IDCT8x2_ROUND_STORET(_y+i);
+ }
+}
+
+/*88 cycles.*/
+static void oc_idct8x8_3_c64x(ogg_int16_t _y[64]){
+ ogg_int16_t w[64];
+ int t0;
+ int t1;
+ int t2;
+ int t3;
+ int t4;
+ int t5;
+ int t6;
+ int t7;
+ int x0;
+ int x1;
+ int i;
+ /*Transform rows of x into rows of w.*/
+ for(i=0;i<2;i+=2){
+ OC_IDCT8x2_LOAD2(_y+i*8);
+ OC_IDCT8x2_2();
+ OC_IDCT8x2_STORE(w+i*8);
+ }
+ /*Transform columns of w into columns of y.*/
+ for(i=0;i<8;i+=2){
+ OC_IDCT8x2_LOAD2T(w+i);
+ OC_IDCT8x2_2();
+ OC_IDCT8x2_ROUND_STORET(_y+i);
+ }
+}
+
+/*Performs an inverse 8x8 Type-II DCT transform.
+ The input is assumed to be scaled by a factor of 4 relative to orthonormal
+ version of the transform.*/
+void oc_idct8x8_c64x(ogg_int16_t _y[64],int _last_zzi){
+ if(_last_zzi<3)oc_idct8x8_3_c64x(_y);
+ else if(_last_zzi<10)oc_idct8x8_10_c64x(_y);
+ else oc_idct8x8_slow_c64x(_y);
+}
Added: experimental/derf/theora-ptalarbvorm/lib/c64x/c64xint.h
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/c64x/c64xint.h (rev 0)
+++ experimental/derf/theora-ptalarbvorm/lib/c64x/c64xint.h 2010-07-19 03:43:42 UTC (rev 17336)
@@ -0,0 +1,77 @@
+/********************************************************************
+ * *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
+ * *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007 *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ * *
+ ********************************************************************
+
+ function:
+ last mod: $Id$
+
+ ********************************************************************/
+
+#if !defined(_c64x_c64xint_H)
+# define _c64x_c64xint_H (1)
+
+# if defined(OC_C64X_ASM)
+# if !defined(oc_frag_copy)
+# define oc_frag_copy(_state,_dst,_src,_ystride) \
+ oc_frag_copy_c64x(_dst,_src,_ystride)
+# endif
+# if !defined(oc_frag_recon_intra)
+# define oc_frag_recon_intra(_state,_dst,_dst_ystride,_residue) \
+ oc_frag_recon_intra_c64x(_dst,_dst_ystride,_residue)
+# endif
+# if !defined(oc_frag_recon_inter)
+# define oc_frag_recon_inter(_state,_dst,_src,_ystride,_residue) \
+ oc_frag_recon_inter_c64x(_dst,_src,_ystride,_residue)
+# endif
+# if !defined(oc_frag_recon_inter2)
+# define oc_frag_recon_inter2(_state,_dst,_src1,_src2,_ystride,_residue) \
+ oc_frag_recon_inter2_c64x(_dst,_src1,_src2,_ystride,_residue)
+# endif
+# if !defined(oc_idct8x8)
+# define oc_idct8x8(_state,_y,_last_zzi) \
+ define oc_idct8x8_c64x(_y,_last_zzi)
+# endif
+# if !defined(oc_state_frag_recon)
+# define oc_state_frag_recon oc_state_frag_recon_c64x
+# endif
+# if !defined(oc_state_frag_copy_list)
+# define oc_state_frag_copy_list oc_state_frag_copy_list_c64x
+# endif
+# if !defined(oc_state_loop_filter_frag_rows)
+# define oc_state_loop_filter_frag_rows oc_state_loop_filter_frag_rows_c64x
+# endif
+# if !defined(oc_restore_fpu)
+# define oc_restore_fpu(_state) do{}while(0)
+# endif
+# endif
+
+# include "../internal.h"
+
+void oc_state_vtable_init_c64x(oc_theora_state *_state);
+
+void oc_frag_copy_c64x(unsigned char *_dst,
+ const unsigned char *_src,int _ystride);
+void oc_frag_recon_intra_c64x(unsigned char *_dst,int _ystride,
+ const ogg_int16_t *_residue);
+void oc_frag_recon_inter_c64x(unsigned char *_dst,
+ const unsigned char *_src,int _ystride,const ogg_int16_t *_residue);
+void oc_frag_recon_inter2_c64x(unsigned char *_dst,const unsigned char *_src1,
+ const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue);
+void oc_idct8x8_c64x(ogg_int16_t _y[64],int _last_zzi);
+void oc_state_frag_recon_c64x(const oc_theora_state *_state,ptrdiff_t _fragi,
+ int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant);
+void oc_state_frag_copy_list_c64x(const oc_theora_state *_state,
+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
+ int _dst_frame,int _src_frame,int _pli);
+void oc_state_loop_filter_frag_rows_c64x(const oc_theora_state *_state,
+ int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
+
+#endif
Added: experimental/derf/theora-ptalarbvorm/lib/c64x/c64xstate.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/c64x/c64xstate.c (rev 0)
+++ experimental/derf/theora-ptalarbvorm/lib/c64x/c64xstate.c 2010-07-19 03:43:42 UTC (rev 17336)
@@ -0,0 +1,36 @@
+/********************************************************************
+ * *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
+ * *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007 *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ * *
+ ********************************************************************
+
+ function:
+ last mod: $Id$
+
+ ********************************************************************/
+
+#include "c64xint.h"
+
+#if defined(OC_C64X_ASM)
+
+void oc_state_vtable_init_c64x(oc_theora_state *_state){
+ _state->opt_vtable.frag_copy=oc_frag_copy_c64x;
+ _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_c64x;
+ _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_c64x;
+ _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_c64x;
+ _state->opt_vtable.idct8x8=oc_idct8x8_c64x;
+ _state->opt_vtable.state_frag_recon=oc_state_frag_recon_c64x;
+ _state->opt_vtable.state_frag_copy_list=oc_state_frag_copy_list_c64x;
+ _state->opt_vtable.state_loop_filter_frag_rows=
+ oc_state_loop_filter_frag_rows_c64x;
+ _state->opt_vtable.restore_fpu=oc_restore_fpu_c;
+ _state->opt_data.dct_fzig_zag=OC_FZIG_ZAG;
+}
+
+#endif
More information about the commits
mailing list