[xiph-commits] r17336 - in experimental/derf/theora-ptalarbvorm/lib: . c64x

Sun Jul 18 20:43:43 PDT 2010

Author: tterribe
Date: 2010-07-18 20:43:42 -0700 (Sun, 18 Jul 2010)
New Revision: 17336

Added:
   experimental/derf/theora-ptalarbvorm/lib/c64x/
   experimental/derf/theora-ptalarbvorm/lib/c64x/c64xdec.c
   experimental/derf/theora-ptalarbvorm/lib/c64x/c64xdec.h
   experimental/derf/theora-ptalarbvorm/lib/c64x/c64xfrag.c
   experimental/derf/theora-ptalarbvorm/lib/c64x/c64xidct.c
   experimental/derf/theora-ptalarbvorm/lib/c64x/c64xint.h
   experimental/derf/theora-ptalarbvorm/lib/c64x/c64xstate.c
Log:
Add new files that were supposed to be added with r17334.


Added: experimental/derf/theora-ptalarbvorm/lib/c64x/c64xdec.c
===================================================================

--- experimental/derf/theora-ptalarbvorm/lib/c64x/c64xdec.c	                        (rev 0)
+++ experimental/derf/theora-ptalarbvorm/lib/c64x/c64xdec.c	2010-07-19 03:43:42 UTC (rev 17336)
@@ -0,0 +1,154 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id$
+
+ ********************************************************************/
+#include "c64xdec.h"
+
+#if defined(OC_C64X_ASM)
+
+void oc_dec_vtable_init_c64x(oc_dec_ctx *_dec){
+  _dec->opt_vtable.dc_unpredict_mcu_plane=oc_dec_dc_unpredict_mcu_plane_c64x;
+}
+
+
+/*Undo the DC prediction in a single plane of an MCU (one or two super block
+   rows).
+  As a side effect, the number of coded and uncoded fragments in this plane of
+   the MCU is also computed.*/
+void oc_dec_dc_unpredict_mcu_plane_c64x(oc_dec_ctx *_dec,
+ oc_dec_pipeline_state *_pipe,int _pli){
+  const oc_fragment_plane *fplane;
+  oc_fragment             *frags;
+  int                     *pred_last;
+  ptrdiff_t                ncoded_fragis;
+  ptrdiff_t                fragi;
+  int                      fragx;
+  int                      fragy;
+  int                      fragy0;
+  int                      fragy_end;
+  int                      nhfrags;
+  /*Compute the first and last fragment row of the current MCU for this
+     plane.*/
+  fplane=_dec->state.fplanes+_pli;
+  fragy0=_pipe->fragy0[_pli];
+  fragy_end=_pipe->fragy_end[_pli];
+  nhfrags=fplane->nhfrags;
+  pred_last=_pipe->pred_last[_pli];
+  frags=_dec->state.frags;
+  ncoded_fragis=0;
+  fragi=fplane->froffset+fragy0*(ptrdiff_t)nhfrags;
+  for(fragy=fragy0;fragy<fragy_end;fragy++){
+    if(fragy==0){
+      /*For the first row, all of the cases reduce to just using the previous
+         predictor for the same reference frame.*/
+      for(fragx=0;fragx<nhfrags;fragx++,fragi++){
+        int coded;
+        int ref;
+        /*The TI compiler refuses to pipeline this if we put it in an if(coded)
+           block.
+          We can do the loads unconditionally, which helps move them earlier.
+          We do the store unconditionally too, because if we use a condtional
+           store, the compiler propagates the condition back to the operations
+           the store depended on, presumably to reduce cache pressure by
+           eliminating dead loads.
+          However, these loads are "free" in the cache sense, since reading the
+           coded flag brings in all four bytes anyway, and starting the loads
+           before we know the coded flag saves 6 cycles.*/
+        ref=OC_FRAME_FOR_MODE(frags[fragi].mb_mode);
+        coded=frags[fragi].coded;
+        frags[fragi].dc=pred_last[ref]+=frags[fragi].dc&-coded;
+        ncoded_fragis+=coded;
+      }
+    }
+    else{
+      oc_fragment *u_frags;
+      int          l_ref;
+      int          ul_ref;
+      int          u_ref;
+      u_frags=frags-nhfrags;
+      l_ref=-1;
+      ul_ref=-1;
+      u_ref=u_frags[fragi].coded?OC_FRAME_FOR_MODE(u_frags[fragi].mb_mode):-1;
+      for(fragx=0;fragx<nhfrags;fragx++,fragi++){
+        int ur_ref;
+        int ref;
+        if(fragx+1>=nhfrags)ur_ref=-1;
+        else{
+          ur_ref=u_frags[fragi+1].coded?
+           OC_FRAME_FOR_MODE(u_frags[fragi+1].mb_mode):-1;
+        }
+        ref=OC_FRAME_FOR_MODE(frags[fragi].mb_mode);
+        /*HACK: This p0 reference could potentially be out of bounds, but
+           because we know what allocator we're using, we know it can't
+           segfault.*/
+        if(frags[fragi].coded){
+          static const int OC_PRED_SCALE[16][2]={
+            {0x00000000,0x00000000},
+            {0x00000000,0x00000080},
+            {0x00800000,0x00000000},
+            {0x00000000,0x00000080},
+            {0x00000080,0x00000000},
+            {0x00000040,0x00000040},
+            {0x00000080,0x00000000},
+            {0xFF980074,0x00000074},
+            {0x00000000,0x00800000},
+            {0x00000000,0x0035004B},
+            {0x00400000,0x00400000},
+            {0x00000000,0x0035004B},
+            {0x00000080,0x00000000},
+            {0x00000000,0x0035004B},
+            {0x00180050,0x00180000},
+            {0xFF980074,0x00000074},
+          };
+          ogg_int16_t p0;
+          ogg_int16_t p1;
+          ogg_int16_t p2;
+          ogg_int16_t p3;
+          int         pred;
+          int         pflags;
+          /*29 cycles.*/
+          p0=u_frags[fragi-1].dc;
+          p1=u_frags[fragi].dc;
+          p2=u_frags[fragi+1].dc;
+          p3=frags[fragi-1].dc;
+          pflags=_cmpeq4(_packl4(_pack2(ur_ref,u_ref),_pack2(ul_ref,l_ref)),
+           _packl4(_pack2(ref,ref),_pack2(ref,ref)));
+          if(pflags==0)pred=pred_last[ref];
+          else{
+            pred=(_dotp2(_pack2(p0,p1),OC_PRED_SCALE[pflags][0])
+             +_dotp2(_pack2(p2,p3),OC_PRED_SCALE[pflags][1]))/128;
+            if((pflags&7)==7){
+              if(abs(pred-p1)>128)pred=p1;
+              else if(abs(pred-p3)>128)pred=p3;
+              else if(abs(pred-p0)>128)pred=p0;
+            }
+          }
+          pred_last[ref]=frags[fragi].dc+=pred;
+          ncoded_fragis++;
+          l_ref=ref;
+        }
+        else l_ref=-1;
+        ul_ref=u_ref;
+        u_ref=ur_ref;
+      }
+    }
+  }
+  _pipe->ncoded_fragis[_pli]=ncoded_fragis;
+  /*Also save the number of uncoded fragments so we know how many to copy.*/
+  _pipe->nuncoded_fragis[_pli]=
+   (fragy_end-fragy0)*(ptrdiff_t)nhfrags-ncoded_fragis;
+}
+
+#endif

Added: experimental/derf/theora-ptalarbvorm/lib/c64x/c64xdec.h
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/c64x/c64xdec.h	                        (rev 0)
+++ experimental/derf/theora-ptalarbvorm/lib/c64x/c64xdec.h	2010-07-19 03:43:42 UTC (rev 17336)
@@ -0,0 +1,34 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id$
+
+ ********************************************************************/
+#if !defined(_c64x_c64xdec_H)
+# define _c64x_c64xdec_H (1)
+# include "c64xint.h"
+
+# if defined(OC_C64X_ASM)
+#  if !defined(oc_dec_dc_unpredict_mcu_plane)
+#   define oc_dec_dc_unpredict_mcu_plane oc_dec_dc_unpredict_mcu_plane_c64x
+#  endif
+# endif
+
+# include "../decint.h"
+
+void oc_dec_vtable_init_c64x(oc_dec_ctx *_dec);
+
+void oc_dec_dc_unpredict_mcu_plane_c64x(oc_dec_ctx *_dec,
+ oc_dec_pipeline_state *_pipe,int _pli);
+
+#endif

Added: experimental/derf/theora-ptalarbvorm/lib/c64x/c64xfrag.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/c64x/c64xfrag.c	                        (rev 0)
+++ experimental/derf/theora-ptalarbvorm/lib/c64x/c64xfrag.c	2010-07-19 03:43:42 UTC (rev 17336)
@@ -0,0 +1,451 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id$
+
+ ********************************************************************/
+#include <string.h>
+#include "c64xint.h"
+
+
+
+/*14 cycles.*/
+void oc_frag_copy_c64x(unsigned char *restrict _dst,
+ const unsigned char *restrict _src,int _ystride){
+  unsigned char *restrict       d2;
+  const unsigned char *restrict s2;
+  d2=_dst+_ystride;
+  s2=_src+_ystride;
+#define OC_ITER() \
+  do{ \
+    _amem8(_dst)=_mem8(_src); \
+    _dst+=2*_ystride; \
+    _src+=2*_ystride; \
+    _amem8(d2)=_mem8(s2); \
+    d2+=2*_ystride; \
+    s2+=2*_ystride; \
+  } \
+  while(0)
+  OC_ITER();
+  OC_ITER();
+  OC_ITER();
+  OC_ITER();
+#undef OC_ITER
+}
+
+/*34 cycles.*/
+void oc_frag_recon_intra_c64x(unsigned char *_dst,int _ystride,
+ const ogg_int16_t _residue[64]){
+  int i;
+  for(i=0;i<8;i++){
+    long long ll;
+    int       x1;
+    int       y1;
+    int       x2;
+    int       y2;
+    ll=_amem8_const(_residue+i*8+0);
+    x1=_sadd2(_loll(ll),0x00800080);
+    y1=_sadd2(_hill(ll),0x00800080);
+    ll=_amem8_const(_residue+i*8+4);
+    x2=_sadd2(_loll(ll),0x00800080);
+    y2=_sadd2(_hill(ll),0x00800080);
+    _amem8(_dst)=_itoll(_spacku4(y2,x2),_spacku4(y1,x1));
+    _dst+=_ystride;
+  }
+}
+
+/*41 cycles.*/
+void oc_frag_recon_inter_c64x(unsigned char *_dst,const unsigned char *_src,
+ int _ystride,const ogg_int16_t _residue[64]){
+  int i;
+  for(i=0;i<8;i++){
+    long long ll;
+    int       x1;
+    int       y1;
+    int       z1;
+    int       x2;
+    int       y2;
+    int       z2;
+    ll=_mem8_const(_src);
+    z1=_loll(ll);
+    z2=_hill(ll);
+    ll=_amem8_const(_residue+i*8+0);
+    x1=_sadd2(_unpklu4(z1),_loll(ll));
+    y1=_sadd2(_unpkhu4(z1),_hill(ll));
+    ll=_amem8_const(_residue+i*8+4);
+    x2=_sadd2(_unpklu4(z2),_loll(ll));
+    y2=_sadd2(_unpkhu4(z2),_hill(ll));
+    _amem8(_dst)=_itoll(_spacku4(y2,x2),_spacku4(y1,x1));
+    _dst+=_ystride;
+    _src+=_ystride;
+  }
+}
+
+/*56 cycles.*/
+void oc_frag_recon_inter2_c64x(unsigned char *_dst,
+ const unsigned char *_src1,const unsigned char *_src2,int _ystride,
+ const ogg_int16_t _residue[64]){
+  int i;
+  for(i=0;i<8;i++){
+    long long ll;
+    int       a;
+    int       b;
+    int       c;
+    int       d;
+    int       x1;
+    int       y1;
+    int       z1;
+    int       x2;
+    int       y2;
+    int       z2;
+    ll=_mem8_const(_src1);
+    a=_loll(ll);
+    b=_hill(ll);
+    ll=_mem8_const(_src2);
+    c=_loll(ll);
+    d=_hill(ll);
+    ll=_amem8_const(_residue+i*8+0);
+    z1=~_avgu4(~a,~c);
+    x1=_sadd2(_unpklu4(z1),_loll(ll));
+    y1=_sadd2(_unpkhu4(z1),_hill(ll));
+    ll=_amem8_const(_residue+i*8+4);
+    z2=~_avgu4(~b,~d);
+    x2=_sadd2(_unpklu4(z2),_loll(ll));
+    y2=_sadd2(_unpkhu4(z2),_hill(ll));
+    _amem8(_dst)=_itoll(_spacku4(y2,x2),_spacku4(y1,x1));
+    _dst+=_ystride;
+    _src1+=_ystride;
+    _src2+=_ystride;
+  }
+}
+
+void oc_state_frag_recon_c64x(const oc_theora_state *_state,ptrdiff_t _fragi,
+ int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant){
+  unsigned char *dst;
+  ptrdiff_t      frag_buf_off;
+  int            ystride;
+  int            mb_mode;
+  /*Apply the inverse transform.*/
+  /*Special case only having a DC component.*/
+  if(_last_zzi<2){
+    ogg_int16_t p;
+    int         ci;
+    /*We round this dequant product (and not any of the others) because there's
+       no iDCT rounding.*/
+    p=(ogg_int16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5);
+    /*LOOP VECTORIZES.*/
+    for(ci=0;ci<64;ci++)_dct_coeffs[ci]=p;
+  }
+  else{
+    /*First, dequantize the DC coefficient.*/
+    _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
+    oc_idct8x8_c64x(_dct_coeffs,_last_zzi);
+  }
+  /*Fill in the target buffer.*/
+  frag_buf_off=_state->frag_buf_offs[_fragi];
+  mb_mode=_state->frags[_fragi].mb_mode;
+  ystride=_state->ref_ystride[_pli];
+  dst=_state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_SELF]]+frag_buf_off;
+  if(mb_mode==OC_MODE_INTRA)oc_frag_recon_intra_c64x(dst,ystride,_dct_coeffs);
+  else{
+    const unsigned char *ref;
+    int                  mvoffsets[2];
+    ref=
+     _state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_FOR_MODE(mb_mode)]]
+     +frag_buf_off;
+    if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
+     _state->frag_mvs[_fragi][0],_state->frag_mvs[_fragi][1])>1){
+      oc_frag_recon_inter2_c64x(dst,ref+mvoffsets[0],ref+mvoffsets[1],
+          ystride,_dct_coeffs);
+    }
+    else oc_frag_recon_inter_c64x(dst,ref+mvoffsets[0],ystride,_dct_coeffs);
+  }
+}
+
+void oc_state_frag_copy_list_c64x(const oc_theora_state *_state,
+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
+ int _dst_frame,int _src_frame,int _pli){
+  const ptrdiff_t     *frag_buf_offs;
+  const unsigned char *src_frame_data;
+  unsigned char       *dst_frame_data;
+  ptrdiff_t            fragii;
+  int                  ystride;
+  dst_frame_data=_state->ref_frame_data[_state->ref_frame_idx[_dst_frame]];
+  src_frame_data=_state->ref_frame_data[_state->ref_frame_idx[_src_frame]];
+  ystride=_state->ref_ystride[_pli];
+  frag_buf_offs=_state->frag_buf_offs;
+  /*9 cycles per iteration.*/
+  for(fragii=0;fragii<_nfragis;fragii++){
+    const unsigned char *restrict src;
+    const unsigned char *restrict s2;
+    unsigned char       *restrict dst;
+    unsigned char       *restrict d2;
+    ptrdiff_t                     frag_buf_off;
+    frag_buf_off=frag_buf_offs[_fragis[fragii]];
+    dst=dst_frame_data+frag_buf_off;
+    src=src_frame_data+frag_buf_off;
+    d2=dst+ystride;
+    s2=src+ystride;
+#define OC_ITER() \
+  do{ \
+    _amem8(dst)=_amem8_const(src); \
+    dst+=2*ystride; \
+    src+=2*ystride; \
+    _amem8(d2)=_amem8_const(s2); \
+    d2+=2*ystride; \
+    s2+=2*ystride; \
+  } \
+  while(0)
+    OC_ITER();
+    OC_ITER();
+    OC_ITER();
+    OC_ITER();
+#undef OC_ITER
+  }
+}
+
+/*46 cycles.*/
+static void loop_filter_h(unsigned char *restrict _pix,int _ystride,int _ll){
+  int p0;
+  int p1;
+  int p2;
+  int p3;
+  int p4;
+  int p5;
+  int p6;
+  int p7;
+  int y;
+  _pix-=2;
+  /*Do all the loads now to avoid the compiler's inability to prove they're not
+     dependent on the stores later.*/
+  p0=_mem4(_pix+_ystride*0);
+  p1=_mem4(_pix+_ystride*1);
+  p2=_mem4(_pix+_ystride*2);
+  p3=_mem4(_pix+_ystride*3);
+  p4=_mem4(_pix+_ystride*4);
+  p5=_mem4(_pix+_ystride*5);
+  p6=_mem4(_pix+_ystride*6);
+  p7=_mem4(_pix+_ystride*7);
+  for(y=0;y<8;y+=4){
+    int f;
+    int a;
+    int b;
+    int u;
+    int v;
+    /*We could pack things right after the dot product, but delaying it
+       actually saves three cycles due to better instruction scheduling.*/
+    a=_dotpsu4(0x01FD03FF,p0)+3>>3;
+    b=_dotpsu4(0x01FD03FF,p1)+3>>3;
+    u=_dotpsu4(0x01FD03FF,p2)+3>>3;
+    v=_dotpsu4(0x01FD03FF,p3)+3>>3;
+    f=_packl4(_pack2(v,u),_pack2(b,a));
+    /*We split the results by sign and work with abs(f) here, since the C64x
+       signed-unsigned addition with unsigned saturation is only available for
+       16-bit operands.
+      For 8-bit operands, we have to emulate it with a saturated addition and a
+       saturated subtraction using separate unsigned values.
+      There's no direct support for 8-bit saturated subtraction, either, so we
+       have to emulate that as well, using either x-_minu4(x,y) or
+       ~_saddu4(~x,y), depending on which one schedules better.*/
+    f=_add4(0x80808080,f);
+    b=_minu4(0x80808080,f);
+    a=0x80808080-b;
+    b=f-b;
+    /*Compute f=clamp(0,2*L-abs(f),abs(f)).*/
+    u=_saddu4(a,_ll);
+    v=_saddu4(b,_ll);
+    a=_saddu4(a,u);
+    b=_saddu4(b,v);
+    a=a-_minu4(a,u);
+    b=b-_minu4(b,v);
+    /*Apply the changes to the original pixels.*/
+    u=_pack2(p1>>8,p0>>8);
+    v=_pack2(p3>>8,p2>>8);
+    p1=_packl4(v,u);
+    p2=_packh4(v,u);
+    p1=_saddu4(~_saddu4(~p1,b),a);
+    p2=_saddu4(p2-_minu4(p2,a),b);
+    /*For unaligned short stores, we have to store byte by byte.
+      It's faster to do it explicitly than to use _mem2().*/
+    _pix[_ystride*0+1]=(unsigned char)p1;
+    _pix[_ystride*0+2]=(unsigned char)p2;
+    _pix[_ystride*1+1]=(unsigned char)(p1>>8);
+    _pix[_ystride*1+2]=(unsigned char)(p2>>8);
+    _pix[_ystride*2+1]=(unsigned char)(p1>>16);
+    _pix[_ystride*2+2]=(unsigned char)(p2>>16);
+    _pix[_ystride*3+1]=(unsigned char)(p1>>24);
+    _pix[_ystride*3+2]=(unsigned char)(p2>>24);
+    p0=p4;
+    p1=p5;
+    p2=p6;
+    p3=p7;
+    _pix+=4*_ystride;
+  }
+}
+
+/*38 cycles.*/
+static void loop_filter_v(unsigned char * restrict _pix,int _ystride,int _ll){
+  long long ll;
+  int       p0;
+  int       p1;
+  int       p2;
+  int       p3;
+  int       p4;
+  int       p5;
+  int       p6;
+  int       p7;
+  int       a1;
+  int       b1;
+  int       f1;
+  int       m1;
+  int       u1;
+  int       v1;
+  int       a2;
+  int       b2;
+  int       f2;
+  int       m2;
+  int       u2;
+  int       v2;
+  /*Do all the loads now to avoid the compiler's inability to prove they're not
+     dependent on the stores later.*/
+  ll=_amem8(_pix-_ystride*2);
+  p0=_loll(ll);
+  p4=_hill(ll);
+  ll=_amem8(_pix-_ystride*1);
+  p1=_loll(ll);
+  p5=_hill(ll);
+  ll=_amem8(_pix+_ystride*0);
+  p2=_loll(ll);
+  p6=_hill(ll);
+  ll=_amem8(_pix+_ystride*1);
+  p3=_loll(ll);
+  p7=_hill(ll);
+  /*I can't find a way to put the rest in a loop that the compiler thinks is
+     unrollable, so instead it's unrolled manually.*/
+  /*This first part is based on the transformation
+    f = -(3*(p2-p1)+p0-p3+4>>3)
+      = -(3*(p2+255-p1)+(p0+255-p3)+4-1020>>3)
+      = -(3*(p2+~p1)+(p0+~p3)-1016>>3)
+      = 127-(3*(p2+~p1)+(p0+~p3)>>3)
+      = 128+~(3*(p2+~p1)+(p0+~p3)>>3) (mod 256).
+    Although _avgu4(a,b) = (a+b+1>>1) (biased up), we rely heavily on the
+     fact that ~_avgu4(~a,~b) = (a+b>>1) (biased down).*/
+  /*We need this first average both biased up and biased down.*/
+  u1=~_avgu4(~p1,p2);
+  v1=_avgu4(p1,~p2);
+  /*The difference controls whether (p3+255-p0>>1) is biased up or down.*/
+  m1=_sub4(u1,v1);
+  a1=m1^_avgu4(m1^~p0,m1^p3);
+  f1=_avgu4(_avgu4(a1,u1),v1);
+  /*Instead of removing the bias by 128, we use it to split f by sign, since
+     the C64x signed-unsigned addition with unsigned saturation is only
+     available for 16-bit operands.
+    For 8-bit operands, we have to emulate it with a saturated addition and a
+     saturated subtraction using separate unsigned values.
+    There's no direct support for 8-bit saturated subtraction, either, so we
+     have to emulate that as well, using either x-_minu4(x,y) or
+     ~_saddu4(~x,y), depending on which one schedules better.*/
+  b1=_minu4(0x80808080,f1);
+  a1=0x80808080-b1;
+  b1=f1-b1;
+  /*Compute f=clamp(0,2*L-abs(f),abs(f)).*/
+  u1=_saddu4(a1,_ll);
+  v1=_saddu4(b1,_ll);
+  a1=_saddu4(a1,u1);
+  b1=_saddu4(b1,v1);
+  a1=a1-_minu4(a1,u1);
+  b1=b1-_minu4(b1,v1);
+  /*Apply the changes to the original pixels.*/
+  p1=_saddu4(p1-_minu4(p1,b1),a1);
+  p2=_saddu4(p2-_minu4(p2,a1),b1);
+  /*We need this first average both biased up and biased down.*/
+  u2=~_avgu4(~p5,p6);
+  v2=_avgu4(p5,~p6);
+  /*The difference controls whether (p3+255-p0>>1) is biased up or down.*/
+  m2=_sub4(u2,v2);
+  a2=m2^_avgu4(m2^~p4,m2^p7);
+  f2=_avgu4(_avgu4(a2,u2),v2);
+  /*Instead of removing the bias by 128, we use it to split f by sign.*/
+  b2=_minu4(0x80808080,f2);
+  a2=0x80808080-b2;
+  b2=f2-b2;
+  /*Compute f=clamp(0,2*L-abs(f),abs(f)).*/
+  u2=_saddu4(a2,_ll);
+  v2=_saddu4(b2,_ll);
+  a2=_saddu4(a2,u2);
+  b2=_saddu4(b2,v2);
+  a2=a2-_minu4(a2,u2);
+  b2=b2-_minu4(b2,v2);
+  /*Apply the changes to the original pixels.*/
+  p5=_saddu4(p5-_minu4(p5,b2),a2);
+  p6=_saddu4(p6-_minu4(p6,a2),b2);
+  /*Write out the results.*/
+  _amem8(_pix-_ystride)=_itoll(p5,p1);
+  _amem8(_pix)=_itoll(p6,p2);
+}
+
+
+void oc_state_loop_filter_frag_rows_c64x(const oc_theora_state *_state,int *_bv,
+ int _refi,int _pli,int _fragy0,int _fragy_end){
+  const oc_fragment_plane *fplane;
+  const oc_fragment       *frags;
+  const ptrdiff_t         *frag_buf_offs;
+  unsigned char           *ref_frame_data;
+  ptrdiff_t                fragi_top;
+  ptrdiff_t                fragi_bot;
+  ptrdiff_t                fragi0;
+  ptrdiff_t                fragi0_end;
+  int                      ystride;
+  int                      nhfrags;
+  int                      ll;
+  fplane=_state->fplanes+_pli;
+  nhfrags=fplane->nhfrags;
+  fragi_top=fplane->froffset;
+  fragi_bot=fragi_top+fplane->nfrags;
+  fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags;
+  fragi0_end=fragi0+(_fragy_end-_fragy0)*(ptrdiff_t)nhfrags;
+  ystride=_state->ref_ystride[_pli];
+  frags=_state->frags;
+  frag_buf_offs=_state->frag_buf_offs;
+  ref_frame_data=_state->ref_frame_data[_refi];
+  ll=_state->loop_filter_limits[_state->qis[0]]<<1;
+  ll=_pack2(ll,ll);
+  ll=~_spacku4(ll,ll);
+  /*The following loops are constructed somewhat non-intuitively on purpose.
+    The main idea is: if a block boundary has at least one coded fragment on
+     it, the filter is applied to it.
+    However, the order that the filters are applied in matters, and VP3 chose
+     the somewhat strange ordering used below.*/
+  while(fragi0<fragi0_end){
+    ptrdiff_t fragi;
+    ptrdiff_t fragi_end;
+    fragi=fragi0;
+    fragi_end=fragi+nhfrags;
+    while(fragi<fragi_end){
+      if(frags[fragi].coded){
+        unsigned char *ref;
+        ref=ref_frame_data+frag_buf_offs[fragi];
+        if(fragi>fragi0)loop_filter_h(ref,ystride,ll);
+        if(fragi0>fragi_top)loop_filter_v(ref,ystride,ll);
+        if(fragi+1<fragi_end&&!frags[fragi+1].coded){
+          loop_filter_h(ref+8,ystride,ll);
+        }
+        if(fragi+nhfrags<fragi_bot&&!frags[fragi+nhfrags].coded){
+          loop_filter_v(ref+(ystride<<3),ystride,ll);
+        }
+      }
+      fragi++;
+    }
+    fragi0+=nhfrags;
+  }
+}

Added: experimental/derf/theora-ptalarbvorm/lib/c64x/c64xidct.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/c64x/c64xidct.c	                        (rev 0)
+++ experimental/derf/theora-ptalarbvorm/lib/c64x/c64xidct.c	2010-07-19 03:43:42 UTC (rev 17336)
@@ -0,0 +1,399 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id$
+
+ ********************************************************************/
+#include <string.h>
+#include "c64xint.h"
+#include "dct.h"
+
+#define DOC_C1S7 ((OC_C1S7<<16)|(OC_C1S7&0xffff))
+#define DOC_C2S6 ((OC_C2S6<<16)|(OC_C2S6&0xffff))
+#define DOC_C3S5 ((OC_C3S5<<16)|(OC_C3S5&0xffff))
+#define DOC_C4S4 ((OC_C4S4<<16)|(OC_C4S4&0xffff))
+#define DOC_C5S3 ((OC_C5S3<<16)|(OC_C5S3&0xffff))
+#define DOC_C6S2 ((OC_C6S2<<16)|(OC_C6S2&0xffff))
+#define DOC_C7S1 ((OC_C7S1<<16)|(OC_C7S1&0xffff))
+
+/*Various building blocks for the iDCT implementations.
+  These are done in macros instead of functions so that we can use all local
+   variables, which avoids leaving the compiler to try to sort out memory
+   reference dependencies.*/
+
+/*Load two rows into x0...x7.*/
+#define OC_IDCT8x2_LOAD8(_x) \
+  do{ \
+    long long ll; \
+    ll=_dpack2(_amem4_const((_x)+8),_amem4_const((_x)+0)); \
+    x0=_loll(ll); \
+    x1=_hill(ll); \
+    ll=_dpack2(_amem4_const((_x)+10),_amem4_const((_x)+2)); \
+    x2=_loll(ll); \
+    x3=_hill(ll); \
+    ll=_dpack2(_amem4_const((_x)+12),_amem4_const((_x)+4)); \
+    x4=_loll(ll); \
+    x5=_hill(ll); \
+    ll=_dpack2(_amem4_const((_x)+14),_amem4_const((_x)+6)); \
+    x6=_loll(ll); \
+    x7=_hill(ll); \
+  } \
+  while(0)
+
+/*Load two rows into x0...x3.
+  Uses ll as a temporary.*/
+#define OC_IDCT8x2_LOAD4(_x) \
+  do{ \
+    long long ll; \
+    ll=_dpack2(_amem4_const((_x)+8),_amem4_const((_x)+0)); \
+    x0=_loll(ll); \
+    x1=_hill(ll); \
+    ll=_dpack2(_amem4_const((_x)+10),_amem4_const((_x)+2)); \
+    x2=_loll(ll); \
+    x3=_hill(ll); \
+  } \
+  while(0)
+
+/*Load two rows into x0...x1.*/
+#define OC_IDCT8x2_LOAD2(_x) \
+  do{ \
+    long long ll; \
+    ll=_dpack2(_amem4_const((_x)+8),_amem4_const((_x)+0)); \
+    x0=_loll(ll); \
+    x1=_hill(ll); \
+  } \
+  while(0)
+
+/*Load two columns into x0...x1.*/
+#define OC_IDCT8x2_LOAD2T(_x) \
+  do{ \
+    x0=_amem4_const((_x)+(0<<3)); \
+    x1=_amem4_const((_x)+(1<<3)); \
+  } \
+  while(0)
+
+/*Transform x0...x7 into t0...t7.*/
+#define OC_IDCT8x2() \
+  do{ \
+    long long ll; \
+    int       a; \
+    int       b; \
+    /*Stage 1:*/ \
+    ll=_addsub2(x0,x4); \
+    a=_hill(ll); \
+    b=_loll(ll); \
+    t0=_packh2(_mpyhus(DOC_C4S4,a),_mpyus(DOC_C4S4,a)); \
+    t1=_packh2(_mpyhus(DOC_C4S4,b),_mpyus(DOC_C4S4,b)); \
+    ll=_mpy2ll(DOC_C6S2,x2); \
+    a=_packh2(_hill(ll),_loll(ll)); \
+    ll=_mpy2ll(DOC_C2S6,x6); \
+    b=_add2(_packh2(_hill(ll),_loll(ll)),x6); \
+    t2=_sub2(a,b); \
+    ll=_mpy2ll(DOC_C2S6,x2); \
+    a=_add2(_packh2(_hill(ll),_loll(ll)),x2); \
+    ll=_mpy2ll(DOC_C6S2,x6); \
+    b=_packh2(_hill(ll),_loll(ll)); \
+    t3=_add2(a,b); \
+    ll=_mpy2ll(DOC_C7S1,x1); \
+    a=_packh2(_hill(ll),_loll(ll)); \
+    ll=_mpy2ll(DOC_C1S7,x7); \
+    b=_add2(_packh2(_hill(ll),_loll(ll)),x7); \
+    t4=_sub2(a,b); \
+    ll=_mpy2ll(DOC_C3S5,x5); \
+    a=_add2(_packh2(_hill(ll),_loll(ll)),x5); \
+    ll=_mpy2ll(DOC_C5S3,x3); \
+    b=_add2(_packh2(_hill(ll),_loll(ll)),x3); \
+    t5=_sub2(a,b); \
+    ll=_mpy2ll(DOC_C5S3,x5); \
+    a=_add2(_packh2(_hill(ll),_loll(ll)),x5); \
+    ll=_mpy2ll(DOC_C3S5,x3); \
+    b=_add2(_packh2(_hill(ll),_loll(ll)),x3); \
+    t6=_add2(a,b); \
+    ll=_mpy2ll(DOC_C1S7,x1); \
+    a=_add2(_packh2(_hill(ll),_loll(ll)),x1); \
+    ll=_mpy2ll(DOC_C7S1,x7); \
+    b=_packh2(_hill(ll),_loll(ll)); \
+    t7=_add2(a,b); \
+    /*Stage 2:*/ \
+    ll=_addsub2(t4,t5); \
+    t4=_hill(ll); \
+    b=_loll(ll); \
+    ll=_mpy2ll(DOC_C4S4,b); \
+    t5=_add2(_packh2(_hill(ll),_loll(ll)),b); \
+    ll=_addsub2(t7,t6); \
+    t7=_hill(ll); \
+    b=_loll(ll); \
+    ll=_mpy2ll(DOC_C4S4,b); \
+    t6=_add2(_packh2(_hill(ll),_loll(ll)),b); \
+    /*Stage 3:*/ \
+    ll=_addsub2(t0,t3); \
+    t0=_hill(ll); \
+    t3=_loll(ll); \
+    ll=_addsub2(t1,t2); \
+    t1=_hill(ll); \
+    t2=_loll(ll); \
+    ll=_addsub2(t6,t5); \
+    t6=_hill(ll); \
+    t5=_loll(ll); \
+  } \
+  while(0)
+
+/*Transform x0...x3 into t0...t7, assuming x4...x7 are zero.*/
+#define OC_IDCT8x2_4() \
+  do{ \
+    long long ll; \
+    int       a; \
+    /*Stage 1:*/ \
+    ll=_mpy2ll(DOC_C4S4,x0); \
+    t0=_add2(_packh2(_hill(ll),_loll(ll)),x0); \
+    t1=t0; \
+    ll=_mpy2ll(DOC_C6S2,x2); \
+    t2=_packh2(_hill(ll),_loll(ll)); \
+    ll=_mpy2ll(DOC_C2S6,x2); \
+    t3=_add2(_packh2(_hill(ll),_loll(ll)),x2); \
+    ll=_mpy2ll(DOC_C7S1,x1); \
+    t4=_packh2(_hill(ll),_loll(ll)); \
+    ll=_mpy2ll(DOC_C5S3,x3); \
+    t5=_add2(_packh2(_hill(ll),_loll(ll)),x3); \
+    ll=_mpy2ll(DOC_C3S5,x3); \
+    t6=_add2(_packh2(_hill(ll),_loll(ll)),x3); \
+    ll=_mpy2ll(DOC_C1S7,x1); \
+    t7=_add2(_packh2(_hill(ll),_loll(ll)),x1); \
+    /*Stage 2:*/ \
+    ll=_addsub2(t4,t5); \
+    t4=_loll(ll); \
+    a=_hill(ll); \
+    ll=_mpy2ll(DOC_C4S4,a); \
+    t5=_add2(_packh2(_hill(ll),_loll(ll)),a); \
+    ll=_addsub2(t7,t6); \
+    t7=_hill(ll); \
+    a=_loll(ll); \
+    ll=_mpy2ll(DOC_C4S4,a); \
+    t6=_add2(_packh2(_hill(ll),_loll(ll)),a); \
+    /*Stage 3:*/ \
+    ll=_addsub2(t0,t3); \
+    t0=_hill(ll); \
+    t3=_loll(ll); \
+    ll=_addsub2(t1,t2); \
+    t1=_hill(ll); \
+    t2=_loll(ll); \
+    ll=_addsub2(t6,t5); \
+    t6=_hill(ll); \
+    t5=_loll(ll); \
+  } \
+  while(0)
+
+/*Transform x0...x1 into t0...t7, assuming x2...x7 are zero.*/
+#define OC_IDCT8x2_2() \
+  do{ \
+    long long ll; \
+    /*Stage 1:*/ \
+    ll=_mpy2ll(DOC_C4S4,x0); \
+    t0=_add2(_packh2(_hill(ll),_loll(ll)),x0); \
+    t1=t0; \
+    ll=_mpy2ll(DOC_C7S1,x1); \
+    t4=_packh2(_hill(ll),_loll(ll)); \
+    ll=_mpy2ll(DOC_C1S7,x1); \
+    t7=_add2(_packh2(_hill(ll),_loll(ll)),x1); \
+    /*Stage 2:*/ \
+    ll=_mpy2ll(DOC_C4S4,t4); \
+    t5=_add2(_packh2(_hill(ll),_loll(ll)),t4); \
+    ll=_mpy2ll(DOC_C4S4,t7); \
+    t6=_add2(_packh2(_hill(ll),_loll(ll)),t7); \
+    /*Stage 3:*/ \
+    t3=t0; \
+    t2=t1; \
+    ll=_addsub2(t6,t5); \
+    t6=_hill(ll); \
+    t5=_loll(ll); \
+  } \
+  while(0)
+
+/*Finish transforming t0...t7 and store two rows.*/
+#define OC_IDCT8x2_STORE(_y) \
+  do{ \
+    long long ll; \
+    int       a; \
+    int       b; \
+    int       c; \
+    int       d; \
+    /*Stage 4:*/ \
+    ll=_addsub2(t0,t7); \
+    a=_hill(ll); \
+    c=_loll(ll); \
+    ll=_addsub2(t1,t6); \
+    b=_hill(ll); \
+    d=_loll(ll); \
+    ll=_dpack2(b,a); \
+    _amem4((_y)+0)=_loll(ll); \
+    _amem4((_y)+8)=_hill(ll); \
+    ll=_dpack2(c,d); \
+    _amem4((_y)+6)=_loll(ll); \
+    _amem4((_y)+14)=_hill(ll); \
+    ll=_addsub2(t2,t5); \
+    a=_hill(ll); \
+    c=_loll(ll); \
+    ll=_addsub2(t3,t4); \
+    b=_hill(ll); \
+    d=_loll(ll); \
+    ll=_dpack2(b,a); \
+    _amem4((_y)+2)=_loll(ll); \
+    _amem4((_y)+10)=_hill(ll); \
+    ll=_dpack2(c,d); \
+    _amem4((_y)+4)=_loll(ll); \
+    _amem4((_y)+12)=_hill(ll); \
+  } \
+  while(0)
+
+/*Finish transforming t0...t7 and store two columns.*/
+#define OC_IDCT8x2_STORET(_y) \
+  do{ \
+    long long ll; \
+    /*Stage 4:*/ \
+    ll=_addsub2(t0,t7); \
+    _amem4((_y)+(0<<3))=_hill(ll); \
+    _amem4((_y)+(7<<3))=_loll(ll); \
+    ll=_addsub2(t1,t6); \
+    _amem4((_y)+(1<<3))=_hill(ll); \
+    _amem4((_y)+(6<<3))=_loll(ll); \
+    ll=_addsub2(t2,t5); \
+    _amem4((_y)+(2<<3))=_hill(ll); \
+    _amem4((_y)+(5<<3))=_loll(ll); \
+    ll=_addsub2(t3,t4); \
+    _amem4((_y)+(3<<3))=_hill(ll); \
+    _amem4((_y)+(4<<3))=_loll(ll); \
+  } \
+  while(0)
+
+/*Finish transforming t0...t7, round and scale, and store two columns.*/
+#define OC_IDCT8x2_ROUND_STORET(_y) \
+  do{ \
+    long long ll; \
+    /*Stage 4:*/ \
+    /*Adjust for the scale factor.*/ \
+    ll=_addsub2(t0,t7); \
+    _amem4((_y)+(0<<3))=_shr2(_add2(_hill(ll),0x00080008),4); \
+    _amem4((_y)+(7<<3))=_shr2(_add2(_loll(ll),0x00080008),4); \
+    ll=_addsub2(t1,t6); \
+    _amem4((_y)+(1<<3))=_shr2(_add2(_hill(ll),0x00080008),4); \
+    _amem4((_y)+(6<<3))=_shr2(_add2(_loll(ll),0x00080008),4); \
+    ll=_addsub2(t2,t5); \
+    _amem4((_y)+(2<<3))=_shr2(_add2(_hill(ll),0x00080008),4); \
+    _amem4((_y)+(5<<3))=_shr2(_add2(_loll(ll),0x00080008),4); \
+    ll=_addsub2(t3,t4); \
+    _amem4((_y)+(3<<3))=_shr2(_add2(_hill(ll),0x00080008),4); \
+    _amem4((_y)+(4<<3))=_shr2(_add2(_loll(ll),0x00080008),4); \
+  } \
+  while(0)
+
+/*179 cycles.*/
+static void oc_idct8x8_slow_c64x(ogg_int16_t _y[64]){
+  ogg_int16_t w[64];
+  int         x0;
+  int         x1;
+  int         x2;
+  int         x3;
+  int         x4;
+  int         x5;
+  int         x6;
+  int         x7;
+  int         t0;
+  int         t1;
+  int         t2;
+  int         t3;
+  int         t4;
+  int         t5;
+  int         t6;
+  int         t7;
+  int         i;
+  /*Transform rows of x into columns of w.*/
+  for(i=0;i<8;i+=2){
+    OC_IDCT8x2_LOAD8(_y+i*8);
+    OC_IDCT8x2();
+    OC_IDCT8x2_STORET(w+i);
+  }
+  /*Transform rows of w into columns of y.*/
+  for(i=0;i<8;i+=2){
+    OC_IDCT8x2_LOAD8(w+i*8);
+    OC_IDCT8x2();
+    OC_IDCT8x2_ROUND_STORET(_y+i);
+  }
+}
+
+/*107 cycles.*/
+static void oc_idct8x8_10_c64x(ogg_int16_t _y[64]){
+  ogg_int16_t w[64];
+  int         t0;
+  int         t1;
+  int         t2;
+  int         t3;
+  int         t4;
+  int         t5;
+  int         t6;
+  int         t7;
+  int         x0;
+  int         x1;
+  int         x2;
+  int         x3;
+  int         i;
+  /*Transform rows of x into columns of w.*/
+  OC_IDCT8x2_LOAD4(_y);
+  OC_IDCT8x2_4();
+  OC_IDCT8x2_STORET(w);
+  OC_IDCT8x2_LOAD2(_y+16);
+  OC_IDCT8x2_2();
+  OC_IDCT8x2_STORET(w+2);
+  /*Transform rows of w into columns of y.*/
+  for(i=0;i<8;i+=2){
+    OC_IDCT8x2_LOAD4(w+i*8);
+    OC_IDCT8x2_4();
+    OC_IDCT8x2_ROUND_STORET(_y+i);
+  }
+}
+
+/*88 cycles.*/
+static void oc_idct8x8_3_c64x(ogg_int16_t _y[64]){
+  ogg_int16_t w[64];
+  int         t0;
+  int         t1;
+  int         t2;
+  int         t3;
+  int         t4;
+  int         t5;
+  int         t6;
+  int         t7;
+  int         x0;
+  int         x1;
+  int         i;
+  /*Transform rows of x into rows of w.*/
+  for(i=0;i<2;i+=2){
+    OC_IDCT8x2_LOAD2(_y+i*8);
+    OC_IDCT8x2_2();
+    OC_IDCT8x2_STORE(w+i*8);
+  }
+  /*Transform columns of w into columns of y.*/
+  for(i=0;i<8;i+=2){
+    OC_IDCT8x2_LOAD2T(w+i);
+    OC_IDCT8x2_2();
+    OC_IDCT8x2_ROUND_STORET(_y+i);
+  }
+}
+
+/*Performs an inverse 8x8 Type-II DCT transform.
+  The input is assumed to be scaled by a factor of 4 relative to orthonormal
+   version of the transform.*/
+void oc_idct8x8_c64x(ogg_int16_t _y[64],int _last_zzi){
+  if(_last_zzi<3)oc_idct8x8_3_c64x(_y);
+  else if(_last_zzi<10)oc_idct8x8_10_c64x(_y);
+  else oc_idct8x8_slow_c64x(_y);
+}

Added: experimental/derf/theora-ptalarbvorm/lib/c64x/c64xint.h
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/c64x/c64xint.h	                        (rev 0)
+++ experimental/derf/theora-ptalarbvorm/lib/c64x/c64xint.h	2010-07-19 03:43:42 UTC (rev 17336)
@@ -0,0 +1,77 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id$
+
+ ********************************************************************/
+
+#if !defined(_c64x_c64xint_H)
+# define _c64x_c64xint_H (1)
+
+# if defined(OC_C64X_ASM)
+#  if !defined(oc_frag_copy)
+#   define oc_frag_copy(_state,_dst,_src,_ystride) \
+     oc_frag_copy_c64x(_dst,_src,_ystride)
+#  endif
+#  if !defined(oc_frag_recon_intra)
+#   define oc_frag_recon_intra(_state,_dst,_dst_ystride,_residue) \
+     oc_frag_recon_intra_c64x(_dst,_dst_ystride,_residue)
+#  endif
+#  if !defined(oc_frag_recon_inter)
+#   define oc_frag_recon_inter(_state,_dst,_src,_ystride,_residue) \
+     oc_frag_recon_inter_c64x(_dst,_src,_ystride,_residue)
+#  endif
+#  if !defined(oc_frag_recon_inter2)
+#   define oc_frag_recon_inter2(_state,_dst,_src1,_src2,_ystride,_residue) \
+     oc_frag_recon_inter2_c64x(_dst,_src1,_src2,_ystride,_residue)
+#  endif
+#  if !defined(oc_idct8x8)
+#   define oc_idct8x8(_state,_y,_last_zzi) \
+     define oc_idct8x8_c64x(_y,_last_zzi)
+#  endif
+#  if !defined(oc_state_frag_recon)
+#   define oc_state_frag_recon oc_state_frag_recon_c64x
+#  endif
+#  if !defined(oc_state_frag_copy_list)
+#   define oc_state_frag_copy_list oc_state_frag_copy_list_c64x
+#  endif
+#  if !defined(oc_state_loop_filter_frag_rows)
+#   define oc_state_loop_filter_frag_rows oc_state_loop_filter_frag_rows_c64x
+#  endif
+#  if !defined(oc_restore_fpu)
+#   define oc_restore_fpu(_state) do{}while(0)
+#  endif
+# endif
+
+# include "../internal.h"
+
+void oc_state_vtable_init_c64x(oc_theora_state *_state);
+
+void oc_frag_copy_c64x(unsigned char *_dst,
+ const unsigned char *_src,int _ystride);
+void oc_frag_recon_intra_c64x(unsigned char *_dst,int _ystride,
+ const ogg_int16_t *_residue);
+void oc_frag_recon_inter_c64x(unsigned char *_dst,
+ const unsigned char *_src,int _ystride,const ogg_int16_t *_residue);
+void oc_frag_recon_inter2_c64x(unsigned char *_dst,const unsigned char *_src1,
+ const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue);
+void oc_idct8x8_c64x(ogg_int16_t _y[64],int _last_zzi);
+void oc_state_frag_recon_c64x(const oc_theora_state *_state,ptrdiff_t _fragi,
+ int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant);
+void oc_state_frag_copy_list_c64x(const oc_theora_state *_state,
+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
+ int _dst_frame,int _src_frame,int _pli);
+void oc_state_loop_filter_frag_rows_c64x(const oc_theora_state *_state,
+ int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
+
+#endif

Added: experimental/derf/theora-ptalarbvorm/lib/c64x/c64xstate.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/c64x/c64xstate.c	                        (rev 0)
+++ experimental/derf/theora-ptalarbvorm/lib/c64x/c64xstate.c	2010-07-19 03:43:42 UTC (rev 17336)
@@ -0,0 +1,36 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id$
+
+ ********************************************************************/
+
+#include "c64xint.h"
+
+#if defined(OC_C64X_ASM)
+
+void oc_state_vtable_init_c64x(oc_theora_state *_state){
+  _state->opt_vtable.frag_copy=oc_frag_copy_c64x;
+  _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_c64x;
+  _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_c64x;
+  _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_c64x;
+  _state->opt_vtable.idct8x8=oc_idct8x8_c64x;
+  _state->opt_vtable.state_frag_recon=oc_state_frag_recon_c64x;
+  _state->opt_vtable.state_frag_copy_list=oc_state_frag_copy_list_c64x;
+  _state->opt_vtable.state_loop_filter_frag_rows=
+   oc_state_loop_filter_frag_rows_c64x;
+  _state->opt_vtable.restore_fpu=oc_restore_fpu_c;
+  _state->opt_data.dct_fzig_zag=OC_FZIG_ZAG;
+}
+
+#endif