[xiph-commits] r17334 - in experimental/derf/theora-ptalarbvorm: . lib

tterribe at svn.xiph.org tterribe at svn.xiph.org
Sat Jul 17 22:02:29 PDT 2010


Author: tterribe
Date: 2010-07-17 22:02:29 -0700 (Sat, 17 Jul 2010)
New Revision: 17334

Modified:
   experimental/derf/theora-ptalarbvorm/configure.ac
   experimental/derf/theora-ptalarbvorm/lib/Makefile.am
   experimental/derf/theora-ptalarbvorm/lib/decint.h
   experimental/derf/theora-ptalarbvorm/lib/decode.c
   experimental/derf/theora-ptalarbvorm/lib/internal.h
   experimental/derf/theora-ptalarbvorm/lib/state.c
Log:
Import C64x port from David Schleef's git repository.


Modified: experimental/derf/theora-ptalarbvorm/configure.ac
===================================================================
--- experimental/derf/theora-ptalarbvorm/configure.ac	2010-07-16 16:47:23 UTC (rev 17333)
+++ experimental/derf/theora-ptalarbvorm/configure.ac	2010-07-18 05:02:29 UTC (rev 17334)
@@ -216,12 +216,18 @@
     AC_DEFINE([OC_X86_ASM], [],  [make use of x86 asm optimization])
     AC_DEFINE([OC_X86_64_ASM], [],  [make use of x86_64 asm optimization])
     ;;
+  tic6x)
+    cpu_c64x=yes
+    cpu_optimization="TI C64x+"
+    AC_DEFINE([OC_C64X_ASM], [],  [make use of c64x+ asm optimization])
+    ;;
   esac
 else
   cpu_optimization="disabled"
 fi
 AM_CONDITIONAL([CPU_x86_64], [test x$cpu_x86_64 = xyes])
 AM_CONDITIONAL([CPU_x86_32], [test x$cpu_x86_32 = xyes])
+AM_CONDITIONAL([CPU_c64x], [test x$cpu_c64x = xyes])
 
 # Test whenever ld supports -version-script
 AC_PROG_LD

Modified: experimental/derf/theora-ptalarbvorm/lib/Makefile.am
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/Makefile.am	2010-07-16 16:47:23 UTC (rev 17333)
+++ experimental/derf/theora-ptalarbvorm/lib/Makefile.am	2010-07-18 05:02:29 UTC (rev 17334)
@@ -20,7 +20,9 @@
 	x86/sse2idct.c \
 	x86/x86int.h \
 	x86/x86state.c \
-	x86_vc
+	x86_vc \
+	c64x/c64xint.h \
+	c64x/c64xdec.h
 
 lib_LTLIBRARIES = libtheoradec.la libtheoraenc.la libtheora.la
 
@@ -102,15 +104,26 @@
 	x86/mmxstate.c \
 	x86/sse2idct.c \
 	x86/x86state.c
+
+decoder_c64x_sources = \
+	c64x/c64xdec.c \
+	c64x/c64xfrag.c \
+	c64x/c64xidct.c \
+	c64x/c64xstate.c
+
 if CPU_x86_64
 decoder_arch_sources = $(decoder_x86_sources)
 else
 if CPU_x86_32
 decoder_arch_sources = $(decoder_x86_sources)
 else
+if CPU_c64x
+decoder_arch_sources = $(decoder_c64x_sources)
+else
 decoder_arch_sources =
 endif
 endif
+endif
 
 decoder_sources = \
 	apiwrapper.c \

Modified: experimental/derf/theora-ptalarbvorm/lib/decint.h
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/decint.h	2010-07-16 16:47:23 UTC (rev 17333)
+++ experimental/derf/theora-ptalarbvorm/lib/decint.h	2010-07-18 05:02:29 UTC (rev 17334)
@@ -22,8 +22,10 @@
 # include "internal.h"
 # include "bitpack.h"
 
-typedef struct th_setup_info oc_setup_info;
-typedef struct th_dec_ctx    oc_dec_ctx;
+typedef struct th_setup_info         oc_setup_info;
+typedef struct oc_dec_opt_vtable     oc_dec_opt_vtable;
+typedef struct oc_dec_pipeline_state oc_dec_pipeline_state;
+typedef struct th_dec_ctx            oc_dec_ctx;
 
 # include "huffdec.h"
 # include "dequant.h"
@@ -44,6 +46,33 @@
 
 
 
+/*Decoder specific functions with accelerated variants.*/
+struct oc_dec_opt_vtable{
+  void (*dc_unpredict_mcu_plane)(oc_dec_ctx *_dec,
+   oc_dec_pipeline_state *_pipe,int _pli);
+};
+
+
+
+struct oc_dec_pipeline_state{
+  int                 bounding_values[256];
+  ptrdiff_t           ti[3][64];
+  ptrdiff_t           ebi[3][64];
+  ptrdiff_t           eob_runs[3][64];
+  const ptrdiff_t    *coded_fragis[3];
+  const ptrdiff_t    *uncoded_fragis[3];
+  ptrdiff_t           ncoded_fragis[3];
+  ptrdiff_t           nuncoded_fragis[3];
+  const ogg_uint16_t *dequant[3][3][2];
+  int                 fragy0[3];
+  int                 fragy_end[3];
+  int                 pred_last[3][3];
+  int                 mcu_nvfrags;
+  int                 loop_filter;
+  int                 pp_level;
+};
+
+
 struct th_dec_ctx{
   /*Shared encoder/decoder state.*/
   oc_theora_state      state;
@@ -87,6 +116,9 @@
   th_ycbcr_buffer      pp_frame_buf;
   /*The striped decode callback function.*/
   th_stripe_callback   stripe_cb;
+  oc_dec_pipeline_state pipe;
+  /*Table for decoder acceleration functions.*/
+  oc_dec_opt_vtable    opt_vtable;
 # if defined(HAVE_CAIRO)
   /*Output metrics for debugging.*/
   int                  telemetry;
@@ -104,4 +136,20 @@
 # endif
 };
 
+/*Decoder-specific accelerated functions.*/
+# if defined(OC_C64X_ASM)
+#  include "c64x/c64xdec.h"
+# endif
+
+# if !defined(oc_dec_dc_unpredict_mcu_plane)
+#  define oc_dec_dc_unpredict_mcu_plane(_dec,_pipe,_pli) \
+ ((*(_dec)->opt_vtable.dc_unpredict_mcu_plane)(_dec,_pipe,_pli))
+# endif
+
+/*Default pure-C implementations.*/
+void oc_dec_vtable_init_c(oc_dec_ctx *_dec);
+
+void oc_dec_dc_unpredict_mcu_plane_c(oc_dec_ctx *_dec,
+ oc_dec_pipeline_state *_pipe,int _pli);
+
 #endif

Modified: experimental/derf/theora-ptalarbvorm/lib/decode.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/decode.c	2010-07-16 16:47:23 UTC (rev 17333)
+++ experimental/derf/theora-ptalarbvorm/lib/decode.c	2010-07-18 05:02:29 UTC (rev 17334)
@@ -358,6 +358,11 @@
 
 
 
+void oc_dec_vtable_init_c(oc_dec_ctx *_dec){
+  _dec->opt_vtable.dc_unpredict_mcu_plane=
+   oc_dec_dc_unpredict_mcu_plane_c;
+}
+
 static int oc_dec_init(oc_dec_ctx *_dec,const th_info *_info,
  const th_setup_info *_setup){
   int qti;
@@ -402,6 +407,11 @@
   }
   memcpy(_dec->state.loop_filter_limits,_setup->qinfo.loop_filter_limits,
    sizeof(_dec->state.loop_filter_limits));
+#if defined(OC_C64X_ASM)
+  oc_dec_vtable_init_c64x(_dec);
+#else
+  oc_dec_vtable_init_c(_dec);
+#endif
   _dec->pp_level=OC_PP_LEVEL_DISABLED;
   _dec->dc_qis=NULL;
   _dec->variances=NULL;
@@ -1322,26 +1332,6 @@
 }
 
 
-
-typedef struct{
-  int                 bounding_values[256];
-  ptrdiff_t           ti[3][64];
-  ptrdiff_t           eob_runs[3][64];
-  const ptrdiff_t    *coded_fragis[3];
-  const ptrdiff_t    *uncoded_fragis[3];
-  ptrdiff_t           ncoded_fragis[3];
-  ptrdiff_t           nuncoded_fragis[3];
-  const ogg_uint16_t *dequant[3][3][2];
-  int                 fragy0[3];
-  int                 fragy_end[3];
-  int                 pred_last[3][3];
-  int                 mcu_nvfrags;
-  int                 loop_filter;
-  int                 pp_level;
-}oc_dec_pipeline_state;
-
-
-
 /*Initialize the main decoding pipeline.*/
 static void oc_dec_pipeline_init(oc_dec_ctx *_dec,
  oc_dec_pipeline_state *_pipe){
@@ -1401,7 +1391,7 @@
    rows).
   As a side effect, the number of coded and uncoded fragments in this plane of
    the MCU is also computed.*/
-static void oc_dec_dc_unpredict_mcu_plane(oc_dec_ctx *_dec,
+void oc_dec_dc_unpredict_mcu_plane_c(oc_dec_ctx *_dec,
  oc_dec_pipeline_state *_pipe,int _pli){
   const oc_fragment_plane *fplane;
   oc_fragment             *frags;
@@ -1476,6 +1466,7 @@
             case  9:
             case 11:
             case 13:{
+              /*The TI compiler mis-compiles this line.*/
               pred=(75*frags[fragi-1].dc+53*u_frags[fragi+1].dc)/128;
             }break;
             case 10:pred=(u_frags[fragi-1].dc+u_frags[fragi+1].dc)/2;break;
@@ -2124,13 +2115,12 @@
     return TH_DUPFRAME;
   }
   else{
-    oc_dec_pipeline_state pipe;
-    th_ycbcr_buffer       stripe_buf;
-    int                   stripe_fragy;
-    int                   refi;
-    int                   pli;
-    int                   notstart;
-    int                   notdone;
+    th_ycbcr_buffer stripe_buf;
+    int             stripe_fragy;
+    int             refi;
+    int             pli;
+    int             notstart;
+    int             notdone;
     /*Select a free buffer to use for the reconstructed version of this frame.*/
     for(refi=0;refi==_dec->state.ref_frame_idx[OC_FRAME_GOLD]||
      refi==_dec->state.ref_frame_idx[OC_FRAME_PREV];refi++);
@@ -2194,15 +2184,15 @@
       An application callback allows further application processing (blitting
        to video memory, color conversion, etc.) to also use the data while it's
        in cache.*/
-    oc_dec_pipeline_init(_dec,&pipe);
+    oc_dec_pipeline_init(_dec,&_dec->pipe);
     oc_ycbcr_buffer_flip(stripe_buf,_dec->pp_frame_buf);
     notstart=0;
     notdone=1;
-    for(stripe_fragy=0;notdone;stripe_fragy+=pipe.mcu_nvfrags){
+    for(stripe_fragy=0;notdone;stripe_fragy+=_dec->pipe.mcu_nvfrags){
       int avail_fragy0;
       int avail_fragy_end;
       avail_fragy0=avail_fragy_end=_dec->state.fplanes[0].nvfrags;
-      notdone=stripe_fragy+pipe.mcu_nvfrags<avail_fragy_end;
+      notdone=stripe_fragy+_dec->pipe.mcu_nvfrags<avail_fragy_end;
       for(pli=0;pli<3;pli++){
         oc_fragment_plane *fplane;
         int                frag_shift;
@@ -2213,45 +2203,45 @@
         /*Compute the first and last fragment row of the current MCU for this
            plane.*/
         frag_shift=pli!=0&&!(_dec->state.info.pixel_fmt&2);
-        pipe.fragy0[pli]=stripe_fragy>>frag_shift;
-        pipe.fragy_end[pli]=OC_MINI(fplane->nvfrags,
-         pipe.fragy0[pli]+(pipe.mcu_nvfrags>>frag_shift));
-        oc_dec_dc_unpredict_mcu_plane(_dec,&pipe,pli);
-        oc_dec_frags_recon_mcu_plane(_dec,&pipe,pli);
+        _dec->pipe.fragy0[pli]=stripe_fragy>>frag_shift;
+        _dec->pipe.fragy_end[pli]=OC_MINI(fplane->nvfrags,
+         _dec->pipe.fragy0[pli]+(_dec->pipe.mcu_nvfrags>>frag_shift));
+        oc_dec_dc_unpredict_mcu_plane(_dec,&_dec->pipe,pli);
+        oc_dec_frags_recon_mcu_plane(_dec,&_dec->pipe,pli);
         sdelay=edelay=0;
-        if(pipe.loop_filter){
+        if(_dec->pipe.loop_filter){
           sdelay+=notstart;
           edelay+=notdone;
-          oc_state_loop_filter_frag_rows(&_dec->state,pipe.bounding_values,
-           refi,pli,pipe.fragy0[pli]-sdelay,pipe.fragy_end[pli]-edelay);
+          oc_state_loop_filter_frag_rows(&_dec->state,_dec->pipe.bounding_values,
+           refi,pli,_dec->pipe.fragy0[pli]-sdelay,_dec->pipe.fragy_end[pli]-edelay);
         }
         /*To fill the borders, we have an additional two pixel delay, since a
            fragment in the next row could filter its top edge, using two pixels
            from a fragment in this row.
           But there's no reason to delay a full fragment between the two.*/
         oc_state_borders_fill_rows(&_dec->state,refi,pli,
-         (pipe.fragy0[pli]-sdelay<<3)-(sdelay<<1),
-         (pipe.fragy_end[pli]-edelay<<3)-(edelay<<1));
+         (_dec->pipe.fragy0[pli]-sdelay<<3)-(sdelay<<1),
+         (_dec->pipe.fragy_end[pli]-edelay<<3)-(edelay<<1));
         /*Out-of-loop post-processing.*/
         pp_offset=3*(pli!=0);
-        if(pipe.pp_level>=OC_PP_LEVEL_DEBLOCKY+pp_offset){
+        if(_dec->pipe.pp_level>=OC_PP_LEVEL_DEBLOCKY+pp_offset){
           /*Perform de-blocking in one plane.*/
           sdelay+=notstart;
           edelay+=notdone;
           oc_dec_deblock_frag_rows(_dec,_dec->pp_frame_buf,
            _dec->state.ref_frame_bufs[refi],pli,
-           pipe.fragy0[pli]-sdelay,pipe.fragy_end[pli]-edelay);
-          if(pipe.pp_level>=OC_PP_LEVEL_DERINGY+pp_offset){
+           _dec->pipe.fragy0[pli]-sdelay,_dec->pipe.fragy_end[pli]-edelay);
+          if(_dec->pipe.pp_level>=OC_PP_LEVEL_DERINGY+pp_offset){
             /*Perform de-ringing in one plane.*/
             sdelay+=notstart;
             edelay+=notdone;
             oc_dec_dering_frag_rows(_dec,_dec->pp_frame_buf,pli,
-             pipe.fragy0[pli]-sdelay,pipe.fragy_end[pli]-edelay);
+             _dec->pipe.fragy0[pli]-sdelay,_dec->pipe.fragy_end[pli]-edelay);
           }
         }
         /*If no post-processing is done, we still need to delay a row for the
            loop filter, thanks to the strange filtering order VP3 chose.*/
-        else if(pipe.loop_filter){
+        else if(_dec->pipe.loop_filter){
           sdelay+=notstart;
           edelay+=notdone;
         }
@@ -2260,9 +2250,9 @@
            doubled, but luma might have more post-processing filters enabled
            than chroma, so we don't know up front which one is the limiting
            factor.*/
-        avail_fragy0=OC_MINI(avail_fragy0,pipe.fragy0[pli]-sdelay<<frag_shift);
+        avail_fragy0=OC_MINI(avail_fragy0,_dec->pipe.fragy0[pli]-sdelay<<frag_shift);
         avail_fragy_end=OC_MINI(avail_fragy_end,
-         pipe.fragy_end[pli]-edelay<<frag_shift);
+         _dec->pipe.fragy_end[pli]-edelay<<frag_shift);
       }
       if(_dec->stripe_cb.stripe_decoded!=NULL){
         /*The callback might want to use the FPU, so let's make sure they can.

Modified: experimental/derf/theora-ptalarbvorm/lib/internal.h
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/internal.h	2010-07-16 16:47:23 UTC (rev 17333)
+++ experimental/derf/theora-ptalarbvorm/lib/internal.h	2010-07-18 05:02:29 UTC (rev 17334)
@@ -88,7 +88,7 @@
 /*A predicted frame.*/
 # define OC_INTER_FRAME (1)
 /*A frame of unknown type (frame type decision has not yet been made).*/
-# define OC_UNKWN_FRAME (-1)
+# define OC_UNKWN_FRAME ((unsigned)-1)
 
 /*The amount of padding to add to the reconstructed frame buffers on all
    sides.

Modified: experimental/derf/theora-ptalarbvorm/lib/state.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/state.c	2010-07-16 16:47:23 UTC (rev 17333)
+++ experimental/derf/theora-ptalarbvorm/lib/state.c	2010-07-18 05:02:29 UTC (rev 17334)
@@ -25,6 +25,9 @@
 # include "x86/x86int.h"
 #endif
 #endif
+#if defined(OC_C64X_ASM)
+# include "c64x/c64xint.h"
+#endif
 #if defined(OC_DUMP_IMAGES)
 # include <stdio.h>
 # include "png.h"
@@ -610,6 +613,8 @@
 void oc_state_vtable_init(oc_theora_state *_state){
 #if defined(OC_X86_ASM)
   oc_state_vtable_init_x86(_state);
+#elif defined(OC_C64X_ASM)
+  oc_state_vtable_init_c64x(_state);
 #else
   oc_state_vtable_init_c(_state);
 #endif



More information about the commits mailing list