[xiph-commits] r9190 - experimental/giles/theora-exp-mt/lib

Wed Apr 27 16:12:21 PDT 2005

Author: tterribe
Date: 2005-04-27 16:12:19 -0700 (Wed, 27 Apr 2005)
New Revision: 9190

Modified:
   experimental/giles/theora-exp-mt/lib/decint.h
   experimental/giles/theora-exp-mt/lib/decode.c
Log:
Update to spawn the extra threads once at initialization, instead of for every
 stripe.
This gets about an additional 5% speedup on a 2-way P4-Xeon machine (with
 hyperthreading enabled), for a total of about 9.3% over the single-threaded
 version.


Modified: experimental/giles/theora-exp-mt/lib/decint.h
===================================================================

--- experimental/giles/theora-exp-mt/lib/decint.h	2005-04-26 18:50:58 UTC (rev 9189)
+++ experimental/giles/theora-exp-mt/lib/decint.h	2005-04-27 23:12:19 UTC (rev 9190)
@@ -1,11 +1,14 @@
 #include <limits.h>
+#include <pthread.h>
 #if !defined(_decint_H)
 # define _decint_H (1)
 # include "theora/theora.h"
 # include "internal.h"
 
-typedef struct theora_setup_info oc_setup_info;
-typedef struct theora_dec_ctx    oc_dec_ctx;
+typedef struct theora_setup_info     oc_setup_info;
+typedef struct oc_dec_pipeline_state oc_dec_pipeline_state;
+typedef struct oc_dec_pipeline_plane oc_dec_pipeline_plane;
+typedef struct theora_dec_ctx        oc_dec_ctx;
 
 # include "idct.h"
 # include "huffdec.h"
@@ -27,6 +30,41 @@
 
 
 
+struct oc_dec_pipeline_plane{
+  oc_dec_ctx    *dec;
+  int            done;
+  int            pli;
+  int            avail_fragy0;
+  int            avail_fragy_end;
+};
+
+
+
+struct oc_dec_pipeline_state{
+  int                   ti[3][64];
+  int                   ebi[3][64];
+  int                   eob_runs[3][64];
+  int                   bounding_values[512];
+  int                  *coded_fragis[3];
+  int                  *uncoded_fragis[3];
+  int                   fragy0[3];
+  int                   fragy_end[3];
+  int                   ncoded_fragis[3];
+  int                   nuncoded_fragis[3];
+  int                   pred_last[3][3];
+  int                   mcu_nvfrags;
+  int                   loop_filter;
+  int                   pp_level;
+  int                   stripe_fragy;
+  int                   refi;
+  int                   notstart;
+  int                   notdone;
+  oc_dec_pipeline_plane pplanes[3];
+};
+
+
+
+
 struct theora_dec_ctx{
   /*Shared encoder/decoder state.*/
   oc_theora_state          state;
@@ -72,6 +110,14 @@
   theora_ycbcr_buffer      pp_frame_buf;
   /*The striped decode callback function.*/
   theora_stripe_callback   stripe_cb;
+  /*The striped decoding pipeline.*/
+  oc_dec_pipeline_state    pipe;
+  /*Mutex for parallel pipelined decode.*/
+  pthread_mutex_t          pipe_lock;
+  /*The auxilliary decoder threads.*/
+  pthread_t                pipe_thread[2];
+  /*Condition variables for the auxilliary decoder threads.*/
+  pthread_cond_t           pipe_cond[2];
 };
 
 /*Fix-ups for the libogg1 API, which returns -1 when there are insufficient

Modified: experimental/giles/theora-exp-mt/lib/decode.c
===================================================================
--- experimental/giles/theora-exp-mt/lib/decode.c	2005-04-26 18:50:58 UTC (rev 9189)
+++ experimental/giles/theora-exp-mt/lib/decode.c	2005-04-27 23:12:19 UTC (rev 9190)
@@ -110,7 +110,10 @@
 };
 
 
+static void *oc_pipe_advance_loop(void *_arg);
 
+
+
 static int oc_sb_run_unpack(oggpack_buffer *_opb){
   long bits;
   int ret;
@@ -215,10 +218,35 @@
   _dec->pp_frame_data=NULL;
   _dec->stripe_cb.ctx=NULL;
   _dec->stripe_cb.stripe_decoded=NULL;
+  pthread_mutex_init(&_dec->pipe_lock,NULL);
+  pthread_mutex_lock(&_dec->pipe_lock);
+  for(pli=0;pli<3;pli++){
+    _dec->pipe.pplanes[pli].dec=_dec;
+    _dec->pipe.pplanes[pli].pli=pli;
+    _dec->pipe.pplanes[pli].done=1;
+  }
+  for(pli=1;pli<3;pli++){
+    pthread_cond_init(_dec->pipe_cond+pli-1,NULL);
+    pthread_create(_dec->pipe_thread+pli-1,NULL,oc_pipe_advance_loop,
+     _dec->pipe.pplanes+pli);
+  }
+  pthread_mutex_unlock(&_dec->pipe_lock);
   return 0;
 }
 
 static void oc_dec_clear(oc_dec_ctx *_dec){
+  int pli;
+  /*Collect our auxilliary decoder threads.*/
+  for(pli=1;pli<3;pli++){
+    void *ret;
+    pthread_mutex_lock(&_dec->pipe_lock);
+    _dec->pipe.pplanes[pli].done=-1;
+    pthread_cond_signal(_dec->pipe_cond+pli-1);
+    pthread_mutex_unlock(&_dec->pipe_lock);
+    pthread_join(_dec->pipe_thread[pli-1],&ret);
+    pthread_cond_destroy(_dec->pipe_cond+pli-1);
+  }
+  pthread_mutex_destroy(&_dec->pipe_lock);
   _ogg_free(_dec->pp_frame_data);
   _ogg_free(_dec->variances);
   _ogg_free(_dec->dc_qis);
@@ -1285,25 +1313,6 @@
 
 
 
-typedef struct{
-  int  ti[3][64];
-  int  ebi[3][64];
-  int  eob_runs[3][64];
-  int  bounding_values[512];
-  int *coded_fragis[3];
-  int *uncoded_fragis[3];
-  int  fragy0[3];
-  int  fragy_end[3];
-  int  ncoded_fragis[3];
-  int  nuncoded_fragis[3];
-  int  pred_last[3][3];
-  int  mcu_nvfrags;
-  int  loop_filter;
-  int  pp_level;
-}oc_dec_pipeline_state;
-
-
-
 /*Initialize the main decoding pipeline.*/
 static void oc_dec_pipeline_init(oc_dec_ctx *_dec,
  oc_dec_pipeline_state *_pipe){
@@ -1884,66 +1893,56 @@
   }
 }
 
-typedef struct{
-  theora_dec_ctx *dec;
+
+/*Advance the pipeline for one image plane.*/
+static void oc_pipe_advance(oc_dec_pipeline_plane *_pplane){
+  oc_dec_ctx            *dec;
   oc_dec_pipeline_state *pipe;
-  int refi, pli;
-  int stripe_fragy;
-  int *avail_fragy0;
-  int *avail_fragy_end;
-  int notstart, notdone;
-  pthread_mutex_t *lock;
-}oc_stripe;
-
-/*advance the pipeline for one image plane*/
-static void *oc_pipe_advance(void *_arg){
-  oc_stripe *stripe = (oc_stripe*)_arg;
-  oc_dec_pipeline_state *pipe = stripe->pipe;
-  theora_dec_ctx *dec = stripe->dec;
-  int pli = stripe->pli;
-
-  oc_fragment_plane *fplane;
-  int                frag_shift;
-  int                pp_offset;
-  int                sdelay;
-  int                edelay;
-
+  oc_fragment_plane     *fplane;
+  int                    frag_shift;
+  int                    pp_offset;
+  int                    sdelay;
+  int                    edelay;
+  int                    pli;
+  dec=_pplane->dec;
+  pipe=&dec->pipe;
+  pli=_pplane->pli;
   fplane=dec->state.fplanes+pli;
   /*Compute the first and last fragment row of the current MCU for this
         plane.*/
   frag_shift=pli!=0&&!(dec->state.info.pixel_fmt&2);
-  pipe->fragy0[pli]=stripe->stripe_fragy>>frag_shift;
+  pipe->fragy0[pli]=pipe->stripe_fragy>>frag_shift;
   pipe->fragy_end[pli]=OC_MINI(fplane->nvfrags,
    pipe->fragy0[pli]+(pipe->mcu_nvfrags>>frag_shift));
   oc_dec_dc_unpredict_mcu_plane(dec,pipe,pli);
   oc_dec_frags_recon_mcu_plane(dec,pipe,pli);
   sdelay=edelay=0;
   if(pipe->loop_filter){
-    sdelay+=stripe->notstart;
-    edelay+=stripe->notdone;
+    sdelay+=pipe->notstart;
+    edelay+=pipe->notdone;
     oc_state_loop_filter_frag_rows(&dec->state,pipe->bounding_values+256,
-     stripe->refi,pli,pipe->fragy0[pli]-sdelay,pipe->fragy_end[pli]-edelay);
+     pipe->refi,pli,pipe->fragy0[pli]-sdelay,pipe->fragy_end[pli]-edelay);
   }
   /*To fill the borders, we have an additional two pixel delay, since a
      fragment in the next row could filter its top edge, using two pixels
      from a fragment in this row.
     But there's no reason to delay a full fragment between the two.*/
-  oc_state_borders_fill_rows(&dec->state,stripe->refi,pli,
+  oc_state_borders_fill_rows(&dec->state,pipe->refi,pli,
    (pipe->fragy0[pli]-sdelay<<3)-(sdelay<<1),
    (pipe->fragy_end[pli]-edelay<<3)-(edelay<<1));
   /*Out-of-loop post-processing.*/
   pp_offset=3*(pli!=0);
   if(pipe->pp_level>=OC_PP_LEVEL_DEBLOCKY+pp_offset){
     /*Perform de-blocking in one plane.*/
-    sdelay+=stripe->notstart;
-    edelay+=stripe->notdone;
+    sdelay+=pipe->notstart;
+    edelay+=pipe->notdone;
     oc_dec_deblock_frag_rows(dec,dec->pp_frame_buf,
-     dec->state.ref_frame_bufs[stripe->refi],pli,
+     dec->state.ref_frame_bufs[pipe->refi],pli,
      pipe->fragy0[pli]-sdelay,pipe->fragy_end[pli]-edelay);
     if(pipe->pp_level>=OC_PP_LEVEL_DERINGY+pp_offset){
       /*Perform de-ringing in one plane.*/
-      sdelay+=stripe->notstart;
-      edelay+=stripe->notdone;
+      sdelay+=pipe->notstart;
+      edelay+=pipe->notdone;
       oc_dec_dering_frag_rows(dec,dec->pp_frame_buf,pli,
        pipe->fragy0[pli]-sdelay,pipe->fragy_end[pli]-edelay);
     }
@@ -1951,22 +1950,38 @@
   /*If no post-processing is done, we still need to delay a row for the
      loop filter, thanks to the strange filtering order VP3 chose.*/
   else if(pipe->loop_filter){
-    sdelay+=stripe->notstart;
-    edelay+=stripe->notdone;
+    sdelay+=pipe->notstart;
+    edelay+=pipe->notdone;
   }
-  /*Compute the intersection of the available rows in all planes.
-    If chroma is sub-sampled, the effect of each of its delays is
-     doubled, but luma might have more post-processing filters enabled
-     than chroma, so we don't know up front which one is the limiting
-     factor.*/
-  pthread_mutex_lock(stripe->lock);
-  *stripe->avail_fragy0=OC_MINI(*stripe->avail_fragy0,
-   pipe->fragy0[pli]-sdelay<<frag_shift);
-  *stripe->avail_fragy_end=OC_MINI(*stripe->avail_fragy_end,
-   pipe->fragy_end[pli]-edelay<<frag_shift);
-  pthread_mutex_unlock(stripe->lock);
+  /*Store the range of fragments available after processing this stripe.*/
+  _pplane->avail_fragy0=pipe->fragy0[pli]-sdelay<<frag_shift;
+  _pplane->avail_fragy_end=pipe->fragy_end[pli]-edelay<<frag_shift;
+}
 
-  return (void*)NULL;
+/*The main loop for our auxilliary decoder threads.*/
+static void *oc_pipe_advance_loop(void *_arg){
+  oc_dec_pipeline_plane *pplane;
+  oc_dec_ctx            *dec;
+  int                    pli;
+  pplane=(oc_dec_pipeline_plane *)_arg;
+  dec=pplane->dec;
+  pli=pplane->pli;
+  for(;;){
+    pthread_mutex_lock(&dec->pipe_lock);
+    while(pplane->done==1){
+      pthread_cond_wait(dec->pipe_cond+pli-1,&dec->pipe_lock);
+    }
+    if(pplane->done<0)break;
+    pthread_mutex_unlock(&dec->pipe_lock);
+    oc_pipe_advance(pplane);
+    pthread_mutex_lock(&dec->pipe_lock);
+    pplane->done=1;
+    pthread_cond_signal(dec->pipe_cond+pli-1);
+    pthread_mutex_unlock(&dec->pipe_lock);
+  }
+  /*Whenever we break, we're still holding the lock, so release it.*/
+  pthread_mutex_unlock(&dec->pipe_lock);
+  return NULL;
 }
 
 int theora_decode_packetin(theora_dec_ctx *_dec,const ogg_packet *_op,
@@ -1977,13 +1992,11 @@
      like an inter frame with no coded blocks.
     Only proceed if we have a non-empty packet.*/
   if(_op->bytes!=0){
-    oc_dec_pipeline_state pipe;
-    theora_ycbcr_buffer   stripe_buf;
-    int                   stripe_fragy;
-    int                   refi;
-    int                   pli;
-    int                   notstart;
-    int                   notdone;
+    oc_dec_pipeline_state *pipe;
+    theora_ycbcr_buffer    stripe_buf;
+    int                    stripe_fragy;
+    int                    refi;
+    int                    pli;
     oggpackB_readinit(&_dec->opb,_op->packet,_op->bytes);
     ret=oc_dec_frame_header_unpack(_dec);
     if(ret<0)return ret;
@@ -2060,38 +2073,42 @@
       An application callback allows further application processing (blitting
        to video memory, color conversion, etc.) to also use the data while it's
        in cache.*/
-    oc_dec_pipeline_init(_dec,&pipe);
+    pipe=&_dec->pipe;
+    oc_dec_pipeline_init(_dec,pipe);
     oc_ycbcr_buffer_flip(stripe_buf,_dec->pp_frame_buf);
-    notstart=0;
-    notdone=1;
-    for(stripe_fragy=notstart=0;notdone;stripe_fragy+=pipe.mcu_nvfrags){
+    pipe->notstart=0;
+    pipe->notdone=1;
+    pipe->refi=refi;
+    for(stripe_fragy=0;pipe->notdone;stripe_fragy+=pipe->mcu_nvfrags){
       int avail_fragy0;
       int avail_fragy_end;
-      pthread_t threads[3];
-      pthread_mutex_t lock;
-      oc_stripe stripes[3];
       avail_fragy0=avail_fragy_end=_dec->state.fplanes[0].nvfrags;
-      notdone=stripe_fragy+pipe.mcu_nvfrags<avail_fragy_end;
-      pthread_mutex_init(&lock, NULL);
-      for(pli=3;pli-->0;){
-        stripes[pli].dec = _dec;
-        stripes[pli].pipe = &pipe;
-        stripes[pli].refi = refi;
-        stripes[pli].pli = pli;
-        stripes[pli].avail_fragy0 = &avail_fragy0;
-        stripes[pli].avail_fragy_end = &avail_fragy_end;
-        stripes[pli].stripe_fragy = stripe_fragy;
-        stripes[pli].notstart = notstart;
-        stripes[pli].notdone = notdone;
-        stripes[pli].lock = &lock;
-        if(pli>0){
-          pthread_create(&threads[pli], NULL, oc_pipe_advance, 
-           (void*)&stripes[pli]);
+      pipe->notdone=stripe_fragy+pipe->mcu_nvfrags<avail_fragy_end;
+      pipe->stripe_fragy=stripe_fragy;
+      pthread_mutex_lock(&_dec->pipe_lock);
+      for(pli=1;pli<3;pli++){
+        pipe->pplanes[pli].done=0;
+        pthread_cond_signal(_dec->pipe_cond+pli-1);
+      }
+      pthread_mutex_unlock(&_dec->pipe_lock);
+      oc_pipe_advance(pipe->pplanes+0);
+      pthread_mutex_lock(&_dec->pipe_lock);
+      for(pli=1;pli<3;pli++){
+        while(pipe->pplanes[pli].done==0){
+          pthread_cond_wait(_dec->pipe_cond+pli-1,&_dec->pipe_lock);
         }
-        else oc_pipe_advance(stripes+pli);
       }
-      for(pli=1;pli<3;pli++)
-        pthread_join(threads[pli],NULL);
+      pthread_mutex_unlock(&_dec->pipe_lock);
+      /*Compute the intersection of the available rows in this plane.
+        If chroma is sub-sampled, the effect of each of its delays is
+         doubled, but luma might have more post-processing filters enabled
+         than chroma, so we don't know up front which one is the limiting
+         factor.*/
+      for(pli=0;pli<3;pli++){
+        avail_fragy0=OC_MINI(avail_fragy0,pipe->pplanes[pli].avail_fragy0);
+        avail_fragy_end=OC_MINI(avail_fragy_end,
+         pipe->pplanes[pli].avail_fragy_end);
+      }
       if(_dec->stripe_cb.stripe_decoded!=NULL){
         /*Make the callback, ensuring we flip the sense of the "start" and
            "end" of the available region upside down.*/
@@ -2099,7 +2116,7 @@
          _dec->state.fplanes[0].nvfrags-avail_fragy_end,
          _dec->state.fplanes[0].nvfrags-avail_fragy0);
       }
-      notstart=1;
+      pipe->notstart=1;
     }
     /*Finish filling in the reference frame borders.*/
     for(pli=0;pli<3;pli++)oc_state_borders_fill_caps(&_dec->state,refi,pli);