[xiph-commits] r15274 - branches/theora_multithread_decode_omp/lib/dec

Mon Sep 8 15:35:42 PDT 2008

Author: piga
Date: 2008-09-08 15:35:40 -0700 (Mon, 08 Sep 2008)
New Revision: 15274

Modified:
   branches/theora_multithread_decode_omp/lib/dec/decode.c
Log:
This is the fatest version using openMP


Modified: branches/theora_multithread_decode_omp/lib/dec/decode.c
===================================================================

--- branches/theora_multithread_decode_omp/lib/dec/decode.c	2008-09-08 16:01:44 UTC (rev 15273)
+++ branches/theora_multithread_decode_omp/lib/dec/decode.c	2008-09-08 22:35:40 UTC (rev 15274)
@@ -24,6 +24,10 @@
 # include "png.h"
 #endif
 
+#ifndef MCU_OMP_VALUE
+#define MCU_OMP_VALUE 64
+#endif
+
 /*No post-processing.*/
 #define OC_PP_LEVEL_DISABLED  (0)
 /*Keep track of DC qi for each block only.*/
@@ -2137,89 +2141,76 @@
       int avail_fragy_end;
       avail_fragy0=avail_fragy_end=_dec->state.fplanes[0].nvfrags;
       notdone=stripe_fragy+pipe.mcu_nvfrags<avail_fragy_end;
-      #pragma omp parallel
-      {
-        #pragma omp for nowait
-	for(pli=0;pli<3;pli++) {
-	  oc_fragment_plane *fplane;
-	  int                frag_shift;
-	  fplane=_dec->state.fplanes+pli;
-	  /*Compute the first and last fragment row of the current MCU for this
-	    plane.*/
-	  frag_shift=pli!=0&&!(_dec->state.info.pixel_fmt&2);
-	  pipe.fragy0[pli]=stripe_fragy>>frag_shift;
-	  pipe.fragy_end[pli]=OC_MINI(fplane->nvfrags,
-				      pipe.fragy0[pli]+(pipe.mcu_nvfrags>>frag_shift));
-	  oc_dec_dc_unpredict_mcu_plane(_dec,&pipe,pli);
-	  oc_dec_frags_recon_mcu_plane(_dec,&pipe,pli);
-	}
-	#pragma omp for
-	for(pli=0;pli<3;pli++){
-	  int                pp_offset;
-	  int                sdelay;
-	  int                edelay;
-	  oc_fragment_plane *fplane;
-	  int                frag_shift;
-	  fplane=_dec->state.fplanes+pli;
-	  /*Compute the first and last fragment row of the current MCU for this
-	    plane.*/
-	  frag_shift=pli!=0&&!(_dec->state.info.pixel_fmt&2);
-
-	  sdelay=edelay=0;
-	  if(pipe.loop_filter){
-	    sdelay+=notstart;
-	    edelay+=notdone;
-	    oc_state_loop_filter_frag_rows(&_dec->state,pipe.bounding_values,
-					   refi,pli,pipe.fragy0[pli]-sdelay,pipe.fragy_end[pli]-edelay);
-	  }
-	  /*To fill the borders, we have an additional two pixel delay, since a
-	    fragment in the next row could filter its top edge, using two pixels
-	    from a fragment in this row.
-	    But there's no reason to delay a full fragment between the two.*/
-	  oc_state_borders_fill_rows(&_dec->state,refi,pli,
-				     (pipe.fragy0[pli]-sdelay<<3)-(sdelay<<1),
-				     (pipe.fragy_end[pli]-edelay<<3)-(edelay<<1));
-	  /*Out-of-loop post-processing.*/
-	  pp_offset=3*(pli!=0);
-	  if(pipe.pp_level>=OC_PP_LEVEL_DEBLOCKY+pp_offset){
-	    /*Perform de-blocking in one plane.*/
-	    sdelay+=notstart;
-	    edelay+=notdone;
-	    oc_dec_deblock_frag_rows(_dec,_dec->pp_frame_buf,
-				     _dec->state.ref_frame_bufs[refi],pli,
-				     pipe.fragy0[pli]-sdelay,pipe.fragy_end[pli]-edelay);
-	    if(pipe.pp_level>=OC_PP_LEVEL_DERINGY+pp_offset){
-	      /*Perform de-ringing in one plane.*/
-	      sdelay+=notstart;
-	      edelay+=notdone;
-	      oc_dec_dering_frag_rows(_dec,_dec->pp_frame_buf,pli,
-				      pipe.fragy0[pli]-sdelay,pipe.fragy_end[pli]-edelay);
-	    }
-	  }
-	  /*If no post-processing is done, we still need to delay a row for the
-	    loop filter, thanks to the strange filtering order VP3 chose.*/
-	  else if(pipe.loop_filter){
-	    sdelay+=notstart;
-	    edelay+=notdone;
-	  }
-	  /*Compute the intersection of the available rows in all planes.
-	    If chroma is sub-sampled, the effect of each of its delays is
-	    doubled, but luma might have more post-processing filters enabled
-	    than chroma, so we don't know up front which one is the limiting
-	    factor.*/
-	  avail_fragy0=OC_MINI(avail_fragy0,pipe.fragy0[pli]-sdelay<<frag_shift);
-	  avail_fragy_end=OC_MINI(avail_fragy_end,
-				  pipe.fragy_end[pli]-edelay<<frag_shift);
-	}
-	if(_dec->stripe_cb.stripe_decoded!=NULL){
-	  /*Make the callback, ensuring we flip the sense of the "start" and
-	    "end" of the available region upside down.*/
-	  (*_dec->stripe_cb.stripe_decoded)(_dec->stripe_cb.ctx,stripe_buf,
-					    _dec->state.fplanes[0].nvfrags-avail_fragy_end,
-					    _dec->state.fplanes[0].nvfrags-avail_fragy0);
-	}
-	notstart=1;
+      #pragma omp parallel for
+      for(pli=0;pli<3;pli++){
+        oc_fragment_plane *fplane;
+        int                frag_shift;
+        int                pp_offset;
+        int                sdelay;
+        int                edelay;
+        fplane=_dec->state.fplanes+pli;
+        /*Compute the first and last fragment row of the current MCU for this
+           plane.*/
+        frag_shift=pli!=0&&!(_dec->state.info.pixel_fmt&2);
+        pipe.fragy0[pli]=stripe_fragy>>frag_shift;
+        pipe.fragy_end[pli]=OC_MINI(fplane->nvfrags,
+         pipe.fragy0[pli]+(pipe.mcu_nvfrags>>frag_shift));
+        oc_dec_dc_unpredict_mcu_plane(_dec,&pipe,pli);
+        oc_dec_frags_recon_mcu_plane(_dec,&pipe,pli);
+        sdelay=edelay=0;
+        if(pipe.loop_filter){
+          sdelay+=notstart;
+          edelay+=notdone;
+          oc_state_loop_filter_frag_rows(&_dec->state,pipe.bounding_values,
+           refi,pli,pipe.fragy0[pli]-sdelay,pipe.fragy_end[pli]-edelay);
+        }
+        /*To fill the borders, we have an additional two pixel delay, since a
+           fragment in the next row could filter its top edge, using two pixels
+           from a fragment in this row.
+          But there's no reason to delay a full fragment between the two.*/
+        oc_state_borders_fill_rows(&_dec->state,refi,pli,
+         (pipe.fragy0[pli]-sdelay<<3)-(sdelay<<1),
+         (pipe.fragy_end[pli]-edelay<<3)-(edelay<<1));
+        /*Out-of-loop post-processing.*/
+        pp_offset=3*(pli!=0);
+        if(pipe.pp_level>=OC_PP_LEVEL_DEBLOCKY+pp_offset){
+          /*Perform de-blocking in one plane.*/
+          sdelay+=notstart;
+          edelay+=notdone;
+          oc_dec_deblock_frag_rows(_dec,_dec->pp_frame_buf,
+           _dec->state.ref_frame_bufs[refi],pli,
+           pipe.fragy0[pli]-sdelay,pipe.fragy_end[pli]-edelay);
+          if(pipe.pp_level>=OC_PP_LEVEL_DERINGY+pp_offset){
+            /*Perform de-ringing in one plane.*/
+            sdelay+=notstart;
+            edelay+=notdone;
+            oc_dec_dering_frag_rows(_dec,_dec->pp_frame_buf,pli,
+             pipe.fragy0[pli]-sdelay,pipe.fragy_end[pli]-edelay);
+          }
+        }
+        /*If no post-processing is done, we still need to delay a row for the
+           loop filter, thanks to the strange filtering order VP3 chose.*/
+        else if(pipe.loop_filter){
+          sdelay+=notstart;
+          edelay+=notdone;
+        }
+        /*Compute the intersection of the available rows in all planes.
+          If chroma is sub-sampled, the effect of each of its delays is
+           doubled, but luma might have more post-processing filters enabled
+           than chroma, so we don't know up front which one is the limiting
+           factor.*/
+        avail_fragy0=OC_MINI(avail_fragy0,pipe.fragy0[pli]-sdelay<<frag_shift);
+        avail_fragy_end=OC_MINI(avail_fragy_end,
+         pipe.fragy_end[pli]-edelay<<frag_shift);
       }
+      if(_dec->stripe_cb.stripe_decoded!=NULL){
+        /*Make the callback, ensuring we flip the sense of the "start" and
+           "end" of the available region upside down.*/
+        (*_dec->stripe_cb.stripe_decoded)(_dec->stripe_cb.ctx,stripe_buf,
+         _dec->state.fplanes[0].nvfrags-avail_fragy_end,
+         _dec->state.fplanes[0].nvfrags-avail_fragy0);
+      }
+      notstart=1;
     }
 
 #ifdef _TH_DEBUG_