[xiph-commits] r9190 - experimental/giles/theora-exp-mt/lib
tterribe at motherfish-iii.xiph.org
tterribe at motherfish-iii.xiph.org
Wed Apr 27 16:12:21 PDT 2005
Author: tterribe
Date: 2005-04-27 16:12:19 -0700 (Wed, 27 Apr 2005)
New Revision: 9190
Modified:
experimental/giles/theora-exp-mt/lib/decint.h
experimental/giles/theora-exp-mt/lib/decode.c
Log:
Update to spawn the extra threads once at initialization, instead of for every
stripe.
This gets about an additional 5% speedup on a 2-way P4-Xeon machine (with
hyperthreading enabled), for a total of about 9.3% over the single-threaded
version.
Modified: experimental/giles/theora-exp-mt/lib/decint.h
===================================================================
--- experimental/giles/theora-exp-mt/lib/decint.h 2005-04-26 18:50:58 UTC (rev 9189)
+++ experimental/giles/theora-exp-mt/lib/decint.h 2005-04-27 23:12:19 UTC (rev 9190)
@@ -1,11 +1,14 @@
#include <limits.h>
+#include <pthread.h>
#if !defined(_decint_H)
# define _decint_H (1)
# include "theora/theora.h"
# include "internal.h"
-typedef struct theora_setup_info oc_setup_info;
-typedef struct theora_dec_ctx oc_dec_ctx;
+typedef struct theora_setup_info oc_setup_info;
+typedef struct oc_dec_pipeline_state oc_dec_pipeline_state;
+typedef struct oc_dec_pipeline_plane oc_dec_pipeline_plane;
+typedef struct theora_dec_ctx oc_dec_ctx;
# include "idct.h"
# include "huffdec.h"
@@ -27,6 +30,41 @@
+struct oc_dec_pipeline_plane{
+ oc_dec_ctx *dec;
+ int done;
+ int pli;
+ int avail_fragy0;
+ int avail_fragy_end;
+};
+
+
+
+struct oc_dec_pipeline_state{
+ int ti[3][64];
+ int ebi[3][64];
+ int eob_runs[3][64];
+ int bounding_values[512];
+ int *coded_fragis[3];
+ int *uncoded_fragis[3];
+ int fragy0[3];
+ int fragy_end[3];
+ int ncoded_fragis[3];
+ int nuncoded_fragis[3];
+ int pred_last[3][3];
+ int mcu_nvfrags;
+ int loop_filter;
+ int pp_level;
+ int stripe_fragy;
+ int refi;
+ int notstart;
+ int notdone;
+ oc_dec_pipeline_plane pplanes[3];
+};
+
+
+
+
struct theora_dec_ctx{
/*Shared encoder/decoder state.*/
oc_theora_state state;
@@ -72,6 +110,14 @@
theora_ycbcr_buffer pp_frame_buf;
/*The striped decode callback function.*/
theora_stripe_callback stripe_cb;
+ /*The striped decoding pipeline.*/
+ oc_dec_pipeline_state pipe;
+ /*Mutex for parallel pipelined decode.*/
+ pthread_mutex_t pipe_lock;
+ /*The auxilliary decoder threads.*/
+ pthread_t pipe_thread[2];
+ /*Condition variables for the auxilliary decoder threads.*/
+ pthread_cond_t pipe_cond[2];
};
/*Fix-ups for the libogg1 API, which returns -1 when there are insufficient
Modified: experimental/giles/theora-exp-mt/lib/decode.c
===================================================================
--- experimental/giles/theora-exp-mt/lib/decode.c 2005-04-26 18:50:58 UTC (rev 9189)
+++ experimental/giles/theora-exp-mt/lib/decode.c 2005-04-27 23:12:19 UTC (rev 9190)
@@ -110,7 +110,10 @@
};
+static void *oc_pipe_advance_loop(void *_arg);
+
+
static int oc_sb_run_unpack(oggpack_buffer *_opb){
long bits;
int ret;
@@ -215,10 +218,35 @@
_dec->pp_frame_data=NULL;
_dec->stripe_cb.ctx=NULL;
_dec->stripe_cb.stripe_decoded=NULL;
+ pthread_mutex_init(&_dec->pipe_lock,NULL);
+ pthread_mutex_lock(&_dec->pipe_lock);
+ for(pli=0;pli<3;pli++){
+ _dec->pipe.pplanes[pli].dec=_dec;
+ _dec->pipe.pplanes[pli].pli=pli;
+ _dec->pipe.pplanes[pli].done=1;
+ }
+ for(pli=1;pli<3;pli++){
+ pthread_cond_init(_dec->pipe_cond+pli-1,NULL);
+ pthread_create(_dec->pipe_thread+pli-1,NULL,oc_pipe_advance_loop,
+ _dec->pipe.pplanes+pli);
+ }
+ pthread_mutex_unlock(&_dec->pipe_lock);
return 0;
}
static void oc_dec_clear(oc_dec_ctx *_dec){
+ int pli;
+ /*Collect our auxilliary decoder threads.*/
+ for(pli=1;pli<3;pli++){
+ void *ret;
+ pthread_mutex_lock(&_dec->pipe_lock);
+ _dec->pipe.pplanes[pli].done=-1;
+ pthread_cond_signal(_dec->pipe_cond+pli-1);
+ pthread_mutex_unlock(&_dec->pipe_lock);
+ pthread_join(_dec->pipe_thread[pli-1],&ret);
+ pthread_cond_destroy(_dec->pipe_cond+pli-1);
+ }
+ pthread_mutex_destroy(&_dec->pipe_lock);
_ogg_free(_dec->pp_frame_data);
_ogg_free(_dec->variances);
_ogg_free(_dec->dc_qis);
@@ -1285,25 +1313,6 @@
-typedef struct{
- int ti[3][64];
- int ebi[3][64];
- int eob_runs[3][64];
- int bounding_values[512];
- int *coded_fragis[3];
- int *uncoded_fragis[3];
- int fragy0[3];
- int fragy_end[3];
- int ncoded_fragis[3];
- int nuncoded_fragis[3];
- int pred_last[3][3];
- int mcu_nvfrags;
- int loop_filter;
- int pp_level;
-}oc_dec_pipeline_state;
-
-
-
/*Initialize the main decoding pipeline.*/
static void oc_dec_pipeline_init(oc_dec_ctx *_dec,
oc_dec_pipeline_state *_pipe){
@@ -1884,66 +1893,56 @@
}
}
-typedef struct{
- theora_dec_ctx *dec;
+
+/*Advance the pipeline for one image plane.*/
+static void oc_pipe_advance(oc_dec_pipeline_plane *_pplane){
+ oc_dec_ctx *dec;
oc_dec_pipeline_state *pipe;
- int refi, pli;
- int stripe_fragy;
- int *avail_fragy0;
- int *avail_fragy_end;
- int notstart, notdone;
- pthread_mutex_t *lock;
-}oc_stripe;
-
-/*advance the pipeline for one image plane*/
-static void *oc_pipe_advance(void *_arg){
- oc_stripe *stripe = (oc_stripe*)_arg;
- oc_dec_pipeline_state *pipe = stripe->pipe;
- theora_dec_ctx *dec = stripe->dec;
- int pli = stripe->pli;
-
- oc_fragment_plane *fplane;
- int frag_shift;
- int pp_offset;
- int sdelay;
- int edelay;
-
+ oc_fragment_plane *fplane;
+ int frag_shift;
+ int pp_offset;
+ int sdelay;
+ int edelay;
+ int pli;
+ dec=_pplane->dec;
+ pipe=&dec->pipe;
+ pli=_pplane->pli;
fplane=dec->state.fplanes+pli;
/*Compute the first and last fragment row of the current MCU for this
plane.*/
frag_shift=pli!=0&&!(dec->state.info.pixel_fmt&2);
- pipe->fragy0[pli]=stripe->stripe_fragy>>frag_shift;
+ pipe->fragy0[pli]=pipe->stripe_fragy>>frag_shift;
pipe->fragy_end[pli]=OC_MINI(fplane->nvfrags,
pipe->fragy0[pli]+(pipe->mcu_nvfrags>>frag_shift));
oc_dec_dc_unpredict_mcu_plane(dec,pipe,pli);
oc_dec_frags_recon_mcu_plane(dec,pipe,pli);
sdelay=edelay=0;
if(pipe->loop_filter){
- sdelay+=stripe->notstart;
- edelay+=stripe->notdone;
+ sdelay+=pipe->notstart;
+ edelay+=pipe->notdone;
oc_state_loop_filter_frag_rows(&dec->state,pipe->bounding_values+256,
- stripe->refi,pli,pipe->fragy0[pli]-sdelay,pipe->fragy_end[pli]-edelay);
+ pipe->refi,pli,pipe->fragy0[pli]-sdelay,pipe->fragy_end[pli]-edelay);
}
/*To fill the borders, we have an additional two pixel delay, since a
fragment in the next row could filter its top edge, using two pixels
from a fragment in this row.
But there's no reason to delay a full fragment between the two.*/
- oc_state_borders_fill_rows(&dec->state,stripe->refi,pli,
+ oc_state_borders_fill_rows(&dec->state,pipe->refi,pli,
(pipe->fragy0[pli]-sdelay<<3)-(sdelay<<1),
(pipe->fragy_end[pli]-edelay<<3)-(edelay<<1));
/*Out-of-loop post-processing.*/
pp_offset=3*(pli!=0);
if(pipe->pp_level>=OC_PP_LEVEL_DEBLOCKY+pp_offset){
/*Perform de-blocking in one plane.*/
- sdelay+=stripe->notstart;
- edelay+=stripe->notdone;
+ sdelay+=pipe->notstart;
+ edelay+=pipe->notdone;
oc_dec_deblock_frag_rows(dec,dec->pp_frame_buf,
- dec->state.ref_frame_bufs[stripe->refi],pli,
+ dec->state.ref_frame_bufs[pipe->refi],pli,
pipe->fragy0[pli]-sdelay,pipe->fragy_end[pli]-edelay);
if(pipe->pp_level>=OC_PP_LEVEL_DERINGY+pp_offset){
/*Perform de-ringing in one plane.*/
- sdelay+=stripe->notstart;
- edelay+=stripe->notdone;
+ sdelay+=pipe->notstart;
+ edelay+=pipe->notdone;
oc_dec_dering_frag_rows(dec,dec->pp_frame_buf,pli,
pipe->fragy0[pli]-sdelay,pipe->fragy_end[pli]-edelay);
}
@@ -1951,22 +1950,38 @@
/*If no post-processing is done, we still need to delay a row for the
loop filter, thanks to the strange filtering order VP3 chose.*/
else if(pipe->loop_filter){
- sdelay+=stripe->notstart;
- edelay+=stripe->notdone;
+ sdelay+=pipe->notstart;
+ edelay+=pipe->notdone;
}
- /*Compute the intersection of the available rows in all planes.
- If chroma is sub-sampled, the effect of each of its delays is
- doubled, but luma might have more post-processing filters enabled
- than chroma, so we don't know up front which one is the limiting
- factor.*/
- pthread_mutex_lock(stripe->lock);
- *stripe->avail_fragy0=OC_MINI(*stripe->avail_fragy0,
- pipe->fragy0[pli]-sdelay<<frag_shift);
- *stripe->avail_fragy_end=OC_MINI(*stripe->avail_fragy_end,
- pipe->fragy_end[pli]-edelay<<frag_shift);
- pthread_mutex_unlock(stripe->lock);
+ /*Store the range of fragments available after processing this stripe.*/
+ _pplane->avail_fragy0=pipe->fragy0[pli]-sdelay<<frag_shift;
+ _pplane->avail_fragy_end=pipe->fragy_end[pli]-edelay<<frag_shift;
+}
- return (void*)NULL;
+/*The main loop for our auxilliary decoder threads.*/
+static void *oc_pipe_advance_loop(void *_arg){
+ oc_dec_pipeline_plane *pplane;
+ oc_dec_ctx *dec;
+ int pli;
+ pplane=(oc_dec_pipeline_plane *)_arg;
+ dec=pplane->dec;
+ pli=pplane->pli;
+ for(;;){
+ pthread_mutex_lock(&dec->pipe_lock);
+ while(pplane->done==1){
+ pthread_cond_wait(dec->pipe_cond+pli-1,&dec->pipe_lock);
+ }
+ if(pplane->done<0)break;
+ pthread_mutex_unlock(&dec->pipe_lock);
+ oc_pipe_advance(pplane);
+ pthread_mutex_lock(&dec->pipe_lock);
+ pplane->done=1;
+ pthread_cond_signal(dec->pipe_cond+pli-1);
+ pthread_mutex_unlock(&dec->pipe_lock);
+ }
+ /*Whenever we break, we're still holding the lock, so release it.*/
+ pthread_mutex_unlock(&dec->pipe_lock);
+ return NULL;
}
int theora_decode_packetin(theora_dec_ctx *_dec,const ogg_packet *_op,
@@ -1977,13 +1992,11 @@
like an inter frame with no coded blocks.
Only proceed if we have a non-empty packet.*/
if(_op->bytes!=0){
- oc_dec_pipeline_state pipe;
- theora_ycbcr_buffer stripe_buf;
- int stripe_fragy;
- int refi;
- int pli;
- int notstart;
- int notdone;
+ oc_dec_pipeline_state *pipe;
+ theora_ycbcr_buffer stripe_buf;
+ int stripe_fragy;
+ int refi;
+ int pli;
oggpackB_readinit(&_dec->opb,_op->packet,_op->bytes);
ret=oc_dec_frame_header_unpack(_dec);
if(ret<0)return ret;
@@ -2060,38 +2073,42 @@
An application callback allows further application processing (blitting
to video memory, color conversion, etc.) to also use the data while it's
in cache.*/
- oc_dec_pipeline_init(_dec,&pipe);
+ pipe=&_dec->pipe;
+ oc_dec_pipeline_init(_dec,pipe);
oc_ycbcr_buffer_flip(stripe_buf,_dec->pp_frame_buf);
- notstart=0;
- notdone=1;
- for(stripe_fragy=notstart=0;notdone;stripe_fragy+=pipe.mcu_nvfrags){
+ pipe->notstart=0;
+ pipe->notdone=1;
+ pipe->refi=refi;
+ for(stripe_fragy=0;pipe->notdone;stripe_fragy+=pipe->mcu_nvfrags){
int avail_fragy0;
int avail_fragy_end;
- pthread_t threads[3];
- pthread_mutex_t lock;
- oc_stripe stripes[3];
avail_fragy0=avail_fragy_end=_dec->state.fplanes[0].nvfrags;
- notdone=stripe_fragy+pipe.mcu_nvfrags<avail_fragy_end;
- pthread_mutex_init(&lock, NULL);
- for(pli=3;pli-->0;){
- stripes[pli].dec = _dec;
- stripes[pli].pipe = &pipe;
- stripes[pli].refi = refi;
- stripes[pli].pli = pli;
- stripes[pli].avail_fragy0 = &avail_fragy0;
- stripes[pli].avail_fragy_end = &avail_fragy_end;
- stripes[pli].stripe_fragy = stripe_fragy;
- stripes[pli].notstart = notstart;
- stripes[pli].notdone = notdone;
- stripes[pli].lock = &lock;
- if(pli>0){
- pthread_create(&threads[pli], NULL, oc_pipe_advance,
- (void*)&stripes[pli]);
+ pipe->notdone=stripe_fragy+pipe->mcu_nvfrags<avail_fragy_end;
+ pipe->stripe_fragy=stripe_fragy;
+ pthread_mutex_lock(&_dec->pipe_lock);
+ for(pli=1;pli<3;pli++){
+ pipe->pplanes[pli].done=0;
+ pthread_cond_signal(_dec->pipe_cond+pli-1);
+ }
+ pthread_mutex_unlock(&_dec->pipe_lock);
+ oc_pipe_advance(pipe->pplanes+0);
+ pthread_mutex_lock(&_dec->pipe_lock);
+ for(pli=1;pli<3;pli++){
+ while(pipe->pplanes[pli].done==0){
+ pthread_cond_wait(_dec->pipe_cond+pli-1,&_dec->pipe_lock);
}
- else oc_pipe_advance(stripes+pli);
}
- for(pli=1;pli<3;pli++)
- pthread_join(threads[pli],NULL);
+ pthread_mutex_unlock(&_dec->pipe_lock);
+ /*Compute the intersection of the available rows in this plane.
+ If chroma is sub-sampled, the effect of each of its delays is
+ doubled, but luma might have more post-processing filters enabled
+ than chroma, so we don't know up front which one is the limiting
+ factor.*/
+ for(pli=0;pli<3;pli++){
+ avail_fragy0=OC_MINI(avail_fragy0,pipe->pplanes[pli].avail_fragy0);
+ avail_fragy_end=OC_MINI(avail_fragy_end,
+ pipe->pplanes[pli].avail_fragy_end);
+ }
if(_dec->stripe_cb.stripe_decoded!=NULL){
/*Make the callback, ensuring we flip the sense of the "start" and
"end" of the available region upside down.*/
@@ -2099,7 +2116,7 @@
_dec->state.fplanes[0].nvfrags-avail_fragy_end,
_dec->state.fplanes[0].nvfrags-avail_fragy0);
}
- notstart=1;
+ pipe->notstart=1;
}
/*Finish filling in the reference frame borders.*/
for(pli=0;pli<3;pli++)oc_state_borders_fill_caps(&_dec->state,refi,pli);
More information about the commits
mailing list