[xiph-commits] r16361 - in branches/theora-gumboot: examples include/theora lib lib/dec lib/dec/x86 lib/dec/x86_vc lib/enc

Wed Jul 29 07:50:10 PDT 2009

Author: gumboot
Date: 2009-07-29 07:50:10 -0700 (Wed, 29 Jul 2009)
New Revision: 16361

Modified:
   branches/theora-gumboot/examples/encoder_example.c
   branches/theora-gumboot/include/theora/codec.h
   branches/theora-gumboot/include/theora/theoraenc.h
   branches/theora-gumboot/lib/dec/decode.c
   branches/theora-gumboot/lib/dec/fragment.c
   branches/theora-gumboot/lib/dec/idct.c
   branches/theora-gumboot/lib/dec/internal.c
   branches/theora-gumboot/lib/dec/state.c
   branches/theora-gumboot/lib/dec/x86/mmxstate.c
   branches/theora-gumboot/lib/dec/x86_vc/mmxstate.c
   branches/theora-gumboot/lib/enc/analyze.c
   branches/theora-gumboot/lib/enc/encint.h
   branches/theora-gumboot/lib/enc/encode.c
   branches/theora-gumboot/lib/enc/rate.c
   branches/theora-gumboot/lib/enc/tokenize.c
   branches/theora-gumboot/lib/internal.h
Log:
Integrate changes up to r16360 into theora-gumboot branch.



Modified: branches/theora-gumboot/examples/encoder_example.c
===================================================================

--- branches/theora-gumboot/examples/encoder_example.c	2009-07-29 13:44:25 UTC (rev 16360)
+++ branches/theora-gumboot/examples/encoder_example.c	2009-07-29 14:50:10 UTC (rev 16361)
@@ -61,7 +61,7 @@
 }
 #endif
 
-const char *optstring = "b:e:o:a:A:v:V:s:S:f:F:ck:";
+const char *optstring = "b:e:o:a:A:v:V:s:S:f:F:ck:d:\1\2\3\4";
 struct option options [] = {
   {"begin-time",required_argument,NULL,'b'},
   {"end-time",required_argument,NULL,'e'},
@@ -75,7 +75,12 @@
   {"framerate-numerator",required_argument,NULL,'f'},
   {"framerate-denominator",required_argument,NULL,'F'},
   {"vp3-compatible",no_argument,NULL,'c'},
+  {"soft-target",no_argument,NULL,'\1'},
   {"keyframe-freq",required_argument,NULL,'k'},
+  {"buf-delay",required_argument,NULL,'d'},
+  {"two-pass",no_argument,NULL,'\2'},
+  {"first-pass",required_argument,NULL,'\3'},
+  {"second-pass",required_argument,NULL,'\4'},
   {NULL,0,NULL,0}
 };
 
@@ -123,8 +128,9 @@
 y4m_convert_func y4m_convert=NULL;
 
 int video_r=-1;
-int video_q=48;
-ogg_uint32_t keyframe_frequency=64;
+int video_q=-1;
+ogg_uint32_t keyframe_frequency=0;
+int buf_delay=-1;
 
 long begin_sec=-1;
 long begin_usec=0;
@@ -143,6 +149,25 @@
           "                                  as -a gives higher quality for a given\n"
           "                                  bitrate.\n\n"
           "  -V --video-rate-target <n>      bitrate target for Theora video\n\n"
+          "     --soft-target                Use a large reservoir and treat the rate\n"
+          "                                  as a soft target; rate control is less\n"
+          "                                  strict but resulting quality is usually\n"
+          "                                  higher/smoother overall. Soft target also\n"
+          "                                  allows an optional -v setting to specify\n"
+          "                                  a minimum allowed quality.\n\n"
+          "     --two-pass                   Compress input using two-pass rate control\n"
+          "                                  This option requires that the input to the\n"
+          "                                  to the encoder is seekable and performs\n"
+          "                                  both passes automatically.\n\n"
+          "     --first-pass <filename>      Perform first-pass of a two-pass rate\n"
+          "                                  controlled encoding, saving pass data to\n"
+          "                                  <filename> for a later second pass\n\n"
+          "     --second-pass <filename>     Perform second-pass of a two-pass rate\n"
+          "                                  controlled encoding, reading first-pass\n"
+          "                                  data from <filename>.  The first pass\n"
+          "                                  data must come from a first encoding pass\n"
+          "                                  using identical input video to work\n"
+          "                                  properly.\n\n"
           "  -a --audio-quality <n>          Vorbis quality selector from -1 to 10\n"
           "                                  (-1 yields smallest files but lowest\n"
           "                                  fidelity; 10 yields highest fidelity\n"
@@ -163,6 +188,14 @@
           "                                  The frame rate nominator divided by this\n"
           "                                  determinates the frame rate in units per tick\n"
           "   -k --keyframe-freq <n>         Keyframe frequency\n"
+          "   -d --buf-delay <n>             Buffer delay (in frames). Longer delays\n"
+          "                                  allow smoother rate adaptation and provide\n"
+          "                                  better overall quality, but require more\n"
+          "                                  client side buffering and add latency. The\n"
+          "                                  default value is the keyframe interval for\n"
+          "                                  one-pass encoding (or somewhat larger if\n"
+          "                                  --soft-target is used) and infinite for\n"
+          "                                  two-pass encoding.\n"
           "   -b --begin-time <h:m:s.d>      Begin encoding at offset into input\n"
           "   -e --end-time <h:m:s.d>        End encoding at offset into input\n"
           "encoder_example accepts only uncompressed RIFF WAV format audio and\n"
@@ -963,145 +996,198 @@
   return audioflag;
 }
 
-int fetch_and_process_video(FILE *video,ogg_page *videopage,
-                            ogg_stream_state *to,
-                            th_enc_ctx *td,
-                            int videoflag){
-  /* You'll go to Hell for using static variables */
-  static ogg_int64_t         frames=0;
-  static int                 state=-1;
-  static unsigned char      *yuvframe[3];
-  static th_ycbcr_buffer     ycbcr;
-  ogg_packet                 op;
+static int                 frame_state=-1;
+static ogg_int64_t         frames=0;
+static unsigned char      *yuvframe[3];
+static th_ycbcr_buffer     ycbcr;
+
+int fetch_and_process_video_packet(FILE *video,FILE *twopass_file,int passno,
+ th_enc_ctx *td,ogg_packet *op){
+  int                        ret;
   int                        pic_sz;
   int                        frame_c_w;
   int                        frame_c_h;
   int                        c_w;
   int                        c_h;
   int                        c_sz;
-  ogg_int64_t                beginframe = (video_fps_n*begin_sec +
-                                           video_fps_n*begin_usec*.000001)/video_fps_d;
-  ogg_int64_t                endframe = (video_fps_n*end_sec +
-                                         video_fps_n*end_usec*.000001)/video_fps_d;
-
+  ogg_int64_t                beginframe;
+  ogg_int64_t                endframe;
+  spinnit();
+  beginframe=(video_fps_n*begin_sec+video_fps_n*begin_usec*.000001)/video_fps_d;
+  endframe=(video_fps_n*end_sec+video_fps_n*end_usec*.000001)/video_fps_d;
+  if(frame_state==-1){
+    /* initialize the double frame buffer */
+    yuvframe[0]=(unsigned char *)malloc(y4m_dst_buf_sz);
+    yuvframe[1]=(unsigned char *)malloc(y4m_dst_buf_sz);
+    yuvframe[2]=(unsigned char *)malloc(y4m_aux_buf_sz);
+    frame_state=0;
+  }
   pic_sz=pic_w*pic_h;
   frame_c_w=frame_w/dst_c_dec_h;
   frame_c_h=frame_h/dst_c_dec_v;
   c_w=(pic_w+dst_c_dec_h-1)/dst_c_dec_h;
   c_h=(pic_h+dst_c_dec_v-1)/dst_c_dec_v;
   c_sz=c_w*c_h;
+  /* read and process more video */
+  /* video strategy reads one frame ahead so we know when we're
+     at end of stream and can mark last video frame as such
+     (vorbis audio has to flush one frame past last video frame
+     due to overlap and thus doesn't need this extra work */
 
-  if(state==-1){
-        /* initialize the double frame buffer */
-    yuvframe[0]=(unsigned char *)malloc(y4m_dst_buf_sz);
-    yuvframe[1]=(unsigned char *)malloc(y4m_dst_buf_sz);
-    yuvframe[2]=(unsigned char *)malloc(y4m_aux_buf_sz);
-
-    state=0;
+  /* have two frame buffers full (if possible) before
+     proceeding.  after first pass and until eos, one will
+     always be full when we get here */
+  for(;frame_state<2 && (frames<endframe || endframe<0);){
+    char c,frame[6];
+    int ret=fread(frame,1,6,video);
+    /* match and skip the frame header */
+    if(ret<6)break;
+    if(memcmp(frame,"FRAME",5)){
+      fprintf(stderr,"Loss of framing in YUV input data\n");
+      exit(1);
+    }
+    if(frame[5]!='\n'){
+      int j;
+      for(j=0;j<79;j++)
+        if(fread(&c,1,1,video)&&c=='\n')break;
+      if(j==79){
+        fprintf(stderr,"Error parsing YUV frame header\n");
+        exit(1);
+      }
+    }
+    /*Read the frame data that needs no conversion.*/
+    if(fread(yuvframe[frame_state],1,y4m_dst_buf_read_sz,video)!=
+     y4m_dst_buf_read_sz){
+      fprintf(stderr,"Error reading YUV frame data.\n");
+      exit(1);
+    }
+    /*Read the frame data that does need conversion.*/
+    if(fread(yuvframe[2],1,y4m_aux_buf_read_sz,video)!=y4m_aux_buf_read_sz){
+      fprintf(stderr,"Error reading YUV frame data.\n");
+      exit(1);
+    }
+    /*Now convert the just read frame.*/
+    (*y4m_convert)(yuvframe[frame_state],yuvframe[2]);
+    frames++;
+    if(frames>=beginframe)
+    frame_state++;
   }
-
-  /* is there a video page flushed?  If not, work until there is. */
-  while(!videoflag){
-    spinnit();
-
-    if(ogg_stream_pageout(to,videopage)>0) return 1;
-    if(ogg_stream_eos(to)) return 0;
-
-    {
-      /* read and process more video */
-      /* video strategy reads one frame ahead so we know when we're
-         at end of stream and can mark last video frame as such
-         (vorbis audio has to flush one frame past last video frame
-         due to overlap and thus doesn't need this extra work */
-
-      /* have two frame buffers full (if possible) before
-         proceeding.  after first pass and until eos, one will
-         always be full when we get here */
-
-      for(;state<2 && (frames<endframe || endframe<0);){
-        char c,frame[6];
-        int ret=fread(frame,1,6,video);
-
-        /* match and skip the frame header */
-        if(ret<6)break;
-        if(memcmp(frame,"FRAME",5)){
-          fprintf(stderr,"Loss of framing in YUV input data\n");
-          exit(1);
-        }
-        if(frame[5]!='\n'){
-          int j;
-          for(j=0;j<79;j++)
-            if(fread(&c,1,1,video)&&c=='\n')break;
-          if(j==79){
-            fprintf(stderr,"Error parsing YUV frame header\n");
-            exit(1);
-          }
-        }
-        /*Read the frame data that needs no conversion.*/
-        if(fread(yuvframe[state],1,y4m_dst_buf_read_sz,video)!=
-           y4m_dst_buf_read_sz){
-          fprintf(stderr,"Error reading YUV frame data.\n");
-          exit(1);
-        }
-        /*Read the frame data that does need conversion.*/
-        if(fread(yuvframe[2],1,y4m_aux_buf_read_sz,video)!=
-           y4m_aux_buf_read_sz){
-          fprintf(stderr,"Error reading YUV frame data.\n");
-          exit(1);
-        }
-        /*Now convert the just read frame.*/
-        (*y4m_convert)(yuvframe[state],yuvframe[2]);
-
-        frames++;
-        if(frames>=beginframe)
-          state++;
-
+  /* check to see if there are dupes to flush */
+  if(th_encode_packetout(td,frame_state<1,op)>0)return 1;
+  if(frame_state<1){
+    /* can't get here unless YUV4MPEG stream has no video */
+    fprintf(stderr,"Video input contains no frames.\n");
+    exit(1);
+  }
+  /* Theora is a one-frame-in,one-frame-out system; submit a frame
+     for compression and pull out the packet */
+  /* in two-pass mode's second pass, we need to submit first-pass data */
+  if(passno==2){
+    for(;;){
+      static unsigned char buffer[80];
+      static int buf_pos;
+      int bytes;
+      /*Ask the encoder how many bytes it would like.*/
+      bytes=th_encode_ctl(td,TH_ENCCTL_2PASS_IN,NULL,0);
+      if(bytes<0){
+        fprintf(stderr,"Error submitting pass data in second pass.\n");
+        exit(1);
       }
-
-      if(state<1){
-        /* can't get here unless YUV4MPEG stream has no video */
-        fprintf(stderr,"Video input contains no frames.\n");
+      /*If it's got enough, stop.*/
+      if(bytes==0)break;
+      /*Read in some more bytes, if necessary.*/
+      if(bytes>80-buf_pos)bytes=80-buf_pos;
+      if(bytes>0&&fread(buffer+buf_pos,1,bytes,twopass_file)<bytes){
+        fprintf(stderr,"Could not read frame data from two-pass data file!\n");
         exit(1);
       }
-
-      /* Theora is a one-frame-in,one-frame-out system; submit a frame
-         for compression and pull out the packet */
-
-      /*We submit the buffer to the library as if it were padded, but we do not
-         actually allocate space for the padding.
-        This is okay, because the library will never read data from the padded
-         region.
-        This is only currently true of the experimental encoder; do NOT do this
-         with the reference encoder.*/
-      ycbcr[0].width=frame_w;
-      ycbcr[0].height=frame_h;
-      ycbcr[0].stride=pic_w;
-      ycbcr[0].data=yuvframe[0]-pic_x-pic_y*pic_w;
-      ycbcr[1].width=frame_c_w;
-      ycbcr[1].height=frame_c_h;
-      ycbcr[1].stride=c_w;
-      ycbcr[1].data=yuvframe[0]+pic_sz-(pic_x/dst_c_dec_h)-
-       (pic_y/dst_c_dec_v)*c_w;
-      ycbcr[2].width=frame_c_w;
-      ycbcr[2].height=frame_c_h;
-      ycbcr[2].stride=c_w;
-      ycbcr[2].data=ycbcr[1].data+c_sz;
-
-      th_encode_ycbcr_in(td,ycbcr);
-
-      /* if there's only one frame, it's the last in the stream */
-      while(th_encode_packetout(td,state<2,&op)){
-        ogg_stream_packetin(to,&op);
+      /*And pass them off.*/
+      ret=th_encode_ctl(td,TH_ENCCTL_2PASS_IN,buffer,bytes);
+      if(ret<0){
+        fprintf(stderr,"Error submitting pass data in second pass.\n");
+        exit(1);
       }
+      /*If the encoder consumed the whole buffer, reset it.*/
+      if(ret>=bytes)buf_pos=0;
+      /*Otherwise remember how much it used.*/
+      else buf_pos+=ret;
+    }
+  }
+  /*We submit the buffer to the library as if it were padded, but we do not
+     actually allocate space for the padding.
+    This is okay, because the library will never read data from the padded
+     region.
+    This is only currently true of the experimental encoder; do NOT do this
+     with the reference encoder.*/
+  ycbcr[0].width=frame_w;
+  ycbcr[0].height=frame_h;
+  ycbcr[0].stride=pic_w;
+  ycbcr[0].data=yuvframe[0]-pic_x-pic_y*pic_w;
+  ycbcr[1].width=frame_c_w;
+  ycbcr[1].height=frame_c_h;
+  ycbcr[1].stride=c_w;
+  ycbcr[1].data=yuvframe[0]+pic_sz-(pic_x/dst_c_dec_h)-(pic_y/dst_c_dec_v)*c_w;
+  ycbcr[2].width=frame_c_w;
+  ycbcr[2].height=frame_c_h;
+  ycbcr[2].stride=c_w;
+  ycbcr[2].data=ycbcr[1].data+c_sz;
+  th_encode_ycbcr_in(td,ycbcr);
+  {
+    unsigned char *temp=yuvframe[0];
+    yuvframe[0]=yuvframe[1];
+    yuvframe[1]=temp;
+    frame_state--;
+  }
+  /* in two-pass mode's first pass we need to extract and save the pass data */
+  if(passno==1){
+    unsigned char *buffer;
+    int bytes = th_encode_ctl(td, TH_ENCCTL_2PASS_OUT, &buffer, sizeof(buffer));
+    if(bytes<0){
+      fprintf(stderr,"Could not read two-pass data from encoder.\n");
+      exit(1);
+    }
+    if(fwrite(buffer,1,bytes,twopass_file)<bytes){
+      fprintf(stderr,"Unable to write to two-pass data file.\n");
+      exit(1);
+    }
+    fflush(twopass_file);
+  }
+  /* if there was only one frame, it's the last in the stream */
+  ret = th_encode_packetout(td,frame_state<1,op);
+  if(passno==1 && frame_state<1){
+    /* need to read the final (summary) packet */
+    unsigned char *buffer;
+    int bytes = th_encode_ctl(td, TH_ENCCTL_2PASS_OUT, &buffer, sizeof(buffer));
+    if(bytes<0){
+      fprintf(stderr,"Could not read two-pass summary data from encoder.\n");
+      exit(1);
+    }
+    if(fseek(twopass_file,0,SEEK_SET)<0){
+      fprintf(stderr,"Unable to seek in two-pass data file.\n");
+      exit(1);
+    }
+    if(fwrite(buffer,1,bytes,twopass_file)<bytes){
+      fprintf(stderr,"Unable to write to two-pass data file.\n");
+      exit(1);
+    }
+    fflush(twopass_file);
+  }
+  return ret;
+}
 
-      {
-        unsigned char *temp=yuvframe[0];
-        yuvframe[0]=yuvframe[1];
-        yuvframe[1]=temp;
-        state--;
-      }
 
-    }
+int fetch_and_process_video(FILE *video,ogg_page *videopage,
+ ogg_stream_state *to,th_enc_ctx *td,FILE *twopass_file,int passno,
+ int videoflag){
+  ogg_packet op;
+  int ret;
+  /* is there a video page flushed?  If not, work until there is. */
+  while(!videoflag){
+    if(ogg_stream_pageout(to,videopage)>0) return 1;
+    if(ogg_stream_eos(to)) return 0;
+    ret=fetch_and_process_video_packet(video,twopass_file,passno,td,&op);
+    if(ret<=0)return 0;
+    ogg_stream_packetin(to,&op);
   }
   return videoflag;
 }
@@ -1137,6 +1223,7 @@
   int videoflag=0;
   int akbps=0;
   int vkbps=0;
+  int soft_target=0;
 
   ogg_int64_t audio_bytesout=0;
   ogg_int64_t video_bytesout=0;
@@ -1144,6 +1231,11 @@
 
   FILE *outfile = stdout;
 
+  FILE *twopass_file = NULL;
+  fpos_t video_rewind_pos;
+  int twopass=0;
+  int passno;
+
 #ifdef _WIN32 /* We need to set stdin/stdout to binary mode. Damn windows. */
   /* if we were reading/writing a file, it would also need to in
      binary mode, eg, fopen("file.wav","wb"); */
@@ -1178,7 +1270,6 @@
         fprintf(stderr,"Illegal video quality (choose 0 through 10)\n");
         exit(1);
       }
-      video_r=0;
       break;
 
     case 'A':
@@ -1196,9 +1287,12 @@
         fprintf(stderr,"Illegal video bitrate (choose > 0 please)\n");
         exit(1);
       }
-      video_q=0;
      break;
 
+    case '\1':
+      soft_target=1;
+      break;
+
     case 's':
       video_par_n=(int)rint(atof(optarg));
       break;
@@ -1227,6 +1321,14 @@
       }
       break;
 
+    case 'd':
+      buf_delay=atoi(optarg);
+      if(buf_delay<=0){
+        fprintf(stderr,"Illegal buffer delay\n");
+        exit(1);
+      }
+      break;
+
     case 'b':
       {
         char *pos=strchr(optarg,':');
@@ -1277,95 +1379,92 @@
         }
       }
       break;
+    case '\2':
+      twopass=3; /* perform both passes */
+      twopass_file=tmpfile();
+      if(!twopass_file){
+        fprintf(stderr,"Unable to open temporary file for twopass data\n");
+        exit(1);
+      }
+      break;
+    case '\3':
+      twopass=1; /* perform first pass */
+      twopass_file=fopen(optarg,"wb");
+      if(!twopass_file){
+        fprintf(stderr,"Unable to open \'%s\' for twopass data\n",optarg);
+        exit(1);
+      }
+      break;
+    case '\4':
+      twopass=2; /* perform second pass */
+      twopass_file=fopen(optarg,"rb");
+      if(!twopass_file){
+        fprintf(stderr,"Unable to open twopass data file \'%s\'",optarg);
+        exit(1);
+      }
+      break;
 
     default:
       usage();
     }
   }
 
+  if(soft_target){
+    if(video_r<=0){
+      fprintf(stderr,"Soft rate target (--soft-tagret) requested without a bitrate (-V).\n");
+      exit(1);
+    }
+    if(video_q==-1)
+      video_q=0;
+  }else{
+    if(video_r>0)
+      video_q=0;
+    if(video_q==-1)
+      video_q=48;
+  }
+
+  if(keyframe_frequency<=0){
+    /*Use a default keyframe frequency of 64 for 1-pass (streaming) mode, and
+       256 for two-pass mode.*/
+    keyframe_frequency=twopass?256:64;
+  }
+
   while(optind<argc){
     /* assume that anything following the options must be a filename */
     id_file(argv[optind]);
     optind++;
   }
 
-  /* yayness.  Set up Ogg output stream */
+  if(twopass==3){
+    /* verify that the input is seekable! */
+    if(video){
+      if(fseek(video,0,SEEK_CUR)){
+        fprintf(stderr,"--two-pass (automatic two-pass) requires the video input\n"
+                "to be seekable.  For non-seekable input, encoder_example\n"
+                "must be run twice, first with the --first-pass option, then\n"
+                "with the --second-pass option.\n\n");
+        exit(1);
+      }
+      if(fgetpos(video,&video_rewind_pos)<0){
+        fprintf(stderr,"Unable to determine start position of video data.\n");
+        exit(1);
+      }
+    }
+  }
+
+  /* Set up Ogg output stream */
   srand(time(NULL));
   if(audio)ogg_stream_init(&vo,rand());
   ogg_stream_init(&to,rand()); /* oops, add one ot the above */
 
-  /* Set up Theora encoder */
-  if(!video){
-    fprintf(stderr,"No video files submitted for compression?\n");
-    exit(1);
-  }
-  /* Theora has a divisible-by-sixteen restriction for the encoded frame size */
-  /* scale the picture size up to the nearest /16 and calculate offsets */
-  frame_w=pic_w+15&~0xF;
-  frame_h=pic_h+15&~0xF;
-  /*Force the offsets to be even so that chroma samples line up like we
-     expect.*/
-  pic_x=frame_w-pic_w>>1&~1;
-  pic_y=frame_h-pic_h>>1&~1;
-
-  th_info_init(&ti);
-  ti.frame_width=frame_w;
-  ti.frame_height=frame_h;
-  ti.pic_width=pic_w;
-  ti.pic_height=pic_h;
-  ti.pic_x=pic_x;
-  ti.pic_y=pic_y;
-  ti.fps_numerator=video_fps_n;
-  ti.fps_denominator=video_fps_d;
-  ti.aspect_numerator=video_par_n;
-  ti.aspect_denominator=video_par_d;
-  ti.colorspace=TH_CS_UNSPECIFIED;
-  /*Account for the Ogg page overhead.
-    This is 1 byte per 255 for lacing values, plus 26 bytes per 4096 bytes for
-     the page header, plus approximately 1/2 byte per packet (not accounted for
-     here).*/
-  ti.target_bitrate=(int)(64870*(ogg_int64_t)video_r>>16);
-  ti.quality=video_q;
-  ti.keyframe_granule_shift=ilog(keyframe_frequency-1);
-
-  if(dst_c_dec_h==2){
-    if(dst_c_dec_v==2)ti.pixel_fmt=TH_PF_420;
-    else ti.pixel_fmt=TH_PF_422;
-  }
-  else ti.pixel_fmt=TH_PF_444;
-
-  td=th_encode_alloc(&ti);
-  th_info_clear(&ti);
-
-  /* setting just the granule shift only allows power-of-two keyframe
-     spacing.  Set the actual requested spacing. */
-  ret=th_encode_ctl(td,TH_ENCCTL_SET_KEYFRAME_FREQUENCY_FORCE,&keyframe_frequency,
-                    sizeof(keyframe_frequency-1));
-  if(ret<0){
-    fprintf(stderr,"Could not set keyframe interval to %d.\n",(int)keyframe_frequency);
-  }
-
-  if(vp3_compatible){
-    ret=th_encode_ctl(td,TH_ENCCTL_SET_VP3_COMPATIBLE,&vp3_compatible,
-     sizeof(vp3_compatible));
-    if(ret<0||!vp3_compatible){
-      fprintf(stderr,"Could not enable strict VP3 compatibility.\n");
-      if(ret>=0){
-        fprintf(stderr,"Ensure your source format is supported by VP3.\n");
-        fprintf(stderr,
-         "(4:2:0 pixel format, width and height multiples of 16).\n");
-      }
-    }
-  }
-
-  /* initialize Vorbis too, assuming we have audio to compress. */
-  if(audio){
+  /* initialize Vorbis assuming we have audio to compress. */
+  if(audio && twopass!=1){
     vorbis_info_init(&vi);
     if(audio_q>-99)
       ret = vorbis_encode_init_vbr(&vi,audio_ch,audio_hz,audio_q);
     else
       ret = vorbis_encode_init(&vi,audio_ch,audio_hz,-1,
-       (int)(64870*(ogg_int64_t)audio_r>>16),-1);
+                               (int)(64870*(ogg_int64_t)audio_r>>16),-1);
     if(ret){
       fprintf(stderr,"The Vorbis encoder could not set up a mode according to\n"
               "the requested quality or bitrate.\n\n");
@@ -1377,153 +1476,279 @@
     vorbis_block_init(&vd,&vb);
   }
 
-  /* write the bitstream header packets with proper page interleave */
-
-  th_comment_init(&tc);
-
-  /* first packet will get its own page automatically */
-  if(th_encode_flushheader(td,&tc,&op)<=0){
-    fprintf(stderr,"Internal Theora library error.\n");
-    exit(1);
-  }
-  ogg_stream_packetin(&to,&op);
-  if(ogg_stream_pageout(&to,&og)!=1){
-    fprintf(stderr,"Internal Ogg library error.\n");
-    exit(1);
-  }
-  fwrite(og.header,1,og.header_len,outfile);
-  fwrite(og.body,1,og.body_len,outfile);
-
-  /* create the remaining theora headers */
-  for(;;){
-    ret=th_encode_flushheader(td,&tc,&op);
+  for(passno=(twopass==3?1:twopass);passno<=(twopass==3?2:twopass);passno++){
+    /* Set up Theora encoder */
+    if(!video){
+      fprintf(stderr,"No video files submitted for compression?\n");
+      exit(1);
+    }
+    /* Theora has a divisible-by-sixteen restriction for the encoded frame size */
+    /* scale the picture size up to the nearest /16 and calculate offsets */
+    frame_w=pic_w+15&~0xF;
+    frame_h=pic_h+15&~0xF;
+    /*Force the offsets to be even so that chroma samples line up like we
+       expect.*/
+    pic_x=frame_w-pic_w>>1&~1;
+    pic_y=frame_h-pic_h>>1&~1;
+    th_info_init(&ti);
+    ti.frame_width=frame_w;
+    ti.frame_height=frame_h;
+    ti.pic_width=pic_w;
+    ti.pic_height=pic_h;
+    ti.pic_x=pic_x;
+    ti.pic_y=pic_y;
+    ti.fps_numerator=video_fps_n;
+    ti.fps_denominator=video_fps_d;
+    ti.aspect_numerator=video_par_n;
+    ti.aspect_denominator=video_par_d;
+    ti.colorspace=TH_CS_UNSPECIFIED;
+    /*Account for the Ogg page overhead.
+      This is 1 byte per 255 for lacing values, plus 26 bytes per 4096 bytes for
+       the page header, plus approximately 1/2 byte per packet (not accounted for
+       here).*/
+    ti.target_bitrate=(int)(64870*(ogg_int64_t)video_r>>16);
+    ti.quality=video_q;
+    ti.keyframe_granule_shift=ilog(keyframe_frequency-1);
+    if(dst_c_dec_h==2){
+      if(dst_c_dec_v==2)ti.pixel_fmt=TH_PF_420;
+      else ti.pixel_fmt=TH_PF_422;
+    }
+    else ti.pixel_fmt=TH_PF_444;
+    td=th_encode_alloc(&ti);
+    th_info_clear(&ti);
+    /* setting just the granule shift only allows power-of-two keyframe
+       spacing.  Set the actual requested spacing. */
+    ret=th_encode_ctl(td,TH_ENCCTL_SET_KEYFRAME_FREQUENCY_FORCE,
+     &keyframe_frequency,sizeof(keyframe_frequency-1));
     if(ret<0){
+      fprintf(stderr,"Could not set keyframe interval to %d.\n",(int)keyframe_frequency);
+    }
+    if(vp3_compatible){
+      ret=th_encode_ctl(td,TH_ENCCTL_SET_VP3_COMPATIBLE,&vp3_compatible,
+       sizeof(vp3_compatible));
+      if(ret<0||!vp3_compatible){
+        fprintf(stderr,"Could not enable strict VP3 compatibility.\n");
+        if(ret>=0){
+          fprintf(stderr,"Ensure your source format is supported by VP3.\n");
+          fprintf(stderr,
+           "(4:2:0 pixel format, width and height multiples of 16).\n");
+        }
+      }
+    }
+    if(soft_target){
+      /* reverse the rate control flags to favor a 'long time' strategy */
+      int arg = TH_RATECTL_CAP_UNDERFLOW;
+      ret=th_encode_ctl(td,TH_ENCCTL_SET_RATE_FLAGS,&arg,sizeof(arg));
+      if(ret<0)
+        fprintf(stderr,"Could not set encoder flags for --soft-target\n");
+      /* Default buffer control is overridden on two-pass */
+      if(!twopass&&buf_delay<0){
+        if((keyframe_frequency*7>>1) > 5*video_fps_n/video_fps_d)
+          arg=keyframe_frequency*7>>1;
+        else
+          arg=30*video_fps_n/video_fps_d;
+        ret=th_encode_ctl(td,TH_ENCCTL_SET_RATE_BUFFER,&arg,sizeof(arg));
+        if(ret<0)
+          fprintf(stderr,"Could not set rate control buffer for --soft-target\n");
+      }
+    }
+    /* set up two-pass if needed */
+    if(passno==1){
+      unsigned char *buffer;
+      int bytes;
+      bytes=th_encode_ctl(td,TH_ENCCTL_2PASS_OUT,&buffer,sizeof(buffer));
+      if(bytes<0){
+        fprintf(stderr,"Could not set up the first pass of two-pass mode.\n");
+        fprintf(stderr,"Did you remember to specify an estimated bitrate?\n");
+        exit(1);
+      }
+      /*Perform a seek test to ensure we can overwrite this placeholder data at
+         the end; this is better than letting the user sit through a whole
+         encode only to find out their pass 1 file is useless at the end.*/
+      if(fseek(twopass_file,0,SEEK_SET)<0){
+        fprintf(stderr,"Unable to seek in two-pass data file.\n");
+        exit(1);
+      }
+      if(fwrite(buffer,1,bytes,twopass_file)<bytes){
+        fprintf(stderr,"Unable to write to two-pass data file.\n");
+        exit(1);
+      }
+      fflush(twopass_file);
+    }
+    if(passno==2){
+      /* enable second pass here, actual data feeding comes later */
+      if(th_encode_ctl(td,TH_ENCCTL_2PASS_IN,NULL,0)<0){
+        fprintf(stderr,"Could not set up the second pass of two-pass mode.\n");
+        exit(1);
+      }
+      if(twopass==3){
+        /* 'automatic' second pass */
+        if(fsetpos(video,&video_rewind_pos)<0){
+          fprintf(stderr,"Could not rewind video input file for second pass!\n");
+          exit(1);
+        }
+        if(fseek(twopass_file,0,SEEK_SET)<0){
+          fprintf(stderr,"Unable to seek in two-pass data file.\n");
+          exit(1);
+        }
+        frame_state=0;
+        frames=0;
+      }
+    }
+    if(passno!=1&&buf_delay>=0){
+      ret=th_encode_ctl(td,TH_ENCCTL_SET_RATE_BUFFER,
+       &buf_delay,sizeof(buf_delay));
+      if(ret<0){
+        fprintf(stderr,"Warning: could not set desired buffer delay.\n");
+      }
+    }
+    /* write the bitstream header packets with proper page interleave */
+    th_comment_init(&tc);
+    /* first packet will get its own page automatically */
+    if(th_encode_flushheader(td,&tc,&op)<=0){
       fprintf(stderr,"Internal Theora library error.\n");
       exit(1);
     }
-    else if(!ret)break;
-    ogg_stream_packetin(&to,&op);
-  }
-
-  if(audio){
-    ogg_packet header;
-    ogg_packet header_comm;
-    ogg_packet header_code;
-
-    vorbis_analysis_headerout(&vd,&vc,&header,&header_comm,&header_code);
-    ogg_stream_packetin(&vo,&header); /* automatically placed in its own
-                                         page */
-    if(ogg_stream_pageout(&vo,&og)!=1){
-      fprintf(stderr,"Internal Ogg library error.\n");
-      exit(1);
-    }
-    fwrite(og.header,1,og.header_len,outfile);
-    fwrite(og.body,1,og.body_len,outfile);
-
-    /* remaining vorbis header packets */
-    ogg_stream_packetin(&vo,&header_comm);
-    ogg_stream_packetin(&vo,&header_code);
-  }
-
-  /* Flush the rest of our headers. This ensures
-     the actual data in each stream will start
-     on a new page, as per spec. */
-  for(;;){
-    int result = ogg_stream_flush(&to,&og);
-      if(result<0){
-        /* can't get here */
+    if(passno!=1){
+      ogg_stream_packetin(&to,&op);
+      if(ogg_stream_pageout(&to,&og)!=1){
         fprintf(stderr,"Internal Ogg library error.\n");
         exit(1);
       }
-    if(result==0)break;
-    fwrite(og.header,1,og.header_len,outfile);
-    fwrite(og.body,1,og.body_len,outfile);
-  }
-  if(audio){
+      fwrite(og.header,1,og.header_len,outfile);
+      fwrite(og.body,1,og.body_len,outfile);
+    }
+    /* create the remaining theora headers */
     for(;;){
-      int result=ogg_stream_flush(&vo,&og);
-      if(result<0){
-        /* can't get here */
+      ret=th_encode_flushheader(td,&tc,&op);
+      if(ret<0){
+        fprintf(stderr,"Internal Theora library error.\n");
+        exit(1);
+      }
+      else if(!ret)break;
+      if(passno!=1)ogg_stream_packetin(&to,&op);
+    }
+    if(audio && passno!=1){
+      ogg_packet header;
+      ogg_packet header_comm;
+      ogg_packet header_code;
+      vorbis_analysis_headerout(&vd,&vc,&header,&header_comm,&header_code);
+      ogg_stream_packetin(&vo,&header); /* automatically placed in its own
+                                           page */
+      if(ogg_stream_pageout(&vo,&og)!=1){
         fprintf(stderr,"Internal Ogg library error.\n");
         exit(1);
       }
-      if(result==0)break;
       fwrite(og.header,1,og.header_len,outfile);
       fwrite(og.body,1,og.body_len,outfile);
+      /* remaining vorbis header packets */
+      ogg_stream_packetin(&vo,&header_comm);
+      ogg_stream_packetin(&vo,&header_code);
     }
-  }
-
-  /* setup complete.  Raw processing loop */
-  fprintf(stderr,"Compressing....\n");
-  for(;;){
-    ogg_page audiopage;
-    ogg_page videopage;
-
-    /* is there an audio page flushed?  If not, fetch one if possible */
-    audioflag=fetch_and_process_audio(audio,&audiopage,&vo,&vd,&vb,audioflag);
-
-    /* is there a video page flushed?  If not, fetch one if possible */
-    videoflag=fetch_and_process_video(video,&videopage,&to,td,videoflag);
-
-    /* no pages of either?  Must be end of stream. */
-    if(!audioflag && !videoflag)break;
-
-    /* which is earlier; the end of the audio page or the end of the
-       video page? Flush the earlier to stream */
-    {
+    /* Flush the rest of our headers. This ensures
+       the actual data in each stream will start
+       on a new page, as per spec. */
+    if(passno!=1){
+      for(;;){
+        int result = ogg_stream_flush(&to,&og);
+        if(result<0){
+          /* can't get here */
+          fprintf(stderr,"Internal Ogg library error.\n");
+          exit(1);
+        }
+        if(result==0)break;
+        fwrite(og.header,1,og.header_len,outfile);
+        fwrite(og.body,1,og.body_len,outfile);
+      }
+    }
+    if(audio && passno!=1){
+      for(;;){
+        int result=ogg_stream_flush(&vo,&og);
+        if(result<0){
+          /* can't get here */
+          fprintf(stderr,"Internal Ogg library error.\n");
+          exit(1);
+        }
+        if(result==0)break;
+        fwrite(og.header,1,og.header_len,outfile);
+        fwrite(og.body,1,og.body_len,outfile);
+      }
+    }
+    /* setup complete.  Raw processing loop */
+      switch(passno){
+      case 0: case 2:
+        fprintf(stderr,"\rCompressing....                                          \n");
+        break;
+      case 1:
+        fprintf(stderr,"\rScanning first pass....                                  \n");
+        break;
+      }
+    for(;;){
       int audio_or_video=-1;
-      double audiotime=
+      if(passno==1){
+        ogg_packet op;
+        int ret=fetch_and_process_video_packet(video,twopass_file,passno,td,&op);
+        if(ret<0)break;
+        if(op.e_o_s)break; /* end of stream */
+        timebase=th_granule_time(td,op.granulepos);
+        audio_or_video=1;
+      }else{
+        double audiotime;
+        double videotime;
+        ogg_page audiopage;
+        ogg_page videopage;
+        /* is there an audio page flushed?  If not, fetch one if possible */
+        audioflag=fetch_and_process_audio(audio,&audiopage,&vo,&vd,&vb,audioflag);
+        /* is there a video page flushed?  If not, fetch one if possible */
+        videoflag=fetch_and_process_video(video,&videopage,&to,td,twopass_file,passno,videoflag);
+        /* no pages of either?  Must be end of stream. */
+        if(!audioflag && !videoflag)break;
+        /* which is earlier; the end of the audio page or the end of the
+           video page? Flush the earlier to stream */
+        audiotime=
         audioflag?vorbis_granule_time(&vd,ogg_page_granulepos(&audiopage)):-1;
-      double videotime=
+        videotime=
         videoflag?th_granule_time(td,ogg_page_granulepos(&videopage)):-1;
-
-      if(!audioflag){
-        audio_or_video=1;
-      } else if(!videoflag) {
-        audio_or_video=0;
-      } else {
-        if(audiotime<videotime)
+        if(!audioflag){
+          audio_or_video=1;
+        } else if(!videoflag) {
           audio_or_video=0;
-        else
-          audio_or_video=1;
+        } else {
+          if(audiotime<videotime)
+            audio_or_video=0;
+          else
+            audio_or_video=1;
+        }
+        if(audio_or_video==1){
+          /* flush a video page */
+          video_bytesout+=fwrite(videopage.header,1,videopage.header_len,outfile);
+          video_bytesout+=fwrite(videopage.body,1,videopage.body_len,outfile);
+          videoflag=0;
+          timebase=videotime;
+        }else{
+          /* flush an audio page */
+          audio_bytesout+=fwrite(audiopage.header,1,audiopage.header_len,outfile);
+          audio_bytesout+=fwrite(audiopage.body,1,audiopage.body_len,outfile);
+          audioflag=0;
+          timebase=audiotime;
+        }
       }
-
-      if(audio_or_video==1){
-        /* flush a video page */
-        video_bytesout+=fwrite(videopage.header,1,videopage.header_len,outfile);
-        video_bytesout+=fwrite(videopage.body,1,videopage.body_len,outfile);
-        videoflag=0;
-        timebase=videotime;
-
-      }else{
-        /* flush an audio page */
-        audio_bytesout+=fwrite(audiopage.header,1,audiopage.header_len,outfile);
-        audio_bytesout+=fwrite(audiopage.body,1,audiopage.body_len,outfile);
-        audioflag=0;
-        timebase=audiotime;
-      }
-      if(timebase > 0)
-      {
+      if(timebase > 0){
         int hundredths=(int)(timebase*100-(long)timebase*100);
         int seconds=(long)timebase%60;
         int minutes=((long)timebase/60)%60;
         int hours=(long)timebase/3600;
-
-        if(audio_or_video)
-          vkbps=(int)rint(video_bytesout*8./timebase*.001);
-        else
-          akbps=(int)rint(audio_bytesout*8./timebase*.001);
-
+        if(audio_or_video)vkbps=(int)rint(video_bytesout*8./timebase*.001);
+        else akbps=(int)rint(audio_bytesout*8./timebase*.001);
         fprintf(stderr,
                 "\r      %d:%02d:%02d.%02d audio: %dkbps video: %dkbps                 ",
                 hours,minutes,seconds,hundredths,akbps,vkbps);
       }
     }
-
+    if(video)th_encode_free(td);
   }
 
   /* clear out state */
-
   if(audio){
     ogg_stream_clear(&vo);
     vorbis_block_clear(&vb);
@@ -1534,12 +1759,12 @@
   }
   if(video){
     ogg_stream_clear(&to);
-    th_encode_free(td);
     th_comment_clear(&tc);
     if(video!=stdin)fclose(video);
   }
 
   if(outfile && outfile!=stdout)fclose(outfile);
+  if(twopass_file)fclose(twopass_file);
 
   fprintf(stderr,"\r   \ndone.\n\n");
 

Modified: branches/theora-gumboot/include/theora/codec.h
===================================================================
--- branches/theora-gumboot/include/theora/codec.h	2009-07-29 13:44:25 UTC (rev 16360)
+++ branches/theora-gumboot/include/theora/codec.h	2009-07-29 14:50:10 UTC (rev 16361)
@@ -259,9 +259,6 @@
   /**The target bit-rate in bits per second.
      If initializing an encoder with this struct, set this field to a non-zero
       value to activate CBR encoding by default.*/
-  /*TODO: Current encoder does not support CBR mode, or anything like it.
-    We also don't really know what nominal rate each quality level
-     corresponds to yet.*/
   int           target_bitrate;
   /**The target quality level.
      Valid values range from 0 to 63, inclusive, with higher values giving

Modified: branches/theora-gumboot/include/theora/theoraenc.h
===================================================================
--- branches/theora-gumboot/include/theora/theoraenc.h	2009-07-29 13:44:25 UTC (rev 16360)
+++ branches/theora-gumboot/include/theora/theoraenc.h	2009-07-29 14:50:10 UTC (rev 16361)
@@ -85,8 +85,8 @@
 #define TH_ENCCTL_SET_KEYFRAME_FREQUENCY_FORCE (4)
 /**Disables any encoder features that would prevent lossless transcoding back
  *  to VP3.
- * This primarily means disabling block-level QI values and not using 4MV mode
- *  when any of the luma blocks in a macro block are not coded.
+ * This primarily means disabling block-adaptive quantization and always coding
+ *  all four luma blocks in a macro block when 4MV is used.
  * It also includes using the VP3 quantization tables and Huffman codes; if you
  *  set them explicitly after calling this function, the resulting stream will
  *  not be VP3-compatible.
@@ -117,9 +117,9 @@
  *  may actually improve, but in this case bitrate will also likely increase.
  * In any case, overall rate/distortion performance will probably decrease.
  * The maximum value, and the meaning of each value, may change depending on
- *  the current encoding mode (VBR vs. CQI, etc.).
+ *  the current encoding mode (VBR vs. constant quality, etc.).
  *
- * \param[out] _buf int: The maximum encoding speed level.
+ * \param[out] _buf <tt>int</tt>: The maximum encoding speed level.
  * \retval TH_EFAULT \a _enc_ctx or \a _buf is <tt>NULL</tt>.
  * \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt>.
  * \retval TH_IMPL   Not supported by this implementation in the current
@@ -128,8 +128,8 @@
 /**Sets the speed level.
  * By default, the slowest speed (0) is used.
  *
- * \param[in] _buf int: The new encoding speed level.
- *                      0 is slowest, larger values use less CPU.
+ * \param[in] _buf <tt>int</tt>: The new encoding speed level.
+ *                 0 is slowest, larger values use less CPU.
  * \retval TH_EFAULT \a _enc_ctx or \a _buf is <tt>NULL</tt>.
  * \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt>, or the
  *                    encoding speed level is out of bounds.
@@ -146,14 +146,14 @@
  * This control code tells the encoder to produce the specified number of extra
  *  duplicates of the next frame.
  * This allows the encoder to make smarter keyframe placement decisions and
- *  rate control decisions, as well as reduces CPU usage, when compared to just
- *  submitting the same frame for encoding multiple times.
+ *  rate control decisions, and reduces CPU usage as well, when compared to
+ *  just submitting the same frame for encoding multiple times.
  * This setting only applies to the next frame submitted for encoding.
  * You MUST call th_encode_packetout() repeatedly until it returns 0, or the
  *  extra duplicate frames will be lost.
  *
- * \param[in] _buf int: The number of duplicates to produce.
- *                      Unless this is positive, no duplicates will be produced.
+ * \param[in] _buf <tt>int</tt>: The number of duplicates to produce.
+ *                 If this is negative or zero, no duplicates will be produced.
  * \retval TH_EFAULT \a _enc_ctx or \a _buf is <tt>NULL</tt>.
  * \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt>, or the
  *                    number of duplicates is greater than or equal to the
@@ -165,10 +165,156 @@
  * \retval TH_IMPL   Not supported by this implementation in the current
  *                    encoding mode.*/
 #define TH_ENCCTL_SET_DUP_COUNT (18)
+/**Modifies the default bitrate management behavior.
+ * Use to allow or disallow frame dropping, and to enable or disable capping
+ *  bit reservoir overflows and underflows.
+ * See \ref encctlcodes "the list of available flags".
+ * The flags are set by default to
+ *  <tt>#TH_RATECTL_DROP_FRAMES|#TH_RATECTL_CAP_OVERFLOW</tt>.
+ *
+ * \param[in] _buf <tt>int</tt>: Any combination of
+ *                  \ref ratectlflags "the available flags":
+ *                 - #TH_RATECTL_DROP_FRAMES: Enable frame dropping.
+ *                 - #TH_RATECTL_CAP_OVERFLOW: Don't bank excess bits for later
+ *                    use.
+ *                 - #TH_RATECTL_CAP_UNDERFLOW: Don't try to make up shortfalls
+ *                    later.
+ * \retval TH_EFAULT \a _enc_ctx or \a _buf is <tt>NULL</tt>.
+ * \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt> or rate control
+ *                    is not enabled.
+ * \retval TH_IMPL   Not supported by this implementation in the current
+ *                    encoding mode.*/
+#define TH_ENCCTL_SET_RATE_FLAGS (20)
+/**Sets the size of the bitrate management bit reservoir as a function
+ *  of number of frames.
+ * The reservoir size affects how quickly bitrate management reacts to
+ *  instantaneous changes in the video complexity.
+ * Larger reservoirs react more slowly, and provide better overall quality, but
+ *  require more buffering by a client, adding more latency to live streams.
+ * By default, libtheora sets the reservoir to the maximum distance between
+ *  keyframes, subject to a minimum and maximum limit.
+ * This call may be used to increase or decrease the reservoir, increasing or
+ *  decreasing the allowed temporary variance in bitrate.
+ * An implementation may impose some limits on the size of a reservoir it can
+ *  handle, in which case the actual reservoir size may not be exactly what was
+ *  requested.
+ * The actual value set will be returned.
+ *
+ * \param[in]  _buf <tt>int</tt>: Requested size of the reservoir measured in
+ *                   frames.
+ * \param[out] _buf <tt>int</tt>: The actual size of the reservoir set.
+ * \retval TH_EFAULT \a _enc_ctx or \a _buf is <tt>NULL</tt>.
+ * \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt>, or rate control
+ *                    is not enabled.  The buffer has an implementation
+ *                    defined minimum and maximum size and the value in _buf
+ *                    will be adjusted to match the actual value set.
+ * \retval TH_IMPL   Not supported by this implementation in the current
+ *                    encoding mode.*/
+#define TH_ENCCTL_SET_RATE_BUFFER (22)
+/**Enable pass 1 of two-pass encoding mode and retrieve the first pass metrics.
+ * Pass 1 mode must be enabled before the first frame is encoded, and a target
+ *  bitrate must have already been specified to the encoder.
+ * Although this does not have to be the exact rate that will be used in the
+ *  second pass, closer values may produce better results.
+ * The first call returns the size of the two-pass header data, along with some
+ *  placeholder content, and sets the encoder into pass 1 mode implicitly.
+ * This call sets the encoder to pass 1 mode implicitly.
+ * Then, a subsequent call must be made after each call to
+ *  th_encode_ycbcr_in() to retrieve the metrics for that frame.
+ * An additional, final call must be made to retrieve the summary data,
+ *  containing such information as the total number of frames, etc.
+ * This must be stored in place of the placeholder data that was returned
+ *  in the first call, before the frame metrics data.
+ * All of this data must be presented back to the encoder during pass 2 using
+ *  #TH_ENCCTL_2PASS_IN.
+ *
+ * \param[out] <tt>char *</tt>_buf: Returns a pointer to internal storage
+ *              containing the two pass metrics data.
+ *             This storage is only valid until the next call, or until the
+ *              encoder context is freed, and must be copied by the
+ *              application.
+ * \retval >=0       The number of bytes of metric data available in the
+ *                    returned buffer.
+ * \retval TH_EFAULT \a _enc_ctx or \a _buf is <tt>NULL</tt>.
+ * \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(char *)</tt>, no target
+ *                    bitrate has been set, or the first call was made after
+ *                    the first frame was submitted for encoding.
+ * \retval TH_IMPL   Not supported by this implementation.*/
+#define TH_ENCCTL_2PASS_OUT (24)
+/**Submits two-pass encoding metric data collected the first encoding pass to
+ *  the second pass.
+ * The first call must be made before the first frame is encoded, and sets the
+ *  encoder to pass 2 mode implicitly.
+ * The encoder may require reading data from some or all of the frames in
+ *  advance, depending on, e.g., the reservoir size used in the second pass.
+ * You must call this function repeatedly before each frame to provide data
+ *  until either a) it fails to consume all of the data presented or b) all of
+ *  the pass 1 data has been consumed.
+ * In the first case, you must save the remaining data to be presented after
+ *  the next frame.
+ * You can call this function with a NULL argument to get an upper bound on
+ *  the number of bytes that will be required before the next frame.
+ *
+ * When pass 2 is first enabled, the default bit reservoir is set to the entire
+ *  file; this gives maximum flexibility but can lead to very high peak rates.
+ * You can subsequently set it to another value with #TH_ENCCTL_SET_RATE_BUFFER
+ *  (e.g., to set it to the keyframe interval for non-live streaming), however,
+ *  you may then need to provide more data before the next frame.
+ *
+ * \param[in] _buf <tt>char[]</tt>: A buffer containing the data returned by
+ *                  #TH_ENCCTL_2PASS_OUT in pass 1.
+ *                 You may pass <tt>NULL</tt> for \a _buf to return an upper
+ *                  bound on the number of additional bytes needed before the
+ *                  next frame.
+ *                 The summary data returned at the end of pass 1 must be at
+ *                  the head of the buffer on the first call with a
+ *                  non-<tt>NULL</tt> \a _buf, and the placeholder data
+ *                  returned at the start of pass 1 should be omitted.
+ *                 After each call you should advance this buffer by the number
+ *                  of bytes consumed.
+ * \retval >0            The number of bytes of metric data required/consumed.
+ * \retval 0             No more data is required before the next frame.
+ * \retval TH_EFAULT     \a _enc_ctx is <tt>NULL</tt>.
+ * \retval TH_EINVAL     The first call was made after the first frame was
+ *                        submitted for encoding.
+ * \retval TH_ENOTFORMAT The data did not appear to be pass 1 from a compatible
+ *                        implementation of this library.
+ * \retval TH_EBADHEADER The data was invalid; this may be returned when
+ *                        attempting to read an aborted pass 1 file that still
+ *                        has the placeholder data in place of the summary
+ *                        data.
+ * \retval TH_IMPL       Not supported by this implementation.*/
+#define TH_ENCCTL_2PASS_IN (26)
 /*@}*/
 
 
+/**\name TH_ENCCTL_SET_RATE_FLAGS flags
+ * \anchor ratectlflags
+ * These are the flags available for use with #TH_ENCCTL_SET_RATE_FLAGS.*/
+/*@{*/
+/**Drop frames to keep within bitrate buffer constraints.
+ * This can have a severe impact on quality, but is the only way to ensure that
+ *  bitrate targets are met at low rates during sudden bursts of activity.*/
+#define TH_RATECTL_DROP_FRAMES   (0x1)
+/**Ignore bitrate buffer overflows.
+ * If the encoder uses so few bits that the reservoir of available bits
+ *  overflows, ignore the excess.
+ * The encoder will not try to use these extra bits in future frames.
+ * At high rates this may cause the result to be undersized, but allows a
+ *  client to play the stream using a finite buffer; it should normally be
+ *  enabled.*/
+#define TH_RATECTL_CAP_OVERFLOW  (0x2)
+/**Ignore bitrate buffer underflows.
+ * If the encoder uses so many bits that the reservoir of available bits
+ *  underflows, ignore the deficit.
+ * The encoder will not try to make up these extra bits in future frames.
+ * At low rates this may cause the result to be oversized; it should normally
+ *  be disabled.*/
+#define TH_RATECTL_CAP_UNDERFLOW (0x4)
+/*@}*/
 
+
+
 /**The quantization parameters used by VP3.*/
 extern const th_quant_info TH_VP31_QUANT_INFO;
 

Modified: branches/theora-gumboot/lib/dec/decode.c
===================================================================
--- branches/theora-gumboot/lib/dec/decode.c	2009-07-29 13:44:25 UTC (rev 16360)
+++ branches/theora-gumboot/lib/dec/decode.c	2009-07-29 14:50:10 UTC (rev 16361)
@@ -1094,7 +1094,6 @@
       eobs=oc_dec_ac_coeff_unpack(_dec,zzi,huff_idxs,ntoks_left,eobs);
     }
   }
-
   /*TODO: eobs should be exactly zero, or 4096 or greater.
     The second case occurs when an EOB run of size zero is encountered, which
      gets treated as an infinite EOB run (where infinity is PTRDIFF_MAX).
@@ -1306,12 +1305,88 @@
   ncoded_fragis=0;
   fragi=fplane->froffset+fragy0*(ptrdiff_t)nhfrags;
   for(fragy=fragy0;fragy<fragy_end;fragy++){
-    for(fragx=0;fragx<nhfrags;fragx++,fragi++){
-      if(!frags[fragi].coded)continue;
-      pred_last[OC_FRAME_FOR_MODE[frags[fragi].mb_mode]]=frags[fragi].dc+=
-       oc_frag_pred_dc(frags+fragi,fplane,fragx,fragy,pred_last);
-      ncoded_fragis++;
+    if(fragy==0){
+      /*For the first row, all of the cases reduce to just using the previous
+         predictor for the same reference frame.*/
+      for(fragx=0;fragx<nhfrags;fragx++,fragi++){
+        if(frags[fragi].coded){
+          int ref;
+          ref=OC_FRAME_FOR_MODE(frags[fragi].mb_mode);
+          pred_last[ref]=frags[fragi].dc+=pred_last[ref];
+          ncoded_fragis++;
+        }
+      }
     }
+    else{
+      oc_fragment *u_frags;
+      int          l_ref;
+      int          ul_ref;
+      int          u_ref;
+      u_frags=frags-nhfrags;
+      l_ref=-1;
+      ul_ref=-1;
+      u_ref=u_frags[fragi].coded?OC_FRAME_FOR_MODE(u_frags[fragi].mb_mode):-1;
+      for(fragx=0;fragx<nhfrags;fragx++,fragi++){
+        int ur_ref;
+        if(fragx+1>=nhfrags)ur_ref=-1;
+        else{
+          ur_ref=u_frags[fragi+1].coded?
+           OC_FRAME_FOR_MODE(u_frags[fragi+1].mb_mode):-1;
+        }
+        if(frags[fragi].coded){
+          int pred;
+          int ref;
+          ref=OC_FRAME_FOR_MODE(frags[fragi].mb_mode);
+          /*We break out a separate case based on which of our neighbors use
+             the same reference frames.
+            This is somewhat faster than trying to make a generic case which
+             handles all of them, since it reduces lots of poorly predicted
+             jumps to one switch statement, and also lets a number of the
+             multiplications be optimized out by strength reduction.*/
+          switch((l_ref==ref)|(ul_ref==ref)<<1|
+           (u_ref==ref)<<2|(ur_ref==ref)<<3){
+            default:pred=pred_last[ref];break;
+            case  1:
+            case  3:pred=frags[fragi-1].dc;break;
+            case  2:pred=u_frags[fragi-1].dc;break;
+            case  4:
+            case  6:
+            case 12:pred=u_frags[fragi].dc;break;
+            case  5:pred=(frags[fragi-1].dc+u_frags[fragi].dc)/2;break;
+            case  8:pred=u_frags[fragi+1].dc;break;
+            case  9:
+            case 11:
+            case 13:{
+              pred=(75*frags[fragi-1].dc+53*u_frags[fragi+1].dc)/128;
+            }break;
+            case 10:pred=(u_frags[fragi-1].dc+u_frags[fragi+1].dc)/2;break;
+            case 14:{
+              pred=(3*(u_frags[fragi-1].dc+u_frags[fragi+1].dc)
+               +10*u_frags[fragi].dc)/16;
+            }break;
+            case  7:
+            case 15:{
+              int p0;
+              int p1;
+              int p2;
+              p0=frags[fragi-1].dc;
+              p1=u_frags[fragi-1].dc;
+              p2=u_frags[fragi].dc;
+              pred=(29*(p0+p2)-26*p1)/32;
+              if(abs(pred-p2)>128)pred=p2;
+              else if(abs(pred-p0)>128)pred=p0;
+              else if(abs(pred-p1)>128)pred=p1;
+            }break;
+          }
+          pred_last[ref]=frags[fragi].dc+=pred;
+          ncoded_fragis++;
+          l_ref=ref;
+        }
+        else l_ref=-1;
+        ul_ref=u_ref;
+        u_ref=ur_ref;
+      }
+    }
   }
   _pipe->ncoded_fragis[_pli]=ncoded_fragis;
   /*Also save the number of uncoded fragments so we know how many to copy.*/

Modified: branches/theora-gumboot/lib/dec/fragment.c
===================================================================
--- branches/theora-gumboot/lib/dec/fragment.c	2009-07-29 13:44:25 UTC (rev 16360)
+++ branches/theora-gumboot/lib/dec/fragment.c	2009-07-29 14:50:10 UTC (rev 16361)
@@ -85,121 +85,3 @@
 }
 
 void oc_restore_fpu_c(void){}
-
-
-/*Computes the predicted DC value for the given fragment.
-  This requires that the fully decoded DC values be available for the left,
-   upper-left, upper, and upper-right fragments (if they exist).
-  _frag:      The fragment to predict the DC value for.
-  _fplane:    The fragment plane the fragment belongs to.
-  _x:         The x-coordinate of the fragment.
-  _y:         The y-coordinate of the fragment.
-  _pred_last: The last fully-decoded DC value for each predictor frame
-               (OC_FRAME_GOLD, OC_FRAME_PREV and OC_FRAME_SELF).
-              This should be initialized to 0's for the first fragment in each
-               color plane.
-  Return: The predicted DC value for this fragment.*/
-int oc_frag_pred_dc(const oc_fragment *_frag,
- const oc_fragment_plane *_fplane,int _x,int _y,int _pred_last[3]){
-  static const signed char   PRED_SCALE[16][4]={
-    /*0*/
-    {0,0,0,0},
-    /*OC_PL*/
-    {1,0,0,0},
-    /*OC_PUL*/
-    {1,0,0,0},
-    /*OC_PL|OC_PUL*/
-    {1,0,0,0},
-    /*OC_PU*/
-    {1,0,0,0},
-    /*OC_PL|OC_PU*/
-    {1,1,0,0},
-    /*OC_PUL|OC_PU*/
-    {0,1,0,0},
-    /*OC_PL|OC_PUL|PC_PU*/
-    {29,-26,29,0},
-    /*OC_PUR*/
-    {1,0,0,0},
-    /*OC_PL|OC_PUR*/
-    {75,53,0,0},
-    /*OC_PUL|OC_PUR*/
-    {1,1,0,0},
-    /*OC_PL|OC_PUL|OC_PUR*/
-    {75,0,53,0},
-    /*OC_PU|OC_PUR*/
-    {1,0,0,0},
-    /*OC_PL|OC_PU|OC_PUR*/
-    {75,0,53,0},
-    /*OC_PUL|OC_PU|OC_PUR*/
-    {3,10,3,0},
-    /*OC_PL|OC_PUL|OC_PU|OC_PUR*/
-    {29,-26,29,0}
-  };
-  static const unsigned char PRED_SHIFT[16]={0,0,0,0,0,1,0,5,0,7,1,7,0,7,4,5};
-  static const unsigned char PRED_RMASK[16]={
-    0,0,0,0,0,1,0,31,0,127,1,127,0,127,15,31
-  };
-  static const unsigned char BC_MASK[8]={
-    /*No boundary condition.*/
-    OC_PL|OC_PUL|OC_PU|OC_PUR,
-    /*Left column.*/
-    OC_PU|OC_PUR,
-    /*Top row.*/
-    OC_PL,
-    /*Top row, left column.*/
-    0,
-    /*Right column.*/
-    OC_PL|OC_PUL|OC_PU,
-    /*Right and left column.*/
-    OC_PU,
-    /*Top row, right column.*/
-    OC_PL,
-    /*Top row, right and left column.*/
-    0
-  };
-  /*Predictor fragments, left, up-left, up, up-right.*/
-  const oc_fragment *predfr[4];
-  /*The frame used for prediction for this fragment.*/
-  int                pred_frame;
-  /*The boundary condition flags.*/
-  int                bc;
-  /*DC predictor values: left, up-left, up, up-right, missing values skipped.*/
-  int                p[4];
-  /*Predictor count.*/
-  int                np;
-  /*Which predictor constants to use.*/
-  int                pflags;
-  /*The predicted DC value.*/
-  int                ret;
-  int                i;
-  pred_frame=OC_FRAME_FOR_MODE[_frag->mb_mode];
-  bc=(_x==0)+((_y==0)<<1)+((_x+1==_fplane->nhfrags)<<2);
-  predfr[0]=_frag-1;
-  predfr[1]=_frag-_fplane->nhfrags-1;
-  predfr[2]=predfr[1]+1;
-  predfr[3]=predfr[2]+1;
-  np=0;
-  pflags=0;
-  for(i=0;i<4;i++){
-    int pflag;
-    pflag=1<<i;
-    if((BC_MASK[bc]&pflag)&&predfr[i]->coded&&
-     OC_FRAME_FOR_MODE[predfr[i]->mb_mode]==pred_frame){
-      p[np++]=predfr[i]->dc;
-      pflags|=pflag;
-    }
-  }
-  if(pflags==0)return _pred_last[pred_frame];
-  else{
-    ret=PRED_SCALE[pflags][0]*p[0];
-    /*LOOP VECTORIZES.*/
-    for(i=1;i<np;i++)ret+=PRED_SCALE[pflags][i]*p[i];
-    ret=OC_DIV_POW2(ret,PRED_SHIFT[pflags],PRED_RMASK[pflags]);
-  }
-  if((pflags&(OC_PL|OC_PUL|OC_PU))==(OC_PL|OC_PUL|OC_PU)){
-    if(abs(ret-p[2])>128)ret=p[2];
-    else if(abs(ret-p[0])>128)ret=p[0];
-    else if(abs(ret-p[1])>128)ret=p[1];
-  }
-  return ret;
-}

Modified: branches/theora-gumboot/lib/dec/idct.c
===================================================================
--- branches/theora-gumboot/lib/dec/idct.c	2009-07-29 13:44:25 UTC (rev 16360)
+++ branches/theora-gumboot/lib/dec/idct.c	2009-07-29 14:50:10 UTC (rev 16361)
@@ -329,10 +329,7 @@
      gets.
     Needless to say we inherited this approach from VP3.*/
   /*Then perform the iDCT.*/
-  if(_last_zzi<3)
-    oc_idct8x8_3(_y,_y);
-  else if(_last_zzi<10)
-    oc_idct8x8_10(_y,_y);
-  else
-    oc_idct8x8_slow(_y,_y);
+  if(_last_zzi<3)oc_idct8x8_3(_y,_y);
+  else if(_last_zzi<10)oc_idct8x8_10(_y,_y);
+  else oc_idct8x8_slow(_y,_y);
 }

Modified: branches/theora-gumboot/lib/dec/internal.c
===================================================================
--- branches/theora-gumboot/lib/dec/internal.c	2009-07-29 13:44:25 UTC (rev 16360)
+++ branches/theora-gumboot/lib/dec/internal.c	2009-07-29 14:50:10 UTC (rev 16361)
@@ -58,26 +58,6 @@
   35,36,48,49,57,58,62,63
 };
 
-/*The predictor frame to use for each macro block mode.*/
-const unsigned char OC_FRAME_FOR_MODE[8]={
-  /*OC_MODE_INTER_NOMV*/
-  OC_FRAME_PREV,
-  /*OC_MODE_INTRA*/
-  OC_FRAME_SELF,
-  /*OC_MODE_INTER_MV*/
-  OC_FRAME_PREV,
-  /*OC_MODE_INTER_MV_LAST*/
-  OC_FRAME_PREV,
-  /*OC_MODE_INTER_MV_LAST2*/
-  OC_FRAME_PREV,
-  /*OC_MODE_GOLDEN*/
-  OC_FRAME_GOLD,
-  /*OC_MODE_GOLDEN_MV*/
-  OC_FRAME_GOLD,
-  /*OC_MODE_INTER_MV_FOUR*/
-  OC_FRAME_PREV,
-};
-
 /*A map from physical macro block ordering to bitstream macro block
    ordering within a super block.*/
 const unsigned char OC_MB_MAP[2][2]={{0,3},{1,2}};

Modified: branches/theora-gumboot/lib/dec/state.c
===================================================================
--- branches/theora-gumboot/lib/dec/state.c	2009-07-29 13:44:25 UTC (rev 16360)
+++ branches/theora-gumboot/lib/dec/state.c	2009-07-29 14:50:10 UTC (rev 16361)
@@ -880,7 +880,8 @@
     for(ci=0;ci<64;ci++)_dct_coeffs[ci]=p;
   }
   else{
-    _dct_coeffs[0]*=_dc_quant;
+    /*First, dequantize the DC coefficient.*/
+    _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
     oc_idct8x8(_state,_dct_coeffs,_last_zzi,_ncoefs);
   }
   /*Fill in the target buffer.*/
@@ -893,7 +894,7 @@
     const unsigned char *ref;
     int                  mvoffsets[2];
     ref=
-     _state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_FOR_MODE[mb_mode]]]
+     _state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_FOR_MODE(mb_mode)]]
      +frag_buf_off;
     if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
      _state->frag_mvs[_fragi][0],_state->frag_mvs[_fragi][1])>1){

Modified: branches/theora-gumboot/lib/dec/x86/mmxstate.c
===================================================================
--- branches/theora-gumboot/lib/dec/x86/mmxstate.c	2009-07-29 13:44:25 UTC (rev 16360)
+++ branches/theora-gumboot/lib/dec/x86/mmxstate.c	2009-07-29 14:50:10 UTC (rev 16361)
@@ -24,6 +24,19 @@
 
 #if defined(OC_X86_ASM)
 
+/*This table has been modified from OC_FZIG_ZAG by baking a 4x4 transpose into
+   each quadrant of the destination.*/
+static const unsigned char OC_FZIG_ZAG_MMX[64]={
+   0, 8, 1, 2, 9,16,24,17,
+  10, 3,32,11,18,25, 4,12,
+   5,26,19,40,33,34,41,48,
+  27, 6,13,20,28,21,14, 7,
+  56,49,42,35,43,50,57,36,
+  15,22,29,30,23,44,37,58,
+  51,59,38,45,52,31,60,53,
+  46,39,47,54,61,62,55,63
+};
+
 void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
  int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,int _ncoefs,
  ogg_uint16_t _dc_quant){
@@ -70,7 +83,8 @@
     );
   }
   else{
-    _dct_coeffs[0]*=_dc_quant;
+    /*Dequantize the DC coefficient.*/
+    _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
     oc_idct8x8_mmx(_dct_coeffs,_last_zzi,_ncoefs);
   }
   /*Fill in the target buffer.*/
@@ -83,7 +97,7 @@
     const unsigned char *ref;
     int                  mvoffsets[2];
     ref=
-     _state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_FOR_MODE[mb_mode]]]
+     _state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_FOR_MODE(mb_mode)]]
      +frag_buf_off;
     if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
      _state->frag_mvs[_fragi][0],_state->frag_mvs[_fragi][1])>1){

Modified: branches/theora-gumboot/lib/dec/x86_vc/mmxstate.c
===================================================================
--- branches/theora-gumboot/lib/dec/x86_vc/mmxstate.c	2009-07-29 13:44:25 UTC (rev 16360)
+++ branches/theora-gumboot/lib/dec/x86_vc/mmxstate.c	2009-07-29 14:50:10 UTC (rev 16361)
@@ -24,6 +24,19 @@
 
 #if defined(OC_X86_ASM)
 
+/*This table has been modified from OC_FZIG_ZAG by baking a 4x4 transpose into
+   each quadrant of the destination.*/
+static const unsigned char OC_FZIG_ZAG_MMX[64]={
+   0, 8, 1, 2, 9,16,24,17,
+  10, 3,32,11,18,25, 4,12,
+   5,26,19,40,33,34,41,48,
+  27, 6,13,20,28,21,14, 7,
+  56,49,42,35,43,50,57,36,
+  15,22,29,30,23,44,37,58,
+  51,59,38,45,52,31,60,53,
+  46,39,47,54,61,62,55,63
+};
+
 void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
  int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,int _ncoefs,
  ogg_uint16_t _dc_quant){
@@ -73,7 +86,8 @@
     }
   }
   else{
-    _dct_coeffs[0]*=_dc_quant;
+    /*Dequantize the DC coefficient.*/
+    _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
     oc_idct8x8_mmx(_dct_coeffs,_last_zzi,_ncoefs);
   }
   /*Fill in the target buffer.*/
@@ -86,7 +100,7 @@
     const unsigned char *ref;
     int                  mvoffsets[2];
     ref=
-     _state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_FOR_MODE[mb_mode]]]
+     _state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_FOR_MODE(mb_mode)]]
      +frag_buf_off;
     if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
      _state->frag_mvs[_fragi][0],_state->frag_mvs[_fragi][1])>1){

Modified: branches/theora-gumboot/lib/enc/analyze.c
===================================================================
--- branches/theora-gumboot/lib/enc/analyze.c	2009-07-29 13:44:25 UTC (rev 16360)
+++ branches/theora-gumboot/lib/enc/analyze.c	2009-07-29 14:50:10 UTC (rev 16361)
@@ -11,7 +11,7 @@
  ********************************************************************
 
   function: mode selection code
-  last mod: $Id$
+  last mod: $Id:$
 
  ********************************************************************/
 #include <limits.h>
@@ -687,7 +687,7 @@
   }
   mb_mode=frags[_fragi].mb_mode;
   ref=_enc->state.ref_frame_data[
-   _enc->state.ref_frame_idx[OC_FRAME_FOR_MODE[mb_mode]]]+frag_offs;
+   _enc->state.ref_frame_idx[OC_FRAME_FOR_MODE(mb_mode)]]+frag_offs;
   dst=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[OC_FRAME_SELF]]
    +frag_offs;
   /*Motion compensation:*/
@@ -742,7 +742,6 @@
   val+=dc_dequant+s^s;
   val=((enquant[0].m*(ogg_int32_t)val>>16)+val>>enquant[0].l)-s;
   dc=OC_CLAMPI(-580,val,580);
-  data[0]=dc;
   nonzero=0;
   /*Quantize the AC coefficients:*/
   dequant=_pipe->dequant[_pli][qii][qti];
@@ -774,7 +773,7 @@
     TODO: nonzero may need to be adjusted after tokenization.*/
   if(nonzero==0){
     ogg_int16_t p;
-    int ci;
+    int         ci;
     /*We round this dequant product (and not any of the others) because there's
        no iDCT rounding.*/
     p=(ogg_int16_t)(dc*(ogg_int32_t)dc_dequant+15>>5);
@@ -1226,7 +1225,7 @@
 }
 
 /*Analysis stage for an INTRA frame.*/
-int oc_enc_analyze_intra(oc_enc_ctx *_enc,int _recode){
+void oc_enc_analyze_intra(oc_enc_ctx *_enc,int _recode){
   oc_enc_pipeline_state   pipe;
   const unsigned char    *map_idxs;
   int                     nmap_idxs;
@@ -1304,7 +1303,6 @@
   refi=_enc->state.ref_frame_idx[OC_FRAME_SELF];
   for(pli=0;pli<3;pli++)oc_state_borders_fill_caps(&_enc->state,refi,pli);
   _enc->state.ntotal_coded_fragis=_enc->state.nfrags;
-  return 0;
 }
 
 
@@ -1668,7 +1666,7 @@
   ptrdiff_t              frag_offs;
   src=_enc->state.ref_frame_data[OC_FRAME_IO];
   ref=_enc->state.ref_frame_data[
-   _enc->state.ref_frame_idx[OC_FRAME_FOR_MODE[_mb_mode]]];
+   _enc->state.ref_frame_idx[OC_FRAME_FOR_MODE(_mb_mode)]];
   ystride=_enc->state.ref_ystride[0];
   frag_buf_offs=_enc->state.frag_buf_offs;
   sb_map=_enc->state.sb_maps[_mbi>>2][_mbi&3];
@@ -1851,7 +1849,7 @@
   oc_mode_set_cost(_modec,_enc->lambda);
 }
 
-int oc_enc_analyze(oc_enc_ctx *_enc,int _frame_type,int _recode){
+int oc_enc_analyze_inter(oc_enc_ctx *_enc,int _allow_keyframe,int _recode){
   oc_set_chroma_mvs_func  set_chroma_mvs;
   oc_enc_pipeline_state   pipe;
   oc_qii_state            intra_luma_qs;
@@ -1882,13 +1880,12 @@
   unsigned                sbi_end;
   int                     refi;
   int                     pli;
-  if(_frame_type==OC_INTRA_FRAME)return oc_enc_analyze_intra(_enc,_recode);
   set_chroma_mvs=OC_SET_CHROMA_MVS_TABLE[_enc->state.info.pixel_fmt];
-  _enc->state.frame_type=_frame_type;
+  _enc->state.frame_type=OC_INTER_FRAME;
   oc_mode_scheme_chooser_reset(&_enc->chooser);
   oc_enc_tokenize_start(_enc);
   oc_enc_pipeline_init(_enc,&pipe);
-  oc_qii_state_init(&intra_luma_qs);
+  if(_allow_keyframe)oc_qii_state_init(&intra_luma_qs);
   _enc->mv_bits[0]=_enc->mv_bits[1]=0;
   interbits=intrabits=0;
   last_mv[0]=last_mv[1]=prior_mv[0]=prior_mv[1]=0;
@@ -1956,12 +1953,14 @@
         }
         oc_mb_intra_satd(_enc,mbi,intra_satd);
         /*Estimate the cost of coding this MB in a keyframe.*/
-        oc_cost_intra(_enc,modes+OC_MODE_INTRA,mbi,
-         pipe.fr+0,&intra_luma_qs,intra_satd,OC_NOSKIP);
-        intrabits+=modes[OC_MODE_INTRA].rate;
-        for(bi=0;bi<4;bi++){
-          oc_qii_state_advance(&intra_luma_qs,&intra_luma_qs,
-           modes[OC_MODE_INTRA].qii[bi]);
+        if(_allow_keyframe){
+          oc_cost_intra(_enc,modes+OC_MODE_INTRA,mbi,
+           pipe.fr+0,&intra_luma_qs,intra_satd,OC_NOSKIP);
+          intrabits+=modes[OC_MODE_INTRA].rate;
+          for(bi=0;bi<4;bi++){
+            oc_qii_state_advance(&intra_luma_qs,&intra_luma_qs,
+             modes[OC_MODE_INTRA].qii[bi]);
+          }
         }
         /*Estimate the cost in a delta frame for various modes.*/
         oc_skip_cost(_enc,&pipe,mbi,skip_ssd);
@@ -2174,7 +2173,7 @@
   for(pli=0;pli<3;pli++)oc_state_borders_fill_caps(&_enc->state,refi,pli);
   /*Finish adding flagging overhead costs to inter bit counts to determine if
      we should have coded a key frame instead.*/
-  if(_enc->state.frame_type!=OC_INTRA_FRAME){
+  if(_allow_keyframe){
     if(interbits>intrabits)return 1;
     /*Technically the chroma plane counts are over-estimations, because they
        don't account for continuing runs from the luma planes, but the

Modified: branches/theora-gumboot/lib/enc/encint.h
===================================================================
--- branches/theora-gumboot/lib/enc/encint.h	2009-07-29 13:44:25 UTC (rev 16360)
+++ branches/theora-gumboot/lib/enc/encint.h	2009-07-29 14:50:10 UTC (rev 16361)
@@ -34,6 +34,9 @@
 typedef struct oc_enc_opt_vtable      oc_enc_opt_vtable;
 typedef struct oc_mb_enc_info         oc_mb_enc_info;
 typedef struct oc_mode_scheme_chooser oc_mode_scheme_chooser;
+typedef struct oc_iir_filter          oc_iir_filter;
+typedef struct oc_log_linear_fit      oc_log_linear_fit;
+typedef struct oc_frame_metrics       oc_frame_metrics;
 typedef struct oc_rc_state            oc_rc_state;
 typedef struct th_enc_ctx             oc_enc_ctx;
 typedef struct oc_token_checkpoint    oc_token_checkpoint;
@@ -160,43 +163,129 @@
 
 
 
+/*A 2nd order low-pass Bessel follower.
+  We use this for rate control because it has fast reaction time, but is
+   critically damped.*/
+struct oc_iir_filter{
+  ogg_int32_t c[2];
+  ogg_int64_t g;
+  ogg_int32_t x[2];
+  ogg_int32_t y[2];
+};
+
+
+
+/*A linear fit for the log-domain scale factors used in 2-pass.*/
+struct oc_log_linear_fit{
+  ogg_int64_t  x;
+  ogg_int64_t  y;
+  ogg_int64_t  x2;
+  ogg_int64_t  xy;
+  ogg_uint32_t n;
+};
+
+
+
+/*The 2-pass metrics associated with a single frame.*/
+struct oc_frame_metrics{
+  ogg_int32_t   scale;
+  unsigned      dup_count:31;
+  unsigned      frame_type:1;
+};
+
+
+
 /*Rate control state information.*/
 struct oc_rc_state{
   /*The target average bits per frame.*/
-  ogg_int64_t  bits_per_frame;
+  ogg_int64_t        bits_per_frame;
   /*The current buffer fullness (bits available to be used).*/
-  ogg_int64_t  fullness;
+  ogg_int64_t        fullness;
   /*The target buffer fullness.
     This is where we'd like to be by the last keyframe the appears in the next
      buf_delay frames.*/
-  ogg_int64_t  target;
+  ogg_int64_t        target;
   /*The maximum buffer fullness (total size of the buffer).*/
-  ogg_int64_t  max;
+  ogg_int64_t        max;
   /*The log of the number of pixels in a frame in Q57 format.*/
-  ogg_int64_t  log_npixels;
+  ogg_int64_t        log_npixels;
   /*The exponent used in the rate model in Q8 format.*/
-  unsigned     exp[2];
+  unsigned           exp[2];
   /*The number of frames to distribute the buffer usage over.*/
-  int          buf_delay;
+  int                buf_delay;
   /*The total drop count from the previous frame.
     This includes duplicates explicitly requested via the
      TH_ENCCTL_SET_DUP_COUNT API as well as frames we chose to drop ourselves.*/
-  ogg_uint32_t prev_drop_count;
+  ogg_uint32_t       prev_drop_count;
   /*The log of an estimated scale factor used to obtain the real framerate, for
      VFR sources or, e.g., 12 fps content doubled to 24 fps, etc.*/
-  ogg_int64_t  log_drop_scale;
+  ogg_int64_t        log_drop_scale;
   /*The log of estimated scale factor for the rate model in Q57 format.*/
-  ogg_int64_t  log_scale[2];
+  ogg_int64_t        log_scale[2];
   /*The log of the target quantizer level in Q57 format.*/
-  ogg_int64_t  log_qtarget;
+  ogg_int64_t        log_qtarget;
+  /*Will we drop frames to meet bitrate target?*/
+  unsigned char      drop_frames;
+  /*Do we respect the maximum buffer fullness?*/
+  unsigned char      cap_overflow;
+  /*Can the reservoir go negative?*/
+  unsigned char      cap_underflow;
+  /*Second-order lowpass filters to track scale and VFR.*/
+  oc_iir_filter      scalefilter[2];
+  oc_iir_filter      vfrfilter;
+  /*Two-pass mode state.
+    0 => 1-pass encoding.
+    1 => 1st pass of 2-pass encoding.
+    2 => 2nd pass of 2-pass encoding.*/
+  int                twopass;
+  /*Buffer for current frame metrics.*/
+  unsigned char      twopass_buffer[48];
+  /*The number of bytes in the frame metrics buffer.
+    When 2-pass encoding is enabled, this is set to 0 after each frame is
+     submitted, and must be non-zero before the next frame will be accepted.*/
+  int                twopass_buffer_bytes;
+  int                twopass_buffer_fill;
+  /*Whether or not to force the next frame to be a keyframe.*/
+  unsigned char      twopass_force_kf;
+  /*The metrics for the previous frame.*/
+  oc_frame_metrics   prev_metrics;
+  /*The metrics for the current frame.*/
+  oc_frame_metrics   cur_metrics;
+  /*The buffered metrics for future frames.*/
+  oc_frame_metrics  *frame_metrics;
+  int                nframe_metrics;
+  int                cframe_metrics;
+  /*The index of the current frame in the circular metric buffer.*/
+  int                frame_metrics_head;
+  /*The frame count of each type (keyframes, delta frames, and dup frames);
+     32 bits limits us to 2.268 years at 60 fps.*/
+  ogg_uint32_t       frames_total[3];
+  /*The number of frames of each type yet to be processed.*/
+  ogg_uint32_t       frames_left[3];
+  /*The sum of the scale values for each frame type.*/
+  ogg_int64_t        scale_sum[2];
+  /*The start of the window over which the current scale sums are taken.*/
+  int                scale_window0;
+  /*The end of the window over which the current scale sums are taken.*/
+  int                scale_window_end;
+  /*The frame count of each type in the current 2-pass window; this does not
+     include dup frames.*/
+  int                nframes[3];
+  /*Bias correction fits for the 1st-pass scale factors.*/
+  oc_log_linear_fit  corr[2];
 };
 
 
+void oc_rc_state_init(oc_rc_state *_rc,oc_enc_ctx *_enc);
+void oc_rc_state_clear(oc_rc_state *_rc);
+
+void oc_enc_rc_resize(oc_enc_ctx *_enc);
+int oc_enc_select_qi(oc_enc_ctx *_enc,int _qti,int _clamp);
 void oc_enc_calc_lambda(oc_enc_ctx *_enc,int _frame_type);
-void oc_rc_state_init(oc_rc_state *_rc,const oc_enc_ctx *_enc);
 int oc_enc_update_rc_state(oc_enc_ctx *_enc,
  long _bits,int _qti,int _qi,int _trial,int _droppable);
-int oc_enc_select_qi(oc_enc_ctx *_enc,int _qti,int _clamp);
+int oc_enc_rc_2pass_out(oc_enc_ctx *_enc,unsigned char **_buf);
+int oc_enc_rc_2pass_in(oc_enc_ctx *_enc,unsigned char *_buf,size_t _bytes);
 
 
 
@@ -286,7 +375,8 @@
 };
 
 
-int oc_enc_analyze(oc_enc_ctx *_enc,int _frame_type,int _recode);
+void oc_enc_analyze_intra(oc_enc_ctx *_enc,int _recode);
+int oc_enc_analyze_inter(oc_enc_ctx *_enc,int _allow_keyframe,int _recode);
 #if defined(OC_COLLECT_METRICS)
 void oc_enc_mode_metrics_collect(oc_enc_ctx *_enc);
 void oc_enc_mode_metrics_dump(oc_enc_ctx *_enc);

Modified: branches/theora-gumboot/lib/enc/encode.c
===================================================================
--- branches/theora-gumboot/lib/enc/encode.c	2009-07-29 13:44:25 UTC (rev 16360)
+++ branches/theora-gumboot/lib/enc/encode.c	2009-07-29 14:50:10 UTC (rev 16361)
@@ -1099,12 +1099,13 @@
   oc_enc_set_quant_params(_enc,NULL);
   _enc->state.qis[0]=_enc->state.info.quality;
   _enc->state.nqis=1;
-  if(_enc->state.info.target_bitrate>0)oc_rc_state_init(&_enc->rc,_enc);
+  oc_rc_state_init(&_enc->rc,_enc);
   return 0;
 }
 
 static void oc_enc_clear(th_enc_ctx *_enc){
   int pli;
+  oc_rc_state_clear(&_enc->rc);
 #if defined(OC_COLLECT_METRICS)
   oc_enc_mode_metrics_dump(_enc);
 #endif
@@ -1141,7 +1142,7 @@
     _enc->state.nqis=1;
   }
   oc_enc_calc_lambda(_enc,OC_INTRA_FRAME);
-  oc_enc_analyze(_enc,OC_INTRA_FRAME,_recode);
+  oc_enc_analyze_intra(_enc,_recode);
   oc_enc_frame_pack(_enc);
   /*On the first frame, the previous call was an initial dry-run to prime
      feed-forward statistics.*/
@@ -1160,7 +1161,7 @@
     _enc->state.nqis=1;
   }
   oc_enc_calc_lambda(_enc,OC_INTER_FRAME);
-  if(oc_enc_analyze(_enc,OC_INTER_FRAME,_recode)){
+  if(oc_enc_analyze_inter(_enc,_enc->rc.twopass!=2,_recode)){
     /*Mode analysis thinks this should have been a keyframe; start over.*/
     oc_enc_compress_keyframe(_enc,1);
   }
@@ -1315,6 +1316,45 @@
       _enc->dup_count=OC_MAXI(dup_count,0);
       return 0;
     }break;
+    case TH_ENCCTL_SET_RATE_FLAGS:{
+      int set;
+      if(_enc==NULL||_buf==NULL)return TH_EFAULT;
+      if(_buf_sz!=sizeof(set))return TH_EINVAL;
+      if(_enc->state.info.target_bitrate<=0)return TH_EINVAL;
+      set=*(int *)_buf;
+      _enc->rc.drop_frames=set&TH_RATECTL_DROP_FRAMES;
+      _enc->rc.cap_overflow=set&TH_RATECTL_CAP_OVERFLOW;
+      _enc->rc.cap_underflow=set&TH_RATECTL_CAP_UNDERFLOW;
+      return 0;
+    }break;
+    case TH_ENCCTL_SET_RATE_BUFFER:{
+      int set;
+      if(_enc==NULL||_buf==NULL)return TH_EFAULT;
+      if(_buf_sz!=sizeof(set))return TH_EINVAL;
+      if(_enc->state.info.target_bitrate<=0)return TH_EINVAL;
+      set=*(int *)_buf;
+      _enc->rc.buf_delay=set;
+      oc_enc_rc_resize(_enc);
+      *(int *)_buf=_enc->rc.buf_delay;
+      return 0;
+    }break;
+    case TH_ENCCTL_2PASS_OUT:{
+      if(_enc==NULL||_buf==NULL)return TH_EFAULT;
+      if(_enc->state.info.target_bitrate<=0||
+       _enc->state.curframe_num>=0&&_enc->rc.twopass!=1||
+       _buf_sz!=sizeof(unsigned char *)){
+        return TH_EINVAL;
+      }
+      return oc_enc_rc_2pass_out(_enc,(unsigned char **)_buf);
+    }break;
+    case TH_ENCCTL_2PASS_IN:{
+      if(_enc==NULL)return TH_EFAULT;
+      if(_enc->state.info.target_bitrate<=0||
+       _enc->state.curframe_num>=0&&_enc->rc.twopass!=2){
+        return TH_EINVAL;
+      }
+      return oc_enc_rc_2pass_in(_enc,_buf,_buf_sz);
+    }break;
     default:return TH_EIMPL;
   }
 }
@@ -1404,7 +1444,6 @@
   }
 }
 
-#include<stdio.h>
 int th_encode_ycbcr_in(th_enc_ctx *_enc,th_ycbcr_buffer _img){
   th_ycbcr_buffer img;
   int             cframe_width;
@@ -1421,6 +1460,7 @@
   /*Step 1: validate parameters.*/
   if(_enc==NULL||_img==NULL)return TH_EFAULT;
   if(_enc->packet_state==OC_PACKET_DONE)return TH_EINVAL;
+  if(_enc->rc.twopass&&_enc->rc.twopass_buffer_bytes==0)return TH_EINVAL;
   if((ogg_uint32_t)_img[0].width!=_enc->state.info.frame_width||
    (ogg_uint32_t)_img[0].height!=_enc->state.info.frame_height){
     return TH_EINVAL;
@@ -1472,7 +1512,7 @@
   /*Step 4: Compress the frame.*/
   /*Start with a keyframe, and don't allow the generation of invalid files that
      overflow the keyframe_granule_shift.*/
-  if(_enc->state.curframe_num==0||
+  if(_enc->rc.twopass_force_kf||_enc->state.curframe_num==0||
    _enc->state.curframe_num-_enc->state.keyframe_num+_enc->dup_count>=
    _enc->keyframe_frequency_force){
     oc_enc_compress_keyframe(_enc,0);
@@ -1507,8 +1547,15 @@
   if(_enc==NULL||_op==NULL)return TH_EFAULT;
   if(_enc->packet_state==OC_PACKET_READY){
     _enc->packet_state=OC_PACKET_EMPTY;
-    _op->packet=oggpackB_get_buffer(&_enc->opb);
-    _op->bytes=oggpackB_bytes(&_enc->opb);
+    /*For the first pass in 2-pass mode, don't emit any packet data.*/
+    if(_enc->rc.twopass==1){
+      _op->packet=NULL;
+      _op->bytes=0;
+    }
+    else{
+      _op->packet=oggpackB_get_buffer(&_enc->opb);
+      _op->bytes=oggpackB_bytes(&_enc->opb);
+    }
   }
   else if(_enc->packet_state==OC_PACKET_EMPTY){
     if(_enc->nqueued_dups>0){
@@ -1528,5 +1575,6 @@
   oc_enc_set_granpos(_enc);
   _op->packetno=th_granule_frame(_enc,_enc->state.granpos)+3;
   _op->granulepos=_enc->state.granpos;
+  if(_last_p)_enc->packet_state=OC_PACKET_DONE;
   return 1+_enc->nqueued_dups;
 }

Modified: branches/theora-gumboot/lib/enc/rate.c
===================================================================
--- branches/theora-gumboot/lib/enc/rate.c	2009-07-29 13:44:25 UTC (rev 16360)
+++ branches/theora-gumboot/lib/enc/rate.c	2009-07-29 14:50:10 UTC (rev 16361)
@@ -18,11 +18,104 @@
 #include <string.h>
 #include "encint.h"
 
+/*A rough lookup table for tan(x), 0<=x<pi/2.
+  The values are Q12 fixed-point and spaced at 5 degree intervals.
+  These decisions are somewhat arbitrary, but sufficient for the 2nd order
+   Bessel follower below.
+  Values of x larger than 85 degrees are extrapolated from the last inteval,
+   which is way off, but "good enough".*/
+static unsigned short OC_ROUGH_TAN_LOOKUP[18]={
+      0,  358,  722, 1098, 1491, 1910,
+   2365, 2868, 3437, 4096, 4881, 5850,
+   7094, 8784,11254,15286,23230,46817
+};
 
+/*_alpha is Q24 in the range [0,0.5).
+  The return values is 5.12.*/
+static int oc_warp_alpha(int _alpha){
+  int i;
+  int d;
+  int t0;
+  int t1;
+  i=_alpha*36>>24;
+  if(i>=17)i=16;
+  t0=OC_ROUGH_TAN_LOOKUP[i];
+  t1=OC_ROUGH_TAN_LOOKUP[i+1];
+  d=_alpha*36-(i<<24);
+  return (int)(((ogg_int64_t)t0<<32)+(t1-t0<<8)*(ogg_int64_t)d>>32);
+}
+
+/*Initialize a 2nd order low-pass Bessel filter with the corresponding delay
+   and initial value.
+  _value is Q24.*/
+void oc_iir_filter_init(oc_iir_filter *_f,int _delay,ogg_int32_t _value){
+  int         alpha;
+  ogg_int64_t one48;
+  ogg_int64_t warp;
+  ogg_int64_t k1;
+  ogg_int64_t k2;
+  ogg_int64_t d;
+  ogg_int64_t a;
+  ogg_int64_t ik2;
+  ogg_int64_t b1;
+  ogg_int64_t b2;
+  /*This borrows some code from an unreleased version of Postfish.*/
+  /*alpha is Q24*/
+  alpha=(1<<24)/_delay;
+  one48=(ogg_int64_t)1<<48;
+  /*warp is 5.12*/
+  warp=oc_warp_alpha(alpha);
+  /*k1 is 6.12*/
+  k1=3*warp;
+  /*k2 is 10.24.*/
+  k2=k1*warp;
+  /*d is 11.24.*/
+  d=((1<<12)+k1<<12)+k2;
+  /*a is 34.24.*/
+  a=(k2<<24)/d;
+  /*ik2 is 25.24.*/
+  ik2=one48/k2;
+  /*b1 is Q48; in practice, the integer part is limited.*/
+  b1=2*a*(ik2-(1<<24));
+  /*b2 is Q48; in practice, the integer part is limited.*/
+  b2=one48-(4*a<<24)-b1;
+  /*All of the filter parameters are Q24.*/
+  _f->c[0]=(ogg_int32_t)(b1+(1<<23)>>24);
+  _f->c[1]=(ogg_int32_t)(b2+(1<<23)>>24);
+  _f->g=(ogg_int32_t)a;
+  _f->y[1]=_f->y[0]=_f->x[1]=_f->x[0]=_value;
+}
+
+static ogg_int64_t oc_iir_filter_update(oc_iir_filter *_f,int _x){
+  ogg_int64_t c0;
+  ogg_int64_t c1;
+  ogg_int64_t g;
+  ogg_int64_t x0;
+  ogg_int64_t x1;
+  ogg_int64_t y0;
+  ogg_int64_t y1;
+  ogg_int64_t ya;
+  c0=_f->c[0];
+  c1=_f->c[1];
+  g=_f->g;
+  x0=_f->x[0];
+  x1=_f->x[1];
+  y0=_f->y[0];
+  y1=_f->y[1];
+  ya=(_x+x0*2+x1)*g+y0*c0+y1*c1+(1<<23)>>24;
+  _f->x[1]=(ogg_int32_t)x0;
+  _f->x[0]=_x;
+  _f->y[1]=(ogg_int32_t)y0;
+  _f->y[0]=(ogg_int32_t)ya;
+  return ya;
+}
+
+
+
 /*Search for the quantizer that matches the target most closely.
   We don't assume a linear ordering, but when there are ties we pick the
    quantizer closest to the old one.*/
-int oc_enc_find_qi_for_target(oc_enc_ctx *_enc,int _qti,int _qi_old,
+static int oc_enc_find_qi_for_target(oc_enc_ctx *_enc,int _qti,int _qi_old,
  int _qi_min,ogg_int64_t _log_qtarget){
   ogg_int64_t best_qdiff;
   int         best_qi;
@@ -61,8 +154,11 @@
   /*If rate control is active, use the lambda for the _target_ quantizer.
     This allows us to scale to rates slightly lower than we'd normally be able
      to reach, and give the rate control a semblance of "fractional qi"
-     precision.*/
-  if(_enc->state.info.target_bitrate>0)lq=_enc->rc.log_qtarget;
+     precision.
+    TODO: Add API for changing QI, and allow extra precision.*/
+  if(_enc->state.info.target_bitrate>0&&_enc->rc.twopass!=1){
+    lq=_enc->rc.log_qtarget;
+  }
   else lq=_enc->log_qavg[_qti][qi];
   /*The resulting lambda value is less than 0x500000.*/
   _enc->lambda=(int)oc_bexp64(2*lq-0x4780BD468D6B62BLL);
@@ -91,141 +187,191 @@
      lq-(OC_Q57(6)+5)/10);
     if(qi1!=qi&&qi1!=_enc->state.qis[nqis-1])_enc->state.qis[nqis++]=qi1;
   }
-  /*printf("%i %.3f:",_qti,oc_bexp64(lq+OC_Q57(3))*0.125);
-  for(qi=0;qi<nqis;qi++)printf(" %2i",_enc->state.qis[qi]);
-  printf("\n");*/
   _enc->state.nqis=nqis;
 }
 
+/*Binary exponential of _log_scale with 24-bit fractional precision and
+   saturation.
+  _log_scale: A binary logarithm in Q57 format.
+  Return: The binary exponential in Q24 format, saturated to 2**31-1 if
+   _log_scale was too large.*/
+static ogg_int32_t oc_bexp_q24(ogg_int64_t _log_scale){
+  if(_log_scale<OC_Q57(8)){
+    ogg_int64_t ret;
+    ret=oc_bexp64(_log_scale+OC_Q57(24));
+    return ret<0x7FFFFFFF?(ogg_int32_t)ret:0x7FFFFFFF;
+  }
+  return 0x7FFFFFFF;
+}
 
 
-void oc_rc_state_init(oc_rc_state *_rc,const oc_enc_ctx *_enc){
+
+static void oc_enc_rc_reset(oc_enc_ctx *_enc){
   ogg_int64_t npixels;
   ogg_int64_t ibpp;
   /*TODO: These parameters should be exposed in a th_encode_ctl() API.*/
-  _rc->bits_per_frame=(_enc->state.info.target_bitrate*
+  _enc->rc.bits_per_frame=(_enc->state.info.target_bitrate*
    (ogg_int64_t)_enc->state.info.fps_denominator)/
    _enc->state.info.fps_numerator;
   /*Insane framerates or frame sizes mean insane bitrates.
     Let's not get carried away.*/
-  if(_rc->bits_per_frame>0x400000000000LL){
-    _rc->bits_per_frame=(ogg_int64_t)0x400000000000LL;
+  if(_enc->rc.bits_per_frame>0x400000000000LL){
+    _enc->rc.bits_per_frame=(ogg_int64_t)0x400000000000LL;
   }
-  else if(_rc->bits_per_frame<32)_rc->bits_per_frame=32;
-  /*The buffer size is set equal to the keyframe interval, clamped to the range
-     [12,256] frames.
-    The 12 frame minimum gives us some chance to distribute bit estimation
-     errors.
-    The 256 frame maximum means we'll require 8-10 seconds of pre-buffering at
-     24-30 fps, which is not unreasonable.*/
-  _rc->buf_delay=_enc->keyframe_frequency_force>256?
-   256:_enc->keyframe_frequency_force;
-  _rc->buf_delay=OC_MAXI(_rc->buf_delay,12);
-  _rc->max=_rc->bits_per_frame*_rc->buf_delay;
-  /*Start with a buffer fullness of 75%.
-    We can require fully half the buffer for a keyframe, and so this initial
-     level gives us maximum flexibility for over/under-shooting in subsequent
-     frames.*/
-  _rc->target=_rc->fullness=(_rc->max+1>>1)+(_rc->max+2>>2);
+  else if(_enc->rc.bits_per_frame<32)_enc->rc.bits_per_frame=32;
+  _enc->rc.buf_delay=OC_MAXI(_enc->rc.buf_delay,12);
+  _enc->rc.max=_enc->rc.bits_per_frame*_enc->rc.buf_delay;
+  /*Start with a buffer fullness of 50% plus 25% of the amount we plan to spend
+     on a single keyframe interval.
+    We can require fully half the bits in an interval for a keyframe, so this
+     initial level gives us maximum flexibility for over/under-shooting in
+     subsequent frames.*/
+  _enc->rc.target=(_enc->rc.max+1>>1)+(_enc->rc.bits_per_frame+2>>2)*
+   OC_MINI(_enc->keyframe_frequency_force,_enc->rc.buf_delay);
+  _enc->rc.fullness=_enc->rc.target;
   /*Pick exponents and initial scales for quantizer selection.*/
   npixels=_enc->state.info.frame_width*
    (ogg_int64_t)_enc->state.info.frame_height;
-  _rc->log_npixels=oc_blog64(npixels);
-  ibpp=npixels/_rc->bits_per_frame;
+  _enc->rc.log_npixels=oc_blog64(npixels);
+  ibpp=npixels/_enc->rc.bits_per_frame;
   if(ibpp<1){
-    _rc->exp[0]=59;
-    _rc->log_scale[0]=oc_blog64(1997)-OC_Q57(8);
+    _enc->rc.exp[0]=59;
+    _enc->rc.log_scale[0]=oc_blog64(1997)-OC_Q57(8);
   }
   else if(ibpp<2){
-    _rc->exp[0]=55;
-    _rc->log_scale[0]=oc_blog64(1604)-OC_Q57(8);
+    _enc->rc.exp[0]=55;
+    _enc->rc.log_scale[0]=oc_blog64(1604)-OC_Q57(8);
   }
   else{
-    _rc->exp[0]=48;
-    _rc->log_scale[0]=oc_blog64(834)-OC_Q57(8);
+    _enc->rc.exp[0]=48;
+    _enc->rc.log_scale[0]=oc_blog64(834)-OC_Q57(8);
   }
   if(ibpp<4){
-    _rc->exp[1]=100;
-    _rc->log_scale[1]=oc_blog64(2249)-OC_Q57(8);
+    _enc->rc.exp[1]=100;
+    _enc->rc.log_scale[1]=oc_blog64(2249)-OC_Q57(8);
   }
   else if(ibpp<8){
-    _rc->exp[1]=95;
-    _rc->log_scale[1]=oc_blog64(1751)-OC_Q57(8);
+    _enc->rc.exp[1]=95;
+    _enc->rc.log_scale[1]=oc_blog64(1751)-OC_Q57(8);
   }
   else{
-    _rc->exp[1]=73;
-    _rc->log_scale[1]=oc_blog64(1260)-OC_Q57(8);
+    _enc->rc.exp[1]=73;
+    _enc->rc.log_scale[1]=oc_blog64(1260)-OC_Q57(8);
   }
-  _rc->prev_drop_count=0;
-  _rc->log_drop_scale=OC_Q57(0);
+  _enc->rc.prev_drop_count=0;
+  _enc->rc.log_drop_scale=OC_Q57(0);
+  /*Set up second order followers, initialized according to corresponding
+     time constants.*/
+  oc_iir_filter_init(&_enc->rc.scalefilter[0],2,oc_bexp_q24(_enc->rc.log_scale[0]));
+  oc_iir_filter_init(&_enc->rc.scalefilter[1],_enc->rc.buf_delay>>1,
+   oc_bexp_q24(_enc->rc.log_scale[1]));
+  oc_iir_filter_init(&_enc->rc.vfrfilter,2,oc_bexp_q24(_enc->rc.log_drop_scale));
 }
 
-int oc_enc_update_rc_state(oc_enc_ctx *_enc,
- long _bits,int _qti,int _qi,int _trial,int _droppable){
-  /*Note, setting OC_SCALE_SMOOTHING[1] to 0x80 (0.5), which one might expect
-     to be a reasonable value, actually causes a feedback loop with, e.g., 12
-     fps content encoded at 24 fps; use values near 0 or near 1 for now.
-    TODO: Should probably revisit using an exponential moving average in the
-     first place at some point; dup tracking should help as well.*/
-  static const unsigned OC_SCALE_SMOOTHING[2]={0x13,0x00};
-  ogg_int64_t buf_delta;
-  int         dropped;
-  dropped=0;
-  buf_delta=_enc->rc.bits_per_frame*(1+_enc->dup_count);
-  if(_bits<=0){
-    /*We didn't code any blocks in this frame.
-      Add it to the previous frame's dup count.*/
-    _enc->rc.prev_drop_count+=1+_enc->dup_count;
-    /*If this was the first frame of this type, lower the expected scale, but
-       don't set it to zero outright.*/
-    if(_trial)_enc->rc.log_scale[_qti]>>=1;
-    _bits=0;
+void oc_rc_state_init(oc_rc_state *_rc,oc_enc_ctx *_enc){
+  if(_enc->state.info.target_bitrate>0){
+    /*The buffer size is set equal to the keyframe interval, clamped to the
+       range [12,256] frames.
+      The 12 frame minimum gives us some chance to distribute bit estimation
+       errors.
+      The 256 frame maximum means we'll require 8-10 seconds of pre-buffering
+       at 24-30 fps, which is not unreasonable.*/
+    _rc->buf_delay=_enc->keyframe_frequency_force>256?
+     256:_enc->keyframe_frequency_force;
+    /*By default, enforce all buffer constraints.*/
+    _rc->drop_frames=1;
+    _rc->cap_overflow=1;
+    _rc->cap_underflow=0;
+    oc_enc_rc_reset(_enc);
   }
+  _rc->twopass=0;
+  _rc->twopass_buffer_bytes=0;
+  _rc->twopass_force_kf=0;
+  _rc->frame_metrics=NULL;
+}
+
+void oc_rc_state_clear(oc_rc_state *_rc){
+  _ogg_free(_rc->frame_metrics);
+}
+
+void oc_enc_rc_resize(oc_enc_ctx *_enc){
+  /*If encoding has not yet begun, reset the buffer state.*/
+  if(_enc->state.curframe_num<0)oc_enc_rc_reset(_enc);
   else{
-    ogg_int64_t log_scale;
-    ogg_int64_t log_bits;
-    ogg_int64_t log_qexp;
-    /*Compute the estimated scale factor for this frame type.*/
-    log_bits=oc_blog64(_bits);
-    log_qexp=_enc->log_qavg[_qti][_qi]-OC_Q57(2);
-    log_qexp=(log_qexp>>6)*(_enc->rc.exp[_qti]);
-    log_scale=OC_MINI(log_bits-_enc->rc.log_npixels+log_qexp,OC_Q57(16));
-    /*Use it to set that factor directly if this was a trial.*/
-    if(_trial)_enc->rc.log_scale[_qti]=log_scale;
-    else{
-      /*Otherwise update an exponential moving average for log_scale,
-         regardless of whether or not we dropped this frame.*/
-      _enc->rc.log_scale[_qti]=log_scale
-       +(_enc->rc.log_scale[_qti]-log_scale+128>>8)*OC_SCALE_SMOOTHING[_qti];
-      /*If this frame busts our budget, it must be dropped.*/
-      if(_droppable&&_enc->rc.fullness+buf_delta<_bits){
-        _enc->rc.prev_drop_count+=1+_enc->dup_count;
-        _bits=0;
-        dropped=1;
+    /*Otherwise, update the bounds on the buffer, but not the current
+       fullness.*/
+    _enc->rc.bits_per_frame=(_enc->state.info.target_bitrate*
+     (ogg_int64_t)_enc->state.info.fps_denominator)/
+     _enc->state.info.fps_numerator;
+    /*Insane framerates or frame sizes mean insane bitrates.
+      Let's not get carried away.*/
+    if(_enc->rc.bits_per_frame>0x400000000000LL){
+      _enc->rc.bits_per_frame=(ogg_int64_t)0x400000000000LL;
+    }
+    else if(_enc->rc.bits_per_frame<32)_enc->rc.bits_per_frame=32;
+    _enc->rc.buf_delay=OC_MAXI(_enc->rc.buf_delay,12);
+    _enc->rc.max=_enc->rc.bits_per_frame*_enc->rc.buf_delay;
+    _enc->rc.target=(_enc->rc.max+1>>1)+(_enc->rc.bits_per_frame+2>>2)*
+     OC_MINI(_enc->keyframe_frequency_force,_enc->rc.buf_delay);
+    oc_iir_filter_init(&_enc->rc.scalefilter[1],_enc->rc.buf_delay>>1,
+     oc_bexp_q24(_enc->rc.log_scale[1]));
+  }
+  /*If we're in pass-2 mode, make sure the frame metrics array is big enough
+     to hold frame statistics for the full buffer.*/
+  if(_enc->rc.twopass==2){
+    int cfm;
+    int buf_delay;
+    int reset_window;
+    reset_window=_enc->rc.frame_metrics==NULL;
+    cfm=_enc->rc.cframe_metrics;
+    buf_delay=_enc->rc.buf_delay;
+    if(cfm<buf_delay){
+      oc_frame_metrics *fm;
+      int               nfm;
+      int               fmh;
+      fm=_enc->rc.frame_metrics=(oc_frame_metrics *)_ogg_realloc(
+       _enc->rc.frame_metrics,buf_delay*sizeof(*_enc->rc.frame_metrics));
+      _enc->rc.cframe_metrics=buf_delay;
+      /*Re-organize the circular buffer.*/
+      fmh=_enc->rc.frame_metrics_head;
+      nfm=_enc->rc.nframe_metrics;
+      if(fmh+nfm>cfm){
+        int shift;
+        shift=OC_MINI(fmh+nfm-cfm,buf_delay-cfm);
+        memcpy(fm+cfm,fm,OC_MINI(fmh+nfm-cfm,buf_delay-cfm)*sizeof(*fm));
+        if(fmh+nfm>buf_delay)memmove(fm,fm+shift,fmh+nfm-buf_delay);
       }
-      else{
-        /*Update a simple exponential moving average to estimate the "real"
-           frame rate taking drops and duplicates into account.
-          This is only done if the frame is coded, as it needs the final count
-           of dropped frames.*/
-        _enc->rc.log_drop_scale=_enc->rc.log_drop_scale
-         +oc_blog64(_enc->rc.prev_drop_count+1)>>1;
-        _enc->rc.prev_drop_count=_enc->dup_count;
+    }
+    /*We were using whole-file buffering; now we're not.*/
+    if(reset_window){
+      _enc->rc.nframes[0]=_enc->rc.nframes[1]=_enc->rc.nframes[2]=0;
+      _enc->rc.scale_sum[0]=_enc->rc.scale_sum[1]=0;
+      _enc->rc.scale_window_end=_enc->rc.scale_window0=
+       _enc->state.curframe_num+_enc->prev_dup_count+1;
+      if(_enc->rc.twopass_buffer_bytes){
+        int qti;
+        /*We already read the metrics for the first frame in the window.*/
+        *(_enc->rc.frame_metrics)=*&_enc->rc.cur_metrics;
+        _enc->rc.nframe_metrics++;
+        qti=_enc->rc.cur_metrics.frame_type;
+        _enc->rc.nframes[qti]++;
+        _enc->rc.nframes[2]+=_enc->rc.cur_metrics.dup_count;
+        _enc->rc.scale_sum[qti]+=_enc->rc.cur_metrics.scale;
+        _enc->rc.scale_window_end+=_enc->rc.cur_metrics.dup_count+1;
+        if(_enc->rc.scale_window_end-_enc->rc.scale_window0<buf_delay){
+          /*We need more frame data.*/
+          _enc->rc.twopass_buffer_bytes=0;
+        }
       }
     }
+    /*Otherwise, we could shrink the size of the current window, if necessary,
+       but leaving it like it is lets us adapt to the new buffer size more
+       gracefully.*/
   }
-  if(!_trial){
-    /*And update the buffer fullness level.*/
-    _enc->rc.fullness+=buf_delta-_bits;
-    /*If we're too quick filling the buffer, that rate is lost forever.*/
-    if(_enc->rc.fullness>_enc->rc.max)_enc->rc.fullness=_enc->rc.max;
-  }
-  return dropped;
 }
 
 int oc_enc_select_qi(oc_enc_ctx *_enc,int _qti,int _clamp){
   ogg_int64_t  rate_total;
-  ogg_uint32_t next_key_frame;
   int          nframes[2];
   int          buf_delay;
   ogg_int64_t  log_qtarget;
@@ -235,37 +381,166 @@
   /*Figure out how to re-distribute bits so that we hit our fullness target
      before the last keyframe in our current buffer window (after the current
      frame), or the end of the buffer window, whichever comes first.*/
-  next_key_frame=_qti?_enc->keyframe_frequency_force
-   -(_enc->state.curframe_num-_enc->state.keyframe_num):0;
-  nframes[0]=(_enc->rc.buf_delay-OC_MINI(next_key_frame,_enc->rc.buf_delay)
-   +_enc->keyframe_frequency_force-1)/_enc->keyframe_frequency_force;
-  if(nframes[0]+_qti>1){
-    nframes[0]--;
-    buf_delay=next_key_frame+nframes[0]*_enc->keyframe_frequency_force;
+  switch(_enc->rc.twopass){
+    default:{
+      ogg_uint32_t next_key_frame;
+      /*Single pass mode: assume only forced keyframes and attempt to estimate
+         the drop count for VFR content.*/
+      next_key_frame=_qti?_enc->keyframe_frequency_force
+       -(_enc->state.curframe_num-_enc->state.keyframe_num):0;
+      nframes[0]=(_enc->rc.buf_delay-OC_MINI(next_key_frame,_enc->rc.buf_delay)
+       +_enc->keyframe_frequency_force-1)/_enc->keyframe_frequency_force;
+      if(nframes[0]+_qti>1){
+        nframes[0]--;
+        buf_delay=next_key_frame+nframes[0]*_enc->keyframe_frequency_force;
+      }
+      else buf_delay=_enc->rc.buf_delay;
+      nframes[1]=buf_delay-nframes[0];
+      /*Downgrade the delta frame rate to correspond to the recent drop count
+         history.*/
+      if(_enc->rc.prev_drop_count>0||_enc->rc.log_drop_scale>OC_Q57(0)){
+        ogg_int64_t dup_scale;
+        dup_scale=oc_bexp64((_enc->rc.log_drop_scale
+         +oc_blog64(_enc->rc.prev_drop_count+1)>>1)+OC_Q57(8));
+        if(dup_scale<nframes[1]<<8){
+          int dup_scalei;
+          dup_scalei=(int)dup_scale;
+          if(dup_scalei>0)nframes[1]=((nframes[1]<<8)+dup_scalei-1)/dup_scalei;
+        }
+        else nframes[1]=!!nframes[1];
+      }
+    }break;
+    case 1:{
+      /*Pass 1 mode: use a fixed qi value.*/
+      return _enc->state.qis[0];
+    }break;
+    case 2:{
+      ogg_int64_t scale_sum[2];
+      int         qti;
+      int         buf_pad;
+      /*Pass 2 mode: we know exactly how much of each frame type there is in
+         the current buffer window, and have estimates for the scales.*/
+      nframes[0]=_enc->rc.nframes[0];
+      nframes[1]=_enc->rc.nframes[1];
+      scale_sum[0]=_enc->rc.scale_sum[0];
+      scale_sum[1]=_enc->rc.scale_sum[1];
+      /*The window size can be slightly larger than the buffer window for VFR
+         content; clamp it down, if appropriate (the excess will all be dup
+         frames).*/
+      buf_delay=OC_MINI(_enc->rc.scale_window_end-_enc->rc.scale_window0,
+       _enc->rc.buf_delay);
+      /*If we're approaching the end of the file, add some slack to keep us
+         from slamming into a rail.
+        Our rate accuracy goes down, but it keeps the result sensible.
+        We position the target where the first forced keyframe beyond the end
+         of the file would be (for consistency with 1-pass mode).
+        TODO: It may also be useful to track the measured scales with the IIR
+         filter and blend those into the pass-1 stats here at the end, to
+         reduce the noise from using just a few frames of pass-1 data.*/
+      buf_pad=OC_MINI(_enc->rc.buf_delay,_enc->state.keyframe_num
+       +_enc->keyframe_frequency_force-_enc->rc.scale_window0);
+      if(buf_delay<buf_pad)buf_pad-=buf_delay;
+      else{
+        /*Otherwise, search for the last keyframe in the buffer window and
+           target that.*/
+        buf_pad=0;
+        /*TODO: Currently we only do this when using a finite buffer; we could
+           save the position of the last keyframe in the summary data and do it
+           with a whole-file buffer as well, but it isn't likely to make a
+           difference.*/
+        if(_enc->rc.frame_metrics!=NULL){
+          int fmi;
+          int fm_tail;
+          fm_tail=_enc->rc.frame_metrics_head+_enc->rc.nframe_metrics;
+          if(fm_tail>=_enc->rc.cframe_metrics)fm_tail-=_enc->rc.cframe_metrics;
+          for(fmi=fm_tail;;){
+            oc_frame_metrics *m;
+            fmi--;
+            if(fmi<0)fmi+=_enc->rc.cframe_metrics;
+            /*Stop before we remove the first frame.*/
+            if(fmi==_enc->rc.frame_metrics_head)break;
+            m=_enc->rc.frame_metrics+fmi;
+            /*If we find a keyframe, remove it and everything past it.*/
+            if(m->frame_type==OC_INTRA_FRAME){
+              do{
+                qti=m->frame_type;
+                nframes[qti]--;
+                scale_sum[qti]-=m->scale;
+                buf_delay-=m->dup_count+1;
+                fmi++;
+                if(fmi>=_enc->rc.cframe_metrics)fmi=0;
+                m=_enc->rc.frame_metrics+fmi;
+              }
+              while(fmi!=fm_tail);
+              /*And stop scanning backwards.*/
+              break;
+            }
+          }
+        }
+      }
+      /*Compute corrected log_scale estimates for each frame type from the
+         pass-1 scales we measured in the current window.*/
+      for(qti=0;qti<2;qti++){
+        oc_log_linear_fit *fit;
+        ogg_int64_t        x;
+        x=nframes[qti]>0?
+         oc_blog64(scale_sum[qti])-oc_blog64(nframes[qti])-OC_Q57(24):
+         -_enc->rc.log_npixels;
+        fit=_enc->rc.corr+qti;
+        if(fit->n>0){
+          ogg_int64_t  var;
+          ogg_uint32_t n_2;
+          n_2=fit->n>>1;
+          var=fit->x2;
+          /*We expect the mean log_scale to match over the length of the
+             sequence, and thus the fit offset to be near zero.
+            Therefore we force the fit offset to zero, which gives less
+             quantizer fluctuation, and a (very) small increase in quality.
+            Uncomment this line and the two below to use a non-zero offset.*/
+          /*var-=(fit->x+2048>>12)*(((fit->x+2048>>12)+n_2)/fit->n);*/
+          if(var>fit->n){
+            ogg_int64_t cov;
+            ogg_int64_t beta;
+            ogg_int64_t alpha;
+            ogg_int64_t y;
+            cov=fit->xy;
+            /*cov-=(fit->y+2048>>12)*(((fit->x+2048>>12)+n_2)/fit->n);*/
+            /*beta is Q33.*/
+            beta=((cov+n_2)/fit->n<<33)/((var+n_2)/fit->n);
+            /*alpha is Q57.*/
+            alpha=0;
+            /*alpha=((fit->y+n_2)/fit->n<<33)-beta*((fit->x+n_2)/fit->n);*/
+            /*Predict the mean y from the mean x.
+              What we're really trying to compensate for is error in exp[], not
+               error in the scales, and hence we can apply the correction to
+               the mean scale instead of applying it to each pass-1 scale and
+               then taking the mean.*/
+            y=(x+((ogg_int64_t)1<<32)>>33)*beta+alpha;
+            /*If we have enough points for a good estimation, use the corrected
+               predictor value directly.*/
+            if(fit->n>=(128<<qti))x=y;
+            /*Otherwise interpolate between the two.*/
+            else x+=fit->n*(y-x>>7+qti);
+          }
+        }
+        _enc->rc.log_scale[qti]=x;
+      }
+      /*Add the padding values from above.
+        TODO: Technically this is wrong for VFR content; again, we could use
+         the IIR filter to estimate the real framerate at the end.*/
+      nframes[1]+=buf_pad;
+      buf_delay+=buf_pad;
+    }break;
   }
-  else buf_delay=_enc->rc.buf_delay;
-  nframes[1]=buf_delay-nframes[0];
+  /*rate_total is the total bits available over the next buf_delay frames.*/
   rate_total=_enc->rc.fullness-_enc->rc.target
    +buf_delay*_enc->rc.bits_per_frame;
-  /*Downgrade the delta frame rate to correspond to the recent drop count
-     history.*/
-  if(_enc->rc.prev_drop_count>0||_enc->rc.log_drop_scale>OC_Q57(0)){
-    ogg_int64_t dup_scale;
-    dup_scale=oc_bexp64((_enc->rc.log_drop_scale
-     +oc_blog64(_enc->rc.prev_drop_count+1)>>1)+OC_Q57(8));
-    if(dup_scale<nframes[1]<<8){
-      int dup_scalei;
-      dup_scalei=(int)dup_scale;
-      if(dup_scalei>0)nframes[1]=((nframes[1]<<8)+dup_scalei-1)/dup_scalei;
-    }
-    else nframes[1]=!!nframes[1];
-  }
   log_scale0=_enc->rc.log_scale[_qti]+_enc->rc.log_npixels;
   /*If there aren't enough bits to achieve our desired fullness level, use the
      minimum quality permitted.*/
   if(rate_total<=buf_delay)log_qtarget=OC_QUANT_MAX_LOG;
   else{
-    static const unsigned char KEY_RATIO[2]={32,17};
+    static const unsigned char KEY_RATIO[2]={32,20};
     ogg_int64_t   log_scale1;
     ogg_int64_t   prevr;
     ogg_int64_t   curr;
@@ -301,6 +576,33 @@
      _enc->rc.exp[_qti]<<6);
     log_qtarget=OC_MINI(log_qtarget,OC_QUANT_MAX_LOG);
   }
+  /*The above allocation looks only at the total rate we'll accumulate in the
+     next buf_delay frames.
+    However, we could overflow the buffer on the very next frame, so check for
+     that here, if we're not using a soft target.*/
+  if(_enc->rc.cap_overflow){
+    ogg_int64_t margin;
+    ogg_int64_t soft_limit;
+    ogg_int64_t log_soft_limit;
+    ogg_int64_t log_qexp;
+    int         exp0;
+    /*Allow 3% of the buffer for prediction error.
+      This should be plenty, and we don't mind if we go a bit over; we only
+       want to keep these bits from being completely wasted.*/
+    margin=_enc->rc.max+31>>5;
+    /*We want to use at least this many bits next frame.*/
+    soft_limit=_enc->rc.fullness+_enc->rc.bits_per_frame-(_enc->rc.max-margin);
+    log_soft_limit=oc_blog64(soft_limit);
+    /*If we're predicting we won't use that many...*/
+    exp0=_enc->rc.exp[_qti];
+    log_qexp=(log_qtarget-OC_Q57(2)>>6)*exp0;
+    if(log_scale0-log_qexp<log_soft_limit){
+      /*Scale the adjustment based on how far into the margin we are.*/
+      log_qexp+=(log_scale0-log_soft_limit-log_qexp>>32)*
+       ((OC_MINI(margin,soft_limit)<<32)/margin);
+      log_qtarget=((log_qexp+(exp0>>1))/exp0<<6)+OC_Q57(2);
+    }
+  }
   /*If this was not one of the initial frames, limit the change in quality.*/
   old_qi=_enc->state.qis[0];
   if(_clamp){
@@ -317,20 +619,23 @@
   /*The above allocation looks only at the total rate we'll accumulate in the
      next buf_delay frames.
     However, we could bust the budget on the very next frame, so check for that
-     here.*/
-  {
+     here, if we're not using a soft target.*/
+  if(!_enc->rc.cap_underflow||_enc->rc.drop_frames){
     ogg_int64_t log_hard_limit;
     ogg_int64_t log_qexp;
     int         exp0;
-    /*Allow 50% of the rate for a single frame for prediction error.
+    /*Compute the maximum number of bits we can use in the next frame.
+      Allow 50% of the rate for a single frame for prediction error.
       This may not be enough for keyframes or sudden changes in complexity.*/
     log_hard_limit=oc_blog64(_enc->rc.fullness+(_enc->rc.bits_per_frame>>1));
+    /*If we're predicting we'll use more than this...*/
     exp0=_enc->rc.exp[_qti];
-    log_qexp=log_qtarget-OC_Q57(2);
     log_qexp=(log_qtarget-OC_Q57(2)>>6)*exp0;
     if(log_scale0-log_qexp>log_hard_limit){
+      /*Force the target to hit our limit exactly.*/
       log_qexp=log_scale0-log_hard_limit;
       log_qtarget=((log_qexp+(exp0>>1))/exp0<<6)+OC_Q57(2);
+      /*If that target is unreasonable, oh well; we'll have to drop.*/
       log_qtarget=OC_MINI(log_qtarget,OC_QUANT_MAX_LOG);
     }
   }
@@ -340,3 +645,412 @@
   _enc->rc.log_qtarget=log_qtarget;
   return qi;
 }
+
+int oc_enc_update_rc_state(oc_enc_ctx *_enc,
+ long _bits,int _qti,int _qi,int _trial,int _droppable){
+  ogg_int64_t buf_delta;
+  ogg_int64_t log_scale;
+  int         dropped;
+  dropped=0;
+  if(!_enc->rc.drop_frames)_droppable=0;
+  buf_delta=_enc->rc.bits_per_frame*(1+_enc->dup_count);
+  if(_bits<=0){
+    /*We didn't code any blocks in this frame.*/
+    _bits=0;
+    log_scale=OC_Q57(-64);
+  }
+  else{
+    ogg_int64_t log_bits;
+    ogg_int64_t log_qexp;
+    /*Compute the estimated scale factor for this frame type.*/
+    log_bits=oc_blog64(_bits);
+    log_qexp=_enc->log_qavg[_qti][_qi]-OC_Q57(2);
+    log_qexp=(log_qexp>>6)*(_enc->rc.exp[_qti]);
+    log_scale=OC_MINI(log_bits-_enc->rc.log_npixels+log_qexp,OC_Q57(16));
+  }
+  switch(_enc->rc.twopass){
+    default:{
+      /*Single pass mode:*/
+      if(_bits>0){
+        /*Use the estimated scale factor directly directly if this was a
+           trial.*/
+        if(_trial)_enc->rc.log_scale[_qti]=log_scale;
+        else{
+          /*Otherwise update the low-pass scale filter for this frame type,
+             regardless of whether or not we dropped this frame.*/
+          _enc->rc.log_scale[_qti]=oc_blog64(oc_iir_filter_update(
+           _enc->rc.scalefilter+_qti,oc_bexp_q24(log_scale)))-OC_Q57(24);
+          /*If this frame busts our budget, it must be dropped.*/
+          if(_droppable&&_enc->rc.fullness+buf_delta<_bits){
+            _enc->rc.prev_drop_count+=1+_enc->dup_count;
+            _bits=0;
+            dropped=1;
+          }
+          else{
+            ogg_uint32_t drop_count;
+            /*Update a low-pass filter to estimate the "real" frame rate taking
+               drops and duplicates into account.
+              This is only done if the frame is coded, as it needs the final
+               count of dropped frames.*/
+            drop_count=_enc->rc.prev_drop_count+1;
+            if(drop_count>0x7F)drop_count=0x7FFFFFFF;
+            else drop_count<<=24;
+            _enc->rc.log_drop_scale=oc_blog64(oc_iir_filter_update(
+             &_enc->rc.vfrfilter,drop_count))-OC_Q57(24);
+            /*Initialize the drop count for this frame to the user-requested dup
+               count.
+              It will be increased if we drop more frames.*/
+            _enc->rc.prev_drop_count=_enc->dup_count;
+          }
+        }
+      }
+      /*Increase the drop count.*/
+      else _enc->rc.prev_drop_count+=1+_enc->dup_count;
+    }break;
+    case 1:{
+      /*Pass-1 mode: save the metrics for this frame.*/
+      _enc->rc.cur_metrics.scale=oc_bexp_q24(log_scale);
+      _enc->rc.cur_metrics.dup_count=_enc->dup_count;
+      _enc->rc.cur_metrics.frame_type=_enc->state.frame_type;
+      _enc->rc.twopass_buffer_bytes=0;
+    }break;
+    case 2:{
+      ogg_int64_t        x;
+      ogg_int64_t        y;
+      oc_log_linear_fit *fit;
+      /*Pass 2 mode:*/
+      /*If this frame busts our budget, it must be dropped.*/
+      if(_droppable&&_enc->rc.fullness+buf_delta<_bits){
+        _bits=0;
+        dropped=1;
+      }
+      /*Accumulate statistics for estimation bias correction.
+        Everything is done in Q24 format.*/
+      x=oc_blog64(_enc->rc.cur_metrics.scale)-OC_Q57(24)>>33;
+      y=log_scale>>33;
+      fit=_enc->rc.corr+_qti;
+      fit->n++;
+      fit->x+=x;
+      fit->y+=y;
+      fit->x2+=(x+2048>>12)*(x+2048>>12);
+      fit->xy+=(x+2048>>12)*(y+2048>>12);
+      if(!_trial){
+        ogg_int64_t next_frame_num;
+        int         qti;
+        /*Move the current metrics back one frame.*/
+        *&_enc->rc.prev_metrics=*&_enc->rc.cur_metrics;
+        next_frame_num=_enc->state.curframe_num+_enc->dup_count+1;
+        /*Back out the last frame's statistics from the sliding window.*/
+        qti=_enc->rc.prev_metrics.frame_type;
+        _enc->rc.frames_left[qti]--;
+        _enc->rc.frames_left[2]-=_enc->rc.prev_metrics.dup_count;
+        _enc->rc.nframes[qti]--;
+        _enc->rc.nframes[2]-=_enc->rc.prev_metrics.dup_count;
+        _enc->rc.scale_sum[qti]-=_enc->rc.prev_metrics.scale;
+        _enc->rc.scale_window0=(int)next_frame_num;
+        /*Free the corresponding entry in the circular buffer.*/
+        if(_enc->rc.frame_metrics!=NULL){
+          _enc->rc.nframe_metrics--;
+          _enc->rc.frame_metrics_head++;
+          if(_enc->rc.frame_metrics_head>=_enc->rc.cframe_metrics){
+            _enc->rc.frame_metrics_head=0;
+          }
+        }
+        /*Mark us ready for the next 2-pass packet.*/
+        _enc->rc.twopass_buffer_bytes=0;
+        /*Update state, so the user doesn't have to keep calling 2pass_in after
+           they've fed in all the data when we're using a finite buffer.*/
+        _enc->prev_dup_count=_enc->dup_count;
+        oc_enc_rc_2pass_in(_enc,NULL,0);
+      }
+    }break;
+  }
+  if(!_trial){
+    /*And update the buffer fullness level.*/
+    _enc->rc.fullness+=buf_delta-_bits;
+    /*If we're too quick filling the buffer and overflow is capped,
+      that rate is lost forever.*/
+    if(_enc->rc.cap_overflow&&_enc->rc.fullness>_enc->rc.max){
+      _enc->rc.fullness=_enc->rc.max;
+    }
+    /*If we're too quick draining the buffer and underflow is capped,
+      don't try to make up that rate later.*/
+    if(_enc->rc.cap_underflow&&_enc->rc.fullness<0){
+      _enc->rc.fullness=0;
+    }
+  }
+  return dropped;
+}
+
+#define OC_RC_2PASS_HDR_SZ    (38)
+#define OC_RC_2PASS_PACKET_SZ (8)
+
+static void oc_rc_buffer_val(oc_rc_state *_rc,ogg_int64_t _val,int _bytes){
+  while(_bytes-->0){
+    _rc->twopass_buffer[_rc->twopass_buffer_bytes++]=(unsigned char)(_val&0xFF);
+    _val>>=8;
+  }
+}
+
+int oc_enc_rc_2pass_out(oc_enc_ctx *_enc,unsigned char **_buf){
+  if(_enc->rc.twopass_buffer_bytes==0){
+    if(_enc->rc.twopass==0){
+      int qi;
+      /*Pick first-pass qi for scale calculations.*/
+      qi=oc_enc_select_qi(_enc,0,0);
+      _enc->state.nqis=1;
+      _enc->state.qis[0]=qi;
+      _enc->rc.twopass=1;
+      _enc->rc.frames_total[0]=_enc->rc.frames_total[1]=
+       _enc->rc.frames_total[2]=0;
+      _enc->rc.scale_sum[0]=_enc->rc.scale_sum[1]=0;
+      /*Fill in dummy summary values.*/
+      oc_rc_buffer_val(&_enc->rc,0x5032544F,4);
+      oc_rc_buffer_val(&_enc->rc,0,4);
+      oc_rc_buffer_val(&_enc->rc,0,OC_RC_2PASS_HDR_SZ-8);
+    }
+    else{
+      int qti;
+      qti=_enc->rc.cur_metrics.frame_type;
+      _enc->rc.scale_sum[qti]+=_enc->rc.cur_metrics.scale;
+      _enc->rc.frames_total[qti]++;
+      _enc->rc.frames_total[2]+=_enc->rc.cur_metrics.dup_count;
+      oc_rc_buffer_val(&_enc->rc,
+       _enc->rc.cur_metrics.dup_count|_enc->rc.cur_metrics.frame_type<<31,4);
+      oc_rc_buffer_val(&_enc->rc,_enc->rc.cur_metrics.scale,4);
+    }
+  }
+  else if(_enc->packet_state==OC_PACKET_DONE&&
+   _enc->rc.twopass_buffer_bytes!=OC_RC_2PASS_HDR_SZ){
+    _enc->rc.twopass_buffer_bytes=0;
+    oc_rc_buffer_val(&_enc->rc,0x5032544F,4);
+    oc_rc_buffer_val(&_enc->rc,0,4);
+    oc_rc_buffer_val(&_enc->rc,_enc->rc.frames_total[0],4);
+    oc_rc_buffer_val(&_enc->rc,_enc->rc.frames_total[1],4);
+    oc_rc_buffer_val(&_enc->rc,_enc->rc.frames_total[2],4);
+    oc_rc_buffer_val(&_enc->rc,_enc->rc.exp[0],1);
+    oc_rc_buffer_val(&_enc->rc,_enc->rc.exp[1],1);
+    oc_rc_buffer_val(&_enc->rc,_enc->rc.scale_sum[0],8);
+    oc_rc_buffer_val(&_enc->rc,_enc->rc.scale_sum[1],8);
+  }
+  else{
+    /*The data for this frame has already been retrieved.*/
+    *_buf=NULL;
+    return 0;
+  }
+  *_buf=_enc->rc.twopass_buffer;
+  return _enc->rc.twopass_buffer_bytes;
+}
+
+static size_t oc_rc_buffer_fill(oc_rc_state *_rc,
+ unsigned char *_buf,size_t _bytes,size_t _consumed,size_t _goal){
+  while(_rc->twopass_buffer_fill<_goal&&_consumed<_bytes){
+    _rc->twopass_buffer[_rc->twopass_buffer_fill++]=_buf[_consumed++];
+  }
+  return _consumed;
+}
+
+static ogg_int64_t oc_rc_unbuffer_val(oc_rc_state *_rc,int _bytes){
+  ogg_int64_t ret;
+  int         shift;
+  ret=0;
+  shift=0;
+  while(_bytes-->0){
+    ret|=((ogg_int64_t)_rc->twopass_buffer[_rc->twopass_buffer_bytes++])<<shift;
+    shift+=8;
+  }
+  return ret;
+}
+
+int oc_enc_rc_2pass_in(oc_enc_ctx *_enc,unsigned char *_buf,size_t _bytes){
+  size_t consumed;
+  consumed=0;
+  /*Enable pass 2 mode if this is the first call.*/
+  if(_enc->rc.twopass==0){
+    _enc->rc.twopass=2;
+    _enc->rc.twopass_buffer_fill=0;
+    _enc->rc.frames_total[0]=0;
+    _enc->rc.nframe_metrics=0;
+    _enc->rc.cframe_metrics=0;
+    _enc->rc.frame_metrics_head=0;
+    _enc->rc.scale_window0=0;
+    _enc->rc.scale_window_end=0;
+  }
+  /*If we haven't got a valid summary header yet, try to parse one.*/
+  if(_enc->rc.frames_total[0]==0){
+    if(!_buf){
+      int frames_needed;
+      /*If we're using a whole-file buffer, we just need the first frame.
+        Otherwise, we may need as many as one per buffer slot.*/
+      frames_needed=_enc->rc.frame_metrics==NULL?1:_enc->rc.buf_delay;
+      return OC_RC_2PASS_HDR_SZ+frames_needed*OC_RC_2PASS_PACKET_SZ
+       -_enc->rc.twopass_buffer_fill;
+    }
+    consumed=oc_rc_buffer_fill(&_enc->rc,
+     _buf,_bytes,consumed,OC_RC_2PASS_HDR_SZ);
+    if(_enc->rc.twopass_buffer_fill>=OC_RC_2PASS_HDR_SZ){
+      ogg_int64_t scale_sum[2];
+      int         exp[2];
+      int         buf_delay;
+      /*Read the summary header data.*/
+      /*Check the magic value and version number.*/
+      if(oc_rc_unbuffer_val(&_enc->rc,4)!=0x5032544F||
+       oc_rc_unbuffer_val(&_enc->rc,4)!=0){
+        _enc->rc.twopass_buffer_bytes=0;
+        return TH_ENOTFORMAT;
+      }
+      _enc->rc.frames_total[0]=(ogg_uint32_t)oc_rc_unbuffer_val(&_enc->rc,4);
+      _enc->rc.frames_total[1]=(ogg_uint32_t)oc_rc_unbuffer_val(&_enc->rc,4);
+      _enc->rc.frames_total[2]=(ogg_uint32_t)oc_rc_unbuffer_val(&_enc->rc,4);
+      exp[0]=(int)oc_rc_unbuffer_val(&_enc->rc,1);
+      exp[1]=(int)oc_rc_unbuffer_val(&_enc->rc,1);
+      scale_sum[0]=oc_rc_unbuffer_val(&_enc->rc,8);
+      scale_sum[1]=oc_rc_unbuffer_val(&_enc->rc,8);
+      /*Make sure the file claims to have at least one frame.
+        Otherwise we probably got the placeholder data from an aborted pass 1.
+        Also make sure the total frame count doesn't overflow an integer.*/
+      buf_delay=_enc->rc.frames_total[0]+_enc->rc.frames_total[1]
+       +_enc->rc.frames_total[2];
+      if(_enc->rc.frames_total[0]==0||buf_delay<0||
+       (ogg_uint32_t)buf_delay<_enc->rc.frames_total[0]||
+       (ogg_uint32_t)buf_delay<_enc->rc.frames_total[1]){
+        _enc->rc.frames_total[0]=0;
+        _enc->rc.twopass_buffer_bytes=0;
+        return TH_EBADHEADER;
+      }
+      /*Got a valid header; set up pass 2.*/
+      _enc->rc.frames_left[0]=_enc->rc.frames_total[0];
+      _enc->rc.frames_left[1]=_enc->rc.frames_total[1];
+      _enc->rc.frames_left[2]=_enc->rc.frames_total[2];
+      /*If the user hasn't specified a buffer size, use the whole file.*/
+      if(_enc->rc.frame_metrics==NULL){
+        _enc->rc.buf_delay=buf_delay;
+        _enc->rc.nframes[0]=_enc->rc.frames_total[0];
+        _enc->rc.nframes[1]=_enc->rc.frames_total[1];
+        _enc->rc.nframes[2]=_enc->rc.frames_total[2];
+        _enc->rc.scale_sum[0]=scale_sum[0];
+        _enc->rc.scale_sum[1]=scale_sum[1];
+        _enc->rc.scale_window_end=buf_delay;
+        oc_enc_rc_reset(_enc);
+      }
+      /*TODO: If exp[] does not match the current values, we should adjust the
+         initial correction fit to compensate.*/
+      memset(_enc->rc.corr,0,sizeof(_enc->rc.corr));
+      /*Clear the header data from the buffer to make room for packet data.*/
+      _enc->rc.twopass_buffer_fill=0;
+      _enc->rc.twopass_buffer_bytes=0;
+    }
+  }
+  if(_enc->rc.frames_total[0]!=0){
+    ogg_int64_t curframe_num;
+    int         nframes_total;
+    curframe_num=_enc->state.curframe_num;
+    if(curframe_num>=0){
+      /*We just encoded a frame; make sure things matched.*/
+      if(_enc->rc.prev_metrics.dup_count!=_enc->prev_dup_count){
+        _enc->rc.twopass_buffer_bytes=0;
+        return TH_EINVAL;
+      }
+    }
+    curframe_num+=_enc->prev_dup_count+1;
+    nframes_total=_enc->rc.frames_total[0]+_enc->rc.frames_total[1]
+     +_enc->rc.frames_total[2];
+    if(curframe_num>=nframes_total){
+      /*We don't want any more data after the last frame, and we don't want to
+         allow any more frames to be encoded.*/
+      _enc->rc.twopass_buffer_bytes=0;
+    }
+    else if(_enc->rc.twopass_buffer_bytes==0){
+      if(_enc->rc.frame_metrics==NULL){
+        /*We're using a whole-file buffer:*/
+        if(!_buf)return OC_RC_2PASS_PACKET_SZ-_enc->rc.twopass_buffer_fill;
+        consumed=oc_rc_buffer_fill(&_enc->rc,
+         _buf,_bytes,consumed,OC_RC_2PASS_PACKET_SZ);
+        if(_enc->rc.twopass_buffer_fill>=OC_RC_2PASS_PACKET_SZ){
+          ogg_uint32_t dup_count;
+          ogg_int32_t  scale;
+          int          qti;
+          int          arg;
+          /*Read the metrics for the next frame.*/
+          dup_count=oc_rc_unbuffer_val(&_enc->rc,4);
+          scale=oc_rc_unbuffer_val(&_enc->rc,4);
+          _enc->rc.cur_metrics.scale=scale;
+          qti=(dup_count&0x80000000)>>31;
+          _enc->rc.cur_metrics.dup_count=dup_count&0x7FFFFFFF;
+          _enc->rc.cur_metrics.frame_type=qti;
+          _enc->rc.twopass_force_kf=qti==OC_INTRA_FRAME;
+          /*"Helpfully" set the dup count back to what it was in pass 1.*/
+          arg=_enc->rc.cur_metrics.dup_count;
+          th_encode_ctl(_enc,TH_ENCCTL_SET_DUP_COUNT,&arg,sizeof(arg));
+          /*Clear the buffer for the next frame.*/
+          _enc->rc.twopass_buffer_fill=0;
+        }
+      }
+      else{
+        int frames_needed;
+        /*We're using a finite buffer:*/
+        frames_needed=OC_CLAMPI(0,_enc->rc.buf_delay
+         -(_enc->rc.scale_window_end-_enc->rc.scale_window0),
+         _enc->rc.frames_left[0]+_enc->rc.frames_left[1]
+         -_enc->rc.nframes[0]-_enc->rc.nframes[1]);
+        while(frames_needed>0){
+          if(!_buf){
+            return OC_RC_2PASS_PACKET_SZ*frames_needed
+           -_enc->rc.twopass_buffer_fill;
+          }
+          consumed=oc_rc_buffer_fill(&_enc->rc,
+           _buf,_bytes,consumed,OC_RC_2PASS_PACKET_SZ);
+          if(_enc->rc.twopass_buffer_fill>=OC_RC_2PASS_PACKET_SZ){
+            oc_frame_metrics *m;
+            int               fmi;
+            ogg_uint32_t      dup_count;
+            ogg_int32_t       scale;
+            int               qti;
+            /*Read the metrics for the next frame.*/
+            dup_count=oc_rc_unbuffer_val(&_enc->rc,4);
+            scale=oc_rc_unbuffer_val(&_enc->rc,4);
+            /*Add the to the circular buffer.*/
+            fmi=_enc->rc.frame_metrics_head+_enc->rc.nframe_metrics++;
+            if(fmi>=_enc->rc.cframe_metrics)fmi-=_enc->rc.cframe_metrics;
+            m=_enc->rc.frame_metrics+fmi;
+            m->scale=scale;
+            qti=(dup_count&0x80000000)>>31;
+            m->dup_count=dup_count&0x7FFFFFFF;
+            m->frame_type=qti;
+            /*And accumulate the statistics over the window.*/
+            _enc->rc.nframes[qti]++;
+            _enc->rc.nframes[2]+=m->dup_count;
+            _enc->rc.scale_sum[qti]+=m->scale;
+            _enc->rc.scale_window_end+=m->dup_count+1;
+            /*Compute an upper bound on the number of remaining packets needed
+               for the current window.*/
+            frames_needed=OC_CLAMPI(0,_enc->rc.buf_delay
+             -(_enc->rc.scale_window_end-_enc->rc.scale_window0),
+             _enc->rc.frames_left[0]+_enc->rc.frames_left[1]
+             -_enc->rc.nframes[0]-_enc->rc.nframes[1]);
+            /*Clear the buffer for the next frame.*/
+            _enc->rc.twopass_buffer_fill=0;
+            _enc->rc.twopass_buffer_bytes=0;
+          }
+          /*Go back for more data.*/
+          else break;
+        }
+        /*If we've got all the frames we need, fill in the current metrics.
+          We're ready to go.*/
+        if(frames_needed<=0){
+          int arg;
+          *&_enc->rc.cur_metrics=
+           *(_enc->rc.frame_metrics+_enc->rc.frame_metrics_head);
+          _enc->rc.twopass_force_kf=
+           _enc->rc.cur_metrics.frame_type==OC_INTRA_FRAME;
+          /*"Helpfully" set the dup count back to what it was in pass 1.*/
+          arg=_enc->rc.cur_metrics.dup_count;
+          th_encode_ctl(_enc,TH_ENCCTL_SET_DUP_COUNT,&arg,sizeof(arg));
+          /*Mark us ready for the next frame.*/
+          _enc->rc.twopass_buffer_bytes=1;
+        }
+      }
+    }
+  }
+  return (int)consumed;
+}

Modified: branches/theora-gumboot/lib/enc/tokenize.c
===================================================================
--- branches/theora-gumboot/lib/enc/tokenize.c	2009-07-29 13:44:25 UTC (rev 16360)
+++ branches/theora-gumboot/lib/enc/tokenize.c	2009-07-29 14:50:10 UTC (rev 16361)
@@ -208,6 +208,9 @@
   int           qc;
 };
 
+/*Tokenizes the AC coefficients, possibly adjusting the quantization, and then
+   dequantizes and de-zig-zags the result.
+  The DC coefficient is not preserved; it should be restored by the caller.*/
 int oc_enc_tokenize_ac(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi,
  ogg_int16_t *_qdct,const ogg_uint16_t *_dequant,const ogg_int16_t *_dct,
  int _zzi,oc_token_checkpoint **_stack,int _acmin){
@@ -232,7 +235,6 @@
   int                  qc;
   huffi=_enc->huff_idxs[_enc->state.frame_type][1][_pli+1>>1];
   eob_run=_enc->eob_run[_pli];
-  dct_fzig_zag=_enc->state.opt_data.dct_fzig_zag;
   memset(tokens[0],0,sizeof(tokens[0]));
   best_flags=nzflags=0;
   zflags=1;
@@ -628,8 +630,10 @@
   }
   /*Emit the tokens from the best path through the trellis.*/
   stack=*_stack;
-  for(zzi=1;zzi<64;zzi++)
-    _qdct[zzi]=0;
+  /*We blow away the first entry here so that things vectorize better.
+    The DC coefficient is not actually stored in the array yet.*/
+  for(zzi=0;zzi<64;zzi++)_qdct[zzi]=0;
+  dct_fzig_zag=_enc->state.opt_data.dct_fzig_zag;
   zzi=1;
   ti=best_flags>>1&1;
   bits=tokens[zzi][ti].bits;
@@ -657,6 +661,8 @@
     next=tokens[zzi][ti].next;
     qc=tokens[zzi][ti].qc;
     zzj=(next>>1)-1&63;
+    /*TODO: It may be worth saving the dequantized coefficient in the trellis
+       above; we had to compute it to measure the error anyway.*/
     _qdct[dct_fzig_zag[zzj]]=(ogg_int16_t)(qc*(int)_dequant[zzj]);
     zzi=next>>1;
     ti=next&1;
@@ -683,13 +689,88 @@
   nhfrags=fplane->nhfrags;
   fragi=fplane->froffset+_fragy0*nhfrags;
   for(fragy=_fragy0;fragy<_frag_yend;fragy++){
-    for(fragx=0;fragx<nhfrags;fragx++,fragi++){
-      if(frags[fragi].coded){
-        frag_dc[fragi]=frags[fragi].dc
-         -oc_frag_pred_dc(frags+fragi,fplane,fragx,fragy,pred_last);
-        pred_last[OC_FRAME_FOR_MODE[frags[fragi].mb_mode]]=frags[fragi].dc;
+    if(fragy==0){
+      /*For the first row, all of the cases reduce to just using the previous
+         predictor for the same reference frame.*/
+      for(fragx=0;fragx<nhfrags;fragx++,fragi++){
+        if(frags[fragi].coded){
+          int ref;
+          ref=OC_FRAME_FOR_MODE(frags[fragi].mb_mode);
+          frag_dc[fragi]=(ogg_int16_t)(frags[fragi].dc-pred_last[ref]);
+          pred_last[ref]=frags[fragi].dc;
+        }
       }
     }
+    else{
+      const oc_fragment *u_frags;
+      int                l_ref;
+      int                ul_ref;
+      int                u_ref;
+      u_frags=frags-nhfrags;
+      l_ref=-1;
+      ul_ref=-1;
+      u_ref=u_frags[fragi].coded?OC_FRAME_FOR_MODE(u_frags[fragi].mb_mode):-1;
+      for(fragx=0;fragx<nhfrags;fragx++,fragi++){
+        int ur_ref;
+        if(fragx+1>=nhfrags)ur_ref=-1;
+        else{
+          ur_ref=u_frags[fragi+1].coded?
+           OC_FRAME_FOR_MODE(u_frags[fragi+1].mb_mode):-1;
+        }
+        if(frags[fragi].coded){
+          int pred;
+          int ref;
+          ref=OC_FRAME_FOR_MODE(frags[fragi].mb_mode);
+          /*We break out a separate case based on which of our neighbors use
+             the same reference frames.
+            This is somewhat faster than trying to make a generic case which
+             handles all of them, since it reduces lots of poorly predicted
+             jumps to one switch statement, and also lets a number of the
+             multiplications be optimized out by strength reduction.*/
+          switch((l_ref==ref)|(ul_ref==ref)<<1|
+           (u_ref==ref)<<2|(ur_ref==ref)<<3){
+            default:pred=pred_last[ref];break;
+            case  1:
+            case  3:pred=frags[fragi-1].dc;break;
+            case  2:pred=u_frags[fragi-1].dc;break;
+            case  4:
+            case  6:
+            case 12:pred=u_frags[fragi].dc;break;
+            case  5:pred=(frags[fragi-1].dc+u_frags[fragi].dc)/2;break;
+            case  8:pred=u_frags[fragi+1].dc;break;
+            case  9:
+            case 11:
+            case 13:{
+              pred=(75*frags[fragi-1].dc+53*u_frags[fragi+1].dc)/128;
+            }break;
+            case 10:pred=(u_frags[fragi-1].dc+u_frags[fragi+1].dc)/2;break;
+            case 14:{
+              pred=(3*(u_frags[fragi-1].dc+u_frags[fragi+1].dc)
+               +10*u_frags[fragi].dc)/16;
+            }break;
+            case  7:
+            case 15:{
+              int p0;
+              int p1;
+              int p2;
+              p0=frags[fragi-1].dc;
+              p1=u_frags[fragi-1].dc;
+              p2=u_frags[fragi].dc;
+              pred=(29*(p0+p2)-26*p1)/32;
+              if(abs(pred-p2)>128)pred=p2;
+              else if(abs(pred-p0)>128)pred=p0;
+              else if(abs(pred-p1)>128)pred=p1;
+            }break;
+          }
+          frag_dc[fragi]=(ogg_int16_t)(frags[fragi].dc-pred);
+          pred_last[ref]=frags[fragi].dc;
+          l_ref=ref;
+        }
+        else l_ref=-1;
+        ul_ref=u_ref;
+        u_ref=ur_ref;
+      }
+    }
   }
 }
 

Modified: branches/theora-gumboot/lib/internal.h
===================================================================
--- branches/theora-gumboot/lib/internal.h	2009-07-29 13:44:25 UTC (rev 16360)
+++ branches/theora-gumboot/lib/internal.h	2009-07-29 14:50:10 UTC (rev 16361)
@@ -135,19 +135,11 @@
 /*The number of (coded) modes.*/
 #define OC_NMODES              (8)
 
-/*Macro block is not coded.*/
-#define OC_MODE_NOT_CODED      (8)
+/*Determines the reference frame used for a given MB mode.*/
+#define OC_FRAME_FOR_MODE(_x) \
+ OC_UNIBBLE_TABLE32(OC_FRAME_PREV,OC_FRAME_SELF,OC_FRAME_PREV,OC_FRAME_PREV, \
+  OC_FRAME_PREV,OC_FRAME_GOLD,OC_FRAME_GOLD,OC_FRAME_PREV,(_x))
 
-/*Predictor bit flags.*/
-/*Left.*/
-#define OC_PL  (1)
-/*Upper-left.*/
-#define OC_PUL (2)
-/*Up.*/
-#define OC_PU  (4)
-/*Upper-right.*/
-#define OC_PUR (8)
-
 /*Constants for the packet state machine common between encoder and decoder.*/
 
 /*Next packet to emit/read: Codec info header.*/
@@ -409,8 +401,6 @@
 /*A map from the coefficient number in a block to its index in the zig zag
    scan.*/
 extern const unsigned char OC_IZIG_ZAG[64];
-/*The predictor frame to use for each macro block mode.*/
-extern const unsigned char OC_FRAME_FOR_MODE[OC_NMODES];
 /*A map from physical macro block ordering to bitstream macro block
    ordering within a super block.*/
 extern const unsigned char OC_MB_MAP[2][2];
@@ -437,9 +427,6 @@
 
 ptrdiff_t oc_dct_token_skip(int _token,int _extra_bits);
 
-int oc_frag_pred_dc(const oc_fragment *_frag,
- const oc_fragment_plane *_fplane,int _x,int _y,int _pred_last[3]);
-
 int oc_state_init(oc_theora_state *_state,const th_info *_info,int _nrefs);
 void oc_state_clear(oc_theora_state *_state);
 void oc_state_vtable_init_c(oc_theora_state *_state);