[xiph-commits] r15046 - in trunk/ffmpeg2theora: . src

j at svn.xiph.org j at svn.xiph.org
Tue Jun 17 04:08:24 PDT 2008


Author: j
Date: 2008-06-17 04:08:24 -0700 (Tue, 17 Jun 2008)
New Revision: 15046

Modified:
   trunk/ffmpeg2theora/
   trunk/ffmpeg2theora/ffmpeg2theora.1
   trunk/ffmpeg2theora/src/ffmpeg2theora.c
   trunk/ffmpeg2theora/src/ffmpeg2theora.h
   trunk/ffmpeg2theora/src/subtitles.c
   trunk/ffmpeg2theora/src/subtitles.h
   trunk/ffmpeg2theora/subtitles.txt
Log:
ignoring non utf8 sequence in what is claimed to be utf8. (thanks ogg.k.ogg.k)


Property changes on: trunk/ffmpeg2theora
___________________________________________________________________
Name: bzr:revision-info
   - timestamp: 2008-06-17 13:03:55.016999960 +0200
committer: j
properties: 
	branch-nick: ffmpeg2theora

   + timestamp: 2008-06-17 13:05:18.197000027 +0200
committer: j
properties: 
	branch-nick: ffmpeg2theora

Name: bzr:revision-id:v3-single1-dHJ1bmsvZmZtcGVnMnRoZW9yYQ..
   - 191 j-20080517230830-he5x8v2m8yrfiw35
192 j-20080518224037-pkmoctzf4qce7tog
193 j-20080518224409-6hbfp3k2ssn6egqa
194 j-20080520111939-dhi52qwbqe7a47cu
195 j-20080523092252-gj9k9db0s67vl7dw
196 j-20080523092420-l0850yrq1qkgz9t0
197 j-20080523093057-l5g0ezzy5geu0pey
198 j-20080523094343-kcno1dm2e1lr38q4
199 j-20080523163006-kjl6ewea5sxawmq2
200 j-20080523165904-l2vm52qae0hlqkhp
201 j-20080523175432-2ed953iktnl8c7cr
202 j-20080525100939-7oja8pk08v9fquiw
203 j-20080526111321-nhzaqh6ivzn0vs7b
204 j-20080527100851-2v5eyxxrq1riqi50
205 j-20080527101341-9ynbgth2b15jw792
206 j-20080527205556-19tffvfrxgt3khld
207 j-20080527205840-zeestdde3v1zks9k
208 j-20080527210129-e73y56uwmzbcid00
209 j-20080527211813-5ll680ed1q4byp16
210 j-20080528102006-aeippim0tn70mz3f
211 j-20080528104907-40kiidjojvta8j61
212 j-20080528111329-vkqbt7xkat2o9h4z
213 j-20080529102940-q9xdwm5v9espzomv
214 j-20080529111405-nmh99aon1kmh22qm
215 j-20080530094948-ncq064s4uggd9z95
216 j-20080530095056-hko2vjfwipikwjyu
217 j-20080530171822-bab8sy8lpotf8081
218 j-20080603170442-v0pxspvfcucvsaex
219 j-20080617110355-xwbeg1xidmv8fubp

   + 191 j-20080517230830-he5x8v2m8yrfiw35
192 j-20080518224037-pkmoctzf4qce7tog
193 j-20080518224409-6hbfp3k2ssn6egqa
194 j-20080520111939-dhi52qwbqe7a47cu
195 j-20080523092252-gj9k9db0s67vl7dw
196 j-20080523092420-l0850yrq1qkgz9t0
197 j-20080523093057-l5g0ezzy5geu0pey
198 j-20080523094343-kcno1dm2e1lr38q4
199 j-20080523163006-kjl6ewea5sxawmq2
200 j-20080523165904-l2vm52qae0hlqkhp
201 j-20080523175432-2ed953iktnl8c7cr
202 j-20080525100939-7oja8pk08v9fquiw
203 j-20080526111321-nhzaqh6ivzn0vs7b
204 j-20080527100851-2v5eyxxrq1riqi50
205 j-20080527101341-9ynbgth2b15jw792
206 j-20080527205556-19tffvfrxgt3khld
207 j-20080527205840-zeestdde3v1zks9k
208 j-20080527210129-e73y56uwmzbcid00
209 j-20080527211813-5ll680ed1q4byp16
210 j-20080528102006-aeippim0tn70mz3f
211 j-20080528104907-40kiidjojvta8j61
212 j-20080528111329-vkqbt7xkat2o9h4z
213 j-20080529102940-q9xdwm5v9espzomv
214 j-20080529111405-nmh99aon1kmh22qm
215 j-20080530094948-ncq064s4uggd9z95
216 j-20080530095056-hko2vjfwipikwjyu
217 j-20080530171822-bab8sy8lpotf8081
218 j-20080603170442-v0pxspvfcucvsaex
219 j-20080617110355-xwbeg1xidmv8fubp
220 j-20080617110518-khqlhaan52kz3lii


Modified: trunk/ffmpeg2theora/ffmpeg2theora.1
===================================================================
--- trunk/ffmpeg2theora/ffmpeg2theora.1	2008-06-17 11:05:54 UTC (rev 15045)
+++ trunk/ffmpeg2theora/ffmpeg2theora.1	2008-06-17 11:08:24 UTC (rev 15046)
@@ -166,6 +166,11 @@
 this available to the user for selection. The default category is
 "subtitles". Suggested other categories may include "transcript",
 "commentary", "lyrics", etc.
+.TP
+.B \-\-subtitles-ignore-non-utf8
+When reading an utf-8 subtitles text file, any invalid utf-8 sequence
+will be ignored. This may be useful if there are stray sequences in
+an otherwise utf-8 file.
 .SS Metadata options:
 .TP
 .B \-\-artist

Modified: trunk/ffmpeg2theora/src/ffmpeg2theora.c
===================================================================
--- trunk/ffmpeg2theora/src/ffmpeg2theora.c	2008-06-17 11:05:54 UTC (rev 15045)
+++ trunk/ffmpeg2theora/src/ffmpeg2theora.c	2008-06-17 11:08:24 UTC (rev 15046)
@@ -62,6 +62,7 @@
   SUBTITLES_ENCODING_FLAG,
   SUBTITLES_LANGUAGE_FLAG,
   SUBTITLES_CATEGORY_FLAG,
+  SUBTITLES_IGNORE_NON_UTF8_FLAG,
   VHOOK_FLAG,
   FRONTEND_FLAG,
   SPEEDLEVEL_FLAG,
@@ -174,6 +175,7 @@
 
         this->n_kate_streams=0;
         this->kate_streams=NULL;
+        this->ignore_non_utf8 = 0;
 
         this->pix_fmt = PIX_FMT_YUV420P;
 
@@ -1182,6 +1184,7 @@
         "             supported are " SUPPORTED_ENCODINGS "\n"
         "      --subtitles-language language    set subtitles language (de, en_GB, etc)\n"
         "      --subtitles-category category    set subtitles category (default \"subtitles\")\n"
+        "      --subtitles-ignore-non-utf8      ignores any non utf-8 sequence in utf-8 text\n"
         "\n"
 #endif
         "Metadata options:\n"
@@ -1278,6 +1281,7 @@
       {"audiostream",required_argument,&flag,AUDIOSTREAM_FLAG},
       {"subtitles",required_argument,&flag,SUBTITLES_FLAG},
       {"subtitles-encoding",required_argument,&flag,SUBTITLES_ENCODING_FLAG},
+      {"subtitles-ignore-non-utf8",0,&flag,SUBTITLES_IGNORE_NON_UTF8_FLAG},
       {"subtitles-language",required_argument,&flag,SUBTITLES_LANGUAGE_FLAG},
       {"subtitles-category",required_argument,&flag,SUBTITLES_CATEGORY_FLAG},
       {"starttime",required_argument,NULL,'s'},
@@ -1407,6 +1411,10 @@
                             else report_unknown_subtitle_encoding(optarg);
                             flag = -1;
                             break;
+                        case SUBTITLES_IGNORE_NON_UTF8_FLAG:
+                            convert->ignore_non_utf8 = 1;
+                            flag = -1;
+                            break;
                         case SUBTITLES_LANGUAGE_FLAG:
                             if (strlen(optarg)>15) {
                               fprintf(stderr, "WARNING - language is limited to 15 characters, and will be truncated\n");
@@ -1424,6 +1432,7 @@
 #else
                         case SUBTITLES_FLAG:
                         case SUBTITLES_ENCODING_FLAG:
+                        case SUBTITLES_IGNORE_NON_UTF8_FLAG:
                         case SUBTITLES_LANGUAGE_FLAG:
                         case SUBTITLES_CATEGORY_FLAG:
                             fprintf(stderr, "WARNING - Kate support not compiled in, subtitles will not be output\n"
@@ -1683,7 +1692,7 @@
 
     for (n=0; n<convert->n_kate_streams; ++n) {
         ff2theora_kate_stream *ks=convert->kate_streams+n;
-        if (load_subtitles(ks)>=0) {
+        if (load_subtitles(ks,convert->ignore_non_utf8)>=0) {
           printf("Muxing Kate stream %d from %s as %s %s\n",
               n,ks->filename,
               ks->subtitles_language[0]?ks->subtitles_language:"<unknown language>",

Modified: trunk/ffmpeg2theora/src/ffmpeg2theora.h
===================================================================
--- trunk/ffmpeg2theora/src/ffmpeg2theora.h	2008-06-17 11:05:54 UTC (rev 15045)
+++ trunk/ffmpeg2theora/src/ffmpeg2theora.h	2008-06-17 11:08:24 UTC (rev 15046)
@@ -85,7 +85,8 @@
 
     size_t n_kate_streams;
     ff2theora_kate_stream *kate_streams;
-    
+
+    int ignore_non_utf8;
     // ffmpeg2theora --nosound -f dv -H 32000 -S 0 -v 8 -x 384 -y 288 -G 1.5 input.dv
     double video_gamma;
     double video_bright;

Modified: trunk/ffmpeg2theora/src/subtitles.c
===================================================================
--- trunk/ffmpeg2theora/src/subtitles.c	2008-06-17 11:05:54 UTC (rev 15045)
+++ trunk/ffmpeg2theora/src/subtitles.c	2008-06-17 11:08:24 UTC (rev 15046)
@@ -111,34 +111,81 @@
   fprintf(stderr, "  " SUPPORTED_ENCODINGS "\n");
 }
 
-char *fgets2(char *s,size_t sz,FILE *f)
+#ifdef HAVE_KATE
+
+static char *fgets2(char *s,size_t sz,FILE *f)
 {
     char *ret = fgets(s, sz, f);
     /* fixup DOS newline character */
     char *ptr=strchr(s, '\r');
-    if (ptr) *ptr='\n';
+    if (ptr) {
+      *ptr='\n';
+      *(ptr+1)=0;
+    }
     return ret;
 }
 
-double hmsms2s(int h,int m,int s,int ms)
+static double hmsms2s(int h,int m,int s,int ms)
 {
     return h*3600+m*60+s+ms/1000.0;
 }
 
 /* very simple implementation when no iconv */
-void convert_subtitle_to_utf8(F2T_ENCODING encoding,unsigned char *text)
+static void convert_subtitle_to_utf8(F2T_ENCODING encoding,unsigned char *text,int ignore_non_utf8)
 {
   size_t nbytes;
-  unsigned char *ptr,*newtext;
+  char *ptr,*newtext;
+  int errors=0;
 
   if (!text || !*text) return;
 
   switch (encoding) {
     case ENC_UNSET:
       /* we don't know what encoding this is, assume utf-8 and we'll yell if it ain't */
-      break;
+      /* fall through */
     case ENC_UTF8:
       /* nothing to do, already in utf-8 */
+      if (ignore_non_utf8) {
+        /* actually, give the user the option of just ignoring non UTF8 characters */
+        char *wptr;
+        size_t wlen0;
+
+        nbytes = strlen(text)+1;
+        newtext=(unsigned char*)malloc(nbytes);
+        if (!newtext) {
+          fprintf(stderr, "WARNING - Memory allocation failed - cannot convert text\n");
+          return;
+        }
+        ptr = text;
+        wptr = newtext;
+        wlen0 = nbytes;
+        while (nbytes>0) {
+          int ret=kate_text_get_character(kate_utf8, (const char ** const)&ptr, &nbytes);
+          if (ret>=0) {
+            /* valid character */
+            ret=kate_text_set_character(kate_utf8, ret, &wptr, &wlen0);
+            if (ret<0) {
+              fprintf(stderr, "WARNING - failed to filter utf8 text: %s\n", text);
+              free(newtext);
+              return;
+            }
+            if (ret==0) break;
+          }
+          else {
+            /* skip offending byte - we can't skip the terminating zero as we do byte by byte */
+            ++errors;
+            ++ptr;
+            --nbytes;
+          }
+        }
+
+        if (errors) {
+          fprintf(stderr, "WARNING - Found non utf8 character(s) in string %s, scrubbed out\n", text);
+        }
+
+        strcpy(text,newtext);
+        free(newtext);
+      }
       break;
     case ENC_ISO_8859_1:
       /* simple, characters above 0x7f are broken in two,
@@ -150,7 +197,7 @@
       }
       newtext=(unsigned char*)malloc(1+nbytes);
       if (!newtext) {
-        fprintf(stderr, "Memory allocation failed - cannot convert text\n");
+        fprintf(stderr, "WARNING - Memory allocation failed - cannot convert text\n");
         return;
       }
       nbytes=0;
@@ -173,8 +220,16 @@
   }
 }
 
-int load_subtitles(ff2theora_kate_stream *this)
+static void remove_last_newline(char *text)
 {
+  char *ptr = text+strlen(text)-1;
+  if (*ptr=='\n') *ptr=0;
+}
+
+#endif
+
+int load_subtitles(ff2theora_kate_stream *this, int ignore_non_utf8)
+{
 #ifdef HAVE_KATE
     enum { need_id, need_timing, need_text };
     int need = need_id;
@@ -236,7 +291,11 @@
           break;
         case need_text:
           if (*str=='\n') {
-            convert_subtitle_to_utf8(this->subtitles_encoding,(unsigned char*)text);
+            /* we have all the lines for that subtitle, remove the last \n */
+            remove_last_newline(text);
+
+            /* we want all text to be UTF8 */
+            convert_subtitle_to_utf8(this->subtitles_encoding,(unsigned char*)text,ignore_non_utf8);
             size_t len = strlen(text);
             this->subtitles = (ff2theora_subtitle*)realloc(this->subtitles, (this->num_subtitles+1)*sizeof(ff2theora_subtitle));
             if (!this->subtitles) {

Modified: trunk/ffmpeg2theora/src/subtitles.h
===================================================================
--- trunk/ffmpeg2theora/src/subtitles.h	2008-06-17 11:05:54 UTC (rev 15045)
+++ trunk/ffmpeg2theora/src/subtitles.h	2008-06-17 11:08:24 UTC (rev 15046)
@@ -16,7 +16,7 @@
 #define SUPPORTED_ENCODINGS "utf-8, utf8, iso-8859-1, latin1"
 
 extern void add_kate_stream(ff2theora this);
-extern int load_subtitles(ff2theora_kate_stream *this);
+extern int load_subtitles(ff2theora_kate_stream *this, int ignore_non_utf8);
 extern void free_subtitles(ff2theora this);
 
 extern void set_subtitles_file(ff2theora this,const char *filename);
@@ -25,8 +25,5 @@
 extern void set_subtitles_encoding(ff2theora this,F2T_ENCODING encoding);
 extern void report_unknown_subtitle_encoding(const char *name);
 
-extern char *fgets2(char *s,size_t sz,FILE *f);
-extern double hmsms2s(int h,int m,int s,int ms);
-extern void convert_subtitle_to_utf8(F2T_ENCODING encoding,unsigned char *text);
 #endif
 

Modified: trunk/ffmpeg2theora/subtitles.txt
===================================================================
--- trunk/ffmpeg2theora/subtitles.txt	2008-06-17 11:05:54 UTC (rev 15045)
+++ trunk/ffmpeg2theora/subtitles.txt	2008-06-17 11:08:24 UTC (rev 15046)
@@ -1,9 +1,10 @@
-Subtitles can be embedded in an Ogg stream alongside a Theora video.
+Text subtitles can be embedded in an Ogg stream alongside a Theora video.
 
  * Overview
  * Subtitles related options
  * Converting non-utf-8 files to utf-8
  * Examples
+ * Playing subtitles
 
 
 
@@ -60,8 +61,14 @@
   converting other encoding to utf-8.
   If unspecified, the default is utf-8.
 
+--subtitles-ignore-non-utf8
+  Any invalid sequence in utf-8 text will be ignored. This may be useful
+  when using an utf-8 file with stray non utf-8 characters. This is not
+  a substitute for converting a non utf-8 file to utf-8, however, as the
+  non utf-8 sequence will be missing from the output stream.
 
 
+
  * Converting non-utf-8 files to utf-8
 
 If you have SubRip files in another format than utf-8, you can use the
@@ -129,3 +136,9 @@
                       input.avi
 
 
+ * Playing subtitles
+
+At the moment, only VLC has playback support for Kate streams. However, the
+libkate distribution includes patches for other players and media frameworks
+(MPlayer, GStreamer).
+



More information about the commits mailing list