[xiph-commits] r13126 - trunk/theora-tools/theoracomment

Sun Jun 10 17:44:18 PDT 2007

Author: tterribe
Date: 2007-06-10 17:44:18 -0700 (Sun, 10 Jun 2007)
New Revision: 13126

Added:
   trunk/theora-tools/theoracomment/charmaps.h
   trunk/theora-tools/theoracomment/charset.c
   trunk/theora-tools/theoracomment/charset.h
   trunk/theora-tools/theoracomment/theoracomment.1
   trunk/theora-tools/theoracomment/utf8.c
   trunk/theora-tools/theoracomment/utf8.h
Log:
Fix kfish's application of dkraft's UTF-8 support patch to include all the
 necessary files from vorbis-tools (and the new manpage) so that it actually
 builds.


Copied: trunk/theora-tools/theoracomment/charmaps.h (from rev 12515, trunk/vorbis-tools/share/charmaps.h)
===================================================================

--- trunk/theora-tools/theoracomment/charmaps.h	                        (rev 0)
+++ trunk/theora-tools/theoracomment/charmaps.h	2007-06-11 00:44:18 UTC (rev 13126)
@@ -0,0 +1,57 @@
+
+/*
+ * If you need to generate more maps, use makemap.c on a system
+ * with a decent iconv.
+ */
+
+static const unsigned short mapping_iso_8859_2[256] = {
+  0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
+  0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
+  0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017,
+  0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f,
+  0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027,
+  0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f,
+  0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
+  0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f,
+  0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
+  0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f,
+  0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057,
+  0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f,
+  0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
+  0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f,
+  0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077,
+  0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x007f,
+  0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,
+  0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f,
+  0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,
+  0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f,
+  0x00a0, 0x0104, 0x02d8, 0x0141, 0x00a4, 0x013d, 0x015a, 0x00a7,
+  0x00a8, 0x0160, 0x015e, 0x0164, 0x0179, 0x00ad, 0x017d, 0x017b,
+  0x00b0, 0x0105, 0x02db, 0x0142, 0x00b4, 0x013e, 0x015b, 0x02c7,
+  0x00b8, 0x0161, 0x015f, 0x0165, 0x017a, 0x02dd, 0x017e, 0x017c,
+  0x0154, 0x00c1, 0x00c2, 0x0102, 0x00c4, 0x0139, 0x0106, 0x00c7,
+  0x010c, 0x00c9, 0x0118, 0x00cb, 0x011a, 0x00cd, 0x00ce, 0x010e,
+  0x0110, 0x0143, 0x0147, 0x00d3, 0x00d4, 0x0150, 0x00d6, 0x00d7,
+  0x0158, 0x016e, 0x00da, 0x0170, 0x00dc, 0x00dd, 0x0162, 0x00df,
+  0x0155, 0x00e1, 0x00e2, 0x0103, 0x00e4, 0x013a, 0x0107, 0x00e7,
+  0x010d, 0x00e9, 0x0119, 0x00eb, 0x011b, 0x00ed, 0x00ee, 0x010f,
+  0x0111, 0x0144, 0x0148, 0x00f3, 0x00f4, 0x0151, 0x00f6, 0x00f7,
+  0x0159, 0x016f, 0x00fa, 0x0171, 0x00fc, 0x00fd, 0x0163, 0x02d9
+};
+
+static struct {
+  const char *name;
+  const unsigned short *map;
+  struct charset *charset;
+} maps[] = {
+  { "ISO-8859-2", mapping_iso_8859_2, 0 },
+  { 0, 0, 0 }
+};
+
+static const struct {
+  const char *bad;
+  const char *good;
+} names[] = {
+  { "ANSI_X3.4-1968", "us-ascii" },
+  { 0, 0 }
+};

Copied: trunk/theora-tools/theoracomment/charset.c (from rev 12515, trunk/vorbis-tools/share/charset.c)
===================================================================
--- trunk/theora-tools/theoracomment/charset.c	                        (rev 0)
+++ trunk/theora-tools/theoracomment/charset.c	2007-06-11 00:44:18 UTC (rev 13126)
@@ -0,0 +1,525 @@
+/*
+ * Copyright (C) 2001 Edmund Grimley Evans <edmundo at rano.org>
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+/*
+ * See the corresponding header file for a description of the functions
+ * that this file provides.
+ *
+ * This was first written for Ogg Vorbis but could be of general use.
+ *
+ * The only deliberate assumption about data sizes is that a short has
+ * at least 16 bits, but this code has only been tested on systems with
+ * 8-bit char, 16-bit short and 32-bit int.
+ */
+
+#ifdef HAVE_CONFIG_H
+# include <config.h>
+#endif
+
+#ifndef HAVE_ICONV /* should be ifdef USE_CHARSET_CONVERT */
+
+#include <stdlib.h>
+
+#include "charset.h"
+
+#include "charmaps.h"
+
+/*
+ * This is like the standard strcasecmp, but it does not depend
+ * on the locale. Locale-dependent functions can be dangerous:
+ * we once had a bug involving strcasecmp("iso", "ISO") in a
+ * Turkish locale!
+ *
+ * (I'm not really sure what the official standard says
+ * about the sign of strcasecmp("Z", "["), but usually
+ * we're only interested in whether it's zero.)
+ */
+
+static int ascii_strcasecmp(const char *s1, const char *s2)
+{
+  char c1, c2;
+
+  for (;; s1++, s2++) {
+    if (!*s1 || !*s1)
+      break;
+    if (*s1 == *s2)
+      continue;
+    c1 = *s1;
+    if ('a' <= c1 && c1 <= 'z')
+      c1 += 'A' - 'a';
+    c2 = *s2;
+    if ('a' <= c2 && c2 <= 'z')
+      c2 += 'A' - 'a';
+    if (c1 != c2)
+      break;
+  }
+  return (unsigned char)*s1 - (unsigned char)*s2;
+}
+
+/*
+ * UTF-8 equivalents of the C library's wctomb() and mbtowc().
+ */
+
+int utf8_mbtowc(int *pwc, const char *s, size_t n)
+{
+  unsigned char c;
+  int wc, i, k;
+
+  if (!n || !s)
+    return 0;
+
+  c = *s;
+  if (c < 0x80) {
+    if (pwc)
+      *pwc = c;
+    return c ? 1 : 0;
+  }
+  else if (c < 0xc2)
+    return -1;
+  else if (c < 0xe0) {
+    if (n >= 2 && (s[1] & 0xc0) == 0x80) {
+      if (pwc)
+	*pwc = ((c & 0x1f) << 6) | (s[1] & 0x3f);
+      return 2;
+    }
+    else
+      return -1;
+  }
+  else if (c < 0xf0)
+    k = 3;
+  else if (c < 0xf8)
+    k = 4;
+  else if (c < 0xfc)
+    k = 5;
+  else if (c < 0xfe)
+    k = 6;
+  else
+    return -1;
+
+  if (n < k)
+    return -1;
+  wc = *s++ & ((1 << (7 - k)) - 1);
+  for (i = 1; i < k; i++) {
+    if ((*s & 0xc0) != 0x80)
+      return -1;
+    wc = (wc << 6) | (*s++ & 0x3f);
+  }
+  if (wc < (1 << (5 * k - 4)))
+    return -1;
+  if (pwc)
+    *pwc = wc;
+  return k;
+}
+
+int utf8_wctomb(char *s, int wc1)
+{
+  unsigned int wc = wc1;
+
+  if (!s)
+    return 0;
+  if (wc < (1 << 7)) {
+    *s++ = wc;
+    return 1;
+  }
+  else if (wc < (1 << 11)) {
+    *s++ = 0xc0 | (wc >> 6);
+    *s++ = 0x80 | (wc & 0x3f);
+    return 2;
+  }
+  else if (wc < (1 << 16)) {
+    *s++ = 0xe0 | (wc >> 12);
+    *s++ = 0x80 | ((wc >> 6) & 0x3f);
+    *s++ = 0x80 | (wc & 0x3f);
+    return 3;
+  }
+  else if (wc < (1 << 21)) {
+    *s++ = 0xf0 | (wc >> 18);
+    *s++ = 0x80 | ((wc >> 12) & 0x3f);
+    *s++ = 0x80 | ((wc >> 6) & 0x3f);
+    *s++ = 0x80 | (wc & 0x3f);
+    return 4;
+  }
+  else if (wc < (1 << 26)) {
+    *s++ = 0xf8 | (wc >> 24);
+    *s++ = 0x80 | ((wc >> 18) & 0x3f);
+    *s++ = 0x80 | ((wc >> 12) & 0x3f);
+    *s++ = 0x80 | ((wc >> 6) & 0x3f);
+    *s++ = 0x80 | (wc & 0x3f);
+    return 5;
+  }
+  else if (wc < (1 << 31)) {
+    *s++ = 0xfc | (wc >> 30);
+    *s++ = 0x80 | ((wc >> 24) & 0x3f);
+    *s++ = 0x80 | ((wc >> 18) & 0x3f);
+    *s++ = 0x80 | ((wc >> 12) & 0x3f);
+    *s++ = 0x80 | ((wc >> 6) & 0x3f);
+    *s++ = 0x80 | (wc & 0x3f);
+    return 6;
+  }
+  else
+    return -1;
+}
+
+/*
+ * The charset "object" and methods.
+ */
+
+struct charset {
+  int max;
+  int (*mbtowc)(void *table, int *pwc, const char *s, size_t n);
+  int (*wctomb)(void *table, char *s, int wc);
+  void *map;
+};
+
+int charset_mbtowc(struct charset *charset, int *pwc, const char *s, size_t n)
+{
+  return (*charset->mbtowc)(charset->map, pwc, s, n);
+}
+
+int charset_wctomb(struct charset *charset, char *s, int wc)
+{
+  return (*charset->wctomb)(charset->map, s, wc);
+}
+
+int charset_max(struct charset *charset)
+{
+  return charset->max;
+}
+
+/*
+ * Implementation of UTF-8.
+ */
+
+static int mbtowc_utf8(void *map, int *pwc, const char *s, size_t n)
+{
+  return utf8_mbtowc(pwc, s, n);
+}
+
+static int wctomb_utf8(void *map, char *s, int wc)
+{
+  return utf8_wctomb(s, wc);
+}
+
+/*
+ * Implementation of US-ASCII.
+ * Probably on most architectures this compiles to less than 256 bytes
+ * of code, so we can save space by not having a table for this one.
+ */
+
+static int mbtowc_ascii(void *map, int *pwc, const char *s, size_t n)
+{
+  int wc;
+
+  if (!n || !s)
+    return 0;
+  wc = (unsigned char)*s;
+  if (wc & ~0x7f)
+    return -1;
+  if (pwc)
+    *pwc = wc;
+  return wc ? 1 : 0;
+}
+
+static int wctomb_ascii(void *map, char *s, int wc)
+{
+  if (!s)
+    return 0;
+  if (wc & ~0x7f)
+    return -1;
+  *s = wc;
+  return 1;
+}
+
+/*
+ * Implementation of ISO-8859-1.
+ * Probably on most architectures this compiles to less than 256 bytes
+ * of code, so we can save space by not having a table for this one.
+ */
+
+static int mbtowc_iso1(void *map, int *pwc, const char *s, size_t n)
+{
+  int wc;
+
+  if (!n || !s)
+    return 0;
+  wc = (unsigned char)*s;
+  if (wc & ~0xff)
+    return -1;
+  if (pwc)
+    *pwc = wc;
+  return wc ? 1 : 0;
+}
+
+static int wctomb_iso1(void *map, char *s, int wc)
+{
+  if (!s)
+    return 0;
+  if (wc & ~0xff)
+    return -1;
+  *s = wc;
+  return 1;
+}
+
+/*
+ * Implementation of any 8-bit charset.
+ */
+
+struct map {
+  const unsigned short *from;
+  struct inverse_map *to;
+};
+
+static int mbtowc_8bit(void *map1, int *pwc, const char *s, size_t n)
+{
+  struct map *map = map1;
+  unsigned short wc;
+
+  if (!n || !s)
+    return 0;
+  wc = map->from[(unsigned char)*s];
+  if (wc == 0xffff)
+    return -1;
+  if (pwc)
+    *pwc = (int)wc;
+  return wc ? 1 : 0;
+}
+
+/*
+ * For the inverse map we use a hash table, which has the advantages
+ * of small constant memory requirement and simple memory allocation,
+ * but the disadvantage of slow conversion in the worst case.
+ * If you need real-time performance while letting a potentially
+ * malicious user define their own map, then the method used in
+ * linux/drivers/char/consolemap.c would be more appropriate.
+ */
+
+struct inverse_map {
+  unsigned char first[256];
+  unsigned char next[256];
+};
+
+/*
+ * The simple hash is good enough for this application.
+ * Use the alternative trivial hashes for testing.
+ */
+#define HASH(i) ((i) & 0xff)
+/* #define HASH(i) 0 */
+/* #define HASH(i) 99 */
+
+static struct inverse_map *make_inverse_map(const unsigned short *from)
+{
+  struct inverse_map *to;
+  char used[256];
+  int i, j, k;
+
+  to = (struct inverse_map *)malloc(sizeof(struct inverse_map));
+  if (!to)
+    return 0;
+  for (i = 0; i < 256; i++)
+    to->first[i] = to->next[i] = used[i] = 0;
+  for (i = 255; i >= 0; i--)
+    if (from[i] != 0xffff) {
+      k = HASH(from[i]);
+      to->next[i] = to->first[k];
+      to->first[k] = i;
+      used[k] = 1;
+    }
+
+  /* Point the empty buckets at an empty list. */
+  for (i = 0; i < 256; i++)
+    if (!to->next[i])
+      break;
+  if (i < 256)
+    for (j = 0; j < 256; j++)
+      if (!used[j])
+	to->first[j] = i;
+
+  return to;
+}
+
+int wctomb_8bit(void *map1, char *s, int wc1)
+{
+  struct map *map = map1;
+  unsigned short wc = wc1;
+  int i;
+
+  if (!s)
+    return 0;
+
+  if (wc1 & ~0xffff)
+    return -1;
+
+  if (1) /* Change 1 to 0 to test the case where malloc fails. */
+    if (!map->to)
+      map->to = make_inverse_map(map->from);
+
+  if (map->to) {
+    /* Use the inverse map. */
+    i = map->to->first[HASH(wc)];
+    for (;;) {
+      if (map->from[i] == wc) {
+	*s = i;
+	return 1;
+      }
+      if (!(i = map->to->next[i]))
+	break;
+    }
+  }
+  else {
+    /* We don't have an inverse map, so do a linear search. */
+    for (i = 0; i < 256; i++)
+      if (map->from[i] == wc) {
+	*s = i;
+	return 1;
+      }
+  }
+
+  return -1;
+}
+
+/*
+ * The "constructor" charset_find().
+ */
+
+struct charset charset_utf8 = {
+  6,
+  &mbtowc_utf8,
+  &wctomb_utf8,
+  0
+};
+
+struct charset charset_iso1 = {
+  1,
+  &mbtowc_iso1,
+  &wctomb_iso1,
+  0
+};
+
+struct charset charset_ascii = {
+  1,
+  &mbtowc_ascii,
+  &wctomb_ascii,
+  0
+};
+
+struct charset *charset_find(const char *code)
+{
+  int i;
+
+  /* Find good (MIME) name. */
+  for (i = 0; names[i].bad; i++)
+    if (!ascii_strcasecmp(code, names[i].bad)) {
+      code = names[i].good;
+      break;
+    }
+
+  /* Recognise some charsets for which we avoid using a table. */
+  if (!ascii_strcasecmp(code, "UTF-8"))
+    return &charset_utf8;
+  if (!ascii_strcasecmp(code, "US-ASCII"))
+    return &charset_ascii;
+  if (!ascii_strcasecmp(code, "ISO-8859-1"))
+    return &charset_iso1;
+
+  /* Look for a mapping for a simple 8-bit encoding. */
+  for (i = 0; maps[i].name; i++)
+    if (!ascii_strcasecmp(code, maps[i].name)) {
+      if (!maps[i].charset) {
+	maps[i].charset = (struct charset *)malloc(sizeof(struct charset));
+	if (maps[i].charset) {
+	  struct map *map = (struct map *)malloc(sizeof(struct map));
+	  if (!map) {
+	    free(maps[i].charset);
+	    maps[i].charset = 0;
+	  }
+	  else {
+	    maps[i].charset->max = 1;
+	    maps[i].charset->mbtowc = &mbtowc_8bit;
+	    maps[i].charset->wctomb = &wctomb_8bit;
+	    maps[i].charset->map = map;
+	    map->from = maps[i].map;
+	    map->to = 0; /* inverse mapping is created when required */
+	  }
+	}
+      }
+      return maps[i].charset;
+    }
+
+  return 0;
+}
+
+/*
+ * Function to convert a buffer from one encoding to another.
+ * Invalid bytes are replaced by '#', and characters that are
+ * not available in the target encoding are replaced by '?'.
+ * Each of TO and TOLEN may be zero, if the result is not needed.
+ * The output buffer is null-terminated, so it is all right to
+ * use charset_convert(fromcode, tocode, s, strlen(s), &t, 0).
+ */
+
+int charset_convert(const char *fromcode, const char *tocode,
+		    const char *from, size_t fromlen,
+		    char **to, size_t *tolen)
+{
+  int ret = 0;
+  struct charset *charset1, *charset2;
+  char *tobuf, *p, *newbuf;
+  int i, j, wc;
+
+  charset1 = charset_find(fromcode);
+  charset2 = charset_find(tocode);
+  if (!charset1 || !charset2 )
+    return -1;
+
+  tobuf = (char *)malloc(fromlen * charset2->max + 1);
+  if (!tobuf)
+    return -2;
+
+  for (p = tobuf; fromlen; from += i, fromlen -= i, p += j) {
+    i = charset_mbtowc(charset1, &wc, from, fromlen);
+    if (!i)
+      i = 1;
+    else if (i == -1) {
+      i  = 1;
+      wc = '#';
+      ret = 2;
+    }
+    j = charset_wctomb(charset2, p, wc);
+    if (j == -1) {
+      if (!ret)
+	ret = 1;
+      j = charset_wctomb(charset2, p, '?');
+      if (j == -1)
+	j = 0;
+    }
+  }
+
+  if (tolen)
+    *tolen = p - tobuf;
+  *p++ = '\0';
+  if (to) {
+    newbuf = realloc(tobuf, p - tobuf);
+    *to = newbuf ? newbuf : tobuf;
+  }
+  else
+    free(tobuf);
+
+  return ret;
+}
+
+#endif /* USE_CHARSET_ICONV */

Copied: trunk/theora-tools/theoracomment/charset.h (from rev 12515, trunk/vorbis-tools/share/charset.h)
===================================================================
--- trunk/theora-tools/theoracomment/charset.h	                        (rev 0)
+++ trunk/theora-tools/theoracomment/charset.h	2007-06-11 00:44:18 UTC (rev 13126)
@@ -0,0 +1,72 @@
+/*
+ * Copyright (C) 2001 Edmund Grimley Evans <edmundo at rano.org>
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <stdlib.h>
+
+/*
+ * These functions are like the C library's mbtowc() and wctomb(),
+ * but instead of depending on the locale they always work in UTF-8,
+ * and they use int instead of wchar_t.
+ */
+
+int utf8_mbtowc(int *pwc, const char *s, size_t n);
+int utf8_wctomb(char *s, int wc);
+
+/*
+ * This is an object-oriented version of mbtowc() and wctomb().
+ * The caller first uses charset_find() to get a pointer to struct
+ * charset, then uses the mbtowc() and wctomb() methods on it.
+ * The function charset_max() gives the maximum length of a
+ * multibyte character in that encoding.
+ * This API is only appropriate for stateless encodings like UTF-8
+ * or ISO-8859-3, but I have no intention of implementing anything
+ * other than UTF-8 and 8-bit encodings.
+ *
+ * MINOR BUG: If there is no memory charset_find() may return 0 and
+ * there is no way to distinguish this case from an unknown encoding.
+ */
+
+struct charset;
+
+struct charset *charset_find(const char *code);
+
+int charset_mbtowc(struct charset *charset, int *pwc, const char *s, size_t n);
+int charset_wctomb(struct charset *charset, char *s, int wc);
+int charset_max(struct charset *charset);
+
+/*
+ * Function to convert a buffer from one encoding to another.
+ * Invalid bytes are replaced by '#', and characters that are
+ * not available in the target encoding are replaced by '?'.
+ * Each of TO and TOLEN may be zero if the result is not wanted.
+ * The input or output may contain null bytes, but the output
+ * buffer is also null-terminated, so it is all right to
+ * use charset_convert(fromcode, tocode, s, strlen(s), &t, 0).
+ *
+ * Return value:
+ *
+ *  -2 : memory allocation failed
+ *  -1 : unknown encoding
+ *   0 : data was converted exactly
+ *   1 : valid data was converted approximately (using '?')
+ *   2 : input was invalid (but still converted, using '#')
+ */
+
+int charset_convert(const char *fromcode, const char *tocode,
+		    const char *from, size_t fromlen,
+		    char **to, size_t *tolen);

Added: trunk/theora-tools/theoracomment/theoracomment.1
===================================================================
--- trunk/theora-tools/theoracomment/theoracomment.1	                        (rev 0)
+++ trunk/theora-tools/theoracomment/theoracomment.1	2007-06-11 00:44:18 UTC (rev 13126)
@@ -0,0 +1,89 @@
+.\" Process this file with
+.\" groff -man -Tascii theoracomment.1
+.\"
+.TH THEORACOMMENT 1 "June 8, 2007" "Xiph.org Foundation" "Theora Tools"
+
+.SH NAME
+theoracomment \- List or edit comments in Ogg Theora files
+
+.SH SYNOPSIS
+.B theoracomment
+.RB [ -l ]
+.I file.ogg
+.br
+.B theoracomment
+.B -a
+.B [ -c commentfile | -t \*(lqname=value\*(rq ]
+.RB [ -q ]
+.I in.ogg
+.I [out.ogg]
+.br
+.B theoracomment
+.B -w
+.B [ -c commentfile | -t \*(lqname=value\*(rq ]
+.RB [ -q ]
+.I in.ogg
+.I [out.ogg]
+
+.SH DESCRIPTION
+.B theoracomment
+Reads, modifies, and appends Ogg Theora audio file metadata tags.
+
+.SH OPTIONS
+.IP "-a, --append"
+Append comments.
+.IP "-c file, --commentfile file"
+Take comments from a file. The file is the same format as is output by the the -l option: one element per line in 'name=value' format.
+.IP "-h, --help"
+Show command help.
+.IP "-l, --list"
+List the comments in the ogg theora file.
+.IP "-q, --quiet"
+Quiet mode.  No messages are displayed.
+.IP "-t 'name=value', --tag 'name=value'"
+Specify a new tag on the command line. Each tag is given as a single string. The part before the '=' is treated as the tag name and the part after as the value.
+.IP "-w, --write"
+Replace comments with the new set given either on the command line with -t or from a file with -c.
+.IP "-R, --raw"
+Read and write comments in UTF-8, rather than converting to the user's character set.
+.IP "-V, --version"
+Display the version of theoracomment.
+
+.\" Examples go here
+.SH EXAMPLES
+
+To just see what comment tags are in a file:
+
+    theoracomment -l file.ogg
+
+To edit those comments:
+
+    theoracomment -l file.ogg > file.txt
+    [edit the comments in file.txt to your satisfaction]
+    theoracomment -w -c file.txt file.ogg newfile.ogg
+
+To simply add a comment:
+
+    theoracomment -a -t 'ARTIST=No One You Know' file.ogg newfile.ogg
+
+.SH TAG FORMAT
+
+See http://xiph.org/ogg/theora/doc/v-comment.html for documentation on the Ogg Theora tag format, including a suggested list of canonical tag names.
+
+.SH AUTHORS
+
+.TP
+Program Authors:
+.br
+Daniel Kraft <d at domob.eu>
+.br
+
+.TP
+Manpage Author:
+.br
+Daniel Kraft <d at domob.eu>
+
+.SH "SEE ALSO"
+
+.PP
+\fBtheoraenc\fR(1)

Copied: trunk/theora-tools/theoracomment/utf8.c (from rev 12515, trunk/vorbis-tools/share/utf8.c)
===================================================================
--- trunk/theora-tools/theoracomment/utf8.c	                        (rev 0)
+++ trunk/theora-tools/theoracomment/utf8.c	2007-06-11 00:44:18 UTC (rev 13126)
@@ -0,0 +1,324 @@
+/*
+ * Copyright (C) 2001 Peter Harris <peter.harris at hummingbird.com>
+ * Copyright (C) 2001 Edmund Grimley Evans <edmundo at rano.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+/*
+ * Convert a string between UTF-8 and the locale's charset.
+ */
+
+#ifdef HAVE_CONFIG_H
+# include <config.h>
+#endif
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "utf8.h"
+
+
+#ifdef _WIN32
+
+	/* Thanks to Peter Harris <peter.harris at hummingbird.com> for this win32
+	 * code.
+	 */
+
+#include <stdio.h>
+#include <windows.h>
+
+static unsigned char *make_utf8_string(const wchar_t *unicode)
+{
+    int size = 0, index = 0, out_index = 0;
+    unsigned char *out;
+    unsigned short c;
+
+    /* first calculate the size of the target string */
+    c = unicode[index++];
+    while(c) {
+        if(c < 0x0080) {
+            size += 1;
+        } else if(c < 0x0800) {
+            size += 2;
+        } else {
+            size += 3;
+        }
+        c = unicode[index++];
+    }	
+
+    out = malloc(size + 1);
+    if (out == NULL)
+        return NULL;
+    index = 0;
+
+    c = unicode[index++];
+    while(c)
+    {
+        if(c < 0x080) {
+            out[out_index++] = (unsigned char)c;
+        } else if(c < 0x800) {
+            out[out_index++] = 0xc0 | (c >> 6);
+            out[out_index++] = 0x80 | (c & 0x3f);
+        } else {
+            out[out_index++] = 0xe0 | (c >> 12);
+            out[out_index++] = 0x80 | ((c >> 6) & 0x3f);
+            out[out_index++] = 0x80 | (c & 0x3f);
+        }
+        c = unicode[index++];
+    }
+    out[out_index] = 0x00;
+
+    return out;
+}
+
+static wchar_t *make_unicode_string(const unsigned char *utf8)
+{
+    int size = 0, index = 0, out_index = 0;
+    wchar_t *out;
+    unsigned char c;
+
+    /* first calculate the size of the target string */
+    c = utf8[index++];
+    while(c) {
+        if((c & 0x80) == 0) {
+            index += 0;
+        } else if((c & 0xe0) == 0xe0) {
+            index += 2;
+        } else {
+            index += 1;
+        }
+        size += 1;
+        c = utf8[index++];
+    }	
+
+    out = malloc((size + 1) * sizeof(wchar_t));
+    if (out == NULL)
+        return NULL;
+    index = 0;
+
+    c = utf8[index++];
+    while(c)
+    {
+        if((c & 0x80) == 0) {
+            out[out_index++] = c;
+        } else if((c & 0xe0) == 0xe0) {
+            out[out_index] = (c & 0x1F) << 12;
+	        c = utf8[index++];
+            out[out_index] |= (c & 0x3F) << 6;
+	        c = utf8[index++];
+            out[out_index++] |= (c & 0x3F);
+        } else {
+            out[out_index] = (c & 0x3F) << 6;
+	        c = utf8[index++];
+            out[out_index++] |= (c & 0x3F);
+        }
+        c = utf8[index++];
+    }
+    out[out_index] = 0;
+
+    return out;
+}
+
+int utf8_encode(const char *from, char **to)
+{
+	wchar_t *unicode;
+	int wchars, err;
+
+	wchars = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED, from,
+			strlen(from), NULL, 0);
+
+	if(wchars == 0)
+	{
+		fprintf(stderr, "Unicode translation error %d\n", GetLastError());
+		return -1;
+	}
+
+	unicode = calloc(wchars + 1, sizeof(unsigned short));
+	if(unicode == NULL) 
+	{
+		fprintf(stderr, "Out of memory processing string to UTF8\n");
+		return -1;
+	}
+
+	err = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED, from, 
+			strlen(from), unicode, wchars);
+	if(err != wchars)
+	{
+		free(unicode);
+		fprintf(stderr, "Unicode translation error %d\n", GetLastError());
+		return -1;
+	}
+
+	/* On NT-based windows systems, we could use WideCharToMultiByte(), but 
+	 * MS doesn't actually have a consistent API across win32.
+	 */
+	*to = make_utf8_string(unicode);
+
+	free(unicode);
+	return 0;
+}
+
+int utf8_decode(const char *from, char **to)
+{
+    wchar_t *unicode;
+    int chars, err;
+
+    /* On NT-based windows systems, we could use MultiByteToWideChar(CP_UTF8), but 
+     * MS doesn't actually have a consistent API across win32.
+     */
+    unicode = make_unicode_string(from);
+    if(unicode == NULL) 
+    {
+        fprintf(stderr, "Out of memory processing string from UTF8 to UNICODE16\n");
+        return -1;
+    }
+
+    chars = WideCharToMultiByte(GetConsoleCP(), WC_COMPOSITECHECK, unicode,
+            -1, NULL, 0, NULL, NULL);
+
+    if(chars == 0)
+    {
+        fprintf(stderr, "Unicode translation error %d\n", GetLastError());
+        free(unicode);
+        return -1;
+    }
+
+    *to = calloc(chars + 1, sizeof(unsigned char));
+    if(*to == NULL) 
+    {
+        fprintf(stderr, "Out of memory processing string to local charset\n");
+        free(unicode);
+        return -1;
+    }
+
+    err = WideCharToMultiByte(GetConsoleCP(), WC_COMPOSITECHECK, unicode, 
+            -1, *to, chars, NULL, NULL);
+    if(err != chars)
+    {
+        fprintf(stderr, "Unicode translation error %d\n", GetLastError());
+        free(unicode);
+        free(*to);
+        *to = NULL;
+        return -1;
+    }
+
+    free(unicode);
+    return 0;
+}
+
+#else /* End win32. Rest is for real operating systems */
+
+
+#ifdef HAVE_LANGINFO_CODESET
+#include <langinfo.h>
+#endif
+
+int iconvert(const char *fromcode, const char *tocode,
+	     const char *from, size_t fromlen,
+	     char **to, size_t *tolen);
+
+static char *current_charset = 0; /* means "US-ASCII" */
+
+void convert_set_charset(const char *charset)
+{
+
+  if (!charset)
+    charset = getenv("CHARSET");
+
+#ifdef HAVE_LANGINFO_CODESET
+  if (!charset)
+    charset = nl_langinfo(CODESET);
+#endif
+
+  free(current_charset);
+  current_charset = 0;
+  if (charset && *charset)
+    current_charset = strdup(charset);
+}
+
+static int convert_buffer(const char *fromcode, const char *tocode,
+			  const char *from, size_t fromlen,
+			  char **to, size_t *tolen)
+{
+  int ret = -1;
+
+#ifdef HAVE_ICONV
+  ret = iconvert(fromcode, tocode, from, fromlen, to, tolen);
+  if (ret != -1)
+    return ret;
+#endif
+
+#ifndef HAVE_ICONV /* should be ifdef USE_CHARSET_CONVERT */
+  ret = charset_convert(fromcode, tocode, from, fromlen, to, tolen);
+  if (ret != -1)
+    return ret;
+#endif
+
+  return ret;
+}
+
+static int convert_string(const char *fromcode, const char *tocode,
+			  const char *from, char **to, char replace)
+{
+  int ret;
+  size_t fromlen;
+  char *s;
+
+  fromlen = strlen(from);
+  ret = convert_buffer(fromcode, tocode, from, fromlen, to, 0);
+  if (ret == -2)
+    return -1;
+  if (ret != -1)
+    return ret;
+
+  s = malloc(fromlen + 1);
+  if (!s)
+    return -1;
+  strcpy(s, from);
+  *to = s;
+  for (; *s; s++)
+    if (*s & ~0x7f)
+      *s = replace;
+  return 3;
+}
+
+int utf8_encode(const char *from, char **to)
+{
+  char *charset;
+
+  if (!current_charset)
+    convert_set_charset(0);
+  charset = current_charset ? current_charset : "US-ASCII";
+  return convert_string(charset, "UTF-8", from, to, '#');
+}
+
+int utf8_decode(const char *from, char **to)
+{
+  char *charset;
+
+  if(*from == 0) {
+      *to = malloc(1);
+      **to = 0;
+      return 1;
+  }
+
+  if (!current_charset)
+    convert_set_charset(0);
+  charset = current_charset ? current_charset : "US-ASCII";
+  return convert_string("UTF-8", charset, from, to, '?');
+}
+
+#endif

Copied: trunk/theora-tools/theoracomment/utf8.h (from rev 12515, trunk/vorbis-tools/include/utf8.h)
===================================================================
--- trunk/theora-tools/theoracomment/utf8.h	                        (rev 0)
+++ trunk/theora-tools/theoracomment/utf8.h	2007-06-11 00:44:18 UTC (rev 13126)
@@ -0,0 +1,36 @@
+
+/*
+ * Convert a string between UTF-8 and the locale's charset.
+ * Invalid bytes are replaced by '#', and characters that are
+ * not available in the target encoding are replaced by '?'.
+ *
+ * If the locale's charset is not set explicitly then it is
+ * obtained using nl_langinfo(CODESET), where available, the
+ * environment variable CHARSET, or assumed to be US-ASCII.
+ *
+ * Return value of conversion functions:
+ *
+ *  -1 : memory allocation failed
+ *   0 : data was converted exactly
+ *   1 : valid data was converted approximately (using '?')
+ *   2 : input was invalid (but still converted, using '#')
+ *   3 : unknown encoding (but still converted, using '?')
+ */
+
+#ifndef __UTF8_H
+#define __UTF8_H
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+void convert_set_charset(const char *charset);
+
+int utf8_encode(const char *from, char **to);
+int utf8_decode(const char *from, char **to);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* __UTF8_H */