[xiph-cvs] cvs commit: vorbis-tools/vorbiscomment vcomment.c

Mon Oct 1 20:03:45 PDT 2001

msmith      01/10/01 20:03:44

  Modified:    .        acinclude.m4 configure.in
               include  utf8.h
               oggenc   oggenc.c
               share    Makefile.am utf8.c
               vorbiscomment vcomment.c
  Added:       share    charmaps.h charset.c charset_test.c iconvert.c
                        makemap.c
  Removed:     share    8859-1.map 8859-2.map make_code_map.pl
  Log:
  BIG patch for sane and complete UTF conversion code (except on win32, where
  it's not yet complete, and probably doesn't compile any more), from
  Edmund Grimley Evans <edmundo at rano.org>
  
  PLEASE test this thoroughly, everyone.

Revision  Changes    Path
1.12      +16 -0     vorbis-tools/acinclude.m4

Index: acinclude.m4
===================================================================
RCS file: /usr/local/cvsroot/vorbis-tools/acinclude.m4,v
retrieving revision 1.11
retrieving revision 1.12
diff -u -r1.11 -r1.12

--- acinclude.m4	2001/08/21 14:05:09	1.11
+++ acinclude.m4	2001/10/02 03:03:41	1.12
@@ -430,3 +430,19 @@
   fi
   AC_SUBST(LIBICONV)
 ])
+
+dnl From Bruno Haible.
+dnl
+AC_DEFUN([AM_LANGINFO_CODESET],
+[
+  AC_CACHE_CHECK([for nl_langinfo and CODESET], am_cv_langinfo_codeset,
+    [AC_TRY_LINK([#include <langinfo.h>],
+      [char* cs = nl_langinfo(CODESET);],
+      am_cv_langinfo_codeset=yes,
+      am_cv_langinfo_codeset=no)
+    ])
+  if test $am_cv_langinfo_codeset = yes; then
+    AC_DEFINE(HAVE_LANGINFO_CODESET, 1,
+      [Define if you have <langinfo.h> and nl_langinfo(CODESET).])
+  fi
+])

1.31      +1 -0      vorbis-tools/configure.in

Index: configure.in
===================================================================
RCS file: /usr/local/cvsroot/vorbis-tools/configure.in,v
retrieving revision 1.30
retrieving revision 1.31
diff -u -r1.30 -r1.31
--- configure.in	2001/09/23 01:59:41	1.30
+++ configure.in	2001/10/02 03:03:41	1.31
@@ -111,6 +111,7 @@
 
 AM_ICONV
 AC_FUNC_SMMAP
+AM_LANGINFO_CODESET
 
 dnl --------------------------------------------------
 dnl Work around FHS stupidity

1.2       +18 -13    vorbis-tools/include/utf8.h

Index: utf8.h
===================================================================
RCS file: /usr/local/cvsroot/vorbis-tools/include/utf8.h,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -r1.1 -r1.2
--- utf8.h	2001/09/22 22:49:49	1.1
+++ utf8.h	2001/10/02 03:03:41	1.2
@@ -1,18 +1,23 @@
-/* OggEnc
+
+/*
+ * Convert a string between UTF-8 and the locale's charset.
+ * Invalid bytes are replaced by '#', and characters that are
+ * not available in the target encoding are replaced by '?'.
+ *
+ * If the locale's charset is not set explicitly then it is
+ * obtained using nl_langinfo(CODESET), where available, the
+ * environment variable CHARSET, or assumed to be US-ASCII.
  *
- * This program is distributed under the GNU General Public License, version 2.
- * A copy of this license is included with this source.
+ * Return value of conversion functions:
  *
- * Copyright © 2001, Daniel Resare <noa at metamatrix.se>
+ *  -1 : memory allocation failed
+ *   0 : data was converted exactly
+ *   1 : valid data was converted approximately (using '?')
+ *   2 : input was invalid (but still converted, using '#')
+ *   3 : unknown encoding (but still converted, using '?')
  */
 
-typedef struct
-{
-	char* name;
-	int mapping[256];
-} charset_map;
+void convert_set_charset(const char *charset);
 
-charset_map *get_map(const char *encoding);
-char *make_utf8_string(const unsigned short *unicode);
-int simple_utf8_encode(const char *from, char **to, const char *encoding);
-int utf8_encode(char *from, char **to, const char *encoding);
+int utf8_encode(const char *from, char **to);
+int utf8_decode(const char *from, char **to);

1.32      +6 -9      vorbis-tools/oggenc/oggenc.c

Index: oggenc.c
===================================================================
RCS file: /usr/local/cvsroot/vorbis-tools/oggenc/oggenc.c,v
retrieving revision 1.31
retrieving revision 1.32
diff -u -r1.31 -r1.32
--- oggenc.c	2001/09/30 00:01:53	1.31
+++ oggenc.c	2001/10/02 03:03:42	1.32
@@ -15,6 +15,7 @@
 #include <getopt.h>
 #include <string.h>
 #include <time.h>
+#include <locale.h>
 
 #include "platform.h"
 #include "encode.h"
@@ -50,7 +51,6 @@
         {"date",1,0,'d'},
         {"tracknum",1,0,'N'},
         {"serial",1,0,'s'},
-	{"encoding",1,0,'e'},
         {NULL,0,0,0}
 };
         
@@ -75,6 +75,8 @@
         int numfiles;
         int errors=0;
 
+	setlocale(LC_ALL, "");
+
         parse_options(argc, argv, &opt);
 
         if(optind >= argc)
@@ -320,8 +322,6 @@
                 " -s, --serial         Specify a serial number for the stream. If encoding\n"
                 "                      multiple files, this will be incremented for each\n"
                 "                      stream after the first.\n"
-		" -e, --encoding       Specify an encoding for the comments given (not\n"
-		"                      supported on windows)\n"
                 "\n"
                 " Naming:\n"
                 " -o, --output=fn      Write file to fn (only valid in single-file mode)\n"
@@ -477,7 +477,7 @@
         int ret;
         int option_index = 1;
 
-	while((ret = getopt_long(argc, argv, "a:b:B:c:C:d:e:G:hl:m:M:n:N:o:P:q:QrR:s:t:vX:", 
+	while((ret = getopt_long(argc, argv, "a:b:B:c:C:d:G:hl:m:M:n:N:o:P:q:QrR:s:t:vX:", 
                                         long_options, &option_index)) != -1)
         {
                 switch(ret)
@@ -498,9 +498,6 @@
                                 opt->dates = realloc(opt->dates, (++opt->date_count)*sizeof(char *));
                                 opt->dates[opt->date_count - 1] = strdup(optarg);
                                 break;
-			case 'e':
-				opt->encoding = strdup(optarg);
-				break;
             case 'G':
                 opt->genre = realloc(opt->genre, (++opt->genre_count)*sizeof(char *));
                 opt->genre[opt->genre_count - 1] = strdup(optarg);
@@ -646,7 +643,7 @@
 static void add_tag(vorbis_comment *vc, oe_options *opt,char *name, char *value)
 {
         char *utf8;
-	if(utf8_encode(value, &utf8, opt->encoding) == 0)
+	if(utf8_encode(value, &utf8) >= 0)
         {
                 if(name == NULL)
                         vorbis_comment_add(vc, utf8);
@@ -655,7 +652,7 @@
                 free(utf8);
         }
         else
-		fprintf(stderr, "Couldn't convert comment to UTF8, cannot add\n");
+		fprintf(stderr, "Couldn't convert comment to UTF-8, cannot add\n");
 }
 
 static void build_comments(vorbis_comment *vc, oe_options *opt, int filenum, 

1.3       +2 -3      vorbis-tools/share/Makefile.am

Index: Makefile.am
===================================================================
RCS file: /usr/local/cvsroot/vorbis-tools/share/Makefile.am,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -r1.2 -r1.3
--- Makefile.am	2001/09/22 23:13:50	1.2
+++ Makefile.am	2001/10/02 03:03:42	1.3
@@ -6,12 +6,11 @@
 
 noinst_LIBRARIES = libutf8.a libgetopt.a
 
-libutf8_a_SOURCES = utf8.c
-MAP_FILES = 8859-1.map 8859-2.map
+libutf8_a_SOURCES = charset.c iconvert.c utf8.c
 
 libgetopt_a_SOURCES = getopt.c getopt1.c
 
-EXTRA_DIST = $(MAP_FILES) charsetmap.h make_code_map.pl
+EXTRA_DIST = charmaps.h makemap.c charset_test.c
 
 debug:
         $(MAKE) all CFLAGS="@DEBUG@"

1.3       +98 -217   vorbis-tools/share/utf8.c

Index: utf8.c
===================================================================
RCS file: /usr/local/cvsroot/vorbis-tools/share/utf8.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -r1.2 -r1.3
--- utf8.c	2001/09/25 08:59:54	1.2
+++ utf8.c	2001/10/02 03:03:42	1.3
@@ -1,30 +1,40 @@
-/* OggEnc
- *
- * This program is distributed under the GNU General Public License, version 2.
- * A copy of this license is included with this source.
- *
- * (C) 2001 Michael Smith <msmith at labyrinth.net.au>
+/*
+ * Copyright (C) 2001 Peter Harris <peter.harris at hummingbird.com>
+ * Copyright (C) 2001 Edmund Grimley Evans <edmundo at rano.org>
  *
- * UTF-8 Conversion routines
- *   Copyright (C) 2001, Daniel Resare <noa at metamatrix.se>
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
-#include <stdio.h>
+/*
+ * Convert a string between UTF-8 and the locale's charset.
+ */
+
 #include <stdlib.h>
 #include <string.h>
+
 #include "utf8.h"
 
 
 #ifdef _WIN32
+#include <stdio.h>
 #include <windows.h>
 
-int utf8_encode(char *from, char **to, const char *encoding)
+int utf8_encode(const char *from, char **to)
 {
         /* Thanks to Peter Harris <peter.harris at hummingbird.com> for this win32
          * code.
-	 *
-	 * We ignore 'encoding' and assume that the input is in the 'code page'
-	 * of the console. Reasonable, since oggenc is a console app.
          */
 
         unsigned short *unicode;
@@ -36,14 +46,14 @@
         if(wchars == 0)
         {
                 fprintf(stderr, "Unicode translation error %d\n", GetLastError());
-		return 1;
+		return -1;
         }
 
         unicode = calloc(wchars + 1, sizeof(unsigned short));
         if(unicode == NULL) 
         {
                 fprintf(stderr, "Out of memory processing string to UTF8\n");
-		return 1;
+		return -1;
         }
 
         err = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED, from, 
@@ -52,7 +62,7 @@
         {
                 free(unicode);
                 fprintf(stderr, "Unicode translation error %d\n", GetLastError());
-		return 1;
+		return -1;
         }
 
         /* On NT-based windows systems, we could use WideCharToMultiByte(), but 
@@ -64,234 +74,105 @@
         return 0;
 }
 
-int utf8_decode(char *from, char **to, const char *encoding)
+int utf8_decode(const char *from, char **to)
 {
-	return 1;  /* Dummy stub */
+	return -1;  /* Dummy stub */
 }
 
 #else /* End win32. Rest is for real operating systems */
 
-#ifdef HAVE_ICONV
-#include <iconv.h>
-#include <errno.h>
-#endif
 
-#include "charsetmap.h"
+#ifdef HAVE_LANGINFO_CODESET
+#include <langinfo.h>
+#endif
 
-#define BUFSIZE 256
+int iconvert(const char *fromcode, const char *tocode,
+	     const char *from, size_t fromlen,
+	     char **to, size_t *tolen);
 
-/*
- Converts the string FROM from the encoding specified in ENCODING
- to UTF-8. The resulting string i pointed to by *TO.
+static char *current_charset = 0; /* means "US-ASCII" */
 
- Return values:
- 0 indicates a successfully converted string.
- 1 indicates that the given encoding is not available.
- 2 indicates that the given string is bigger than BUFSIZE and can therefore
-   not be encoded.
- 3 indicates that given string could not be parsed.
-*/
-int utf8_encode(char *from, char **to, const char *encoding)
+void convert_set_charset(const char *charset)
 {
-#ifdef HAVE_ICONV
-	static unsigned char buffer[BUFSIZE];
-    char *from_p, *to_p;
-	size_t from_left, to_left;
-	iconv_t cd;
+
+#ifdef HAVE_LANGINFO_CODESET
+  if (!charset)
+    charset = nl_langinfo(CODESET);
 #endif
 
-	if (!strcasecmp(encoding, "UTF-8")) {
-	    /* ideally some checking of the given string should be done */
-		*to = malloc(strlen(from) + 1);
-		strcpy(*to, from);
-		return 0;
-	}
+  if (!charset)
+    charset = getenv("CHARSET");
 
-#ifdef HAVE_ICONV
-	cd = iconv_open("UTF-8", encoding);
-	if(cd == (iconv_t)(-1))
-	{
-		if(errno == EINVAL) {
-			/* if iconv can't encode from this encoding, try
-			 * simple_utf8_encode()
-			 */
-			return simple_utf8_encode(from, to, encoding);
-		} else {
-			perror("iconv_open");
-		}
-	}
-	
-	from_left = strlen(from);
-	to_left = BUFSIZE;
-	from_p = from;
-	to_p = buffer;
-	
-	if(iconv(cd, (ICONV_CONST char **)(&from_p), &from_left, &to_p, 
-				&to_left) == (size_t)-1)
-	{
-		iconv_close(cd);
-		switch(errno)
-		{
-		case E2BIG:
-			/* if the buffer is too small, try simple_utf8_encode()
-			 */
-			return simple_utf8_encode(from, to, encoding);
-		case EILSEQ:
-		case EINVAL:
-			return 3;
-		default:
-			perror("iconv");
-		}
-	}
-	else
-	{
-		iconv_close(cd);
-	}
-	*to = malloc(BUFSIZE - to_left + 1);
-	buffer[BUFSIZE - to_left] = 0;
-	strcpy(*to, buffer);
-	return 0;
-#else
-	return simple_utf8_encode(from, to, encoding);
-#endif
+  free(current_charset);
+  current_charset = 0;
+  if (charset && *charset)
+    current_charset = strdup(charset);
 }
 
-/*
- This implementation has the following limitations: The given charset must
- represent each glyph with exactly one (1) byte. No multi byte or variable
- width charsets are allowed. (An exception to this i UTF-8 that is passed
- right through.) The glyhps in the charsets must have a unicode value equal
- to or less than 0xFFFF (this inclues pretty much everything). For a complete,
- free conversion implementation please have a look at libiconv.
-*/
-int simple_utf8_encode(const char *from, char **to, const char *encoding)
+static int convert_buffer(const char *fromcode, const char *tocode,
+			  const char *from, size_t fromlen,
+			  char **to, size_t *tolen)
 {
-	/* can you always know this will be 16 bit? */
-	unsigned short *unicode;
-	charset_map *map;
-	int index = 0;
-	unsigned char c;
-	
-	unicode = calloc((strlen(from) + 1), sizeof(short));
-
-	map = get_map(encoding);
-	
-	if (map == NULL) 
-		return 1;
+  int ret = -1;
 
-	c = from[index];
-	while(c)
-	{
-		unicode[index] = map->mapping[c];
-		index++;
-		c = from[index];
-	}
+#ifdef HAVE_ICONV
+  ret = iconvert(fromcode, tocode, from, fromlen, to, tolen);
+  if (ret != -1)
+    return ret;
+#endif
 
-	*to =  make_utf8_string(unicode);
-	free(unicode);
-	return 0;
-}
+#ifndef HAVE_ICONV /* should be ifdef USE_CHARSET_CONVERT */
+  ret = charset_convert(fromcode, tocode, from, fromlen, to, tolen);
+  if (ret != -1)
+    return ret;
+#endif
 
-int utf8_decode(char *from, char **to, const char *encoding)
-{
-#ifdef HAVE_ICONV
-	static unsigned char buffer[BUFSIZE];
-    char *from_p, *to_p;
-	size_t from_left, to_left;
-	iconv_t cd;
-	cd = iconv_open(encoding, "UTF-8");
-	if(cd == (iconv_t)(-1))
-	{
-		perror("iconv_open");
-	}
-	
-	from_left = strlen(from);
-	to_left = BUFSIZE;
-	from_p = from;
-	to_p = buffer;
-	
-	if(iconv(cd, (ICONV_CONST char **)(&from_p), &from_left, &to_p, 
-				&to_left) == (size_t)-1)
-	{
-		iconv_close(cd);
-		switch(errno)
-		{
-		case E2BIG:
-		case EILSEQ:
-		case EINVAL:
-			return 3;
-		default:
-			perror("iconv");
-		}
-	}
-	else
-	{
-		iconv_close(cd);
-	}
-	*to = malloc(BUFSIZE - to_left + 1);
-	buffer[BUFSIZE - to_left] = 0;
-	strcpy(*to, buffer);
-	return 0;
-#else
-	return 1;  /* Dummy stub */
-#endif /* HAVE_ICONV */
+  return ret;
 }
 
-charset_map *get_map(const char *encoding)
+static int convert_string(const char *fromcode, const char *tocode,
+			  const char *from, char **to, char replace)
 {
-	charset_map *map_p = maps;
-	while(map_p->name != NULL)
-	{
-		if(!strcasecmp(map_p->name, encoding))
-		{
-			return map_p;
-		}
-		map_p++;
-	}
-	return NULL;
-}
+  int ret;
+  size_t fromlen;
+  char *s;
 
-#endif /* The rest is used by everthing */
+  fromlen = strlen(from);
+  ret = convert_buffer(fromcode, tocode, from, fromlen, to, 0);
+  if (ret == -2)
+    return -1;
+  if (ret != -1)
+    return ret;
 
-char *make_utf8_string(const unsigned short *unicode)
+  s = malloc(fromlen + 1);
+  if (!s)
+    return -1;
+  strcpy(s, from);
+  *to = s;
+  for (; *s; s++)
+    if (*s & ~0x7f)
+      *s = replace;
+  return 3;
+}
+
+int utf8_encode(const char *from, char **to)
 {
-	int size = 0, index = 0, out_index = 0;
-	unsigned char *out;
-	unsigned short c;
-
-    /* first calculate the size of the target string */
-	c = unicode[index++];
-	while(c) {
-		if(c < 0x0080) {
-			size += 1;
-		} else if(c < 0x0800) {
-			size += 2;
-		} else {
-			size += 3;
-		}
-		c = unicode[index++];
-	}	
+  char *charset;
 
-	out = malloc(size + 1);
-	index = 0;
+  if (!current_charset)
+    convert_set_charset(0);
+  charset = current_charset ? current_charset : "US-ASCII";
+  return convert_string(charset, "UTF-8", from, to, '#');
+}
 
-	c = unicode[index++];
-	while(c)
-	{
-		if(c < 0x080) {
-			out[out_index++] = c;
-		} else if(c < 0x800) {
-			out[out_index++] = 0xc0 | (c >> 6);
-			out[out_index++] = 0x80 | (c & 0x3f);
-		} else {
-			out[out_index++] = 0xe0 | (c >> 12);
-			out[out_index++] = 0x80 | ((c >> 6) & 0x3f);
-			out[out_index++] = 0x80 | (c & 0x3f);
-		}
-		c = unicode[index++];
-	}
-	out[out_index] = 0x00;
+int utf8_decode(const char *from, char **to)
+{
+  char *charset;
 
-	return out;
+  if (!current_charset)
+    convert_set_charset(0);
+  charset = current_charset ? current_charset : "US-ASCII";
+  return convert_string("UTF-8", charset, from, to, '?');
 }
 
+#endif

1.1                  vorbis-tools/share/charmaps.h

Index: charmaps.h
===================================================================

/*
 * If you need to generate more maps, use makemap.c on a system
 * with a decent iconv.
 */

tatic const unsigned short mapping_iso_8859_2[256] = {
  0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
  0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
  0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017,
  0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f,
  0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027,
  0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f,
  0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
  0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f,
  0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
  0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f,
  0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057,
  0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f,
  0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
  0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f,
  0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077,
  0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x007f,
  0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,
  0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f,
  0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,
  0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f,
  0x00a0, 0x0104, 0x02d8, 0x0141, 0x00a4, 0x013d, 0x015a, 0x00a7,
  0x00a8, 0x0160, 0x015e, 0x0164, 0x0179, 0x00ad, 0x017d, 0x017b,
  0x00b0, 0x0105, 0x02db, 0x0142, 0x00b4, 0x013e, 0x015b, 0x02c7,
  0x00b8, 0x0161, 0x015f, 0x0165, 0x017a, 0x02dd, 0x017e, 0x017c,
  0x0154, 0x00c1, 0x00c2, 0x0102, 0x00c4, 0x0139, 0x0106, 0x00c7,
  0x010c, 0x00c9, 0x0118, 0x00cb, 0x011a, 0x00cd, 0x00ce, 0x010e,
  0x0110, 0x0143, 0x0147, 0x00d3, 0x00d4, 0x0150, 0x00d6, 0x00d7,
  0x0158, 0x016e, 0x00da, 0x0170, 0x00dc, 0x00dd, 0x0162, 0x00df,
  0x0155, 0x00e1, 0x00e2, 0x0103, 0x00e4, 0x013a, 0x0107, 0x00e7,
  0x010d, 0x00e9, 0x0119, 0x00eb, 0x011b, 0x00ed, 0x00ee, 0x010f,
  0x0111, 0x0144, 0x0148, 0x00f3, 0x00f4, 0x0151, 0x00f6, 0x00f7,
  0x0159, 0x016f, 0x00fa, 0x0171, 0x00fc, 0x00fd, 0x0163, 0x02d9
};

tatic struct {
  const char *name;
  const unsigned short *map;
  struct charset *charset;
} maps[] = {
  { "ISO-8859-2", mapping_iso_8859_2, 0 },
  { 0, 0, 0 }
};

tatic const struct {
  const char *bad;
  const char *good;
} names[] = {
  { "ANSI_X3.4-1968", "us-ascii" },
  { 0, 0 }
};

1.1                  vorbis-tools/share/charset.c

Index: charset.c
===================================================================
/*
 * Copyright (C) 2001 Edmund Grimley Evans <edmundo at rano.org>
 * 
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

/*
 * See the corresponding header file for a description of the functions
 * that this file provides.
 *
 * This was first written for Ogg Vorbis but could be of general use.
 *
 * The only deliberate assumption about data sizes is that a short has
 * at least 16 bits, but this code has only been tested on systems with
 * 8-bit char, 16-bit short and 32-bit int.
 */

#ifndef HAVE_ICONV /* should be ifdef USE_CHARSET_CONVERT */

#include <stdlib.h>

#include "charset.h"

#include "charmaps.h"

/*
 * This is like the standard strcasecmp, but it does not depend
 * on the locale. Locale-dependent functions can be dangerous:
 * we once had a bug involving strcasecmp("iso", "ISO") in a
 * Turkish locale!
 *
 * (I'm not really sure what the official standard says
 * about the sign of strcasecmp("Z", "["), but usually
 * we're only interested in whether it's zero.)
 */

tatic int ascii_strcasecmp(const char *s1, const char *s2)
{
  char c1, c2;

  for (;; s1++, s2++) {
    if (!*s1 || !*s1)
      break;
    if (*s1 == *s2)
      continue;
    c1 = *s1;
    if ('a' <= c1 && c1 <= 'z')
      c1 += 'A' - 'a';
    c2 = *s2;
    if ('a' <= c2 && c2 <= 'z')
      c2 += 'A' - 'a';
    if (c1 != c2)
      break;
  }
  return (unsigned char)*s1 - (unsigned char)*s2;
}

/*
 * UTF-8 equivalents of the C library's wctomb() and mbtowc().
 */

int utf8_mbtowc(int *pwc, const char *s, size_t n)
{
  unsigned char c;
  int wc, i, k;

  if (!n || !s)
    return 0;

  c = *s;
  if (c < 0x80) {
    if (pwc)
      *pwc = c;
    return c ? 1 : 0;
  }
  else if (c < 0xc2)
    return -1;
  else if (c < 0xe0) {
    if (n >= 2) {
      if (pwc)
        *pwc = ((c & 0x1f) << 6) | (s[1] & 0x3f);
      return 2;
    }
    else
      return -1;
  }
  else if (c < 0xf0)
    k = 3;
  else if (c < 0xf8)
    k = 4;
  else if (c < 0xfc)
    k = 5;
  else if (c < 0xfe)
    k = 6;
  else
    return -1;

  if (n < k)
    return -1;
  wc = *s++ & ((1 << (7 - k)) - 1);
  for (i = 1; i < k; i++) {
    if ((*s & 0xc0) != 0x80)
      return -1;
    wc = (wc << 6) | (*s++ & 0x3f);
  }
  if (wc < (1 << (5 * k - 4)))
    return -1;
  if (pwc)
    *pwc = wc;
  return k;
}

int utf8_wctomb(char *s, int wc1)
{
  unsigned int wc = wc1;

  if (!s)
    return 0;
  if (wc < (1 << 7)) {
    *s++ = wc;
    return 1;
  }
  else if (wc < (1 << 11)) {
    *s++ = 0xc0 | (wc >> 6);
    *s++ = 0x80 | (wc & 0x3f);
    return 2;
  }
  else if (wc < (1 << 16)) {
    *s++ = 0xe0 | (wc >> 12);
    *s++ = 0x80 | ((wc >> 6) & 0x3f);
    *s++ = 0x80 | (wc & 0x3f);
    return 3;
  }
  else if (wc < (1 << 21)) {
    *s++ = 0xf0 | (wc >> 18);
    *s++ = 0x80 | ((wc >> 12) & 0x3f);
    *s++ = 0x80 | ((wc >> 6) & 0x3f);
    *s++ = 0x80 | (wc & 0x3f);
    return 4;
  }
  else if (wc < (1 << 26)) {
    *s++ = 0xf8 | (wc >> 24);
    *s++ = 0x80 | ((wc >> 18) & 0x3f);
    *s++ = 0x80 | ((wc >> 12) & 0x3f);
    *s++ = 0x80 | ((wc >> 6) & 0x3f);
    *s++ = 0x80 | (wc & 0x3f);
    return 5;
  }
  else if (wc < (1 << 31)) {
    *s++ = 0xfc | (wc >> 30);
    *s++ = 0x80 | ((wc >> 24) & 0x3f);
    *s++ = 0x80 | ((wc >> 18) & 0x3f);
    *s++ = 0x80 | ((wc >> 12) & 0x3f);
    *s++ = 0x80 | ((wc >> 6) & 0x3f);
    *s++ = 0x80 | (wc & 0x3f);
    return 6;
  }
  else
    return -1;
}

/*
 * The charset "object" and methods.
 */

truct charset {
  int min, max;
  int (*mbtowc)(void *table, int *pwc, const char *s, size_t n);
  int (*wctomb)(void *table, char *s, int wc);
  void *map;
};

int charset_mbtowc(struct charset *charset, int *pwc, const char *s, size_t n)
{
  return (*charset->mbtowc)(charset->map, pwc, s, n);
}

int charset_wctomb(struct charset *charset, char *s, int wc)
{
  return (*charset->wctomb)(charset->map, s, wc);
}

int charset_min(struct charset *charset)
{
  return charset->min;
}

int charset_max(struct charset *charset)
{
  return charset->max;
}

/*
 * Implementation of UTF-8.
 */

tatic int mbtowc_utf8(void *map, int *pwc, const char *s, size_t n)
{
  return utf8_mbtowc(pwc, s, n);
}

tatic int wctomb_utf8(void *map, char *s, int wc)
{
  return utf8_wctomb(s, wc);
}

/*
 * Implementation of US-ASCII.
 * Probably on most architectures this compiles to less than 256 bytes
 * of code, so we can save space by not having a table for this one.
 */

tatic int mbtowc_ascii(void *map, int *pwc, const char *s, size_t n)
{
  int wc;

  if (!n || !s)
    return 0;
  wc = (unsigned char)*s;
  if (wc & ~0x7f)
    return -1;
  if (pwc)
    *pwc = wc;
  return wc ? 1 : 0;
}

tatic int wctomb_ascii(void *map, char *s, int wc)
{
  if (!s)
    return 0;
  if (wc & ~0x7f)
    return -1;
  *s = wc;
  return 1;
}

/*
 * Implementation of ISO-8859-1.
 * Probably on most architectures this compiles to less than 256 bytes
 * of code, so we can save space by not having a table for this one.
 */

tatic int mbtowc_iso1(void *map, int *pwc, const char *s, size_t n)
{
  int wc;

  if (!n || !s)
    return 0;
  wc = (unsigned char)*s;
  if (wc & ~0xff)
    return -1;
  if (pwc)
    *pwc = wc;
  return wc ? 1 : 0;
}

tatic int wctomb_iso1(void *map, char *s, int wc)
{
  if (!s)
    return 0;
  if (wc & ~0xff)
    return -1;
  *s = wc;
  return 1;
}

/*
 * Implementation of any 8-bit charset.
 */

truct map {
  const unsigned short *from;
  struct inverse_map *to;
};

tatic int mbtowc_8bit(void *map1, int *pwc, const char *s, size_t n)
{
  struct map *map = map1;
  unsigned short wc;

  if (!n || !s)
    return 0;
  wc = map->from[(unsigned char)*s];
  if (wc == 0xffff)
    return -1;
  if (pwc)
    *pwc = (int)wc;
  return wc ? 1 : 0;
}

/*
 * For the inverse map we use a hash table, which has the advantages
 * of small constant memory requirement and simple memory allocation,
 * but the disadvantage of slow conversion in the worst case.
 * If you need real-time performance while letting a potentially
 * malicious user define their own map, then the method used in
 * linux/drivers/char/consolemap.c would be more appropriate.
 */

truct inverse_map {
  unsigned char first[256];
  unsigned char next[256];
};

/*
 * The simple hash is good enough for this application.
 * Use the alternative trivial hashes for testing.
 */
#define HASH(i) ((i) & 0xff)
/* #define HASH(i) 0 */
/* #define HASH(i) 99 */

tatic struct inverse_map *make_inverse_map(const unsigned short *from)
{
  struct inverse_map *to;
  char used[256];
  int i, j, k;

  to = (struct inverse_map *)malloc(sizeof(struct inverse_map));
  if (!to)
    return 0;
  for (i = 0; i < 256; i++)
    to->first[i] = to->next[i] = used[i] = 0;
  for (i = 255; i >= 0; i--)
    if (from[i] != 0xffff) {
      k = HASH(from[i]);
      to->next[i] = to->first[k];
      to->first[k] = i;
      used[k] = 1;
    }

  /* Point the empty buckets at an empty list. */
  for (i = 0; i < 256; i++)
    if (!to->next[i])
      break;
  if (i < 256)
    for (j = 0; j < 256; j++)
      if (!used[j])
        to->first[j] = i;

  return to;
}

int wctomb_8bit(void *map1, char *s, int wc1)
{
  struct map *map = map1;
  unsigned short wc = wc1;
  int i;

  if (!s)
    return 0;

  if (wc1 & ~0xffff)
    return -1;

  if (1) /* Change 1 to 0 to test the case where malloc fails. */
    if (!map->to)
      map->to = make_inverse_map(map->from);

  if (map->to) {
    /* Use the inverse map. */
    i = map->to->first[HASH(wc)];
    for (;;) {
      if (map->from[i] == wc) {
        *s = i;
        return 1;
      }
      if (!(i = map->to->next[i]))
        break;
    }
  }
  else {
    /* We don't have an inverse map, so do a linear search. */
    for (i = 0; i < 256; i++)
      if (map->from[i] == wc) {
        *s = i;
        return 1;
      }
  }

  return -1;
}

/*
 * The "constructor" charset_find().
 */

truct charset charset_utf8 = {
  1, 6,
  &mbtowc_utf8,
  &wctomb_utf8,
  0
};

truct charset charset_iso1 = {
  1, 1,
  &mbtowc_iso1,
  &wctomb_iso1,
  0
};

truct charset charset_ascii = {
  1, 1,
  &mbtowc_ascii,
  &wctomb_ascii,
  0
};

truct charset *charset_find(const char *code)
{
  int i;

  /* Find good (MIME) name. */
  for (i = 0; names[i].bad; i++)
    if (!ascii_strcasecmp(code, names[i].bad)) {
      code = names[i].good;
      break;
    }

  /* Recognise some charsets for which we avoid using a table. */
  if (!ascii_strcasecmp(code, "UTF-8"))
    return &charset_utf8;
  if (!ascii_strcasecmp(code, "US-ASCII"))
    return &charset_ascii;
  if (!ascii_strcasecmp(code, "ISO-8859-1"))
    return &charset_iso1;

  /* Look for a mapping for a simple 8-bit encoding. */
  for (i = 0; maps[i].name; i++)
    if (!ascii_strcasecmp(code, maps[i].name)) {
      if (!maps[i].charset) {
        maps[i].charset = (struct charset *)malloc(sizeof(struct charset));
        if (maps[i].charset) {
          struct map *map = (struct map *)malloc(sizeof(struct map));
          if (!map) {
            free(maps[i].charset);
            maps[i].charset = 0;
          }
          else {
            maps[i].charset->min = 1;
            maps[i].charset->max = 1;
            maps[i].charset->mbtowc = &mbtowc_8bit;
            maps[i].charset->wctomb = &wctomb_8bit;
            maps[i].charset->map = map;
            map->from = maps[i].map;
            map->to = 0; /* inverse mapping is created when required */
          }
        }
      }
      return maps[i].charset;
    }

  return 0;
}

/*
 * Function to convert a buffer from one encoding to another.
 * Invalid bytes are replaced by '#', and characters that are
 * not available in the target encoding are replaced by '?'.
 * Each of TO and TOLEN may be zero, if the result is not needed.
 * The output buffer is null-terminated, so it is all right to
 * use charset_convert(fromcode, tocode, s, strlen(s), &t, 0).
 */

int charset_convert(const char *fromcode, const char *tocode,
                    const char *from, size_t fromlen,
                    char **to, size_t *tolen)
{
  int ret = 0;
  struct charset *charset1, *charset2;
  char *tobuf, *p, *newbuf;
  int i, j, wc;

  charset1 = charset_find(fromcode);
  charset2 = charset_find(tocode);
  if (!charset1 || !charset2 )
    return -1;

  tobuf = (char *)malloc((fromlen / charset1->min) * charset2->max + 1);
  if (!tobuf)
    return -2;

  for (p = tobuf; fromlen; from += i, fromlen -= i, p += j) {
    i = charset_mbtowc(charset1, &wc, from, fromlen);
    if (!i)
      i = 1;
    else if (i == -1) {
      i  = 1;
      wc = '#';
      ret = 2;
    }
    j = charset_wctomb(charset2, p, wc);
    if (j == -1) {
      if (!ret)
        ret = 1;
      j = charset_wctomb(charset2, p, '?');
      if (j == -1)
        j = 0;
    }
  }

  if (tolen)
    *tolen = p - tobuf;
  *p++ = '\0';
  if (to) {
    newbuf = realloc(tobuf, p - tobuf);
    *to = newbuf ? newbuf : tobuf;
  }
  else
    free(tobuf);

  return ret;
}

#endif /* USE_CHARSET_ICONV */

1.1                  vorbis-tools/share/charset_test.c

Index: charset_test.c
===================================================================
/*
 * Copyright (C) 2001 Edmund Grimley Evans <edmundo at rano.org>
 * 
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

#include <assert.h>
#include <string.h>

#include "charset.h"

void test_any(struct charset *charset)
{
  int wc;
  char s[2];

  assert(charset);

  /* Decoder */

  assert(charset_mbtowc(charset, 0, (char *)(-1), 0) == 0);
  assert(charset_mbtowc(charset, 0, 0, 1) == 0);

  assert(charset_mbtowc(charset, &wc, "x", 0) == 0);
  assert(charset_mbtowc(charset, &wc, "x", 1) == 1 && wc == 'x');
  assert(charset_mbtowc(charset, &wc, "x", 2) == 1 && wc == 'x');
  assert(charset_mbtowc(charset, &wc, "", 1) == 0 && wc == 0);

  /* Encoder */

  assert(charset_wctomb(charset, 0, 0) == 0);

  s[0] = s[1] = '.';
  assert(charset_wctomb(charset, s, 0) == 1 &&
         s[0] == '\0' && s[1] == '.');
  assert(charset_wctomb(charset, s, 'x') == 1 &&
         s[0] == 'x' && s[1] == '.');
}

void test_utf8()
{
  struct charset *charset;
  int wc;
  char s[8];

  charset = charset_find("UTF-8");
  test_any(charset);

  /* Decoder */
  wc = 0;
  assert(charset_mbtowc(charset, &wc, "\177", 1) == 1 && wc == 127);
  assert(charset_mbtowc(charset, &wc, "\200", 2) == -1);
  assert(charset_mbtowc(charset, &wc, "\301\277", 9) == -1);
  assert(charset_mbtowc(charset, &wc, "\302\200", 1) == -1);
  assert(charset_mbtowc(charset, &wc, "\302\200", 2) == 2 && wc == 128);
  assert(charset_mbtowc(charset, &wc, "\302\200", 3) == 2 && wc == 128);
  assert(charset_mbtowc(charset, &wc, "\340\237\200", 9) == -1);
  assert(charset_mbtowc(charset, &wc, "\340\240\200", 9) == 3 &&
         wc == 1 << 11);
  assert(charset_mbtowc(charset, &wc, "\360\217\277\277", 9) == -1);
  assert(charset_mbtowc(charset, &wc, "\360\220\200\200", 9) == 4 &&
         wc == 1 << 16);
  assert(charset_mbtowc(charset, &wc, "\370\207\277\277\277", 9) == -1);
  assert(charset_mbtowc(charset, &wc, "\370\210\200\200\200", 9) == 5 &&
         wc == 1 << 21);
  assert(charset_mbtowc(charset, &wc, "\374\203\277\277\277\277", 9) == -1);
  assert(charset_mbtowc(charset, &wc, "\374\204\200\200\200\200", 9) == 6 &&
         wc == 1 << 26);
  assert(charset_mbtowc(charset, &wc, "\375\277\277\277\277\277", 9) == 6 &&
         wc == 0x7fffffff);

  /* Encoder */
  strcpy(s, ".......");
  assert(charset_wctomb(charset, s, 1 << 31) == -1 &&
         !strcmp(s, "......."));
  assert(charset_wctomb(charset, s, 127) == 1 &&
         !strcmp(s, "\177......"));
  assert(charset_wctomb(charset, s, 128) == 2 &&
         !strcmp(s, "\302\200....."));
  assert(charset_wctomb(charset, s, 0x7ff) == 2 &&
         !strcmp(s, "\337\277....."));
  assert(charset_wctomb(charset, s, 0x800) == 3 &&
         !strcmp(s, "\340\240\200...."));
  assert(charset_wctomb(charset, s, 0xffff) == 3 &&
         !strcmp(s, "\357\277\277...."));
  assert(charset_wctomb(charset, s, 0x10000) == 4 &&
         !strcmp(s, "\360\220\200\200..."));
  assert(charset_wctomb(charset, s, 0x1fffff) == 4 &&
         !strcmp(s, "\367\277\277\277..."));
  assert(charset_wctomb(charset, s, 0x200000) == 5 &&
         !strcmp(s, "\370\210\200\200\200.."));
  assert(charset_wctomb(charset, s, 0x3ffffff) == 5 &&
         !strcmp(s, "\373\277\277\277\277.."));
  assert(charset_wctomb(charset, s, 0x4000000) == 6 &&
         !strcmp(s, "\374\204\200\200\200\200."));
  assert(charset_wctomb(charset, s, 0x7fffffff) == 6 &&
         !strcmp(s, "\375\277\277\277\277\277."));
}

void test_ascii()
{
  struct charset *charset;
  int wc;
  char s[3];

  charset = charset_find("us-ascii");
  test_any(charset);

  /* Decoder */
  wc = 0;
  assert(charset_mbtowc(charset, &wc, "\177", 2) == 1 && wc == 127);
  assert(charset_mbtowc(charset, &wc, "\200", 2) == -1);

  /* Encoder */
  strcpy(s, "..");
  assert(charset_wctomb(charset, s, 256) == -1 && !strcmp(s, ".."));
  assert(charset_wctomb(charset, s, 255) == -1);
  assert(charset_wctomb(charset, s, 128) == -1);
  assert(charset_wctomb(charset, s, 127) == 1 && !strcmp(s, "\177."));
}

void test_iso1()
{
  struct charset *charset;
  int wc;
  char s[3];

  charset = charset_find("iso-8859-1");
  test_any(charset);

  /* Decoder */
  wc = 0;
  assert(charset_mbtowc(charset, &wc, "\302\200", 9) == 1 && wc == 0xc2);

  /* Encoder */
  strcpy(s, "..");
  assert(charset_wctomb(charset, s, 256) == -1 && !strcmp(s, ".."));
  assert(charset_wctomb(charset, s, 255) == 1 && !strcmp(s, "\377."));
  assert(charset_wctomb(charset, s, 128) == 1 && !strcmp(s, "\200."));
}

void test_iso2()
{
  struct charset *charset;
  int wc;
  char s[3];

  charset = charset_find("iso-8859-2");
  test_any(charset);

  /* Decoder */
  wc = 0;
  assert(charset_mbtowc(charset, &wc, "\302\200", 9) == 1 && wc == 0xc2);
  assert(charset_mbtowc(charset, &wc, "\377", 2) == 1 && wc == 0x2d9);

  /* Encoder */
  strcpy(s, "..");
  assert(charset_wctomb(charset, s, 256) == -1 && !strcmp(s, ".."));
  assert(charset_wctomb(charset, s, 255) == -1 && !strcmp(s, ".."));
  assert(charset_wctomb(charset, s, 258) == 1 && !strcmp(s, "\303."));
  assert(charset_wctomb(charset, s, 128) == 1 && !strcmp(s, "\200."));
}

void test_convert()
{
  const char *p;
  char *q, *r;
  char s[256];
  size_t n, n2;
  int i;

  p = "\000x\302\200\375\277\277\277\277\277";
  assert(charset_convert("UTF-8", "UTF-8", p, 10, &q, &n) == 0 &&
         n == 10 && !strcmp(p, q));
  assert(charset_convert("UTF-8", "UTF-8", "x\301\277y", 4, &q, &n) == 2 &&
         n == 4 && !strcmp(q, "x##y"));
  assert(charset_convert("UTF-8", "UTF-8", "x\301\277y", 4, 0, &n) == 2 &&
         n == 4);
  assert(charset_convert("UTF-8", "UTF-8", "x\301\277y", 4, &q, 0) == 2 &&
         !strcmp(q, "x##y"));
  assert(charset_convert("UTF-8", "iso-8859-1",
                         "\302\200\304\200x", 5, &q, &n) == 1 &&
         n == 3 && !strcmp(q, "\200?x"));
  assert(charset_convert("iso-8859-1", "UTF-8", 
                         "\000\200\377", 3, &q, &n) == 0 &&
         n == 5 && !memcmp(q, "\000\302\200\303\277", 5));
  assert(charset_convert("iso-8859-1", "iso-8859-1",
                         "\000\200\377", 3, &q, &n) == 0 &&
         n == 3 && !memcmp(q, "\000\200\377", 3));

  assert(charset_convert("iso-8859-2", "utf-8", "\300", 1, &q, &n) == 0 &&
         n == 2 && !strcmp(q, "\305\224"));
  assert(charset_convert("utf-8", "iso-8859-2", "\305\224", 2, &q, &n) == 0 &&
         n == 1 && !strcmp(q, "\300"));

  for (i = 0; i < 256; i++)
    s[i] = i;

  assert(charset_convert("iso-8859-2", "utf-8", s, 256, &q, &n) == 0);
  assert(charset_convert("utf-8", "iso-8859-2", q, n, &r, &n2) == 0);
  assert(n2 == 256 && !memcmp(r, s, n2));
}

int main()
{
  test_utf8();
  test_ascii();
  test_iso1();
  test_iso2();

  test_convert();

  return 0;
}

1.1                  vorbis-tools/share/iconvert.c

Index: iconvert.c
===================================================================
/*
 * Copyright (C) 2001 Edmund Grimley Evans <edmundo at rano.org>
 * 
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

#ifdef HAVE_ICONV

#include <assert.h>
#include <errno.h>
#include <iconv.h>
#include <stdlib.h>
#include <string.h>

/*
 * Convert data from one encoding to another. Return:
 *
 *  -2 : memory allocation failed
 *  -1 : unknown encoding
 *   0 : data was converted exactly
 *   1 : data was converted inexactly
 *   2 : data was invalid (but still converted)
 *
 * We convert in two steps, via UTF-8, as this is the only
 * reliable way of distinguishing between invalid input
 * and valid input which iconv refuses to transliterate.
 * We convert from UTF-8 twice, because we have no way of
 * knowing whether the conversion was exact if iconv returns
 * E2BIG (due to a bug in the specification of iconv).
 * An alternative approach is to assume that the output of
 * iconv is never more than 4 times as long as the input,
 * but I prefer to avoid that assumption if possible.
 */

int iconvert(const char *fromcode, const char *tocode,
             const char *from, size_t fromlen,
             char **to, size_t *tolen)
{
  int ret = 0;
  iconv_t cd1, cd2;
  char *ib;
  char *ob;
  char *utfbuf, *outbuf, *newbuf;
  size_t utflen, outlen, ibl, obl, k;
  char tbuf[2048];

  cd1 = iconv_open("UTF-8", fromcode);
  if (cd1 == (iconv_t)(-1))
    return -1;

  cd2 = (iconv_t)(-1);
  /* Don't use strcasecmp() as it's locale-dependent. */
  if (!strchr("Uu", tocode[0]) ||
      !strchr("Tt", tocode[1]) ||
      !strchr("Ff", tocode[2]) ||
      tocode[3] != '-' ||
      tocode[4] != '8' ||
      tocode[5] != '\0') {
    char *tocode1;

    /*
     * Try using this non-standard feature of glibc and libiconv.
     * This is deliberately not a config option as people often
     * change their iconv library without rebuilding applications.
     */
    tocode1 = (char *)malloc(strlen(tocode) + 11);
    if (!tocode1)
      goto fail;

    strcpy(tocode1, tocode);
    strcat(tocode1, "//TRANSLIT");
    cd2 = iconv_open(tocode1, "UTF-8");
    free(tocode1);

    if (cd2 == (iconv_t)(-1))
      cd2 = iconv_open(tocode, fromcode);

    if (cd2 == (iconv_t)(-1)) {
      iconv_close(cd1);
      return -1;
    }
  }

  utflen = 1; /*fromlen * 2 + 1; XXX */
  utfbuf = (char *)malloc(utflen);
  if (!utfbuf)
    goto fail;

  /* Convert to UTF-8 */
  ib = from;
  ibl = fromlen;
  ob = utfbuf;
  obl = utflen;
  for (;;) {
    k = iconv(cd1, &ib, &ibl, &ob, &obl);
    assert((!k && !ibl) ||
           (k == (size_t)(-1) && errno == E2BIG && ibl && obl < 6) ||
           (k == (size_t)(-1) &&
            (errno == EILSEQ || errno == EINVAL) && ibl));
    if (!ibl)
      break;
    if (obl < 6) {
      /* Enlarge the buffer */
      utflen *= 2;
      newbuf = (char *)realloc(utfbuf, utflen);
      if (!newbuf)
        goto fail;
      ob = (ob - utfbuf) + newbuf;
      obl = utflen - (ob - utfbuf);
      utfbuf = newbuf;
    }
    else {
      /* Invalid input */
      ib++, ibl--;
      *ob++ = '#', obl--;
      ret = 2;
      iconv(cd1, 0, 0, 0, 0);
    }
  }

  if (cd2 == (iconv_t)(-1)) {
    /* The target encoding was UTF-8 */
    if (tolen)
      *tolen = ob - utfbuf;
    if (!to) {
      free(utfbuf);
      iconv_close(cd1);
      return ret;
    }
    newbuf = (char *)realloc(utfbuf, (ob - utfbuf) + 1);
    if (!newbuf)
      goto fail;
    ob = (ob - utfbuf) + newbuf;
    *ob = '\0';
    *to = newbuf;
    iconv_close(cd1);
    return ret;
  }

  /* Truncate the buffer to be tidy */
  utflen = ob - utfbuf;
  newbuf = (char *)realloc(utfbuf, utflen);
  if (!newbuf)
    goto fail;
  utfbuf = newbuf;

  /* Convert from UTF-8 to discover how long the output is */
  outlen = 0;
  ib = utfbuf;
  ibl = utflen;
  while (ibl) {
    ob = tbuf;
    obl = sizeof(tbuf);
    k = iconv(cd2, &ib, &ibl, &ob, &obl);
    assert((k != (size_t)(-1) && !ibl) ||
           (k == (size_t)(-1) && errno == E2BIG && ibl) ||
           (k == (size_t)(-1) && errno == EILSEQ && ibl));
    if (ibl && !(k == (size_t)(-1) && errno == E2BIG)) {
      /* Replace one character */
      char *tb = "?";
      size_t tbl = 1;

      outlen += ob - tbuf;
      ob = tbuf;
      obl = sizeof(tbuf);
      k = iconv(cd2, &tb, &tbl, &ob, &obl);
      assert((!k && !tbl) ||
             (k == (size_t)(-1) && errno == EILSEQ && tbl));
      for (++ib, --ibl; ibl && (*ib & 0x80); ib++, ibl--)
        ;
    }
    outlen += ob - tbuf;
  }
  ob = tbuf;
  obl = sizeof(tbuf);
  k = iconv(cd2, 0, 0, &ob, &obl);
  assert(!k);
  outlen += ob - tbuf;

  /* Convert from UTF-8 for real */
  outbuf = (char *)malloc(outlen + 1);
  if (!outbuf)
    goto fail;
  ib = utfbuf;
  ibl = utflen;
  ob = outbuf;
  obl = outlen;
  while (ibl) {
    k = iconv(cd2, &ib, &ibl, &ob, &obl);
    assert((k != (size_t)(-1) && !ibl) ||
           (k == (size_t)(-1) && errno == EILSEQ && ibl));
    if (k && !ret)
      ret = 1;
    if (ibl && !(k == (size_t)(-1) && errno == E2BIG)) {
      /* Replace one character */
      char *tb = "?";
      size_t tbl = 1;

      k = iconv(cd2, &tb, &tbl, &ob, &obl);
      assert((!k && !tbl) ||
             (k == (size_t)(-1) && errno == EILSEQ && tbl));
      for (++ib, --ibl; ibl && (*ib & 0x80); ib++, ibl--)
        ;
    }
  }
  k = iconv(cd2, 0, 0, &ob, &obl);
  assert(!k);
  assert(!obl);
  *ob = '\0';

  free(utfbuf);
  iconv_close(cd1);
  iconv_close(cd2);
  if (tolen)
    *tolen = outlen;
  if (!to) {
    free(outbuf);
    return ret;
  }
  *to = outbuf;
  return ret;

 fail:
  free(utfbuf);
  iconv_close(cd1);
  if (cd2 != (iconv_t)(-1))
    iconv_close(cd2);
  return -2;
}

#endif /* HAVE_ICONV */

1.1                  vorbis-tools/share/makemap.c

Index: makemap.c
===================================================================
/*
 * Copyright (C) 2001 Edmund Grimley Evans <edmundo at rano.org>
 * 
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

#include <errno.h>
#include <iconv.h>
#include <stdio.h>

int main(int argc, char *argv[])
{
  iconv_t cd;
  const char *ib;
  char *ob;
  size_t ibl, obl, k;
  unsigned char c, buf[4];
  int i, wc;

  if (argc != 2) {
    printf("Usage: %s ENCODING\n", argv[0]);
    printf("Output a charset map for the 8-bit ENCODING.\n");
    return 1;
  }

  cd = iconv_open("UCS-4", argv[1]);
  if (cd == (iconv_t)(-1)) {
    perror("iconv_open");
    return 1;
  }

  for (i = 0; i < 256; i++) {
    c = i;
    ib = &c;
    ibl = 1;
    ob = buf;
    obl = 4;
    k = iconv(cd, &ib, &ibl, &ob, &obl);
    if (!k && !ibl && !obl) {
      wc = (buf[0] << 24) + (buf[1] << 16) + (buf[2] << 8) + buf[3];
      if (wc >= 0xffff) {
        printf("Dodgy value.\n");
        return 1;
      }
    }
    else if (k == (size_t)(-1) && errno == EILSEQ)
      wc = 0xffff;
    else {
      printf("Non-standard iconv.\n");
      return 1;
    }

    if (i % 8 == 0)
      printf("  ");
    printf("0x%04x", wc);
    if (i == 255)
      printf("\n");
    else if (i % 8 == 7)
      printf(",\n");
    else
      printf(", ");
  }

  return 0;
}

1.14      +14 -19    vorbis-tools/vorbiscomment/vcomment.c

Index: vcomment.c
===================================================================
RCS file: /usr/local/cvsroot/vorbis-tools/vorbiscomment/vcomment.c,v
retrieving revision 1.13
retrieving revision 1.14
diff -u -r1.13 -r1.14
--- vcomment.c	2001/09/25 08:59:55	1.13
+++ vcomment.c	2001/10/02 03:03:44	1.14
@@ -12,6 +12,7 @@
 #include <stdio.h>
 #include <string.h>
 #include <stdlib.h>
+#include <locale.h>
 #include "getopt.h"
 #include "utf8.h"
 
@@ -24,7 +25,6 @@
         {"help",0,0,'h'},
         {"quiet",0,0,'q'},
         {"commentfile",1,0,'c'},
-    {"encoding", 1,0,'e'},
         {NULL,0,0,0}
 };
 
@@ -37,7 +37,6 @@
         int commentcount;
         char **comments;
         int tempoutfile;
-	char *encoding;
 } param_t;
 
 #define MODE_NONE  0
@@ -47,8 +46,8 @@
 
 /* prototypes */
 void usage(void);
-void print_comments(FILE *out, vorbis_comment *vc, char *encoding);
-int  add_comment(char *line, vorbis_comment *vc, char *encoding);
+void print_comments(FILE *out, vorbis_comment *vc);
+int  add_comment(char *line, vorbis_comment *vc);
 
 param_t	*new_param(void);
 void parse_options(int argc, char *argv[], param_t *param);
@@ -98,7 +97,7 @@
 
                 /* extract and display the comments */
                 vc = vcedit_comments(state);
-		print_comments(param->com, vc, param->encoding);
+		print_comments(param->com, vc);
 
                 /* done */
                 vcedit_clear(state);
@@ -128,7 +127,7 @@
 
                 for(i=0; i < param->commentcount; i++)
                 {
-			if(add_comment(param->comments[i], vc, param->encoding) < 0)
+			if(add_comment(param->comments[i], vc) < 0)
                                 fprintf(stderr, "Bad comment: \"%s\"\n", param->comments[i]);
                 }
 
@@ -139,7 +138,7 @@
                         char *buf = (char *)malloc(sizeof(char)*1024);
 
                         while (fgets(buf, 1024, param->com))
-				if (add_comment(buf, vc, param->encoding) < 0) {
+				if (add_comment(buf, vc) < 0) {
                                         fprintf(stderr,
                                                 "bad comment: \"%s\"\n",
                                                 buf);
@@ -177,14 +176,14 @@
 
 ***********/
 
-void print_comments(FILE *out, vorbis_comment *vc, char *encoding)
+void print_comments(FILE *out, vorbis_comment *vc)
 {
         int i;
     char *decoded_value;
 
         for (i = 0; i < vc->comments; i++)
     {
-	    if (utf8_decode(vc->user_comments[i], &decoded_value, encoding) == 0)
+	    if (utf8_decode(vc->user_comments[i], &decoded_value) >= 0)
         {
                     fprintf(out, "%s\n", decoded_value);
             free(decoded_value);
@@ -197,7 +196,7 @@
 /**********
 
    Take a line of the form "TAG=value string", parse it, convert the
-   value to UTF-8 from the specified encoding, and add it to the
+   value to UTF-8, and add it to the
    vorbis_comment structure. Error checking is performed.
 
    Note that this assumes a null-terminated string, which may cause
@@ -205,7 +204,7 @@
 
 ***********/
 
-int  add_comment(char *line, vorbis_comment *vc, char *encoding)
+int  add_comment(char *line, vorbis_comment *vc)
 {
         char	*mark, *value, *utf8_value;
 
@@ -234,7 +233,7 @@
         value++;
 
         /* convert the value from the native charset to UTF-8 */
-	if (utf8_encode(value, &utf8_value, encoding) == 0) {
+	if (utf8_encode(value, &utf8_value) >= 0) {
                 
                 /* append the comment and return */
                 vorbis_comment_add_tag(vc, line, utf8_value);
@@ -307,9 +306,6 @@
         param->comments=NULL;
         param->tempoutfile=0;
 
-	/* character encoding */
-	param->encoding = "ISO-8859-1";
-
         return param;
 }
 
@@ -326,8 +322,10 @@
 {
         int ret;
         int option_index = 1;
+
+	setlocale(LC_ALL, "");
 
-	while ((ret = getopt_long(argc, argv, "ae:lwhqc:t:",
+	while ((ret = getopt_long(argc, argv, "alwhqc:t:",
                         long_options, &option_index)) != -1) {
                 switch (ret) {
                         case 0:
@@ -342,9 +340,6 @@
                                 break;
                         case 'a':
                                 param->mode = MODE_APPEND;
-				break;
-			case 'e':
-				param->encoding = strdup(optarg);
                                 break;
                         case 'h':
                                 usage();

--- >8 ----
List archives:  http://www.xiph.org/archives/
Ogg project homepage: http://www.xiph.org/ogg/
To unsubscribe from this list, send a message to 'cvs-request at xiph.org'
containing only the word 'unsubscribe' in the body.  No subject is needed.
Unsubscribe messages sent to the list will be ignored/filtered.