[xiph-cvs] cvs commit: vorbis-tools/vorbiscomment vcomment.c
Michael Smith
msmith at xiph.org
Mon Oct 1 20:03:45 PDT 2001
msmith 01/10/01 20:03:44
Modified: . acinclude.m4 configure.in
include utf8.h
oggenc oggenc.c
share Makefile.am utf8.c
vorbiscomment vcomment.c
Added: share charmaps.h charset.c charset_test.c iconvert.c
makemap.c
Removed: share 8859-1.map 8859-2.map make_code_map.pl
Log:
BIG patch for sane and complete UTF conversion code (except on win32, where
it's not yet complete, and probably doesn't compile any more), from
Edmund Grimley Evans <edmundo at rano.org>
PLEASE test this thoroughly, everyone.
Revision Changes Path
1.12 +16 -0 vorbis-tools/acinclude.m4
Index: acinclude.m4
===================================================================
RCS file: /usr/local/cvsroot/vorbis-tools/acinclude.m4,v
retrieving revision 1.11
retrieving revision 1.12
diff -u -r1.11 -r1.12
--- acinclude.m4 2001/08/21 14:05:09 1.11
+++ acinclude.m4 2001/10/02 03:03:41 1.12
@@ -430,3 +430,19 @@
fi
AC_SUBST(LIBICONV)
])
+
+dnl From Bruno Haible.
+dnl
+AC_DEFUN([AM_LANGINFO_CODESET],
+[
+ AC_CACHE_CHECK([for nl_langinfo and CODESET], am_cv_langinfo_codeset,
+ [AC_TRY_LINK([#include <langinfo.h>],
+ [char* cs = nl_langinfo(CODESET);],
+ am_cv_langinfo_codeset=yes,
+ am_cv_langinfo_codeset=no)
+ ])
+ if test $am_cv_langinfo_codeset = yes; then
+ AC_DEFINE(HAVE_LANGINFO_CODESET, 1,
+ [Define if you have <langinfo.h> and nl_langinfo(CODESET).])
+ fi
+])
1.31 +1 -0 vorbis-tools/configure.in
Index: configure.in
===================================================================
RCS file: /usr/local/cvsroot/vorbis-tools/configure.in,v
retrieving revision 1.30
retrieving revision 1.31
diff -u -r1.30 -r1.31
--- configure.in 2001/09/23 01:59:41 1.30
+++ configure.in 2001/10/02 03:03:41 1.31
@@ -111,6 +111,7 @@
AM_ICONV
AC_FUNC_SMMAP
+AM_LANGINFO_CODESET
dnl --------------------------------------------------
dnl Work around FHS stupidity
1.2 +18 -13 vorbis-tools/include/utf8.h
Index: utf8.h
===================================================================
RCS file: /usr/local/cvsroot/vorbis-tools/include/utf8.h,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -r1.1 -r1.2
--- utf8.h 2001/09/22 22:49:49 1.1
+++ utf8.h 2001/10/02 03:03:41 1.2
@@ -1,18 +1,23 @@
-/* OggEnc
+
+/*
+ * Convert a string between UTF-8 and the locale's charset.
+ * Invalid bytes are replaced by '#', and characters that are
+ * not available in the target encoding are replaced by '?'.
+ *
+ * If the locale's charset is not set explicitly then it is
+ * obtained using nl_langinfo(CODESET), where available, the
+ * environment variable CHARSET, or assumed to be US-ASCII.
*
- * This program is distributed under the GNU General Public License, version 2.
- * A copy of this license is included with this source.
+ * Return value of conversion functions:
*
- * Copyright © 2001, Daniel Resare <noa at metamatrix.se>
+ * -1 : memory allocation failed
+ * 0 : data was converted exactly
+ * 1 : valid data was converted approximately (using '?')
+ * 2 : input was invalid (but still converted, using '#')
+ * 3 : unknown encoding (but still converted, using '?')
*/
-typedef struct
-{
- char* name;
- int mapping[256];
-} charset_map;
+void convert_set_charset(const char *charset);
-charset_map *get_map(const char *encoding);
-char *make_utf8_string(const unsigned short *unicode);
-int simple_utf8_encode(const char *from, char **to, const char *encoding);
-int utf8_encode(char *from, char **to, const char *encoding);
+int utf8_encode(const char *from, char **to);
+int utf8_decode(const char *from, char **to);
1.32 +6 -9 vorbis-tools/oggenc/oggenc.c
Index: oggenc.c
===================================================================
RCS file: /usr/local/cvsroot/vorbis-tools/oggenc/oggenc.c,v
retrieving revision 1.31
retrieving revision 1.32
diff -u -r1.31 -r1.32
--- oggenc.c 2001/09/30 00:01:53 1.31
+++ oggenc.c 2001/10/02 03:03:42 1.32
@@ -15,6 +15,7 @@
#include <getopt.h>
#include <string.h>
#include <time.h>
+#include <locale.h>
#include "platform.h"
#include "encode.h"
@@ -50,7 +51,6 @@
{"date",1,0,'d'},
{"tracknum",1,0,'N'},
{"serial",1,0,'s'},
- {"encoding",1,0,'e'},
{NULL,0,0,0}
};
@@ -75,6 +75,8 @@
int numfiles;
int errors=0;
+ setlocale(LC_ALL, "");
+
parse_options(argc, argv, &opt);
if(optind >= argc)
@@ -320,8 +322,6 @@
" -s, --serial Specify a serial number for the stream. If encoding\n"
" multiple files, this will be incremented for each\n"
" stream after the first.\n"
- " -e, --encoding Specify an encoding for the comments given (not\n"
- " supported on windows)\n"
"\n"
" Naming:\n"
" -o, --output=fn Write file to fn (only valid in single-file mode)\n"
@@ -477,7 +477,7 @@
int ret;
int option_index = 1;
- while((ret = getopt_long(argc, argv, "a:b:B:c:C:d:e:G:hl:m:M:n:N:o:P:q:QrR:s:t:vX:",
+ while((ret = getopt_long(argc, argv, "a:b:B:c:C:d:G:hl:m:M:n:N:o:P:q:QrR:s:t:vX:",
long_options, &option_index)) != -1)
{
switch(ret)
@@ -498,9 +498,6 @@
opt->dates = realloc(opt->dates, (++opt->date_count)*sizeof(char *));
opt->dates[opt->date_count - 1] = strdup(optarg);
break;
- case 'e':
- opt->encoding = strdup(optarg);
- break;
case 'G':
opt->genre = realloc(opt->genre, (++opt->genre_count)*sizeof(char *));
opt->genre[opt->genre_count - 1] = strdup(optarg);
@@ -646,7 +643,7 @@
static void add_tag(vorbis_comment *vc, oe_options *opt,char *name, char *value)
{
char *utf8;
- if(utf8_encode(value, &utf8, opt->encoding) == 0)
+ if(utf8_encode(value, &utf8) >= 0)
{
if(name == NULL)
vorbis_comment_add(vc, utf8);
@@ -655,7 +652,7 @@
free(utf8);
}
else
- fprintf(stderr, "Couldn't convert comment to UTF8, cannot add\n");
+ fprintf(stderr, "Couldn't convert comment to UTF-8, cannot add\n");
}
static void build_comments(vorbis_comment *vc, oe_options *opt, int filenum,
1.3 +2 -3 vorbis-tools/share/Makefile.am
Index: Makefile.am
===================================================================
RCS file: /usr/local/cvsroot/vorbis-tools/share/Makefile.am,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -r1.2 -r1.3
--- Makefile.am 2001/09/22 23:13:50 1.2
+++ Makefile.am 2001/10/02 03:03:42 1.3
@@ -6,12 +6,11 @@
noinst_LIBRARIES = libutf8.a libgetopt.a
-libutf8_a_SOURCES = utf8.c
-MAP_FILES = 8859-1.map 8859-2.map
+libutf8_a_SOURCES = charset.c iconvert.c utf8.c
libgetopt_a_SOURCES = getopt.c getopt1.c
-EXTRA_DIST = $(MAP_FILES) charsetmap.h make_code_map.pl
+EXTRA_DIST = charmaps.h makemap.c charset_test.c
debug:
$(MAKE) all CFLAGS="@DEBUG@"
1.3 +98 -217 vorbis-tools/share/utf8.c
Index: utf8.c
===================================================================
RCS file: /usr/local/cvsroot/vorbis-tools/share/utf8.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -r1.2 -r1.3
--- utf8.c 2001/09/25 08:59:54 1.2
+++ utf8.c 2001/10/02 03:03:42 1.3
@@ -1,30 +1,40 @@
-/* OggEnc
- *
- * This program is distributed under the GNU General Public License, version 2.
- * A copy of this license is included with this source.
- *
- * (C) 2001 Michael Smith <msmith at labyrinth.net.au>
+/*
+ * Copyright (C) 2001 Peter Harris <peter.harris at hummingbird.com>
+ * Copyright (C) 2001 Edmund Grimley Evans <edmundo at rano.org>
*
- * UTF-8 Conversion routines
- * Copyright (C) 2001, Daniel Resare <noa at metamatrix.se>
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
-#include <stdio.h>
+/*
+ * Convert a string between UTF-8 and the locale's charset.
+ */
+
#include <stdlib.h>
#include <string.h>
+
#include "utf8.h"
#ifdef _WIN32
+#include <stdio.h>
#include <windows.h>
-int utf8_encode(char *from, char **to, const char *encoding)
+int utf8_encode(const char *from, char **to)
{
/* Thanks to Peter Harris <peter.harris at hummingbird.com> for this win32
* code.
- *
- * We ignore 'encoding' and assume that the input is in the 'code page'
- * of the console. Reasonable, since oggenc is a console app.
*/
unsigned short *unicode;
@@ -36,14 +46,14 @@
if(wchars == 0)
{
fprintf(stderr, "Unicode translation error %d\n", GetLastError());
- return 1;
+ return -1;
}
unicode = calloc(wchars + 1, sizeof(unsigned short));
if(unicode == NULL)
{
fprintf(stderr, "Out of memory processing string to UTF8\n");
- return 1;
+ return -1;
}
err = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED, from,
@@ -52,7 +62,7 @@
{
free(unicode);
fprintf(stderr, "Unicode translation error %d\n", GetLastError());
- return 1;
+ return -1;
}
/* On NT-based windows systems, we could use WideCharToMultiByte(), but
@@ -64,234 +74,105 @@
return 0;
}
-int utf8_decode(char *from, char **to, const char *encoding)
+int utf8_decode(const char *from, char **to)
{
- return 1; /* Dummy stub */
+ return -1; /* Dummy stub */
}
#else /* End win32. Rest is for real operating systems */
-#ifdef HAVE_ICONV
-#include <iconv.h>
-#include <errno.h>
-#endif
-#include "charsetmap.h"
+#ifdef HAVE_LANGINFO_CODESET
+#include <langinfo.h>
+#endif
-#define BUFSIZE 256
+int iconvert(const char *fromcode, const char *tocode,
+ const char *from, size_t fromlen,
+ char **to, size_t *tolen);
-/*
- Converts the string FROM from the encoding specified in ENCODING
- to UTF-8. The resulting string i pointed to by *TO.
+static char *current_charset = 0; /* means "US-ASCII" */
- Return values:
- 0 indicates a successfully converted string.
- 1 indicates that the given encoding is not available.
- 2 indicates that the given string is bigger than BUFSIZE and can therefore
- not be encoded.
- 3 indicates that given string could not be parsed.
-*/
-int utf8_encode(char *from, char **to, const char *encoding)
+void convert_set_charset(const char *charset)
{
-#ifdef HAVE_ICONV
- static unsigned char buffer[BUFSIZE];
- char *from_p, *to_p;
- size_t from_left, to_left;
- iconv_t cd;
+
+#ifdef HAVE_LANGINFO_CODESET
+ if (!charset)
+ charset = nl_langinfo(CODESET);
#endif
- if (!strcasecmp(encoding, "UTF-8")) {
- /* ideally some checking of the given string should be done */
- *to = malloc(strlen(from) + 1);
- strcpy(*to, from);
- return 0;
- }
+ if (!charset)
+ charset = getenv("CHARSET");
-#ifdef HAVE_ICONV
- cd = iconv_open("UTF-8", encoding);
- if(cd == (iconv_t)(-1))
- {
- if(errno == EINVAL) {
- /* if iconv can't encode from this encoding, try
- * simple_utf8_encode()
- */
- return simple_utf8_encode(from, to, encoding);
- } else {
- perror("iconv_open");
- }
- }
-
- from_left = strlen(from);
- to_left = BUFSIZE;
- from_p = from;
- to_p = buffer;
-
- if(iconv(cd, (ICONV_CONST char **)(&from_p), &from_left, &to_p,
- &to_left) == (size_t)-1)
- {
- iconv_close(cd);
- switch(errno)
- {
- case E2BIG:
- /* if the buffer is too small, try simple_utf8_encode()
- */
- return simple_utf8_encode(from, to, encoding);
- case EILSEQ:
- case EINVAL:
- return 3;
- default:
- perror("iconv");
- }
- }
- else
- {
- iconv_close(cd);
- }
- *to = malloc(BUFSIZE - to_left + 1);
- buffer[BUFSIZE - to_left] = 0;
- strcpy(*to, buffer);
- return 0;
-#else
- return simple_utf8_encode(from, to, encoding);
-#endif
+ free(current_charset);
+ current_charset = 0;
+ if (charset && *charset)
+ current_charset = strdup(charset);
}
-/*
- This implementation has the following limitations: The given charset must
- represent each glyph with exactly one (1) byte. No multi byte or variable
- width charsets are allowed. (An exception to this i UTF-8 that is passed
- right through.) The glyhps in the charsets must have a unicode value equal
- to or less than 0xFFFF (this inclues pretty much everything). For a complete,
- free conversion implementation please have a look at libiconv.
-*/
-int simple_utf8_encode(const char *from, char **to, const char *encoding)
+static int convert_buffer(const char *fromcode, const char *tocode,
+ const char *from, size_t fromlen,
+ char **to, size_t *tolen)
{
- /* can you always know this will be 16 bit? */
- unsigned short *unicode;
- charset_map *map;
- int index = 0;
- unsigned char c;
-
- unicode = calloc((strlen(from) + 1), sizeof(short));
-
- map = get_map(encoding);
-
- if (map == NULL)
- return 1;
+ int ret = -1;
- c = from[index];
- while(c)
- {
- unicode[index] = map->mapping[c];
- index++;
- c = from[index];
- }
+#ifdef HAVE_ICONV
+ ret = iconvert(fromcode, tocode, from, fromlen, to, tolen);
+ if (ret != -1)
+ return ret;
+#endif
- *to = make_utf8_string(unicode);
- free(unicode);
- return 0;
-}
+#ifndef HAVE_ICONV /* should be ifdef USE_CHARSET_CONVERT */
+ ret = charset_convert(fromcode, tocode, from, fromlen, to, tolen);
+ if (ret != -1)
+ return ret;
+#endif
-int utf8_decode(char *from, char **to, const char *encoding)
-{
-#ifdef HAVE_ICONV
- static unsigned char buffer[BUFSIZE];
- char *from_p, *to_p;
- size_t from_left, to_left;
- iconv_t cd;
- cd = iconv_open(encoding, "UTF-8");
- if(cd == (iconv_t)(-1))
- {
- perror("iconv_open");
- }
-
- from_left = strlen(from);
- to_left = BUFSIZE;
- from_p = from;
- to_p = buffer;
-
- if(iconv(cd, (ICONV_CONST char **)(&from_p), &from_left, &to_p,
- &to_left) == (size_t)-1)
- {
- iconv_close(cd);
- switch(errno)
- {
- case E2BIG:
- case EILSEQ:
- case EINVAL:
- return 3;
- default:
- perror("iconv");
- }
- }
- else
- {
- iconv_close(cd);
- }
- *to = malloc(BUFSIZE - to_left + 1);
- buffer[BUFSIZE - to_left] = 0;
- strcpy(*to, buffer);
- return 0;
-#else
- return 1; /* Dummy stub */
-#endif /* HAVE_ICONV */
+ return ret;
}
-charset_map *get_map(const char *encoding)
+static int convert_string(const char *fromcode, const char *tocode,
+ const char *from, char **to, char replace)
{
- charset_map *map_p = maps;
- while(map_p->name != NULL)
- {
- if(!strcasecmp(map_p->name, encoding))
- {
- return map_p;
- }
- map_p++;
- }
- return NULL;
-}
+ int ret;
+ size_t fromlen;
+ char *s;
-#endif /* The rest is used by everthing */
+ fromlen = strlen(from);
+ ret = convert_buffer(fromcode, tocode, from, fromlen, to, 0);
+ if (ret == -2)
+ return -1;
+ if (ret != -1)
+ return ret;
-char *make_utf8_string(const unsigned short *unicode)
+ s = malloc(fromlen + 1);
+ if (!s)
+ return -1;
+ strcpy(s, from);
+ *to = s;
+ for (; *s; s++)
+ if (*s & ~0x7f)
+ *s = replace;
+ return 3;
+}
+
+int utf8_encode(const char *from, char **to)
{
- int size = 0, index = 0, out_index = 0;
- unsigned char *out;
- unsigned short c;
-
- /* first calculate the size of the target string */
- c = unicode[index++];
- while(c) {
- if(c < 0x0080) {
- size += 1;
- } else if(c < 0x0800) {
- size += 2;
- } else {
- size += 3;
- }
- c = unicode[index++];
- }
+ char *charset;
- out = malloc(size + 1);
- index = 0;
+ if (!current_charset)
+ convert_set_charset(0);
+ charset = current_charset ? current_charset : "US-ASCII";
+ return convert_string(charset, "UTF-8", from, to, '#');
+}
- c = unicode[index++];
- while(c)
- {
- if(c < 0x080) {
- out[out_index++] = c;
- } else if(c < 0x800) {
- out[out_index++] = 0xc0 | (c >> 6);
- out[out_index++] = 0x80 | (c & 0x3f);
- } else {
- out[out_index++] = 0xe0 | (c >> 12);
- out[out_index++] = 0x80 | ((c >> 6) & 0x3f);
- out[out_index++] = 0x80 | (c & 0x3f);
- }
- c = unicode[index++];
- }
- out[out_index] = 0x00;
+int utf8_decode(const char *from, char **to)
+{
+ char *charset;
- return out;
+ if (!current_charset)
+ convert_set_charset(0);
+ charset = current_charset ? current_charset : "US-ASCII";
+ return convert_string("UTF-8", charset, from, to, '?');
}
+#endif
1.1 vorbis-tools/share/charmaps.h
Index: charmaps.h
===================================================================
/*
* If you need to generate more maps, use makemap.c on a system
* with a decent iconv.
*/
tatic const unsigned short mapping_iso_8859_2[256] = {
0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017,
0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f,
0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027,
0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f,
0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f,
0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f,
0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057,
0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f,
0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f,
0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077,
0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x007f,
0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,
0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f,
0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,
0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f,
0x00a0, 0x0104, 0x02d8, 0x0141, 0x00a4, 0x013d, 0x015a, 0x00a7,
0x00a8, 0x0160, 0x015e, 0x0164, 0x0179, 0x00ad, 0x017d, 0x017b,
0x00b0, 0x0105, 0x02db, 0x0142, 0x00b4, 0x013e, 0x015b, 0x02c7,
0x00b8, 0x0161, 0x015f, 0x0165, 0x017a, 0x02dd, 0x017e, 0x017c,
0x0154, 0x00c1, 0x00c2, 0x0102, 0x00c4, 0x0139, 0x0106, 0x00c7,
0x010c, 0x00c9, 0x0118, 0x00cb, 0x011a, 0x00cd, 0x00ce, 0x010e,
0x0110, 0x0143, 0x0147, 0x00d3, 0x00d4, 0x0150, 0x00d6, 0x00d7,
0x0158, 0x016e, 0x00da, 0x0170, 0x00dc, 0x00dd, 0x0162, 0x00df,
0x0155, 0x00e1, 0x00e2, 0x0103, 0x00e4, 0x013a, 0x0107, 0x00e7,
0x010d, 0x00e9, 0x0119, 0x00eb, 0x011b, 0x00ed, 0x00ee, 0x010f,
0x0111, 0x0144, 0x0148, 0x00f3, 0x00f4, 0x0151, 0x00f6, 0x00f7,
0x0159, 0x016f, 0x00fa, 0x0171, 0x00fc, 0x00fd, 0x0163, 0x02d9
};
tatic struct {
const char *name;
const unsigned short *map;
struct charset *charset;
} maps[] = {
{ "ISO-8859-2", mapping_iso_8859_2, 0 },
{ 0, 0, 0 }
};
tatic const struct {
const char *bad;
const char *good;
} names[] = {
{ "ANSI_X3.4-1968", "us-ascii" },
{ 0, 0 }
};
1.1 vorbis-tools/share/charset.c
Index: charset.c
===================================================================
/*
* Copyright (C) 2001 Edmund Grimley Evans <edmundo at rano.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
/*
* See the corresponding header file for a description of the functions
* that this file provides.
*
* This was first written for Ogg Vorbis but could be of general use.
*
* The only deliberate assumption about data sizes is that a short has
* at least 16 bits, but this code has only been tested on systems with
* 8-bit char, 16-bit short and 32-bit int.
*/
#ifndef HAVE_ICONV /* should be ifdef USE_CHARSET_CONVERT */
#include <stdlib.h>
#include "charset.h"
#include "charmaps.h"
/*
* This is like the standard strcasecmp, but it does not depend
* on the locale. Locale-dependent functions can be dangerous:
* we once had a bug involving strcasecmp("iso", "ISO") in a
* Turkish locale!
*
* (I'm not really sure what the official standard says
* about the sign of strcasecmp("Z", "["), but usually
* we're only interested in whether it's zero.)
*/
tatic int ascii_strcasecmp(const char *s1, const char *s2)
{
char c1, c2;
for (;; s1++, s2++) {
if (!*s1 || !*s1)
break;
if (*s1 == *s2)
continue;
c1 = *s1;
if ('a' <= c1 && c1 <= 'z')
c1 += 'A' - 'a';
c2 = *s2;
if ('a' <= c2 && c2 <= 'z')
c2 += 'A' - 'a';
if (c1 != c2)
break;
}
return (unsigned char)*s1 - (unsigned char)*s2;
}
/*
* UTF-8 equivalents of the C library's wctomb() and mbtowc().
*/
int utf8_mbtowc(int *pwc, const char *s, size_t n)
{
unsigned char c;
int wc, i, k;
if (!n || !s)
return 0;
c = *s;
if (c < 0x80) {
if (pwc)
*pwc = c;
return c ? 1 : 0;
}
else if (c < 0xc2)
return -1;
else if (c < 0xe0) {
if (n >= 2) {
if (pwc)
*pwc = ((c & 0x1f) << 6) | (s[1] & 0x3f);
return 2;
}
else
return -1;
}
else if (c < 0xf0)
k = 3;
else if (c < 0xf8)
k = 4;
else if (c < 0xfc)
k = 5;
else if (c < 0xfe)
k = 6;
else
return -1;
if (n < k)
return -1;
wc = *s++ & ((1 << (7 - k)) - 1);
for (i = 1; i < k; i++) {
if ((*s & 0xc0) != 0x80)
return -1;
wc = (wc << 6) | (*s++ & 0x3f);
}
if (wc < (1 << (5 * k - 4)))
return -1;
if (pwc)
*pwc = wc;
return k;
}
int utf8_wctomb(char *s, int wc1)
{
unsigned int wc = wc1;
if (!s)
return 0;
if (wc < (1 << 7)) {
*s++ = wc;
return 1;
}
else if (wc < (1 << 11)) {
*s++ = 0xc0 | (wc >> 6);
*s++ = 0x80 | (wc & 0x3f);
return 2;
}
else if (wc < (1 << 16)) {
*s++ = 0xe0 | (wc >> 12);
*s++ = 0x80 | ((wc >> 6) & 0x3f);
*s++ = 0x80 | (wc & 0x3f);
return 3;
}
else if (wc < (1 << 21)) {
*s++ = 0xf0 | (wc >> 18);
*s++ = 0x80 | ((wc >> 12) & 0x3f);
*s++ = 0x80 | ((wc >> 6) & 0x3f);
*s++ = 0x80 | (wc & 0x3f);
return 4;
}
else if (wc < (1 << 26)) {
*s++ = 0xf8 | (wc >> 24);
*s++ = 0x80 | ((wc >> 18) & 0x3f);
*s++ = 0x80 | ((wc >> 12) & 0x3f);
*s++ = 0x80 | ((wc >> 6) & 0x3f);
*s++ = 0x80 | (wc & 0x3f);
return 5;
}
else if (wc < (1 << 31)) {
*s++ = 0xfc | (wc >> 30);
*s++ = 0x80 | ((wc >> 24) & 0x3f);
*s++ = 0x80 | ((wc >> 18) & 0x3f);
*s++ = 0x80 | ((wc >> 12) & 0x3f);
*s++ = 0x80 | ((wc >> 6) & 0x3f);
*s++ = 0x80 | (wc & 0x3f);
return 6;
}
else
return -1;
}
/*
* The charset "object" and methods.
*/
truct charset {
int min, max;
int (*mbtowc)(void *table, int *pwc, const char *s, size_t n);
int (*wctomb)(void *table, char *s, int wc);
void *map;
};
int charset_mbtowc(struct charset *charset, int *pwc, const char *s, size_t n)
{
return (*charset->mbtowc)(charset->map, pwc, s, n);
}
int charset_wctomb(struct charset *charset, char *s, int wc)
{
return (*charset->wctomb)(charset->map, s, wc);
}
int charset_min(struct charset *charset)
{
return charset->min;
}
int charset_max(struct charset *charset)
{
return charset->max;
}
/*
* Implementation of UTF-8.
*/
tatic int mbtowc_utf8(void *map, int *pwc, const char *s, size_t n)
{
return utf8_mbtowc(pwc, s, n);
}
tatic int wctomb_utf8(void *map, char *s, int wc)
{
return utf8_wctomb(s, wc);
}
/*
* Implementation of US-ASCII.
* Probably on most architectures this compiles to less than 256 bytes
* of code, so we can save space by not having a table for this one.
*/
tatic int mbtowc_ascii(void *map, int *pwc, const char *s, size_t n)
{
int wc;
if (!n || !s)
return 0;
wc = (unsigned char)*s;
if (wc & ~0x7f)
return -1;
if (pwc)
*pwc = wc;
return wc ? 1 : 0;
}
tatic int wctomb_ascii(void *map, char *s, int wc)
{
if (!s)
return 0;
if (wc & ~0x7f)
return -1;
*s = wc;
return 1;
}
/*
* Implementation of ISO-8859-1.
* Probably on most architectures this compiles to less than 256 bytes
* of code, so we can save space by not having a table for this one.
*/
tatic int mbtowc_iso1(void *map, int *pwc, const char *s, size_t n)
{
int wc;
if (!n || !s)
return 0;
wc = (unsigned char)*s;
if (wc & ~0xff)
return -1;
if (pwc)
*pwc = wc;
return wc ? 1 : 0;
}
tatic int wctomb_iso1(void *map, char *s, int wc)
{
if (!s)
return 0;
if (wc & ~0xff)
return -1;
*s = wc;
return 1;
}
/*
* Implementation of any 8-bit charset.
*/
truct map {
const unsigned short *from;
struct inverse_map *to;
};
tatic int mbtowc_8bit(void *map1, int *pwc, const char *s, size_t n)
{
struct map *map = map1;
unsigned short wc;
if (!n || !s)
return 0;
wc = map->from[(unsigned char)*s];
if (wc == 0xffff)
return -1;
if (pwc)
*pwc = (int)wc;
return wc ? 1 : 0;
}
/*
* For the inverse map we use a hash table, which has the advantages
* of small constant memory requirement and simple memory allocation,
* but the disadvantage of slow conversion in the worst case.
* If you need real-time performance while letting a potentially
* malicious user define their own map, then the method used in
* linux/drivers/char/consolemap.c would be more appropriate.
*/
truct inverse_map {
unsigned char first[256];
unsigned char next[256];
};
/*
* The simple hash is good enough for this application.
* Use the alternative trivial hashes for testing.
*/
#define HASH(i) ((i) & 0xff)
/* #define HASH(i) 0 */
/* #define HASH(i) 99 */
tatic struct inverse_map *make_inverse_map(const unsigned short *from)
{
struct inverse_map *to;
char used[256];
int i, j, k;
to = (struct inverse_map *)malloc(sizeof(struct inverse_map));
if (!to)
return 0;
for (i = 0; i < 256; i++)
to->first[i] = to->next[i] = used[i] = 0;
for (i = 255; i >= 0; i--)
if (from[i] != 0xffff) {
k = HASH(from[i]);
to->next[i] = to->first[k];
to->first[k] = i;
used[k] = 1;
}
/* Point the empty buckets at an empty list. */
for (i = 0; i < 256; i++)
if (!to->next[i])
break;
if (i < 256)
for (j = 0; j < 256; j++)
if (!used[j])
to->first[j] = i;
return to;
}
int wctomb_8bit(void *map1, char *s, int wc1)
{
struct map *map = map1;
unsigned short wc = wc1;
int i;
if (!s)
return 0;
if (wc1 & ~0xffff)
return -1;
if (1) /* Change 1 to 0 to test the case where malloc fails. */
if (!map->to)
map->to = make_inverse_map(map->from);
if (map->to) {
/* Use the inverse map. */
i = map->to->first[HASH(wc)];
for (;;) {
if (map->from[i] == wc) {
*s = i;
return 1;
}
if (!(i = map->to->next[i]))
break;
}
}
else {
/* We don't have an inverse map, so do a linear search. */
for (i = 0; i < 256; i++)
if (map->from[i] == wc) {
*s = i;
return 1;
}
}
return -1;
}
/*
* The "constructor" charset_find().
*/
truct charset charset_utf8 = {
1, 6,
&mbtowc_utf8,
&wctomb_utf8,
0
};
truct charset charset_iso1 = {
1, 1,
&mbtowc_iso1,
&wctomb_iso1,
0
};
truct charset charset_ascii = {
1, 1,
&mbtowc_ascii,
&wctomb_ascii,
0
};
truct charset *charset_find(const char *code)
{
int i;
/* Find good (MIME) name. */
for (i = 0; names[i].bad; i++)
if (!ascii_strcasecmp(code, names[i].bad)) {
code = names[i].good;
break;
}
/* Recognise some charsets for which we avoid using a table. */
if (!ascii_strcasecmp(code, "UTF-8"))
return &charset_utf8;
if (!ascii_strcasecmp(code, "US-ASCII"))
return &charset_ascii;
if (!ascii_strcasecmp(code, "ISO-8859-1"))
return &charset_iso1;
/* Look for a mapping for a simple 8-bit encoding. */
for (i = 0; maps[i].name; i++)
if (!ascii_strcasecmp(code, maps[i].name)) {
if (!maps[i].charset) {
maps[i].charset = (struct charset *)malloc(sizeof(struct charset));
if (maps[i].charset) {
struct map *map = (struct map *)malloc(sizeof(struct map));
if (!map) {
free(maps[i].charset);
maps[i].charset = 0;
}
else {
maps[i].charset->min = 1;
maps[i].charset->max = 1;
maps[i].charset->mbtowc = &mbtowc_8bit;
maps[i].charset->wctomb = &wctomb_8bit;
maps[i].charset->map = map;
map->from = maps[i].map;
map->to = 0; /* inverse mapping is created when required */
}
}
}
return maps[i].charset;
}
return 0;
}
/*
* Function to convert a buffer from one encoding to another.
* Invalid bytes are replaced by '#', and characters that are
* not available in the target encoding are replaced by '?'.
* Each of TO and TOLEN may be zero, if the result is not needed.
* The output buffer is null-terminated, so it is all right to
* use charset_convert(fromcode, tocode, s, strlen(s), &t, 0).
*/
int charset_convert(const char *fromcode, const char *tocode,
const char *from, size_t fromlen,
char **to, size_t *tolen)
{
int ret = 0;
struct charset *charset1, *charset2;
char *tobuf, *p, *newbuf;
int i, j, wc;
charset1 = charset_find(fromcode);
charset2 = charset_find(tocode);
if (!charset1 || !charset2 )
return -1;
tobuf = (char *)malloc((fromlen / charset1->min) * charset2->max + 1);
if (!tobuf)
return -2;
for (p = tobuf; fromlen; from += i, fromlen -= i, p += j) {
i = charset_mbtowc(charset1, &wc, from, fromlen);
if (!i)
i = 1;
else if (i == -1) {
i = 1;
wc = '#';
ret = 2;
}
j = charset_wctomb(charset2, p, wc);
if (j == -1) {
if (!ret)
ret = 1;
j = charset_wctomb(charset2, p, '?');
if (j == -1)
j = 0;
}
}
if (tolen)
*tolen = p - tobuf;
*p++ = '\0';
if (to) {
newbuf = realloc(tobuf, p - tobuf);
*to = newbuf ? newbuf : tobuf;
}
else
free(tobuf);
return ret;
}
#endif /* USE_CHARSET_ICONV */
1.1 vorbis-tools/share/charset_test.c
Index: charset_test.c
===================================================================
/*
* Copyright (C) 2001 Edmund Grimley Evans <edmundo at rano.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <assert.h>
#include <string.h>
#include "charset.h"
void test_any(struct charset *charset)
{
int wc;
char s[2];
assert(charset);
/* Decoder */
assert(charset_mbtowc(charset, 0, (char *)(-1), 0) == 0);
assert(charset_mbtowc(charset, 0, 0, 1) == 0);
assert(charset_mbtowc(charset, &wc, "x", 0) == 0);
assert(charset_mbtowc(charset, &wc, "x", 1) == 1 && wc == 'x');
assert(charset_mbtowc(charset, &wc, "x", 2) == 1 && wc == 'x');
assert(charset_mbtowc(charset, &wc, "", 1) == 0 && wc == 0);
/* Encoder */
assert(charset_wctomb(charset, 0, 0) == 0);
s[0] = s[1] = '.';
assert(charset_wctomb(charset, s, 0) == 1 &&
s[0] == '\0' && s[1] == '.');
assert(charset_wctomb(charset, s, 'x') == 1 &&
s[0] == 'x' && s[1] == '.');
}
void test_utf8()
{
struct charset *charset;
int wc;
char s[8];
charset = charset_find("UTF-8");
test_any(charset);
/* Decoder */
wc = 0;
assert(charset_mbtowc(charset, &wc, "\177", 1) == 1 && wc == 127);
assert(charset_mbtowc(charset, &wc, "\200", 2) == -1);
assert(charset_mbtowc(charset, &wc, "\301\277", 9) == -1);
assert(charset_mbtowc(charset, &wc, "\302\200", 1) == -1);
assert(charset_mbtowc(charset, &wc, "\302\200", 2) == 2 && wc == 128);
assert(charset_mbtowc(charset, &wc, "\302\200", 3) == 2 && wc == 128);
assert(charset_mbtowc(charset, &wc, "\340\237\200", 9) == -1);
assert(charset_mbtowc(charset, &wc, "\340\240\200", 9) == 3 &&
wc == 1 << 11);
assert(charset_mbtowc(charset, &wc, "\360\217\277\277", 9) == -1);
assert(charset_mbtowc(charset, &wc, "\360\220\200\200", 9) == 4 &&
wc == 1 << 16);
assert(charset_mbtowc(charset, &wc, "\370\207\277\277\277", 9) == -1);
assert(charset_mbtowc(charset, &wc, "\370\210\200\200\200", 9) == 5 &&
wc == 1 << 21);
assert(charset_mbtowc(charset, &wc, "\374\203\277\277\277\277", 9) == -1);
assert(charset_mbtowc(charset, &wc, "\374\204\200\200\200\200", 9) == 6 &&
wc == 1 << 26);
assert(charset_mbtowc(charset, &wc, "\375\277\277\277\277\277", 9) == 6 &&
wc == 0x7fffffff);
/* Encoder */
strcpy(s, ".......");
assert(charset_wctomb(charset, s, 1 << 31) == -1 &&
!strcmp(s, "......."));
assert(charset_wctomb(charset, s, 127) == 1 &&
!strcmp(s, "\177......"));
assert(charset_wctomb(charset, s, 128) == 2 &&
!strcmp(s, "\302\200....."));
assert(charset_wctomb(charset, s, 0x7ff) == 2 &&
!strcmp(s, "\337\277....."));
assert(charset_wctomb(charset, s, 0x800) == 3 &&
!strcmp(s, "\340\240\200...."));
assert(charset_wctomb(charset, s, 0xffff) == 3 &&
!strcmp(s, "\357\277\277...."));
assert(charset_wctomb(charset, s, 0x10000) == 4 &&
!strcmp(s, "\360\220\200\200..."));
assert(charset_wctomb(charset, s, 0x1fffff) == 4 &&
!strcmp(s, "\367\277\277\277..."));
assert(charset_wctomb(charset, s, 0x200000) == 5 &&
!strcmp(s, "\370\210\200\200\200.."));
assert(charset_wctomb(charset, s, 0x3ffffff) == 5 &&
!strcmp(s, "\373\277\277\277\277.."));
assert(charset_wctomb(charset, s, 0x4000000) == 6 &&
!strcmp(s, "\374\204\200\200\200\200."));
assert(charset_wctomb(charset, s, 0x7fffffff) == 6 &&
!strcmp(s, "\375\277\277\277\277\277."));
}
void test_ascii()
{
struct charset *charset;
int wc;
char s[3];
charset = charset_find("us-ascii");
test_any(charset);
/* Decoder */
wc = 0;
assert(charset_mbtowc(charset, &wc, "\177", 2) == 1 && wc == 127);
assert(charset_mbtowc(charset, &wc, "\200", 2) == -1);
/* Encoder */
strcpy(s, "..");
assert(charset_wctomb(charset, s, 256) == -1 && !strcmp(s, ".."));
assert(charset_wctomb(charset, s, 255) == -1);
assert(charset_wctomb(charset, s, 128) == -1);
assert(charset_wctomb(charset, s, 127) == 1 && !strcmp(s, "\177."));
}
void test_iso1()
{
struct charset *charset;
int wc;
char s[3];
charset = charset_find("iso-8859-1");
test_any(charset);
/* Decoder */
wc = 0;
assert(charset_mbtowc(charset, &wc, "\302\200", 9) == 1 && wc == 0xc2);
/* Encoder */
strcpy(s, "..");
assert(charset_wctomb(charset, s, 256) == -1 && !strcmp(s, ".."));
assert(charset_wctomb(charset, s, 255) == 1 && !strcmp(s, "\377."));
assert(charset_wctomb(charset, s, 128) == 1 && !strcmp(s, "\200."));
}
void test_iso2()
{
struct charset *charset;
int wc;
char s[3];
charset = charset_find("iso-8859-2");
test_any(charset);
/* Decoder */
wc = 0;
assert(charset_mbtowc(charset, &wc, "\302\200", 9) == 1 && wc == 0xc2);
assert(charset_mbtowc(charset, &wc, "\377", 2) == 1 && wc == 0x2d9);
/* Encoder */
strcpy(s, "..");
assert(charset_wctomb(charset, s, 256) == -1 && !strcmp(s, ".."));
assert(charset_wctomb(charset, s, 255) == -1 && !strcmp(s, ".."));
assert(charset_wctomb(charset, s, 258) == 1 && !strcmp(s, "\303."));
assert(charset_wctomb(charset, s, 128) == 1 && !strcmp(s, "\200."));
}
void test_convert()
{
const char *p;
char *q, *r;
char s[256];
size_t n, n2;
int i;
p = "\000x\302\200\375\277\277\277\277\277";
assert(charset_convert("UTF-8", "UTF-8", p, 10, &q, &n) == 0 &&
n == 10 && !strcmp(p, q));
assert(charset_convert("UTF-8", "UTF-8", "x\301\277y", 4, &q, &n) == 2 &&
n == 4 && !strcmp(q, "x##y"));
assert(charset_convert("UTF-8", "UTF-8", "x\301\277y", 4, 0, &n) == 2 &&
n == 4);
assert(charset_convert("UTF-8", "UTF-8", "x\301\277y", 4, &q, 0) == 2 &&
!strcmp(q, "x##y"));
assert(charset_convert("UTF-8", "iso-8859-1",
"\302\200\304\200x", 5, &q, &n) == 1 &&
n == 3 && !strcmp(q, "\200?x"));
assert(charset_convert("iso-8859-1", "UTF-8",
"\000\200\377", 3, &q, &n) == 0 &&
n == 5 && !memcmp(q, "\000\302\200\303\277", 5));
assert(charset_convert("iso-8859-1", "iso-8859-1",
"\000\200\377", 3, &q, &n) == 0 &&
n == 3 && !memcmp(q, "\000\200\377", 3));
assert(charset_convert("iso-8859-2", "utf-8", "\300", 1, &q, &n) == 0 &&
n == 2 && !strcmp(q, "\305\224"));
assert(charset_convert("utf-8", "iso-8859-2", "\305\224", 2, &q, &n) == 0 &&
n == 1 && !strcmp(q, "\300"));
for (i = 0; i < 256; i++)
s[i] = i;
assert(charset_convert("iso-8859-2", "utf-8", s, 256, &q, &n) == 0);
assert(charset_convert("utf-8", "iso-8859-2", q, n, &r, &n2) == 0);
assert(n2 == 256 && !memcmp(r, s, n2));
}
int main()
{
test_utf8();
test_ascii();
test_iso1();
test_iso2();
test_convert();
return 0;
}
1.1 vorbis-tools/share/iconvert.c
Index: iconvert.c
===================================================================
/*
* Copyright (C) 2001 Edmund Grimley Evans <edmundo at rano.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#ifdef HAVE_ICONV
#include <assert.h>
#include <errno.h>
#include <iconv.h>
#include <stdlib.h>
#include <string.h>
/*
* Convert data from one encoding to another. Return:
*
* -2 : memory allocation failed
* -1 : unknown encoding
* 0 : data was converted exactly
* 1 : data was converted inexactly
* 2 : data was invalid (but still converted)
*
* We convert in two steps, via UTF-8, as this is the only
* reliable way of distinguishing between invalid input
* and valid input which iconv refuses to transliterate.
* We convert from UTF-8 twice, because we have no way of
* knowing whether the conversion was exact if iconv returns
* E2BIG (due to a bug in the specification of iconv).
* An alternative approach is to assume that the output of
* iconv is never more than 4 times as long as the input,
* but I prefer to avoid that assumption if possible.
*/
int iconvert(const char *fromcode, const char *tocode,
const char *from, size_t fromlen,
char **to, size_t *tolen)
{
int ret = 0;
iconv_t cd1, cd2;
char *ib;
char *ob;
char *utfbuf, *outbuf, *newbuf;
size_t utflen, outlen, ibl, obl, k;
char tbuf[2048];
cd1 = iconv_open("UTF-8", fromcode);
if (cd1 == (iconv_t)(-1))
return -1;
cd2 = (iconv_t)(-1);
/* Don't use strcasecmp() as it's locale-dependent. */
if (!strchr("Uu", tocode[0]) ||
!strchr("Tt", tocode[1]) ||
!strchr("Ff", tocode[2]) ||
tocode[3] != '-' ||
tocode[4] != '8' ||
tocode[5] != '\0') {
char *tocode1;
/*
* Try using this non-standard feature of glibc and libiconv.
* This is deliberately not a config option as people often
* change their iconv library without rebuilding applications.
*/
tocode1 = (char *)malloc(strlen(tocode) + 11);
if (!tocode1)
goto fail;
strcpy(tocode1, tocode);
strcat(tocode1, "//TRANSLIT");
cd2 = iconv_open(tocode1, "UTF-8");
free(tocode1);
if (cd2 == (iconv_t)(-1))
cd2 = iconv_open(tocode, fromcode);
if (cd2 == (iconv_t)(-1)) {
iconv_close(cd1);
return -1;
}
}
utflen = 1; /*fromlen * 2 + 1; XXX */
utfbuf = (char *)malloc(utflen);
if (!utfbuf)
goto fail;
/* Convert to UTF-8 */
ib = from;
ibl = fromlen;
ob = utfbuf;
obl = utflen;
for (;;) {
k = iconv(cd1, &ib, &ibl, &ob, &obl);
assert((!k && !ibl) ||
(k == (size_t)(-1) && errno == E2BIG && ibl && obl < 6) ||
(k == (size_t)(-1) &&
(errno == EILSEQ || errno == EINVAL) && ibl));
if (!ibl)
break;
if (obl < 6) {
/* Enlarge the buffer */
utflen *= 2;
newbuf = (char *)realloc(utfbuf, utflen);
if (!newbuf)
goto fail;
ob = (ob - utfbuf) + newbuf;
obl = utflen - (ob - utfbuf);
utfbuf = newbuf;
}
else {
/* Invalid input */
ib++, ibl--;
*ob++ = '#', obl--;
ret = 2;
iconv(cd1, 0, 0, 0, 0);
}
}
if (cd2 == (iconv_t)(-1)) {
/* The target encoding was UTF-8 */
if (tolen)
*tolen = ob - utfbuf;
if (!to) {
free(utfbuf);
iconv_close(cd1);
return ret;
}
newbuf = (char *)realloc(utfbuf, (ob - utfbuf) + 1);
if (!newbuf)
goto fail;
ob = (ob - utfbuf) + newbuf;
*ob = '\0';
*to = newbuf;
iconv_close(cd1);
return ret;
}
/* Truncate the buffer to be tidy */
utflen = ob - utfbuf;
newbuf = (char *)realloc(utfbuf, utflen);
if (!newbuf)
goto fail;
utfbuf = newbuf;
/* Convert from UTF-8 to discover how long the output is */
outlen = 0;
ib = utfbuf;
ibl = utflen;
while (ibl) {
ob = tbuf;
obl = sizeof(tbuf);
k = iconv(cd2, &ib, &ibl, &ob, &obl);
assert((k != (size_t)(-1) && !ibl) ||
(k == (size_t)(-1) && errno == E2BIG && ibl) ||
(k == (size_t)(-1) && errno == EILSEQ && ibl));
if (ibl && !(k == (size_t)(-1) && errno == E2BIG)) {
/* Replace one character */
char *tb = "?";
size_t tbl = 1;
outlen += ob - tbuf;
ob = tbuf;
obl = sizeof(tbuf);
k = iconv(cd2, &tb, &tbl, &ob, &obl);
assert((!k && !tbl) ||
(k == (size_t)(-1) && errno == EILSEQ && tbl));
for (++ib, --ibl; ibl && (*ib & 0x80); ib++, ibl--)
;
}
outlen += ob - tbuf;
}
ob = tbuf;
obl = sizeof(tbuf);
k = iconv(cd2, 0, 0, &ob, &obl);
assert(!k);
outlen += ob - tbuf;
/* Convert from UTF-8 for real */
outbuf = (char *)malloc(outlen + 1);
if (!outbuf)
goto fail;
ib = utfbuf;
ibl = utflen;
ob = outbuf;
obl = outlen;
while (ibl) {
k = iconv(cd2, &ib, &ibl, &ob, &obl);
assert((k != (size_t)(-1) && !ibl) ||
(k == (size_t)(-1) && errno == EILSEQ && ibl));
if (k && !ret)
ret = 1;
if (ibl && !(k == (size_t)(-1) && errno == E2BIG)) {
/* Replace one character */
char *tb = "?";
size_t tbl = 1;
k = iconv(cd2, &tb, &tbl, &ob, &obl);
assert((!k && !tbl) ||
(k == (size_t)(-1) && errno == EILSEQ && tbl));
for (++ib, --ibl; ibl && (*ib & 0x80); ib++, ibl--)
;
}
}
k = iconv(cd2, 0, 0, &ob, &obl);
assert(!k);
assert(!obl);
*ob = '\0';
free(utfbuf);
iconv_close(cd1);
iconv_close(cd2);
if (tolen)
*tolen = outlen;
if (!to) {
free(outbuf);
return ret;
}
*to = outbuf;
return ret;
fail:
free(utfbuf);
iconv_close(cd1);
if (cd2 != (iconv_t)(-1))
iconv_close(cd2);
return -2;
}
#endif /* HAVE_ICONV */
1.1 vorbis-tools/share/makemap.c
Index: makemap.c
===================================================================
/*
* Copyright (C) 2001 Edmund Grimley Evans <edmundo at rano.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <errno.h>
#include <iconv.h>
#include <stdio.h>
int main(int argc, char *argv[])
{
iconv_t cd;
const char *ib;
char *ob;
size_t ibl, obl, k;
unsigned char c, buf[4];
int i, wc;
if (argc != 2) {
printf("Usage: %s ENCODING\n", argv[0]);
printf("Output a charset map for the 8-bit ENCODING.\n");
return 1;
}
cd = iconv_open("UCS-4", argv[1]);
if (cd == (iconv_t)(-1)) {
perror("iconv_open");
return 1;
}
for (i = 0; i < 256; i++) {
c = i;
ib = &c;
ibl = 1;
ob = buf;
obl = 4;
k = iconv(cd, &ib, &ibl, &ob, &obl);
if (!k && !ibl && !obl) {
wc = (buf[0] << 24) + (buf[1] << 16) + (buf[2] << 8) + buf[3];
if (wc >= 0xffff) {
printf("Dodgy value.\n");
return 1;
}
}
else if (k == (size_t)(-1) && errno == EILSEQ)
wc = 0xffff;
else {
printf("Non-standard iconv.\n");
return 1;
}
if (i % 8 == 0)
printf(" ");
printf("0x%04x", wc);
if (i == 255)
printf("\n");
else if (i % 8 == 7)
printf(",\n");
else
printf(", ");
}
return 0;
}
1.14 +14 -19 vorbis-tools/vorbiscomment/vcomment.c
Index: vcomment.c
===================================================================
RCS file: /usr/local/cvsroot/vorbis-tools/vorbiscomment/vcomment.c,v
retrieving revision 1.13
retrieving revision 1.14
diff -u -r1.13 -r1.14
--- vcomment.c 2001/09/25 08:59:55 1.13
+++ vcomment.c 2001/10/02 03:03:44 1.14
@@ -12,6 +12,7 @@
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
+#include <locale.h>
#include "getopt.h"
#include "utf8.h"
@@ -24,7 +25,6 @@
{"help",0,0,'h'},
{"quiet",0,0,'q'},
{"commentfile",1,0,'c'},
- {"encoding", 1,0,'e'},
{NULL,0,0,0}
};
@@ -37,7 +37,6 @@
int commentcount;
char **comments;
int tempoutfile;
- char *encoding;
} param_t;
#define MODE_NONE 0
@@ -47,8 +46,8 @@
/* prototypes */
void usage(void);
-void print_comments(FILE *out, vorbis_comment *vc, char *encoding);
-int add_comment(char *line, vorbis_comment *vc, char *encoding);
+void print_comments(FILE *out, vorbis_comment *vc);
+int add_comment(char *line, vorbis_comment *vc);
param_t *new_param(void);
void parse_options(int argc, char *argv[], param_t *param);
@@ -98,7 +97,7 @@
/* extract and display the comments */
vc = vcedit_comments(state);
- print_comments(param->com, vc, param->encoding);
+ print_comments(param->com, vc);
/* done */
vcedit_clear(state);
@@ -128,7 +127,7 @@
for(i=0; i < param->commentcount; i++)
{
- if(add_comment(param->comments[i], vc, param->encoding) < 0)
+ if(add_comment(param->comments[i], vc) < 0)
fprintf(stderr, "Bad comment: \"%s\"\n", param->comments[i]);
}
@@ -139,7 +138,7 @@
char *buf = (char *)malloc(sizeof(char)*1024);
while (fgets(buf, 1024, param->com))
- if (add_comment(buf, vc, param->encoding) < 0) {
+ if (add_comment(buf, vc) < 0) {
fprintf(stderr,
"bad comment: \"%s\"\n",
buf);
@@ -177,14 +176,14 @@
***********/
-void print_comments(FILE *out, vorbis_comment *vc, char *encoding)
+void print_comments(FILE *out, vorbis_comment *vc)
{
int i;
char *decoded_value;
for (i = 0; i < vc->comments; i++)
{
- if (utf8_decode(vc->user_comments[i], &decoded_value, encoding) == 0)
+ if (utf8_decode(vc->user_comments[i], &decoded_value) >= 0)
{
fprintf(out, "%s\n", decoded_value);
free(decoded_value);
@@ -197,7 +196,7 @@
/**********
Take a line of the form "TAG=value string", parse it, convert the
- value to UTF-8 from the specified encoding, and add it to the
+ value to UTF-8, and add it to the
vorbis_comment structure. Error checking is performed.
Note that this assumes a null-terminated string, which may cause
@@ -205,7 +204,7 @@
***********/
-int add_comment(char *line, vorbis_comment *vc, char *encoding)
+int add_comment(char *line, vorbis_comment *vc)
{
char *mark, *value, *utf8_value;
@@ -234,7 +233,7 @@
value++;
/* convert the value from the native charset to UTF-8 */
- if (utf8_encode(value, &utf8_value, encoding) == 0) {
+ if (utf8_encode(value, &utf8_value) >= 0) {
/* append the comment and return */
vorbis_comment_add_tag(vc, line, utf8_value);
@@ -307,9 +306,6 @@
param->comments=NULL;
param->tempoutfile=0;
- /* character encoding */
- param->encoding = "ISO-8859-1";
-
return param;
}
@@ -326,8 +322,10 @@
{
int ret;
int option_index = 1;
+
+ setlocale(LC_ALL, "");
- while ((ret = getopt_long(argc, argv, "ae:lwhqc:t:",
+ while ((ret = getopt_long(argc, argv, "alwhqc:t:",
long_options, &option_index)) != -1) {
switch (ret) {
case 0:
@@ -342,9 +340,6 @@
break;
case 'a':
param->mode = MODE_APPEND;
- break;
- case 'e':
- param->encoding = strdup(optarg);
break;
case 'h':
usage();
--- >8 ----
List archives: http://www.xiph.org/archives/
Ogg project homepage: http://www.xiph.org/ogg/
To unsubscribe from this list, send a message to 'cvs-request at xiph.org'
containing only the word 'unsubscribe' in the body. No subject is needed.
Unsubscribe messages sent to the list will be ignored/filtered.
More information about the commits
mailing list