[xiph-cvs] cvs commit: vorbis-tools/oggenc 8859-1.map 8859-2.map make_code_map.pl utf8.c utf8.h Makefile.am encode.h oggenc.c

Michael Smith msmith at xiph.org
Mon Jul 2 02:39:11 PDT 2001



msmith      01/07/02 02:39:11

  Modified:    oggenc   Makefile.am encode.h oggenc.c
  Added:       oggenc   8859-1.map 8859-2.map make_code_map.pl utf8.c
                        utf8.h
  Log:
  Implementation (partial, needs win32 support still) of charset conversion
  to UTF8 for oggenc.
  
  Also some documentation updates.

Revision  Changes    Path
1.9       +3 -2      vorbis-tools/oggenc/Makefile.am

Index: Makefile.am
===================================================================
RCS file: /usr/local/cvsroot/vorbis-tools/oggenc/Makefile.am,v
retrieving revision 1.8
retrieving revision 1.9
diff -u -r1.8 -r1.9
--- Makefile.am	2001/01/25 06:54:52	1.8
+++ Makefile.am	2001/07/02 09:39:10	1.9
@@ -10,10 +10,11 @@
 
 oggenc_LDADD = @VORBISENC_LIBS@ @VORBIS_LIBS@ @OGG_LIBS@
 oggenc_SOURCES = oggenc.c audio.c encode.c getopt.c getopt1.c\
-		platform.c\
+		platform.c utf8.c utf8.h\
                 audio.h encode.h platform.h getopt.h
 
-EXTRA_DIST = oggenc.dsp build_oggenc.bat
+MAP_FILES = 8859-1.map 8859-2.map
+EXTRA_DIST = oggenc.dsp build_oggenc.bat $(MAP_FILES) make_code_map.pl charsetmap.h
 
 debug:
         $(MAKE) all CFLAGS="@DEBUG@"

1.8       +2 -0      vorbis-tools/oggenc/encode.h

Index: encode.h
===================================================================
RCS file: /usr/local/cvsroot/vorbis-tools/oggenc/encode.h,v
retrieving revision 1.7
retrieving revision 1.8
diff -u -r1.7 -r1.8
--- encode.h	2001/05/27 09:49:35	1.7
+++ encode.h	2001/07/02 09:39:10	1.8
@@ -29,6 +29,8 @@
 
 typedef struct
 {
+	char *encoding;
+
         char **title;
         int title_count;
         char **artist;

1.16      +35 -13    vorbis-tools/oggenc/oggenc.c

Index: oggenc.c
===================================================================
RCS file: /usr/local/cvsroot/vorbis-tools/oggenc/oggenc.c,v
retrieving revision 1.15
retrieving revision 1.16
diff -u -r1.15 -r1.16
--- oggenc.c	2001/06/18 00:32:47	1.15
+++ oggenc.c	2001/07/02 09:39:10	1.16
@@ -19,6 +19,7 @@
 #include "platform.h"
 #include "encode.h"
 #include "audio.h"
+#include "utf8.h"
 
 #define VERSION_STRING "OggEnc v0.7 (libvorbis rc1)\n"
 #define COPYRIGHT "(c) 2000 Michael Smith <msmith at labyrinth.net.au)\n"
@@ -42,6 +43,7 @@
         {"date",1,0,'d'},
         {"tracknum",1,0,'N'},
         {"serial",1,0,'s'},
+	{"encoding",1,0,'e'},
         {NULL,0,0,0}
 };
         
@@ -53,8 +55,8 @@
 
 int main(int argc, char **argv)
 {
-	oe_options opt = {NULL, 0, NULL, 0, NULL, 0, NULL, 0, NULL, 0, NULL, 0, 
-		0, 0,16,44100,2, NULL,NULL,128,0}; /* Default values */
+	oe_options opt = {"ISO-8859-1", NULL, 0, NULL, 0, NULL, 0, NULL, 0, NULL, 
+		0, NULL, 0, 0, 0,16,44100,2, NULL,NULL,128,0}; /* Default values */
         int i;
 
         char **infiles;
@@ -281,6 +283,7 @@
                 " -s, --serial         Specify a serial number for the stream. If encoding\n"
                 "                      multiple files, this will be incremented for each\n"
                 "                      stream after the first.\n"
+		" -e, --encoding       Specify an encoding for the comments given.\n"
                 "\n"
                 " Naming:\n"
                 " -o, --output=fn      Write file to fn (only valid in single-file mode)\n"
@@ -306,12 +309,13 @@
                 "                      once, for example, and have it used for all the files)\n"
                 "\n"
                 "INPUT FILES:\n"
-		" OggEnc input files must currently be 16 bit PCM WAV, AIFF, or AIFF/C files.\n"
-		" Files may be mono or stereo (or more channels) and sampling rates \n"
+		" OggEnc input files must currently be 16 or 8 bit PCM WAV, AIFF, or AIFF/C\n"
+		" files. Files may be mono or stereo (or more channels) and sampling rates \n"
                 " between 8kHz and 56kHz.\n"
-		" You can specify taking the file from stdin by using - as the input filename.\n"
                 " Alternatively, the --raw option may be used to use a raw PCM data file, which\n"
-		" must be 16bit stereo little-endian PCM ('headerless wav').\n"
+		" must be 16bit stereo little-endian PCM ('headerless wav'), unless additional\n"
+		" parameters for raw mode are specified.\n"
+		" You can specify taking the file from stdin by using - as the input filename.\n"
                 " In this mode, output is to stdout unless an outfile filename is specified\n"
                 " with -o\n"
                 "\n"
@@ -386,7 +390,7 @@
         int ret;
         int option_index = 1;
 
-	while((ret = getopt_long(argc, argv, "a:b:B:c:C:d:hl:n:N:o:qrR:s:t:v", 
+	while((ret = getopt_long(argc, argv, "a:b:B:c:C:d:e:hl:n:N:o:qrR:s:t:v", 
                                         long_options, &option_index)) != -1)
         {
                 switch(ret)
@@ -407,6 +411,9 @@
                                 opt->dates = realloc(opt->dates, (++opt->date_count)*sizeof(char *));
                                 opt->dates[opt->date_count - 1] = strdup(optarg);
                                 break;
+			case 'e':
+				opt->encoding = strdup(optarg);
+				break;
                         case 'l':
                                 opt->album = realloc(opt->album, (++opt->album_count)*sizeof(char *));
                                 opt->album[opt->album_count - 1] = strdup(optarg);
@@ -508,6 +515,21 @@
         }
 }
 
+void add_tag(vorbis_comment *vc, oe_options *opt, char *name, char *value)
+{
+	char *utf8;
+	if(utf8_encode(value, &utf8, opt->encoding) == 0)
+	{
+		if(name == NULL)
+			vorbis_comment_add(vc, utf8);
+		else
+			vorbis_comment_add_tag(vc, name, utf8);
+		free(utf8);
+	}
+	else
+		fprintf(stderr, "Couldn't convert comment to UTF8, cannot add\n");
+}
+
 void build_comments(vorbis_comment *vc, oe_options *opt, int filenum, 
                 char **artist, char **album, char **title, char **tracknum, char **date)
 {
@@ -516,7 +538,7 @@
         vorbis_comment_init(vc);
 
         for(i = 0; i < opt->comment_count; i++)
-		vorbis_comment_add(vc, opt->comments[i]);
+		add_tag(vc, opt, NULL, opt->comments[i]);
 
         if(opt->title_count)
         {
@@ -530,7 +552,7 @@
                         i = filenum;
 
                 *title = opt->title[i];
-		vorbis_comment_add_tag(vc, "title", opt->title[i]);
+		add_tag(vc, opt, "title", opt->title[i]);
         }
 
         if(opt->artist_count)
@@ -541,7 +563,7 @@
                         i = filenum;
         
                 *artist = opt->artist[i];
-		vorbis_comment_add_tag(vc, "artist", opt->artist[i]);
+		add_tag(vc, opt, "artist", opt->artist[i]);
         }
 
         if(opt->date_count)
@@ -552,7 +574,7 @@
                         i = filenum;
         
                 *date = opt->dates[i];
-		vorbis_comment_add_tag(vc, "date", opt->dates[i]);
+		add_tag(vc, opt, "date", opt->dates[i]);
         }
         
         if(opt->album_count)
@@ -565,14 +587,14 @@
                         i = filenum;
 
                 *album = opt->album[i];	
-		vorbis_comment_add_tag(vc, "album", opt->album[i]);
+		add_tag(vc, opt, "album", opt->album[i]);
         }
 
         if(filenum < opt->track_count)
         {
                 i = filenum;
                 *tracknum = opt->tracknum[i];
-		vorbis_comment_add_tag(vc, "tracknumber", opt->tracknum[i]);
+		add_tag(vc, opt, "tracknumber", opt->tracknum[i]);
         }
 }
 

1.1                  vorbis-tools/oggenc/8859-1.map

Index: 8859-1.map
===================================================================
#
#	Name:             ISO/IEC 8859-1:1998 to Unicode
#	Unicode version:  3.0
#	Table version:    1.0
#	Table format:     Format A
#	Date:             1999 July 27
#	Authors:          Ken Whistler <kenw at sybase.com>
#
#	Copyright (c) 1991-1999 Unicode, Inc.  All Rights reserved.
#
#	This file is provided as-is by Unicode, Inc. (The Unicode Consortium).
#	No claims are made as to fitness for any particular purpose.  No
#	warranties of any kind are expressed or implied.  The recipient
#	agrees to determine applicability of information provided.  If this
#	file has been provided on optical media by Unicode, Inc., the sole
#	remedy for any claim will be exchange of defective media within 90
#	days of receipt.
#
#	Unicode, Inc. hereby grants the right to freely use the information
#	supplied in this file in the creation of products supporting the
#	Unicode Standard, and to make copies of this file in any form for
#	internal or external distribution as long as this notice remains
#	attached.
#
#	General notes:
#
#	This table contains the data the Unicode Consortium has on how
#       ISO/IEC 8859-1:1998 characters map into Unicode.
#
#	Format:  Three tab-separated columns
#		 Column #1 is the ISO/IEC 8859-1 code (in hex as 0xXX)
#		 Column #2 is the Unicode (in hex as 0xXXXX)
#		 Column #3 the Unicode name (follows a comment sign, '#')
#
#	The entries are in ISO/IEC 8859-1 order.
#
#	Version history
#	1.0 version updates 0.1 version by adding mappings for all
#	control characters.
#
#	Updated versions of this file may be found in:
#		<ftp://ftp.unicode.org/Public/MAPPINGS/>
#
#	Any comments or problems, contact <errata at unicode.org>
#	Please note that <errata at unicode.org> is an archival address;
#	notices will be checked, but do not expect an immediate response.
#
0x00	0x0000	#	NULL
0x01	0x0001	#	START OF HEADING
0x02	0x0002	#	START OF TEXT
0x03	0x0003	#	END OF TEXT
0x04	0x0004	#	END OF TRANSMISSION
0x05	0x0005	#	ENQUIRY
0x06	0x0006	#	ACKNOWLEDGE
0x07	0x0007	#	BELL
0x08	0x0008	#	BACKSPACE
0x09	0x0009	#	HORIZONTAL TABULATION
0x0A	0x000A	#	LINE FEED
0x0B	0x000B	#	VERTICAL TABULATION
0x0C	0x000C	#	FORM FEED
0x0D	0x000D	#	CARRIAGE RETURN
0x0E	0x000E	#	SHIFT OUT
0x0F	0x000F	#	SHIFT IN
0x10	0x0010	#	DATA LINK ESCAPE
0x11	0x0011	#	DEVICE CONTROL ONE
0x12	0x0012	#	DEVICE CONTROL TWO
0x13	0x0013	#	DEVICE CONTROL THREE
0x14	0x0014	#	DEVICE CONTROL FOUR
0x15	0x0015	#	NEGATIVE ACKNOWLEDGE
0x16	0x0016	#	SYNCHRONOUS IDLE
0x17	0x0017	#	END OF TRANSMISSION BLOCK
0x18	0x0018	#	CANCEL
0x19	0x0019	#	END OF MEDIUM
0x1A	0x001A	#	SUBSTITUTE
0x1B	0x001B	#	ESCAPE
0x1C	0x001C	#	FILE SEPARATOR
0x1D	0x001D	#	GROUP SEPARATOR
0x1E	0x001E	#	RECORD SEPARATOR
0x1F	0x001F	#	UNIT SEPARATOR
0x20	0x0020	#	SPACE
0x21	0x0021	#	EXCLAMATION MARK
0x22	0x0022	#	QUOTATION MARK
0x23	0x0023	#	NUMBER SIGN
0x24	0x0024	#	DOLLAR SIGN
0x25	0x0025	#	PERCENT SIGN
0x26	0x0026	#	AMPERSAND
0x27	0x0027	#	APOSTROPHE
0x28	0x0028	#	LEFT PARENTHESIS
0x29	0x0029	#	RIGHT PARENTHESIS
0x2A	0x002A	#	ASTERISK
0x2B	0x002B	#	PLUS SIGN
0x2C	0x002C	#	COMMA
0x2D	0x002D	#	HYPHEN-MINUS
0x2E	0x002E	#	FULL STOP
0x2F	0x002F	#	SOLIDUS
0x30	0x0030	#	DIGIT ZERO
0x31	0x0031	#	DIGIT ONE
0x32	0x0032	#	DIGIT TWO
0x33	0x0033	#	DIGIT THREE
0x34	0x0034	#	DIGIT FOUR
0x35	0x0035	#	DIGIT FIVE
0x36	0x0036	#	DIGIT SIX
0x37	0x0037	#	DIGIT SEVEN
0x38	0x0038	#	DIGIT EIGHT
0x39	0x0039	#	DIGIT NINE
0x3A	0x003A	#	COLON
0x3B	0x003B	#	SEMICOLON
0x3C	0x003C	#	LESS-THAN SIGN
0x3D	0x003D	#	EQUALS SIGN
0x3E	0x003E	#	GREATER-THAN SIGN
0x3F	0x003F	#	QUESTION MARK
0x40	0x0040	#	COMMERCIAL AT
0x41	0x0041	#	LATIN CAPITAL LETTER A
0x42	0x0042	#	LATIN CAPITAL LETTER B
0x43	0x0043	#	LATIN CAPITAL LETTER C
0x44	0x0044	#	LATIN CAPITAL LETTER D
0x45	0x0045	#	LATIN CAPITAL LETTER E
0x46	0x0046	#	LATIN CAPITAL LETTER F
0x47	0x0047	#	LATIN CAPITAL LETTER G
0x48	0x0048	#	LATIN CAPITAL LETTER H
0x49	0x0049	#	LATIN CAPITAL LETTER I
0x4A	0x004A	#	LATIN CAPITAL LETTER J
0x4B	0x004B	#	LATIN CAPITAL LETTER K
0x4C	0x004C	#	LATIN CAPITAL LETTER L
0x4D	0x004D	#	LATIN CAPITAL LETTER M
0x4E	0x004E	#	LATIN CAPITAL LETTER N
0x4F	0x004F	#	LATIN CAPITAL LETTER O
0x50	0x0050	#	LATIN CAPITAL LETTER P
0x51	0x0051	#	LATIN CAPITAL LETTER Q
0x52	0x0052	#	LATIN CAPITAL LETTER R
0x53	0x0053	#	LATIN CAPITAL LETTER S
0x54	0x0054	#	LATIN CAPITAL LETTER T
0x55	0x0055	#	LATIN CAPITAL LETTER U
0x56	0x0056	#	LATIN CAPITAL LETTER V
0x57	0x0057	#	LATIN CAPITAL LETTER W
0x58	0x0058	#	LATIN CAPITAL LETTER X
0x59	0x0059	#	LATIN CAPITAL LETTER Y
0x5A	0x005A	#	LATIN CAPITAL LETTER Z
0x5B	0x005B	#	LEFT SQUARE BRACKET
0x5C	0x005C	#	REVERSE SOLIDUS
0x5D	0x005D	#	RIGHT SQUARE BRACKET
0x5E	0x005E	#	CIRCUMFLEX ACCENT
0x5F	0x005F	#	LOW LINE
0x60	0x0060	#	GRAVE ACCENT
0x61	0x0061	#	LATIN SMALL LETTER A
0x62	0x0062	#	LATIN SMALL LETTER B
0x63	0x0063	#	LATIN SMALL LETTER C
0x64	0x0064	#	LATIN SMALL LETTER D
0x65	0x0065	#	LATIN SMALL LETTER E
0x66	0x0066	#	LATIN SMALL LETTER F
0x67	0x0067	#	LATIN SMALL LETTER G
0x68	0x0068	#	LATIN SMALL LETTER H
0x69	0x0069	#	LATIN SMALL LETTER I
0x6A	0x006A	#	LATIN SMALL LETTER J
0x6B	0x006B	#	LATIN SMALL LETTER K
0x6C	0x006C	#	LATIN SMALL LETTER L
0x6D	0x006D	#	LATIN SMALL LETTER M
0x6E	0x006E	#	LATIN SMALL LETTER N
0x6F	0x006F	#	LATIN SMALL LETTER O
0x70	0x0070	#	LATIN SMALL LETTER P
0x71	0x0071	#	LATIN SMALL LETTER Q
0x72	0x0072	#	LATIN SMALL LETTER R
0x73	0x0073	#	LATIN SMALL LETTER S
0x74	0x0074	#	LATIN SMALL LETTER T
0x75	0x0075	#	LATIN SMALL LETTER U
0x76	0x0076	#	LATIN SMALL LETTER V
0x77	0x0077	#	LATIN SMALL LETTER W
0x78	0x0078	#	LATIN SMALL LETTER X
0x79	0x0079	#	LATIN SMALL LETTER Y
0x7A	0x007A	#	LATIN SMALL LETTER Z
0x7B	0x007B	#	LEFT CURLY BRACKET
0x7C	0x007C	#	VERTICAL LINE
0x7D	0x007D	#	RIGHT CURLY BRACKET
0x7E	0x007E	#	TILDE
0x7F	0x007F	#	DELETE
0x80	0x0080	#	<control>
0x81	0x0081	#	<control>
0x82	0x0082	#	<control>
0x83	0x0083	#	<control>
0x84	0x0084	#	<control>
0x85	0x0085	#	<control>
0x86	0x0086	#	<control>
0x87	0x0087	#	<control>
0x88	0x0088	#	<control>
0x89	0x0089	#	<control>
0x8A	0x008A	#	<control>
0x8B	0x008B	#	<control>
0x8C	0x008C	#	<control>
0x8D	0x008D	#	<control>
0x8E	0x008E	#	<control>
0x8F	0x008F	#	<control>
0x90	0x0090	#	<control>
0x91	0x0091	#	<control>
0x92	0x0092	#	<control>
0x93	0x0093	#	<control>
0x94	0x0094	#	<control>
0x95	0x0095	#	<control>
0x96	0x0096	#	<control>
0x97	0x0097	#	<control>
0x98	0x0098	#	<control>
0x99	0x0099	#	<control>
0x9A	0x009A	#	<control>
0x9B	0x009B	#	<control>
0x9C	0x009C	#	<control>
0x9D	0x009D	#	<control>
0x9E	0x009E	#	<control>
0x9F	0x009F	#	<control>
0xA0	0x00A0	#	NO-BREAK SPACE
0xA1	0x00A1	#	INVERTED EXCLAMATION MARK
0xA2	0x00A2	#	CENT SIGN
0xA3	0x00A3	#	POUND SIGN
0xA4	0x00A4	#	CURRENCY SIGN
0xA5	0x00A5	#	YEN SIGN
0xA6	0x00A6	#	BROKEN BAR
0xA7	0x00A7	#	SECTION SIGN
0xA8	0x00A8	#	DIAERESIS
0xA9	0x00A9	#	COPYRIGHT SIGN
0xAA	0x00AA	#	FEMININE ORDINAL INDICATOR
0xAB	0x00AB	#	LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
0xAC	0x00AC	#	NOT SIGN
0xAD	0x00AD	#	SOFT HYPHEN
0xAE	0x00AE	#	REGISTERED SIGN
0xAF	0x00AF	#	MACRON
0xB0	0x00B0	#	DEGREE SIGN
0xB1	0x00B1	#	PLUS-MINUS SIGN
0xB2	0x00B2	#	SUPERSCRIPT TWO
0xB3	0x00B3	#	SUPERSCRIPT THREE
0xB4	0x00B4	#	ACUTE ACCENT
0xB5	0x00B5	#	MICRO SIGN
0xB6	0x00B6	#	PILCROW SIGN
0xB7	0x00B7	#	MIDDLE DOT
0xB8	0x00B8	#	CEDILLA
0xB9	0x00B9	#	SUPERSCRIPT ONE
0xBA	0x00BA	#	MASCULINE ORDINAL INDICATOR
0xBB	0x00BB	#	RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
0xBC	0x00BC	#	VULGAR FRACTION ONE QUARTER
0xBD	0x00BD	#	VULGAR FRACTION ONE HALF
0xBE	0x00BE	#	VULGAR FRACTION THREE QUARTERS
0xBF	0x00BF	#	INVERTED QUESTION MARK
0xC0	0x00C0	#	LATIN CAPITAL LETTER A WITH GRAVE
0xC1	0x00C1	#	LATIN CAPITAL LETTER A WITH ACUTE
0xC2	0x00C2	#	LATIN CAPITAL LETTER A WITH CIRCUMFLEX
0xC3	0x00C3	#	LATIN CAPITAL LETTER A WITH TILDE
0xC4	0x00C4	#	LATIN CAPITAL LETTER A WITH DIAERESIS
0xC5	0x00C5	#	LATIN CAPITAL LETTER A WITH RING ABOVE
0xC6	0x00C6	#	LATIN CAPITAL LETTER AE
0xC7	0x00C7	#	LATIN CAPITAL LETTER C WITH CEDILLA
0xC8	0x00C8	#	LATIN CAPITAL LETTER E WITH GRAVE
0xC9	0x00C9	#	LATIN CAPITAL LETTER E WITH ACUTE
0xCA	0x00CA	#	LATIN CAPITAL LETTER E WITH CIRCUMFLEX
0xCB	0x00CB	#	LATIN CAPITAL LETTER E WITH DIAERESIS
0xCC	0x00CC	#	LATIN CAPITAL LETTER I WITH GRAVE
0xCD	0x00CD	#	LATIN CAPITAL LETTER I WITH ACUTE
0xCE	0x00CE	#	LATIN CAPITAL LETTER I WITH CIRCUMFLEX
0xCF	0x00CF	#	LATIN CAPITAL LETTER I WITH DIAERESIS
0xD0	0x00D0	#	LATIN CAPITAL LETTER ETH (Icelandic)
0xD1	0x00D1	#	LATIN CAPITAL LETTER N WITH TILDE
0xD2	0x00D2	#	LATIN CAPITAL LETTER O WITH GRAVE
0xD3	0x00D3	#	LATIN CAPITAL LETTER O WITH ACUTE
0xD4	0x00D4	#	LATIN CAPITAL LETTER O WITH CIRCUMFLEX
0xD5	0x00D5	#	LATIN CAPITAL LETTER O WITH TILDE
0xD6	0x00D6	#	LATIN CAPITAL LETTER O WITH DIAERESIS
0xD7	0x00D7	#	MULTIPLICATION SIGN
0xD8	0x00D8	#	LATIN CAPITAL LETTER O WITH STROKE
0xD9	0x00D9	#	LATIN CAPITAL LETTER U WITH GRAVE
0xDA	0x00DA	#	LATIN CAPITAL LETTER U WITH ACUTE
0xDB	0x00DB	#	LATIN CAPITAL LETTER U WITH CIRCUMFLEX
0xDC	0x00DC	#	LATIN CAPITAL LETTER U WITH DIAERESIS
0xDD	0x00DD	#	LATIN CAPITAL LETTER Y WITH ACUTE
0xDE	0x00DE	#	LATIN CAPITAL LETTER THORN (Icelandic)
0xDF	0x00DF	#	LATIN SMALL LETTER SHARP S (German)
0xE0	0x00E0	#	LATIN SMALL LETTER A WITH GRAVE
0xE1	0x00E1	#	LATIN SMALL LETTER A WITH ACUTE
0xE2	0x00E2	#	LATIN SMALL LETTER A WITH CIRCUMFLEX
0xE3	0x00E3	#	LATIN SMALL LETTER A WITH TILDE
0xE4	0x00E4	#	LATIN SMALL LETTER A WITH DIAERESIS
0xE5	0x00E5	#	LATIN SMALL LETTER A WITH RING ABOVE
0xE6	0x00E6	#	LATIN SMALL LETTER AE
0xE7	0x00E7	#	LATIN SMALL LETTER C WITH CEDILLA
0xE8	0x00E8	#	LATIN SMALL LETTER E WITH GRAVE
0xE9	0x00E9	#	LATIN SMALL LETTER E WITH ACUTE
0xEA	0x00EA	#	LATIN SMALL LETTER E WITH CIRCUMFLEX
0xEB	0x00EB	#	LATIN SMALL LETTER E WITH DIAERESIS
0xEC	0x00EC	#	LATIN SMALL LETTER I WITH GRAVE
0xED	0x00ED	#	LATIN SMALL LETTER I WITH ACUTE
0xEE	0x00EE	#	LATIN SMALL LETTER I WITH CIRCUMFLEX
0xEF	0x00EF	#	LATIN SMALL LETTER I WITH DIAERESIS
0xF0	0x00F0	#	LATIN SMALL LETTER ETH (Icelandic)
0xF1	0x00F1	#	LATIN SMALL LETTER N WITH TILDE
0xF2	0x00F2	#	LATIN SMALL LETTER O WITH GRAVE
0xF3	0x00F3	#	LATIN SMALL LETTER O WITH ACUTE
0xF4	0x00F4	#	LATIN SMALL LETTER O WITH CIRCUMFLEX
0xF5	0x00F5	#	LATIN SMALL LETTER O WITH TILDE
0xF6	0x00F6	#	LATIN SMALL LETTER O WITH DIAERESIS
0xF7	0x00F7	#	DIVISION SIGN
0xF8	0x00F8	#	LATIN SMALL LETTER O WITH STROKE
0xF9	0x00F9	#	LATIN SMALL LETTER U WITH GRAVE
0xFA	0x00FA	#	LATIN SMALL LETTER U WITH ACUTE
0xFB	0x00FB	#	LATIN SMALL LETTER U WITH CIRCUMFLEX
0xFC	0x00FC	#	LATIN SMALL LETTER U WITH DIAERESIS
0xFD	0x00FD	#	LATIN SMALL LETTER Y WITH ACUTE
0xFE	0x00FE	#	LATIN SMALL LETTER THORN (Icelandic)
0xFF	0x00FF	#	LATIN SMALL LETTER Y WITH DIAERESIS

1.1                  vorbis-tools/oggenc/8859-2.map

Index: 8859-2.map
===================================================================
#
#	Name:             ISO 8859-2:1999 to Unicode
#	Unicode version:  3.0
#	Table version:    1.0
#	Table format:     Format A
#	Date:             1999 July 27
#	Authors:          Ken Whistler <kenw at sybase.com>
#
#	Copyright (c) 1991-1999 Unicode, Inc.  All Rights reserved.
#
#	This file is provided as-is by Unicode, Inc. (The Unicode Consortium).
#	No claims are made as to fitness for any particular purpose.  No
#	warranties of any kind are expressed or implied.  The recipient
#	agrees to determine applicability of information provided.  If this
#	file has been provided on optical media by Unicode, Inc., the sole
#	remedy for any claim will be exchange of defective media within 90
#	days of receipt.
#
#	Unicode, Inc. hereby grants the right to freely use the information
#	supplied in this file in the creation of products supporting the
#	Unicode Standard, and to make copies of this file in any form for
#	internal or external distribution as long as this notice remains
#	attached.
#
#	General notes:
#
#	This table contains the data the Unicode Consortium has on how
#       ISO/IEC 8859-2:1999 characters map into Unicode.
#
#	Format:  Three tab-separated columns
#		 Column #1 is the ISO/IEC 8859-2 code (in hex as 0xXX)
#		 Column #2 is the Unicode (in hex as 0xXXXX)
#		 Column #3 the Unicode name (follows a comment sign, '#')
#
#	The entries are in ISO/IEC 8859-2 order.
#
#	Version history
#	1.0 version updates 0.1 version by adding mappings for all
#	control characters.
#
#	Updated versions of this file may be found in:
#		<ftp://ftp.unicode.org/Public/MAPPINGS/>
#
#	Any comments or problems, contact <errata at unicode.org>
#	Please note that <errata at unicode.org> is an archival address;
#	notices will be checked, but do not expect an immediate response.
#
0x00	0x0000	#	NULL
0x01	0x0001	#	START OF HEADING
0x02	0x0002	#	START OF TEXT
0x03	0x0003	#	END OF TEXT
0x04	0x0004	#	END OF TRANSMISSION
0x05	0x0005	#	ENQUIRY
0x06	0x0006	#	ACKNOWLEDGE
0x07	0x0007	#	BELL
0x08	0x0008	#	BACKSPACE
0x09	0x0009	#	HORIZONTAL TABULATION
0x0A	0x000A	#	LINE FEED
0x0B	0x000B	#	VERTICAL TABULATION
0x0C	0x000C	#	FORM FEED
0x0D	0x000D	#	CARRIAGE RETURN
0x0E	0x000E	#	SHIFT OUT
0x0F	0x000F	#	SHIFT IN
0x10	0x0010	#	DATA LINK ESCAPE
0x11	0x0011	#	DEVICE CONTROL ONE
0x12	0x0012	#	DEVICE CONTROL TWO
0x13	0x0013	#	DEVICE CONTROL THREE
0x14	0x0014	#	DEVICE CONTROL FOUR
0x15	0x0015	#	NEGATIVE ACKNOWLEDGE
0x16	0x0016	#	SYNCHRONOUS IDLE
0x17	0x0017	#	END OF TRANSMISSION BLOCK
0x18	0x0018	#	CANCEL
0x19	0x0019	#	END OF MEDIUM
0x1A	0x001A	#	SUBSTITUTE
0x1B	0x001B	#	ESCAPE
0x1C	0x001C	#	FILE SEPARATOR
0x1D	0x001D	#	GROUP SEPARATOR
0x1E	0x001E	#	RECORD SEPARATOR
0x1F	0x001F	#	UNIT SEPARATOR
0x20	0x0020	#	SPACE
0x21	0x0021	#	EXCLAMATION MARK
0x22	0x0022	#	QUOTATION MARK
0x23	0x0023	#	NUMBER SIGN
0x24	0x0024	#	DOLLAR SIGN
0x25	0x0025	#	PERCENT SIGN
0x26	0x0026	#	AMPERSAND
0x27	0x0027	#	APOSTROPHE
0x28	0x0028	#	LEFT PARENTHESIS
0x29	0x0029	#	RIGHT PARENTHESIS
0x2A	0x002A	#	ASTERISK
0x2B	0x002B	#	PLUS SIGN
0x2C	0x002C	#	COMMA
0x2D	0x002D	#	HYPHEN-MINUS
0x2E	0x002E	#	FULL STOP
0x2F	0x002F	#	SOLIDUS
0x30	0x0030	#	DIGIT ZERO
0x31	0x0031	#	DIGIT ONE
0x32	0x0032	#	DIGIT TWO
0x33	0x0033	#	DIGIT THREE
0x34	0x0034	#	DIGIT FOUR
0x35	0x0035	#	DIGIT FIVE
0x36	0x0036	#	DIGIT SIX
0x37	0x0037	#	DIGIT SEVEN
0x38	0x0038	#	DIGIT EIGHT
0x39	0x0039	#	DIGIT NINE
0x3A	0x003A	#	COLON
0x3B	0x003B	#	SEMICOLON
0x3C	0x003C	#	LESS-THAN SIGN
0x3D	0x003D	#	EQUALS SIGN
0x3E	0x003E	#	GREATER-THAN SIGN
0x3F	0x003F	#	QUESTION MARK
0x40	0x0040	#	COMMERCIAL AT
0x41	0x0041	#	LATIN CAPITAL LETTER A
0x42	0x0042	#	LATIN CAPITAL LETTER B
0x43	0x0043	#	LATIN CAPITAL LETTER C
0x44	0x0044	#	LATIN CAPITAL LETTER D
0x45	0x0045	#	LATIN CAPITAL LETTER E
0x46	0x0046	#	LATIN CAPITAL LETTER F
0x47	0x0047	#	LATIN CAPITAL LETTER G
0x48	0x0048	#	LATIN CAPITAL LETTER H
0x49	0x0049	#	LATIN CAPITAL LETTER I
0x4A	0x004A	#	LATIN CAPITAL LETTER J
0x4B	0x004B	#	LATIN CAPITAL LETTER K
0x4C	0x004C	#	LATIN CAPITAL LETTER L
0x4D	0x004D	#	LATIN CAPITAL LETTER M
0x4E	0x004E	#	LATIN CAPITAL LETTER N
0x4F	0x004F	#	LATIN CAPITAL LETTER O
0x50	0x0050	#	LATIN CAPITAL LETTER P
0x51	0x0051	#	LATIN CAPITAL LETTER Q
0x52	0x0052	#	LATIN CAPITAL LETTER R
0x53	0x0053	#	LATIN CAPITAL LETTER S
0x54	0x0054	#	LATIN CAPITAL LETTER T
0x55	0x0055	#	LATIN CAPITAL LETTER U
0x56	0x0056	#	LATIN CAPITAL LETTER V
0x57	0x0057	#	LATIN CAPITAL LETTER W
0x58	0x0058	#	LATIN CAPITAL LETTER X
0x59	0x0059	#	LATIN CAPITAL LETTER Y
0x5A	0x005A	#	LATIN CAPITAL LETTER Z
0x5B	0x005B	#	LEFT SQUARE BRACKET
0x5C	0x005C	#	REVERSE SOLIDUS
0x5D	0x005D	#	RIGHT SQUARE BRACKET
0x5E	0x005E	#	CIRCUMFLEX ACCENT
0x5F	0x005F	#	LOW LINE
0x60	0x0060	#	GRAVE ACCENT
0x61	0x0061	#	LATIN SMALL LETTER A
0x62	0x0062	#	LATIN SMALL LETTER B
0x63	0x0063	#	LATIN SMALL LETTER C
0x64	0x0064	#	LATIN SMALL LETTER D
0x65	0x0065	#	LATIN SMALL LETTER E
0x66	0x0066	#	LATIN SMALL LETTER F
0x67	0x0067	#	LATIN SMALL LETTER G
0x68	0x0068	#	LATIN SMALL LETTER H
0x69	0x0069	#	LATIN SMALL LETTER I
0x6A	0x006A	#	LATIN SMALL LETTER J
0x6B	0x006B	#	LATIN SMALL LETTER K
0x6C	0x006C	#	LATIN SMALL LETTER L
0x6D	0x006D	#	LATIN SMALL LETTER M
0x6E	0x006E	#	LATIN SMALL LETTER N
0x6F	0x006F	#	LATIN SMALL LETTER O
0x70	0x0070	#	LATIN SMALL LETTER P
0x71	0x0071	#	LATIN SMALL LETTER Q
0x72	0x0072	#	LATIN SMALL LETTER R
0x73	0x0073	#	LATIN SMALL LETTER S
0x74	0x0074	#	LATIN SMALL LETTER T
0x75	0x0075	#	LATIN SMALL LETTER U
0x76	0x0076	#	LATIN SMALL LETTER V
0x77	0x0077	#	LATIN SMALL LETTER W
0x78	0x0078	#	LATIN SMALL LETTER X
0x79	0x0079	#	LATIN SMALL LETTER Y
0x7A	0x007A	#	LATIN SMALL LETTER Z
0x7B	0x007B	#	LEFT CURLY BRACKET
0x7C	0x007C	#	VERTICAL LINE
0x7D	0x007D	#	RIGHT CURLY BRACKET
0x7E	0x007E	#	TILDE
0x7F	0x007F	#	DELETE
0x80	0x0080	#	<control>
0x81	0x0081	#	<control>
0x82	0x0082	#	<control>
0x83	0x0083	#	<control>
0x84	0x0084	#	<control>
0x85	0x0085	#	<control>
0x86	0x0086	#	<control>
0x87	0x0087	#	<control>
0x88	0x0088	#	<control>
0x89	0x0089	#	<control>
0x8A	0x008A	#	<control>
0x8B	0x008B	#	<control>
0x8C	0x008C	#	<control>
0x8D	0x008D	#	<control>
0x8E	0x008E	#	<control>
0x8F	0x008F	#	<control>
0x90	0x0090	#	<control>
0x91	0x0091	#	<control>
0x92	0x0092	#	<control>
0x93	0x0093	#	<control>
0x94	0x0094	#	<control>
0x95	0x0095	#	<control>
0x96	0x0096	#	<control>
0x97	0x0097	#	<control>
0x98	0x0098	#	<control>
0x99	0x0099	#	<control>
0x9A	0x009A	#	<control>
0x9B	0x009B	#	<control>
0x9C	0x009C	#	<control>
0x9D	0x009D	#	<control>
0x9E	0x009E	#	<control>
0x9F	0x009F	#	<control>
0xA0	0x00A0	#	NO-BREAK SPACE
0xA1	0x0104	#	LATIN CAPITAL LETTER A WITH OGONEK
0xA2	0x02D8	#	BREVE
0xA3	0x0141	#	LATIN CAPITAL LETTER L WITH STROKE
0xA4	0x00A4	#	CURRENCY SIGN
0xA5	0x013D	#	LATIN CAPITAL LETTER L WITH CARON
0xA6	0x015A	#	LATIN CAPITAL LETTER S WITH ACUTE
0xA7	0x00A7	#	SECTION SIGN
0xA8	0x00A8	#	DIAERESIS
0xA9	0x0160	#	LATIN CAPITAL LETTER S WITH CARON
0xAA	0x015E	#	LATIN CAPITAL LETTER S WITH CEDILLA
0xAB	0x0164	#	LATIN CAPITAL LETTER T WITH CARON
0xAC	0x0179	#	LATIN CAPITAL LETTER Z WITH ACUTE
0xAD	0x00AD	#	SOFT HYPHEN
0xAE	0x017D	#	LATIN CAPITAL LETTER Z WITH CARON
0xAF	0x017B	#	LATIN CAPITAL LETTER Z WITH DOT ABOVE
0xB0	0x00B0	#	DEGREE SIGN
0xB1	0x0105	#	LATIN SMALL LETTER A WITH OGONEK
0xB2	0x02DB	#	OGONEK
0xB3	0x0142	#	LATIN SMALL LETTER L WITH STROKE
0xB4	0x00B4	#	ACUTE ACCENT
0xB5	0x013E	#	LATIN SMALL LETTER L WITH CARON
0xB6	0x015B	#	LATIN SMALL LETTER S WITH ACUTE
0xB7	0x02C7	#	CARON
0xB8	0x00B8	#	CEDILLA
0xB9	0x0161	#	LATIN SMALL LETTER S WITH CARON
0xBA	0x015F	#	LATIN SMALL LETTER S WITH CEDILLA
0xBB	0x0165	#	LATIN SMALL LETTER T WITH CARON
0xBC	0x017A	#	LATIN SMALL LETTER Z WITH ACUTE
0xBD	0x02DD	#	DOUBLE ACUTE ACCENT
0xBE	0x017E	#	LATIN SMALL LETTER Z WITH CARON
0xBF	0x017C	#	LATIN SMALL LETTER Z WITH DOT ABOVE
0xC0	0x0154	#	LATIN CAPITAL LETTER R WITH ACUTE
0xC1	0x00C1	#	LATIN CAPITAL LETTER A WITH ACUTE
0xC2	0x00C2	#	LATIN CAPITAL LETTER A WITH CIRCUMFLEX
0xC3	0x0102	#	LATIN CAPITAL LETTER A WITH BREVE
0xC4	0x00C4	#	LATIN CAPITAL LETTER A WITH DIAERESIS
0xC5	0x0139	#	LATIN CAPITAL LETTER L WITH ACUTE
0xC6	0x0106	#	LATIN CAPITAL LETTER C WITH ACUTE
0xC7	0x00C7	#	LATIN CAPITAL LETTER C WITH CEDILLA
0xC8	0x010C	#	LATIN CAPITAL LETTER C WITH CARON
0xC9	0x00C9	#	LATIN CAPITAL LETTER E WITH ACUTE
0xCA	0x0118	#	LATIN CAPITAL LETTER E WITH OGONEK
0xCB	0x00CB	#	LATIN CAPITAL LETTER E WITH DIAERESIS
0xCC	0x011A	#	LATIN CAPITAL LETTER E WITH CARON
0xCD	0x00CD	#	LATIN CAPITAL LETTER I WITH ACUTE
0xCE	0x00CE	#	LATIN CAPITAL LETTER I WITH CIRCUMFLEX
0xCF	0x010E	#	LATIN CAPITAL LETTER D WITH CARON
0xD0	0x0110	#	LATIN CAPITAL LETTER D WITH STROKE
0xD1	0x0143	#	LATIN CAPITAL LETTER N WITH ACUTE
0xD2	0x0147	#	LATIN CAPITAL LETTER N WITH CARON
0xD3	0x00D3	#	LATIN CAPITAL LETTER O WITH ACUTE
0xD4	0x00D4	#	LATIN CAPITAL LETTER O WITH CIRCUMFLEX
0xD5	0x0150	#	LATIN CAPITAL LETTER O WITH DOUBLE ACUTE
0xD6	0x00D6	#	LATIN CAPITAL LETTER O WITH DIAERESIS
0xD7	0x00D7	#	MULTIPLICATION SIGN
0xD8	0x0158	#	LATIN CAPITAL LETTER R WITH CARON
0xD9	0x016E	#	LATIN CAPITAL LETTER U WITH RING ABOVE
0xDA	0x00DA	#	LATIN CAPITAL LETTER U WITH ACUTE
0xDB	0x0170	#	LATIN CAPITAL LETTER U WITH DOUBLE ACUTE
0xDC	0x00DC	#	LATIN CAPITAL LETTER U WITH DIAERESIS
0xDD	0x00DD	#	LATIN CAPITAL LETTER Y WITH ACUTE
0xDE	0x0162	#	LATIN CAPITAL LETTER T WITH CEDILLA
0xDF	0x00DF	#	LATIN SMALL LETTER SHARP S
0xE0	0x0155	#	LATIN SMALL LETTER R WITH ACUTE
0xE1	0x00E1	#	LATIN SMALL LETTER A WITH ACUTE
0xE2	0x00E2	#	LATIN SMALL LETTER A WITH CIRCUMFLEX
0xE3	0x0103	#	LATIN SMALL LETTER A WITH BREVE
0xE4	0x00E4	#	LATIN SMALL LETTER A WITH DIAERESIS
0xE5	0x013A	#	LATIN SMALL LETTER L WITH ACUTE
0xE6	0x0107	#	LATIN SMALL LETTER C WITH ACUTE
0xE7	0x00E7	#	LATIN SMALL LETTER C WITH CEDILLA
0xE8	0x010D	#	LATIN SMALL LETTER C WITH CARON
0xE9	0x00E9	#	LATIN SMALL LETTER E WITH ACUTE
0xEA	0x0119	#	LATIN SMALL LETTER E WITH OGONEK
0xEB	0x00EB	#	LATIN SMALL LETTER E WITH DIAERESIS
0xEC	0x011B	#	LATIN SMALL LETTER E WITH CARON
0xED	0x00ED	#	LATIN SMALL LETTER I WITH ACUTE
0xEE	0x00EE	#	LATIN SMALL LETTER I WITH CIRCUMFLEX
0xEF	0x010F	#	LATIN SMALL LETTER D WITH CARON
0xF0	0x0111	#	LATIN SMALL LETTER D WITH STROKE
0xF1	0x0144	#	LATIN SMALL LETTER N WITH ACUTE
0xF2	0x0148	#	LATIN SMALL LETTER N WITH CARON
0xF3	0x00F3	#	LATIN SMALL LETTER O WITH ACUTE
0xF4	0x00F4	#	LATIN SMALL LETTER O WITH CIRCUMFLEX
0xF5	0x0151	#	LATIN SMALL LETTER O WITH DOUBLE ACUTE
0xF6	0x00F6	#	LATIN SMALL LETTER O WITH DIAERESIS
0xF7	0x00F7	#	DIVISION SIGN
0xF8	0x0159	#	LATIN SMALL LETTER R WITH CARON
0xF9	0x016F	#	LATIN SMALL LETTER U WITH RING ABOVE
0xFA	0x00FA	#	LATIN SMALL LETTER U WITH ACUTE
0xFB	0x0171	#	LATIN SMALL LETTER U WITH DOUBLE ACUTE
0xFC	0x00FC	#	LATIN SMALL LETTER U WITH DIAERESIS
0xFD	0x00FD	#	LATIN SMALL LETTER Y WITH ACUTE
0xFE	0x0163	#	LATIN SMALL LETTER T WITH CEDILLA
0xFF	0x02D9	#	DOT ABOVE

1.1                  vorbis-tools/oggenc/make_code_map.pl

Index: make_code_map.pl
===================================================================
#!/usr/bin/perl
# OggEnc

# This program is distributed under the GNU General Public License, version 2.
# A copy of this license is included with this source.
#
# Copyright © 2001, Daniel Resare <noa at metamatrix.se>

# this script creates a headerfile with charset maps from charset mapping
# files in the format published on unicode.org.
# To add more encodings, simply pull the desired files from
# http://www.unicode.org/Public/ and add encoding name and file name to 
# %maps

%maps = ('ISO-8859-1' => '8859-1.map',
        'ISO-8859-2' => '8859-2.map');

print <<EOF;
/* This file was automatically generated by make_code_map.pl
   please don't edit directly
   Daniel Resare <noa\@metamatrix.se>
*/
EOF

print("charset_map maps[] = {");
for(keys(%maps)) {
  print("\n\t{\"" . $_ . "\",\n\t {");

  open FILE, $maps{$_} or die;
  $i = 0;
  while(<FILE>) {
    if(/^#/) {
      next;
    }
    if($i != 0) {
      print ",";
    }
    @fields = split;
    unless ($i % 8) {
      print "\n\t  ";
    }
    print "$fields[1]";
    $i++;
  }
  print("\n\t }\n\t},");
}
print "\n\t{NULL}\n};\n";

1.1                  vorbis-tools/oggenc/utf8.c

Index: utf8.c
===================================================================
/* OggEnc
 *
 * This program is distributed under the GNU General Public License, version 2.
 * A copy of this license is included with this source.
 *
 * (C) 2001 Michael Smith <msmith at labyrinth.net.au>
 *
 * UTF-8 Conversion routines
 *   Copyright (C) 2001, Daniel Resare <noa at metamatrix.se>
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#ifdef _WIN32
int utf8_encode(const char *from, char **to, const char *encoding)
{
        fprintf(stderr, "Sorry, not implemented currently on win32\n");
        return 1;
}

#else /* End win32. Rest is for real operating systems */

#ifdef HAVE_ICONV
#include <iconv.h>
#include <errno.h>
#endif

#include "utf8.h"
#include "charsetmap.h"

#define BUFSIZE 256

/*
 Converts the string FROM from the encoding specified in ENCODING
 to UTF-8. The resulting string i pointed to by *TO.

 Return values:
 0 indicates a successfully converted string.
 1 indicates that the given encoding is not available.
 2 indicates that the given string is bigger than BUFSIZE and can therefore
   not be encoded.
 3 indicates that given string could not be parsed.
*/
int utf8_encode(char *from, char **to, const char *encoding)
{
#ifdef HAVE_ICONV
        static unsigned char buffer[BUFSIZE];
    char *from_p, *to_p;
        size_t from_left, to_left, ret;
        iconv_t cd;
#endif

        if (!strcasecmp(encoding, "UTF-8")) {
            /* ideally some checking of the given string should be done */
                *to = malloc(strlen(from) + 1);
                strcpy(*to, from);
                return 0;
        }

#ifdef HAVE_ICONV
        cd = iconv_open("UTF-8", encoding);
        if(cd == (iconv_t)(-1))
        {
                if(errno == EINVAL) {
                        /* if iconv can't encode from this encoding, try
                         * simple_utf8_encode()
                         */
                        return simple_utf8_encode(from, to, encoding);
                } else {
                        perror("iconv_open");
                }
        }
        
        from_left = strlen(from);
        to_left = BUFSIZE;
        from_p = from;
        to_p = buffer;
        
        if(iconv(cd, &from_p, &from_left, &to_p, &to_left) == (size_t)-1)
        {
                iconv_close(cd);
                switch(errno)
                {
                case E2BIG:
                        /* if the buffer is too small, try simple_utf8_encode()
                         */
                        return simple_utf8_encode(from, to, encoding);
                case EILSEQ:
                case EINVAL:
                        return 3;
                default:
                        perror("iconv");
                }
        }
        else
        {
                iconv_close(cd);
        }
        *to = malloc(BUFSIZE - to_left + 1);
        buffer[BUFSIZE - to_left] = 0;
        strcpy(*to, buffer);
        return 0;
#else
        return simple_utf8_encode(from, to, encoding);
#endif
}

/*
 This implementation has the following limitations: The given charset must
 represent each glyph with exactly one (1) byte. No multi byte or variable
 width charsets are allowed. (An exception to this i UTF-8 that is passed
 right through.) The glyhps in the charsets must have a unicode value equal
 to or less than 0xFFFF (this inclues pretty much everything). For a complete,
 free conversion implementation please have a look at libiconv.
*/
int simple_utf8_encode(const char *from, char **to, const char *encoding)
{
        // can you always know this will be 16 bit?
        unsigned short *unicode;
        charset_map *map;
        int index = 0;
        unsigned char c;
        
        unicode = malloc((strlen(from) * sizeof(int)) + 1);

        map = get_map(encoding);
        
        if (map == NULL) 
                return 1;

        c = from[index];
        while(c)
        {
                unicode[index] = map->mapping[c];
                index++;
                c = from[index];
        }

        *to =  make_utf8_string(unicode);
        free(unicode);
        return 0;
}
        
charset_map *get_map(const char *encoding)
{
        charset_map *map_p = maps;
        while(map_p->name != NULL)
        {
                if(!strcasecmp(map_p->name, encoding))
                {
                        return map_p;
                }
                map_p++;
        }
        return NULL;
}

char *make_utf8_string(const unsigned short *unicode)
{
        int size = 0, index = 0, out_index = 0;
        unsigned char *out;
        unsigned short c;

        /* first calculate the size of the target string */
        c = unicode[index++];
        while(c) {
                if(c < 0x0080) {
                        size += 1;
                } else if(c < 0x8000) {
                        size += 2;
                } else {
                        size += 3;
                }
                c = unicode[index++];
        }	

        out = malloc(size);
        index = 0;

        c = unicode[index++];
        while(c)
        {
                if(c < 0x080) {
                        out[out_index++] = c;
                } else if(c < 0x800) {
                        out[out_index++] = 0xc0 | (c >> 6);
                        out[out_index++] = 0x80 | (c & 0x3f);
                } else {
                        out[out_index++] = 0xe0 | (c >> 12);
                        out[out_index++] = 0x80 | ((c >> 6) & 0x3f);
                        out[out_index++] = 0x80 | (c & 0x3f);
                }
                c = unicode[index++];
        }
        out[out_index] = 0x0000;

        return out;
}

#endif

1.1                  vorbis-tools/oggenc/utf8.h

Index: utf8.h
===================================================================
/* OggEnc
 *
 * This program is distributed under the GNU General Public License, version 2.
 * A copy of this license is included with this source.
 *
 * Copyright © 2001, Daniel Resare <noa at metamatrix.se>
 */

typedef struct
{
        char* name;
        int mapping[256];
} charset_map;

charset_map *get_map(const char *encoding);
char *make_utf8_string(const unsigned short *unicode);
int simple_utf8_encode(const char *from, char **to, const char *encoding);
int utf8_encode(char *from, char **to, const char *encoding);

--- >8 ----
List archives:  http://www.xiph.org/archives/
Ogg project homepage: http://www.xiph.org/ogg/
To unsubscribe from this list, send a message to 'cvs-request at xiph.org'
containing only the word 'unsubscribe' in the body.  No subject is needed.
Unsubscribe messages sent to the list will be ignored/filtered.



More information about the commits mailing list