[xiph-cvs] cvs commit: vorbis-tools/oggenc 8859-1.map 8859-2.map make_code_map.pl utf8.c utf8.h Makefile.am encode.h oggenc.c
Michael Smith
msmith at xiph.org
Mon Jul 2 02:39:11 PDT 2001
msmith 01/07/02 02:39:11
Modified: oggenc Makefile.am encode.h oggenc.c
Added: oggenc 8859-1.map 8859-2.map make_code_map.pl utf8.c
utf8.h
Log:
Implementation (partial, needs win32 support still) of charset conversion
to UTF8 for oggenc.
Also some documentation updates.
Revision Changes Path
1.9 +3 -2 vorbis-tools/oggenc/Makefile.am
Index: Makefile.am
===================================================================
RCS file: /usr/local/cvsroot/vorbis-tools/oggenc/Makefile.am,v
retrieving revision 1.8
retrieving revision 1.9
diff -u -r1.8 -r1.9
--- Makefile.am 2001/01/25 06:54:52 1.8
+++ Makefile.am 2001/07/02 09:39:10 1.9
@@ -10,10 +10,11 @@
oggenc_LDADD = @VORBISENC_LIBS@ @VORBIS_LIBS@ @OGG_LIBS@
oggenc_SOURCES = oggenc.c audio.c encode.c getopt.c getopt1.c\
- platform.c\
+ platform.c utf8.c utf8.h\
audio.h encode.h platform.h getopt.h
-EXTRA_DIST = oggenc.dsp build_oggenc.bat
+MAP_FILES = 8859-1.map 8859-2.map
+EXTRA_DIST = oggenc.dsp build_oggenc.bat $(MAP_FILES) make_code_map.pl charsetmap.h
debug:
$(MAKE) all CFLAGS="@DEBUG@"
1.8 +2 -0 vorbis-tools/oggenc/encode.h
Index: encode.h
===================================================================
RCS file: /usr/local/cvsroot/vorbis-tools/oggenc/encode.h,v
retrieving revision 1.7
retrieving revision 1.8
diff -u -r1.7 -r1.8
--- encode.h 2001/05/27 09:49:35 1.7
+++ encode.h 2001/07/02 09:39:10 1.8
@@ -29,6 +29,8 @@
typedef struct
{
+ char *encoding;
+
char **title;
int title_count;
char **artist;
1.16 +35 -13 vorbis-tools/oggenc/oggenc.c
Index: oggenc.c
===================================================================
RCS file: /usr/local/cvsroot/vorbis-tools/oggenc/oggenc.c,v
retrieving revision 1.15
retrieving revision 1.16
diff -u -r1.15 -r1.16
--- oggenc.c 2001/06/18 00:32:47 1.15
+++ oggenc.c 2001/07/02 09:39:10 1.16
@@ -19,6 +19,7 @@
#include "platform.h"
#include "encode.h"
#include "audio.h"
+#include "utf8.h"
#define VERSION_STRING "OggEnc v0.7 (libvorbis rc1)\n"
#define COPYRIGHT "(c) 2000 Michael Smith <msmith at labyrinth.net.au)\n"
@@ -42,6 +43,7 @@
{"date",1,0,'d'},
{"tracknum",1,0,'N'},
{"serial",1,0,'s'},
+ {"encoding",1,0,'e'},
{NULL,0,0,0}
};
@@ -53,8 +55,8 @@
int main(int argc, char **argv)
{
- oe_options opt = {NULL, 0, NULL, 0, NULL, 0, NULL, 0, NULL, 0, NULL, 0,
- 0, 0,16,44100,2, NULL,NULL,128,0}; /* Default values */
+ oe_options opt = {"ISO-8859-1", NULL, 0, NULL, 0, NULL, 0, NULL, 0, NULL,
+ 0, NULL, 0, 0, 0,16,44100,2, NULL,NULL,128,0}; /* Default values */
int i;
char **infiles;
@@ -281,6 +283,7 @@
" -s, --serial Specify a serial number for the stream. If encoding\n"
" multiple files, this will be incremented for each\n"
" stream after the first.\n"
+ " -e, --encoding Specify an encoding for the comments given.\n"
"\n"
" Naming:\n"
" -o, --output=fn Write file to fn (only valid in single-file mode)\n"
@@ -306,12 +309,13 @@
" once, for example, and have it used for all the files)\n"
"\n"
"INPUT FILES:\n"
- " OggEnc input files must currently be 16 bit PCM WAV, AIFF, or AIFF/C files.\n"
- " Files may be mono or stereo (or more channels) and sampling rates \n"
+ " OggEnc input files must currently be 16 or 8 bit PCM WAV, AIFF, or AIFF/C\n"
+ " files. Files may be mono or stereo (or more channels) and sampling rates \n"
" between 8kHz and 56kHz.\n"
- " You can specify taking the file from stdin by using - as the input filename.\n"
" Alternatively, the --raw option may be used to use a raw PCM data file, which\n"
- " must be 16bit stereo little-endian PCM ('headerless wav').\n"
+ " must be 16bit stereo little-endian PCM ('headerless wav'), unless additional\n"
+ " parameters for raw mode are specified.\n"
+ " You can specify taking the file from stdin by using - as the input filename.\n"
" In this mode, output is to stdout unless an outfile filename is specified\n"
" with -o\n"
"\n"
@@ -386,7 +390,7 @@
int ret;
int option_index = 1;
- while((ret = getopt_long(argc, argv, "a:b:B:c:C:d:hl:n:N:o:qrR:s:t:v",
+ while((ret = getopt_long(argc, argv, "a:b:B:c:C:d:e:hl:n:N:o:qrR:s:t:v",
long_options, &option_index)) != -1)
{
switch(ret)
@@ -407,6 +411,9 @@
opt->dates = realloc(opt->dates, (++opt->date_count)*sizeof(char *));
opt->dates[opt->date_count - 1] = strdup(optarg);
break;
+ case 'e':
+ opt->encoding = strdup(optarg);
+ break;
case 'l':
opt->album = realloc(opt->album, (++opt->album_count)*sizeof(char *));
opt->album[opt->album_count - 1] = strdup(optarg);
@@ -508,6 +515,21 @@
}
}
+void add_tag(vorbis_comment *vc, oe_options *opt, char *name, char *value)
+{
+ char *utf8;
+ if(utf8_encode(value, &utf8, opt->encoding) == 0)
+ {
+ if(name == NULL)
+ vorbis_comment_add(vc, utf8);
+ else
+ vorbis_comment_add_tag(vc, name, utf8);
+ free(utf8);
+ }
+ else
+ fprintf(stderr, "Couldn't convert comment to UTF8, cannot add\n");
+}
+
void build_comments(vorbis_comment *vc, oe_options *opt, int filenum,
char **artist, char **album, char **title, char **tracknum, char **date)
{
@@ -516,7 +538,7 @@
vorbis_comment_init(vc);
for(i = 0; i < opt->comment_count; i++)
- vorbis_comment_add(vc, opt->comments[i]);
+ add_tag(vc, opt, NULL, opt->comments[i]);
if(opt->title_count)
{
@@ -530,7 +552,7 @@
i = filenum;
*title = opt->title[i];
- vorbis_comment_add_tag(vc, "title", opt->title[i]);
+ add_tag(vc, opt, "title", opt->title[i]);
}
if(opt->artist_count)
@@ -541,7 +563,7 @@
i = filenum;
*artist = opt->artist[i];
- vorbis_comment_add_tag(vc, "artist", opt->artist[i]);
+ add_tag(vc, opt, "artist", opt->artist[i]);
}
if(opt->date_count)
@@ -552,7 +574,7 @@
i = filenum;
*date = opt->dates[i];
- vorbis_comment_add_tag(vc, "date", opt->dates[i]);
+ add_tag(vc, opt, "date", opt->dates[i]);
}
if(opt->album_count)
@@ -565,14 +587,14 @@
i = filenum;
*album = opt->album[i];
- vorbis_comment_add_tag(vc, "album", opt->album[i]);
+ add_tag(vc, opt, "album", opt->album[i]);
}
if(filenum < opt->track_count)
{
i = filenum;
*tracknum = opt->tracknum[i];
- vorbis_comment_add_tag(vc, "tracknumber", opt->tracknum[i]);
+ add_tag(vc, opt, "tracknumber", opt->tracknum[i]);
}
}
1.1 vorbis-tools/oggenc/8859-1.map
Index: 8859-1.map
===================================================================
#
# Name: ISO/IEC 8859-1:1998 to Unicode
# Unicode version: 3.0
# Table version: 1.0
# Table format: Format A
# Date: 1999 July 27
# Authors: Ken Whistler <kenw at sybase.com>
#
# Copyright (c) 1991-1999 Unicode, Inc. All Rights reserved.
#
# This file is provided as-is by Unicode, Inc. (The Unicode Consortium).
# No claims are made as to fitness for any particular purpose. No
# warranties of any kind are expressed or implied. The recipient
# agrees to determine applicability of information provided. If this
# file has been provided on optical media by Unicode, Inc., the sole
# remedy for any claim will be exchange of defective media within 90
# days of receipt.
#
# Unicode, Inc. hereby grants the right to freely use the information
# supplied in this file in the creation of products supporting the
# Unicode Standard, and to make copies of this file in any form for
# internal or external distribution as long as this notice remains
# attached.
#
# General notes:
#
# This table contains the data the Unicode Consortium has on how
# ISO/IEC 8859-1:1998 characters map into Unicode.
#
# Format: Three tab-separated columns
# Column #1 is the ISO/IEC 8859-1 code (in hex as 0xXX)
# Column #2 is the Unicode (in hex as 0xXXXX)
# Column #3 the Unicode name (follows a comment sign, '#')
#
# The entries are in ISO/IEC 8859-1 order.
#
# Version history
# 1.0 version updates 0.1 version by adding mappings for all
# control characters.
#
# Updated versions of this file may be found in:
# <ftp://ftp.unicode.org/Public/MAPPINGS/>
#
# Any comments or problems, contact <errata at unicode.org>
# Please note that <errata at unicode.org> is an archival address;
# notices will be checked, but do not expect an immediate response.
#
0x00 0x0000 # NULL
0x01 0x0001 # START OF HEADING
0x02 0x0002 # START OF TEXT
0x03 0x0003 # END OF TEXT
0x04 0x0004 # END OF TRANSMISSION
0x05 0x0005 # ENQUIRY
0x06 0x0006 # ACKNOWLEDGE
0x07 0x0007 # BELL
0x08 0x0008 # BACKSPACE
0x09 0x0009 # HORIZONTAL TABULATION
0x0A 0x000A # LINE FEED
0x0B 0x000B # VERTICAL TABULATION
0x0C 0x000C # FORM FEED
0x0D 0x000D # CARRIAGE RETURN
0x0E 0x000E # SHIFT OUT
0x0F 0x000F # SHIFT IN
0x10 0x0010 # DATA LINK ESCAPE
0x11 0x0011 # DEVICE CONTROL ONE
0x12 0x0012 # DEVICE CONTROL TWO
0x13 0x0013 # DEVICE CONTROL THREE
0x14 0x0014 # DEVICE CONTROL FOUR
0x15 0x0015 # NEGATIVE ACKNOWLEDGE
0x16 0x0016 # SYNCHRONOUS IDLE
0x17 0x0017 # END OF TRANSMISSION BLOCK
0x18 0x0018 # CANCEL
0x19 0x0019 # END OF MEDIUM
0x1A 0x001A # SUBSTITUTE
0x1B 0x001B # ESCAPE
0x1C 0x001C # FILE SEPARATOR
0x1D 0x001D # GROUP SEPARATOR
0x1E 0x001E # RECORD SEPARATOR
0x1F 0x001F # UNIT SEPARATOR
0x20 0x0020 # SPACE
0x21 0x0021 # EXCLAMATION MARK
0x22 0x0022 # QUOTATION MARK
0x23 0x0023 # NUMBER SIGN
0x24 0x0024 # DOLLAR SIGN
0x25 0x0025 # PERCENT SIGN
0x26 0x0026 # AMPERSAND
0x27 0x0027 # APOSTROPHE
0x28 0x0028 # LEFT PARENTHESIS
0x29 0x0029 # RIGHT PARENTHESIS
0x2A 0x002A # ASTERISK
0x2B 0x002B # PLUS SIGN
0x2C 0x002C # COMMA
0x2D 0x002D # HYPHEN-MINUS
0x2E 0x002E # FULL STOP
0x2F 0x002F # SOLIDUS
0x30 0x0030 # DIGIT ZERO
0x31 0x0031 # DIGIT ONE
0x32 0x0032 # DIGIT TWO
0x33 0x0033 # DIGIT THREE
0x34 0x0034 # DIGIT FOUR
0x35 0x0035 # DIGIT FIVE
0x36 0x0036 # DIGIT SIX
0x37 0x0037 # DIGIT SEVEN
0x38 0x0038 # DIGIT EIGHT
0x39 0x0039 # DIGIT NINE
0x3A 0x003A # COLON
0x3B 0x003B # SEMICOLON
0x3C 0x003C # LESS-THAN SIGN
0x3D 0x003D # EQUALS SIGN
0x3E 0x003E # GREATER-THAN SIGN
0x3F 0x003F # QUESTION MARK
0x40 0x0040 # COMMERCIAL AT
0x41 0x0041 # LATIN CAPITAL LETTER A
0x42 0x0042 # LATIN CAPITAL LETTER B
0x43 0x0043 # LATIN CAPITAL LETTER C
0x44 0x0044 # LATIN CAPITAL LETTER D
0x45 0x0045 # LATIN CAPITAL LETTER E
0x46 0x0046 # LATIN CAPITAL LETTER F
0x47 0x0047 # LATIN CAPITAL LETTER G
0x48 0x0048 # LATIN CAPITAL LETTER H
0x49 0x0049 # LATIN CAPITAL LETTER I
0x4A 0x004A # LATIN CAPITAL LETTER J
0x4B 0x004B # LATIN CAPITAL LETTER K
0x4C 0x004C # LATIN CAPITAL LETTER L
0x4D 0x004D # LATIN CAPITAL LETTER M
0x4E 0x004E # LATIN CAPITAL LETTER N
0x4F 0x004F # LATIN CAPITAL LETTER O
0x50 0x0050 # LATIN CAPITAL LETTER P
0x51 0x0051 # LATIN CAPITAL LETTER Q
0x52 0x0052 # LATIN CAPITAL LETTER R
0x53 0x0053 # LATIN CAPITAL LETTER S
0x54 0x0054 # LATIN CAPITAL LETTER T
0x55 0x0055 # LATIN CAPITAL LETTER U
0x56 0x0056 # LATIN CAPITAL LETTER V
0x57 0x0057 # LATIN CAPITAL LETTER W
0x58 0x0058 # LATIN CAPITAL LETTER X
0x59 0x0059 # LATIN CAPITAL LETTER Y
0x5A 0x005A # LATIN CAPITAL LETTER Z
0x5B 0x005B # LEFT SQUARE BRACKET
0x5C 0x005C # REVERSE SOLIDUS
0x5D 0x005D # RIGHT SQUARE BRACKET
0x5E 0x005E # CIRCUMFLEX ACCENT
0x5F 0x005F # LOW LINE
0x60 0x0060 # GRAVE ACCENT
0x61 0x0061 # LATIN SMALL LETTER A
0x62 0x0062 # LATIN SMALL LETTER B
0x63 0x0063 # LATIN SMALL LETTER C
0x64 0x0064 # LATIN SMALL LETTER D
0x65 0x0065 # LATIN SMALL LETTER E
0x66 0x0066 # LATIN SMALL LETTER F
0x67 0x0067 # LATIN SMALL LETTER G
0x68 0x0068 # LATIN SMALL LETTER H
0x69 0x0069 # LATIN SMALL LETTER I
0x6A 0x006A # LATIN SMALL LETTER J
0x6B 0x006B # LATIN SMALL LETTER K
0x6C 0x006C # LATIN SMALL LETTER L
0x6D 0x006D # LATIN SMALL LETTER M
0x6E 0x006E # LATIN SMALL LETTER N
0x6F 0x006F # LATIN SMALL LETTER O
0x70 0x0070 # LATIN SMALL LETTER P
0x71 0x0071 # LATIN SMALL LETTER Q
0x72 0x0072 # LATIN SMALL LETTER R
0x73 0x0073 # LATIN SMALL LETTER S
0x74 0x0074 # LATIN SMALL LETTER T
0x75 0x0075 # LATIN SMALL LETTER U
0x76 0x0076 # LATIN SMALL LETTER V
0x77 0x0077 # LATIN SMALL LETTER W
0x78 0x0078 # LATIN SMALL LETTER X
0x79 0x0079 # LATIN SMALL LETTER Y
0x7A 0x007A # LATIN SMALL LETTER Z
0x7B 0x007B # LEFT CURLY BRACKET
0x7C 0x007C # VERTICAL LINE
0x7D 0x007D # RIGHT CURLY BRACKET
0x7E 0x007E # TILDE
0x7F 0x007F # DELETE
0x80 0x0080 # <control>
0x81 0x0081 # <control>
0x82 0x0082 # <control>
0x83 0x0083 # <control>
0x84 0x0084 # <control>
0x85 0x0085 # <control>
0x86 0x0086 # <control>
0x87 0x0087 # <control>
0x88 0x0088 # <control>
0x89 0x0089 # <control>
0x8A 0x008A # <control>
0x8B 0x008B # <control>
0x8C 0x008C # <control>
0x8D 0x008D # <control>
0x8E 0x008E # <control>
0x8F 0x008F # <control>
0x90 0x0090 # <control>
0x91 0x0091 # <control>
0x92 0x0092 # <control>
0x93 0x0093 # <control>
0x94 0x0094 # <control>
0x95 0x0095 # <control>
0x96 0x0096 # <control>
0x97 0x0097 # <control>
0x98 0x0098 # <control>
0x99 0x0099 # <control>
0x9A 0x009A # <control>
0x9B 0x009B # <control>
0x9C 0x009C # <control>
0x9D 0x009D # <control>
0x9E 0x009E # <control>
0x9F 0x009F # <control>
0xA0 0x00A0 # NO-BREAK SPACE
0xA1 0x00A1 # INVERTED EXCLAMATION MARK
0xA2 0x00A2 # CENT SIGN
0xA3 0x00A3 # POUND SIGN
0xA4 0x00A4 # CURRENCY SIGN
0xA5 0x00A5 # YEN SIGN
0xA6 0x00A6 # BROKEN BAR
0xA7 0x00A7 # SECTION SIGN
0xA8 0x00A8 # DIAERESIS
0xA9 0x00A9 # COPYRIGHT SIGN
0xAA 0x00AA # FEMININE ORDINAL INDICATOR
0xAB 0x00AB # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
0xAC 0x00AC # NOT SIGN
0xAD 0x00AD # SOFT HYPHEN
0xAE 0x00AE # REGISTERED SIGN
0xAF 0x00AF # MACRON
0xB0 0x00B0 # DEGREE SIGN
0xB1 0x00B1 # PLUS-MINUS SIGN
0xB2 0x00B2 # SUPERSCRIPT TWO
0xB3 0x00B3 # SUPERSCRIPT THREE
0xB4 0x00B4 # ACUTE ACCENT
0xB5 0x00B5 # MICRO SIGN
0xB6 0x00B6 # PILCROW SIGN
0xB7 0x00B7 # MIDDLE DOT
0xB8 0x00B8 # CEDILLA
0xB9 0x00B9 # SUPERSCRIPT ONE
0xBA 0x00BA # MASCULINE ORDINAL INDICATOR
0xBB 0x00BB # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
0xBC 0x00BC # VULGAR FRACTION ONE QUARTER
0xBD 0x00BD # VULGAR FRACTION ONE HALF
0xBE 0x00BE # VULGAR FRACTION THREE QUARTERS
0xBF 0x00BF # INVERTED QUESTION MARK
0xC0 0x00C0 # LATIN CAPITAL LETTER A WITH GRAVE
0xC1 0x00C1 # LATIN CAPITAL LETTER A WITH ACUTE
0xC2 0x00C2 # LATIN CAPITAL LETTER A WITH CIRCUMFLEX
0xC3 0x00C3 # LATIN CAPITAL LETTER A WITH TILDE
0xC4 0x00C4 # LATIN CAPITAL LETTER A WITH DIAERESIS
0xC5 0x00C5 # LATIN CAPITAL LETTER A WITH RING ABOVE
0xC6 0x00C6 # LATIN CAPITAL LETTER AE
0xC7 0x00C7 # LATIN CAPITAL LETTER C WITH CEDILLA
0xC8 0x00C8 # LATIN CAPITAL LETTER E WITH GRAVE
0xC9 0x00C9 # LATIN CAPITAL LETTER E WITH ACUTE
0xCA 0x00CA # LATIN CAPITAL LETTER E WITH CIRCUMFLEX
0xCB 0x00CB # LATIN CAPITAL LETTER E WITH DIAERESIS
0xCC 0x00CC # LATIN CAPITAL LETTER I WITH GRAVE
0xCD 0x00CD # LATIN CAPITAL LETTER I WITH ACUTE
0xCE 0x00CE # LATIN CAPITAL LETTER I WITH CIRCUMFLEX
0xCF 0x00CF # LATIN CAPITAL LETTER I WITH DIAERESIS
0xD0 0x00D0 # LATIN CAPITAL LETTER ETH (Icelandic)
0xD1 0x00D1 # LATIN CAPITAL LETTER N WITH TILDE
0xD2 0x00D2 # LATIN CAPITAL LETTER O WITH GRAVE
0xD3 0x00D3 # LATIN CAPITAL LETTER O WITH ACUTE
0xD4 0x00D4 # LATIN CAPITAL LETTER O WITH CIRCUMFLEX
0xD5 0x00D5 # LATIN CAPITAL LETTER O WITH TILDE
0xD6 0x00D6 # LATIN CAPITAL LETTER O WITH DIAERESIS
0xD7 0x00D7 # MULTIPLICATION SIGN
0xD8 0x00D8 # LATIN CAPITAL LETTER O WITH STROKE
0xD9 0x00D9 # LATIN CAPITAL LETTER U WITH GRAVE
0xDA 0x00DA # LATIN CAPITAL LETTER U WITH ACUTE
0xDB 0x00DB # LATIN CAPITAL LETTER U WITH CIRCUMFLEX
0xDC 0x00DC # LATIN CAPITAL LETTER U WITH DIAERESIS
0xDD 0x00DD # LATIN CAPITAL LETTER Y WITH ACUTE
0xDE 0x00DE # LATIN CAPITAL LETTER THORN (Icelandic)
0xDF 0x00DF # LATIN SMALL LETTER SHARP S (German)
0xE0 0x00E0 # LATIN SMALL LETTER A WITH GRAVE
0xE1 0x00E1 # LATIN SMALL LETTER A WITH ACUTE
0xE2 0x00E2 # LATIN SMALL LETTER A WITH CIRCUMFLEX
0xE3 0x00E3 # LATIN SMALL LETTER A WITH TILDE
0xE4 0x00E4 # LATIN SMALL LETTER A WITH DIAERESIS
0xE5 0x00E5 # LATIN SMALL LETTER A WITH RING ABOVE
0xE6 0x00E6 # LATIN SMALL LETTER AE
0xE7 0x00E7 # LATIN SMALL LETTER C WITH CEDILLA
0xE8 0x00E8 # LATIN SMALL LETTER E WITH GRAVE
0xE9 0x00E9 # LATIN SMALL LETTER E WITH ACUTE
0xEA 0x00EA # LATIN SMALL LETTER E WITH CIRCUMFLEX
0xEB 0x00EB # LATIN SMALL LETTER E WITH DIAERESIS
0xEC 0x00EC # LATIN SMALL LETTER I WITH GRAVE
0xED 0x00ED # LATIN SMALL LETTER I WITH ACUTE
0xEE 0x00EE # LATIN SMALL LETTER I WITH CIRCUMFLEX
0xEF 0x00EF # LATIN SMALL LETTER I WITH DIAERESIS
0xF0 0x00F0 # LATIN SMALL LETTER ETH (Icelandic)
0xF1 0x00F1 # LATIN SMALL LETTER N WITH TILDE
0xF2 0x00F2 # LATIN SMALL LETTER O WITH GRAVE
0xF3 0x00F3 # LATIN SMALL LETTER O WITH ACUTE
0xF4 0x00F4 # LATIN SMALL LETTER O WITH CIRCUMFLEX
0xF5 0x00F5 # LATIN SMALL LETTER O WITH TILDE
0xF6 0x00F6 # LATIN SMALL LETTER O WITH DIAERESIS
0xF7 0x00F7 # DIVISION SIGN
0xF8 0x00F8 # LATIN SMALL LETTER O WITH STROKE
0xF9 0x00F9 # LATIN SMALL LETTER U WITH GRAVE
0xFA 0x00FA # LATIN SMALL LETTER U WITH ACUTE
0xFB 0x00FB # LATIN SMALL LETTER U WITH CIRCUMFLEX
0xFC 0x00FC # LATIN SMALL LETTER U WITH DIAERESIS
0xFD 0x00FD # LATIN SMALL LETTER Y WITH ACUTE
0xFE 0x00FE # LATIN SMALL LETTER THORN (Icelandic)
0xFF 0x00FF # LATIN SMALL LETTER Y WITH DIAERESIS
1.1 vorbis-tools/oggenc/8859-2.map
Index: 8859-2.map
===================================================================
#
# Name: ISO 8859-2:1999 to Unicode
# Unicode version: 3.0
# Table version: 1.0
# Table format: Format A
# Date: 1999 July 27
# Authors: Ken Whistler <kenw at sybase.com>
#
# Copyright (c) 1991-1999 Unicode, Inc. All Rights reserved.
#
# This file is provided as-is by Unicode, Inc. (The Unicode Consortium).
# No claims are made as to fitness for any particular purpose. No
# warranties of any kind are expressed or implied. The recipient
# agrees to determine applicability of information provided. If this
# file has been provided on optical media by Unicode, Inc., the sole
# remedy for any claim will be exchange of defective media within 90
# days of receipt.
#
# Unicode, Inc. hereby grants the right to freely use the information
# supplied in this file in the creation of products supporting the
# Unicode Standard, and to make copies of this file in any form for
# internal or external distribution as long as this notice remains
# attached.
#
# General notes:
#
# This table contains the data the Unicode Consortium has on how
# ISO/IEC 8859-2:1999 characters map into Unicode.
#
# Format: Three tab-separated columns
# Column #1 is the ISO/IEC 8859-2 code (in hex as 0xXX)
# Column #2 is the Unicode (in hex as 0xXXXX)
# Column #3 the Unicode name (follows a comment sign, '#')
#
# The entries are in ISO/IEC 8859-2 order.
#
# Version history
# 1.0 version updates 0.1 version by adding mappings for all
# control characters.
#
# Updated versions of this file may be found in:
# <ftp://ftp.unicode.org/Public/MAPPINGS/>
#
# Any comments or problems, contact <errata at unicode.org>
# Please note that <errata at unicode.org> is an archival address;
# notices will be checked, but do not expect an immediate response.
#
0x00 0x0000 # NULL
0x01 0x0001 # START OF HEADING
0x02 0x0002 # START OF TEXT
0x03 0x0003 # END OF TEXT
0x04 0x0004 # END OF TRANSMISSION
0x05 0x0005 # ENQUIRY
0x06 0x0006 # ACKNOWLEDGE
0x07 0x0007 # BELL
0x08 0x0008 # BACKSPACE
0x09 0x0009 # HORIZONTAL TABULATION
0x0A 0x000A # LINE FEED
0x0B 0x000B # VERTICAL TABULATION
0x0C 0x000C # FORM FEED
0x0D 0x000D # CARRIAGE RETURN
0x0E 0x000E # SHIFT OUT
0x0F 0x000F # SHIFT IN
0x10 0x0010 # DATA LINK ESCAPE
0x11 0x0011 # DEVICE CONTROL ONE
0x12 0x0012 # DEVICE CONTROL TWO
0x13 0x0013 # DEVICE CONTROL THREE
0x14 0x0014 # DEVICE CONTROL FOUR
0x15 0x0015 # NEGATIVE ACKNOWLEDGE
0x16 0x0016 # SYNCHRONOUS IDLE
0x17 0x0017 # END OF TRANSMISSION BLOCK
0x18 0x0018 # CANCEL
0x19 0x0019 # END OF MEDIUM
0x1A 0x001A # SUBSTITUTE
0x1B 0x001B # ESCAPE
0x1C 0x001C # FILE SEPARATOR
0x1D 0x001D # GROUP SEPARATOR
0x1E 0x001E # RECORD SEPARATOR
0x1F 0x001F # UNIT SEPARATOR
0x20 0x0020 # SPACE
0x21 0x0021 # EXCLAMATION MARK
0x22 0x0022 # QUOTATION MARK
0x23 0x0023 # NUMBER SIGN
0x24 0x0024 # DOLLAR SIGN
0x25 0x0025 # PERCENT SIGN
0x26 0x0026 # AMPERSAND
0x27 0x0027 # APOSTROPHE
0x28 0x0028 # LEFT PARENTHESIS
0x29 0x0029 # RIGHT PARENTHESIS
0x2A 0x002A # ASTERISK
0x2B 0x002B # PLUS SIGN
0x2C 0x002C # COMMA
0x2D 0x002D # HYPHEN-MINUS
0x2E 0x002E # FULL STOP
0x2F 0x002F # SOLIDUS
0x30 0x0030 # DIGIT ZERO
0x31 0x0031 # DIGIT ONE
0x32 0x0032 # DIGIT TWO
0x33 0x0033 # DIGIT THREE
0x34 0x0034 # DIGIT FOUR
0x35 0x0035 # DIGIT FIVE
0x36 0x0036 # DIGIT SIX
0x37 0x0037 # DIGIT SEVEN
0x38 0x0038 # DIGIT EIGHT
0x39 0x0039 # DIGIT NINE
0x3A 0x003A # COLON
0x3B 0x003B # SEMICOLON
0x3C 0x003C # LESS-THAN SIGN
0x3D 0x003D # EQUALS SIGN
0x3E 0x003E # GREATER-THAN SIGN
0x3F 0x003F # QUESTION MARK
0x40 0x0040 # COMMERCIAL AT
0x41 0x0041 # LATIN CAPITAL LETTER A
0x42 0x0042 # LATIN CAPITAL LETTER B
0x43 0x0043 # LATIN CAPITAL LETTER C
0x44 0x0044 # LATIN CAPITAL LETTER D
0x45 0x0045 # LATIN CAPITAL LETTER E
0x46 0x0046 # LATIN CAPITAL LETTER F
0x47 0x0047 # LATIN CAPITAL LETTER G
0x48 0x0048 # LATIN CAPITAL LETTER H
0x49 0x0049 # LATIN CAPITAL LETTER I
0x4A 0x004A # LATIN CAPITAL LETTER J
0x4B 0x004B # LATIN CAPITAL LETTER K
0x4C 0x004C # LATIN CAPITAL LETTER L
0x4D 0x004D # LATIN CAPITAL LETTER M
0x4E 0x004E # LATIN CAPITAL LETTER N
0x4F 0x004F # LATIN CAPITAL LETTER O
0x50 0x0050 # LATIN CAPITAL LETTER P
0x51 0x0051 # LATIN CAPITAL LETTER Q
0x52 0x0052 # LATIN CAPITAL LETTER R
0x53 0x0053 # LATIN CAPITAL LETTER S
0x54 0x0054 # LATIN CAPITAL LETTER T
0x55 0x0055 # LATIN CAPITAL LETTER U
0x56 0x0056 # LATIN CAPITAL LETTER V
0x57 0x0057 # LATIN CAPITAL LETTER W
0x58 0x0058 # LATIN CAPITAL LETTER X
0x59 0x0059 # LATIN CAPITAL LETTER Y
0x5A 0x005A # LATIN CAPITAL LETTER Z
0x5B 0x005B # LEFT SQUARE BRACKET
0x5C 0x005C # REVERSE SOLIDUS
0x5D 0x005D # RIGHT SQUARE BRACKET
0x5E 0x005E # CIRCUMFLEX ACCENT
0x5F 0x005F # LOW LINE
0x60 0x0060 # GRAVE ACCENT
0x61 0x0061 # LATIN SMALL LETTER A
0x62 0x0062 # LATIN SMALL LETTER B
0x63 0x0063 # LATIN SMALL LETTER C
0x64 0x0064 # LATIN SMALL LETTER D
0x65 0x0065 # LATIN SMALL LETTER E
0x66 0x0066 # LATIN SMALL LETTER F
0x67 0x0067 # LATIN SMALL LETTER G
0x68 0x0068 # LATIN SMALL LETTER H
0x69 0x0069 # LATIN SMALL LETTER I
0x6A 0x006A # LATIN SMALL LETTER J
0x6B 0x006B # LATIN SMALL LETTER K
0x6C 0x006C # LATIN SMALL LETTER L
0x6D 0x006D # LATIN SMALL LETTER M
0x6E 0x006E # LATIN SMALL LETTER N
0x6F 0x006F # LATIN SMALL LETTER O
0x70 0x0070 # LATIN SMALL LETTER P
0x71 0x0071 # LATIN SMALL LETTER Q
0x72 0x0072 # LATIN SMALL LETTER R
0x73 0x0073 # LATIN SMALL LETTER S
0x74 0x0074 # LATIN SMALL LETTER T
0x75 0x0075 # LATIN SMALL LETTER U
0x76 0x0076 # LATIN SMALL LETTER V
0x77 0x0077 # LATIN SMALL LETTER W
0x78 0x0078 # LATIN SMALL LETTER X
0x79 0x0079 # LATIN SMALL LETTER Y
0x7A 0x007A # LATIN SMALL LETTER Z
0x7B 0x007B # LEFT CURLY BRACKET
0x7C 0x007C # VERTICAL LINE
0x7D 0x007D # RIGHT CURLY BRACKET
0x7E 0x007E # TILDE
0x7F 0x007F # DELETE
0x80 0x0080 # <control>
0x81 0x0081 # <control>
0x82 0x0082 # <control>
0x83 0x0083 # <control>
0x84 0x0084 # <control>
0x85 0x0085 # <control>
0x86 0x0086 # <control>
0x87 0x0087 # <control>
0x88 0x0088 # <control>
0x89 0x0089 # <control>
0x8A 0x008A # <control>
0x8B 0x008B # <control>
0x8C 0x008C # <control>
0x8D 0x008D # <control>
0x8E 0x008E # <control>
0x8F 0x008F # <control>
0x90 0x0090 # <control>
0x91 0x0091 # <control>
0x92 0x0092 # <control>
0x93 0x0093 # <control>
0x94 0x0094 # <control>
0x95 0x0095 # <control>
0x96 0x0096 # <control>
0x97 0x0097 # <control>
0x98 0x0098 # <control>
0x99 0x0099 # <control>
0x9A 0x009A # <control>
0x9B 0x009B # <control>
0x9C 0x009C # <control>
0x9D 0x009D # <control>
0x9E 0x009E # <control>
0x9F 0x009F # <control>
0xA0 0x00A0 # NO-BREAK SPACE
0xA1 0x0104 # LATIN CAPITAL LETTER A WITH OGONEK
0xA2 0x02D8 # BREVE
0xA3 0x0141 # LATIN CAPITAL LETTER L WITH STROKE
0xA4 0x00A4 # CURRENCY SIGN
0xA5 0x013D # LATIN CAPITAL LETTER L WITH CARON
0xA6 0x015A # LATIN CAPITAL LETTER S WITH ACUTE
0xA7 0x00A7 # SECTION SIGN
0xA8 0x00A8 # DIAERESIS
0xA9 0x0160 # LATIN CAPITAL LETTER S WITH CARON
0xAA 0x015E # LATIN CAPITAL LETTER S WITH CEDILLA
0xAB 0x0164 # LATIN CAPITAL LETTER T WITH CARON
0xAC 0x0179 # LATIN CAPITAL LETTER Z WITH ACUTE
0xAD 0x00AD # SOFT HYPHEN
0xAE 0x017D # LATIN CAPITAL LETTER Z WITH CARON
0xAF 0x017B # LATIN CAPITAL LETTER Z WITH DOT ABOVE
0xB0 0x00B0 # DEGREE SIGN
0xB1 0x0105 # LATIN SMALL LETTER A WITH OGONEK
0xB2 0x02DB # OGONEK
0xB3 0x0142 # LATIN SMALL LETTER L WITH STROKE
0xB4 0x00B4 # ACUTE ACCENT
0xB5 0x013E # LATIN SMALL LETTER L WITH CARON
0xB6 0x015B # LATIN SMALL LETTER S WITH ACUTE
0xB7 0x02C7 # CARON
0xB8 0x00B8 # CEDILLA
0xB9 0x0161 # LATIN SMALL LETTER S WITH CARON
0xBA 0x015F # LATIN SMALL LETTER S WITH CEDILLA
0xBB 0x0165 # LATIN SMALL LETTER T WITH CARON
0xBC 0x017A # LATIN SMALL LETTER Z WITH ACUTE
0xBD 0x02DD # DOUBLE ACUTE ACCENT
0xBE 0x017E # LATIN SMALL LETTER Z WITH CARON
0xBF 0x017C # LATIN SMALL LETTER Z WITH DOT ABOVE
0xC0 0x0154 # LATIN CAPITAL LETTER R WITH ACUTE
0xC1 0x00C1 # LATIN CAPITAL LETTER A WITH ACUTE
0xC2 0x00C2 # LATIN CAPITAL LETTER A WITH CIRCUMFLEX
0xC3 0x0102 # LATIN CAPITAL LETTER A WITH BREVE
0xC4 0x00C4 # LATIN CAPITAL LETTER A WITH DIAERESIS
0xC5 0x0139 # LATIN CAPITAL LETTER L WITH ACUTE
0xC6 0x0106 # LATIN CAPITAL LETTER C WITH ACUTE
0xC7 0x00C7 # LATIN CAPITAL LETTER C WITH CEDILLA
0xC8 0x010C # LATIN CAPITAL LETTER C WITH CARON
0xC9 0x00C9 # LATIN CAPITAL LETTER E WITH ACUTE
0xCA 0x0118 # LATIN CAPITAL LETTER E WITH OGONEK
0xCB 0x00CB # LATIN CAPITAL LETTER E WITH DIAERESIS
0xCC 0x011A # LATIN CAPITAL LETTER E WITH CARON
0xCD 0x00CD # LATIN CAPITAL LETTER I WITH ACUTE
0xCE 0x00CE # LATIN CAPITAL LETTER I WITH CIRCUMFLEX
0xCF 0x010E # LATIN CAPITAL LETTER D WITH CARON
0xD0 0x0110 # LATIN CAPITAL LETTER D WITH STROKE
0xD1 0x0143 # LATIN CAPITAL LETTER N WITH ACUTE
0xD2 0x0147 # LATIN CAPITAL LETTER N WITH CARON
0xD3 0x00D3 # LATIN CAPITAL LETTER O WITH ACUTE
0xD4 0x00D4 # LATIN CAPITAL LETTER O WITH CIRCUMFLEX
0xD5 0x0150 # LATIN CAPITAL LETTER O WITH DOUBLE ACUTE
0xD6 0x00D6 # LATIN CAPITAL LETTER O WITH DIAERESIS
0xD7 0x00D7 # MULTIPLICATION SIGN
0xD8 0x0158 # LATIN CAPITAL LETTER R WITH CARON
0xD9 0x016E # LATIN CAPITAL LETTER U WITH RING ABOVE
0xDA 0x00DA # LATIN CAPITAL LETTER U WITH ACUTE
0xDB 0x0170 # LATIN CAPITAL LETTER U WITH DOUBLE ACUTE
0xDC 0x00DC # LATIN CAPITAL LETTER U WITH DIAERESIS
0xDD 0x00DD # LATIN CAPITAL LETTER Y WITH ACUTE
0xDE 0x0162 # LATIN CAPITAL LETTER T WITH CEDILLA
0xDF 0x00DF # LATIN SMALL LETTER SHARP S
0xE0 0x0155 # LATIN SMALL LETTER R WITH ACUTE
0xE1 0x00E1 # LATIN SMALL LETTER A WITH ACUTE
0xE2 0x00E2 # LATIN SMALL LETTER A WITH CIRCUMFLEX
0xE3 0x0103 # LATIN SMALL LETTER A WITH BREVE
0xE4 0x00E4 # LATIN SMALL LETTER A WITH DIAERESIS
0xE5 0x013A # LATIN SMALL LETTER L WITH ACUTE
0xE6 0x0107 # LATIN SMALL LETTER C WITH ACUTE
0xE7 0x00E7 # LATIN SMALL LETTER C WITH CEDILLA
0xE8 0x010D # LATIN SMALL LETTER C WITH CARON
0xE9 0x00E9 # LATIN SMALL LETTER E WITH ACUTE
0xEA 0x0119 # LATIN SMALL LETTER E WITH OGONEK
0xEB 0x00EB # LATIN SMALL LETTER E WITH DIAERESIS
0xEC 0x011B # LATIN SMALL LETTER E WITH CARON
0xED 0x00ED # LATIN SMALL LETTER I WITH ACUTE
0xEE 0x00EE # LATIN SMALL LETTER I WITH CIRCUMFLEX
0xEF 0x010F # LATIN SMALL LETTER D WITH CARON
0xF0 0x0111 # LATIN SMALL LETTER D WITH STROKE
0xF1 0x0144 # LATIN SMALL LETTER N WITH ACUTE
0xF2 0x0148 # LATIN SMALL LETTER N WITH CARON
0xF3 0x00F3 # LATIN SMALL LETTER O WITH ACUTE
0xF4 0x00F4 # LATIN SMALL LETTER O WITH CIRCUMFLEX
0xF5 0x0151 # LATIN SMALL LETTER O WITH DOUBLE ACUTE
0xF6 0x00F6 # LATIN SMALL LETTER O WITH DIAERESIS
0xF7 0x00F7 # DIVISION SIGN
0xF8 0x0159 # LATIN SMALL LETTER R WITH CARON
0xF9 0x016F # LATIN SMALL LETTER U WITH RING ABOVE
0xFA 0x00FA # LATIN SMALL LETTER U WITH ACUTE
0xFB 0x0171 # LATIN SMALL LETTER U WITH DOUBLE ACUTE
0xFC 0x00FC # LATIN SMALL LETTER U WITH DIAERESIS
0xFD 0x00FD # LATIN SMALL LETTER Y WITH ACUTE
0xFE 0x0163 # LATIN SMALL LETTER T WITH CEDILLA
0xFF 0x02D9 # DOT ABOVE
1.1 vorbis-tools/oggenc/make_code_map.pl
Index: make_code_map.pl
===================================================================
#!/usr/bin/perl
# OggEnc
# This program is distributed under the GNU General Public License, version 2.
# A copy of this license is included with this source.
#
# Copyright © 2001, Daniel Resare <noa at metamatrix.se>
# this script creates a headerfile with charset maps from charset mapping
# files in the format published on unicode.org.
# To add more encodings, simply pull the desired files from
# http://www.unicode.org/Public/ and add encoding name and file name to
# %maps
%maps = ('ISO-8859-1' => '8859-1.map',
'ISO-8859-2' => '8859-2.map');
print <<EOF;
/* This file was automatically generated by make_code_map.pl
please don't edit directly
Daniel Resare <noa\@metamatrix.se>
*/
EOF
print("charset_map maps[] = {");
for(keys(%maps)) {
print("\n\t{\"" . $_ . "\",\n\t {");
open FILE, $maps{$_} or die;
$i = 0;
while(<FILE>) {
if(/^#/) {
next;
}
if($i != 0) {
print ",";
}
@fields = split;
unless ($i % 8) {
print "\n\t ";
}
print "$fields[1]";
$i++;
}
print("\n\t }\n\t},");
}
print "\n\t{NULL}\n};\n";
1.1 vorbis-tools/oggenc/utf8.c
Index: utf8.c
===================================================================
/* OggEnc
*
* This program is distributed under the GNU General Public License, version 2.
* A copy of this license is included with this source.
*
* (C) 2001 Michael Smith <msmith at labyrinth.net.au>
*
* UTF-8 Conversion routines
* Copyright (C) 2001, Daniel Resare <noa at metamatrix.se>
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#ifdef _WIN32
int utf8_encode(const char *from, char **to, const char *encoding)
{
fprintf(stderr, "Sorry, not implemented currently on win32\n");
return 1;
}
#else /* End win32. Rest is for real operating systems */
#ifdef HAVE_ICONV
#include <iconv.h>
#include <errno.h>
#endif
#include "utf8.h"
#include "charsetmap.h"
#define BUFSIZE 256
/*
Converts the string FROM from the encoding specified in ENCODING
to UTF-8. The resulting string i pointed to by *TO.
Return values:
0 indicates a successfully converted string.
1 indicates that the given encoding is not available.
2 indicates that the given string is bigger than BUFSIZE and can therefore
not be encoded.
3 indicates that given string could not be parsed.
*/
int utf8_encode(char *from, char **to, const char *encoding)
{
#ifdef HAVE_ICONV
static unsigned char buffer[BUFSIZE];
char *from_p, *to_p;
size_t from_left, to_left, ret;
iconv_t cd;
#endif
if (!strcasecmp(encoding, "UTF-8")) {
/* ideally some checking of the given string should be done */
*to = malloc(strlen(from) + 1);
strcpy(*to, from);
return 0;
}
#ifdef HAVE_ICONV
cd = iconv_open("UTF-8", encoding);
if(cd == (iconv_t)(-1))
{
if(errno == EINVAL) {
/* if iconv can't encode from this encoding, try
* simple_utf8_encode()
*/
return simple_utf8_encode(from, to, encoding);
} else {
perror("iconv_open");
}
}
from_left = strlen(from);
to_left = BUFSIZE;
from_p = from;
to_p = buffer;
if(iconv(cd, &from_p, &from_left, &to_p, &to_left) == (size_t)-1)
{
iconv_close(cd);
switch(errno)
{
case E2BIG:
/* if the buffer is too small, try simple_utf8_encode()
*/
return simple_utf8_encode(from, to, encoding);
case EILSEQ:
case EINVAL:
return 3;
default:
perror("iconv");
}
}
else
{
iconv_close(cd);
}
*to = malloc(BUFSIZE - to_left + 1);
buffer[BUFSIZE - to_left] = 0;
strcpy(*to, buffer);
return 0;
#else
return simple_utf8_encode(from, to, encoding);
#endif
}
/*
This implementation has the following limitations: The given charset must
represent each glyph with exactly one (1) byte. No multi byte or variable
width charsets are allowed. (An exception to this i UTF-8 that is passed
right through.) The glyhps in the charsets must have a unicode value equal
to or less than 0xFFFF (this inclues pretty much everything). For a complete,
free conversion implementation please have a look at libiconv.
*/
int simple_utf8_encode(const char *from, char **to, const char *encoding)
{
// can you always know this will be 16 bit?
unsigned short *unicode;
charset_map *map;
int index = 0;
unsigned char c;
unicode = malloc((strlen(from) * sizeof(int)) + 1);
map = get_map(encoding);
if (map == NULL)
return 1;
c = from[index];
while(c)
{
unicode[index] = map->mapping[c];
index++;
c = from[index];
}
*to = make_utf8_string(unicode);
free(unicode);
return 0;
}
charset_map *get_map(const char *encoding)
{
charset_map *map_p = maps;
while(map_p->name != NULL)
{
if(!strcasecmp(map_p->name, encoding))
{
return map_p;
}
map_p++;
}
return NULL;
}
char *make_utf8_string(const unsigned short *unicode)
{
int size = 0, index = 0, out_index = 0;
unsigned char *out;
unsigned short c;
/* first calculate the size of the target string */
c = unicode[index++];
while(c) {
if(c < 0x0080) {
size += 1;
} else if(c < 0x8000) {
size += 2;
} else {
size += 3;
}
c = unicode[index++];
}
out = malloc(size);
index = 0;
c = unicode[index++];
while(c)
{
if(c < 0x080) {
out[out_index++] = c;
} else if(c < 0x800) {
out[out_index++] = 0xc0 | (c >> 6);
out[out_index++] = 0x80 | (c & 0x3f);
} else {
out[out_index++] = 0xe0 | (c >> 12);
out[out_index++] = 0x80 | ((c >> 6) & 0x3f);
out[out_index++] = 0x80 | (c & 0x3f);
}
c = unicode[index++];
}
out[out_index] = 0x0000;
return out;
}
#endif
1.1 vorbis-tools/oggenc/utf8.h
Index: utf8.h
===================================================================
/* OggEnc
*
* This program is distributed under the GNU General Public License, version 2.
* A copy of this license is included with this source.
*
* Copyright © 2001, Daniel Resare <noa at metamatrix.se>
*/
typedef struct
{
char* name;
int mapping[256];
} charset_map;
charset_map *get_map(const char *encoding);
char *make_utf8_string(const unsigned short *unicode);
int simple_utf8_encode(const char *from, char **to, const char *encoding);
int utf8_encode(char *from, char **to, const char *encoding);
--- >8 ----
List archives: http://www.xiph.org/archives/
Ogg project homepage: http://www.xiph.org/ogg/
To unsubscribe from this list, send a message to 'cvs-request at xiph.org'
containing only the word 'unsubscribe' in the body. No subject is needed.
Unsubscribe messages sent to the list will be ignored/filtered.
More information about the commits
mailing list