[xiph-cvs] cvs commit: vorbis-tools/share charset.h charset.c charset_test.c
Michael Smith
msmith at xiph.org
Fri Oct 19 18:11:02 PDT 2001
msmith 01/10/19 18:11:01
Modified: share charset.c charset_test.c
Added: share charset.h
Log:
Updates and bugfixes, plus extra tests, from Edmund Evans.
Revision Changes Path
1.2 +6 -12 vorbis-tools/share/charset.c
Index: charset.c
===================================================================
RCS file: /usr/local/cvsroot/vorbis-tools/share/charset.c,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -r1.1 -r1.2
--- charset.c 2001/10/02 03:03:42 1.1
+++ charset.c 2001/10/20 01:11:01 1.2
@@ -88,7 +88,7 @@
else if (c < 0xc2)
return -1;
else if (c < 0xe0) {
- if (n >= 2) {
+ if (n >= 2 && (s[1] & 0xc0) == 0x80) {
if (pwc)
*pwc = ((c & 0x1f) << 6) | (s[1] & 0x3f);
return 2;
@@ -176,7 +176,7 @@
*/
struct charset {
- int min, max;
+ int max;
int (*mbtowc)(void *table, int *pwc, const char *s, size_t n);
int (*wctomb)(void *table, char *s, int wc);
void *map;
@@ -192,11 +192,6 @@
return (*charset->wctomb)(charset->map, s, wc);
}
-int charset_min(struct charset *charset)
-{
- return charset->min;
-}
-
int charset_max(struct charset *charset)
{
return charset->max;
@@ -398,21 +393,21 @@
*/
struct charset charset_utf8 = {
- 1, 6,
+ 6,
&mbtowc_utf8,
&wctomb_utf8,
0
};
struct charset charset_iso1 = {
- 1, 1,
+ 1,
&mbtowc_iso1,
&wctomb_iso1,
0
};
struct charset charset_ascii = {
- 1, 1,
+ 1,
&mbtowc_ascii,
&wctomb_ascii,
0
@@ -449,7 +444,6 @@
maps[i].charset = 0;
}
else {
- maps[i].charset->min = 1;
maps[i].charset->max = 1;
maps[i].charset->mbtowc = &mbtowc_8bit;
maps[i].charset->wctomb = &wctomb_8bit;
@@ -488,7 +482,7 @@
if (!charset1 || !charset2 )
return -1;
- tobuf = (char *)malloc((fromlen / charset1->min) * charset2->max + 1);
+ tobuf = (char *)malloc(fromlen * charset2->max + 1);
if (!tobuf)
return -2;
1.2 +37 -4 vorbis-tools/share/charset_test.c
Index: charset_test.c
===================================================================
RCS file: /usr/local/cvsroot/vorbis-tools/share/charset_test.c,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -r1.1 -r1.2
--- charset_test.c 2001/10/02 03:03:42 1.1
+++ charset_test.c 2001/10/20 01:11:01 1.2
@@ -30,13 +30,22 @@
/* Decoder */
- assert(charset_mbtowc(charset, 0, (char *)(-1), 0) == 0);
+ assert(charset_mbtowc(charset, 0, 0, 0) == 0);
assert(charset_mbtowc(charset, 0, 0, 1) == 0);
+ assert(charset_mbtowc(charset, 0, (char *)(-1), 0) == 0);
+
+ assert(charset_mbtowc(charset, 0, "a", 0) == 0);
+ assert(charset_mbtowc(charset, 0, "", 1) == 0);
+ assert(charset_mbtowc(charset, 0, "b", 1) == 1);
+ assert(charset_mbtowc(charset, 0, "", 2) == 0);
+ assert(charset_mbtowc(charset, 0, "c", 2) == 1);
- assert(charset_mbtowc(charset, &wc, "x", 0) == 0);
- assert(charset_mbtowc(charset, &wc, "x", 1) == 1 && wc == 'x');
- assert(charset_mbtowc(charset, &wc, "x", 2) == 1 && wc == 'x');
+ wc = 'x';
+ assert(charset_mbtowc(charset, &wc, "a", 0) == 0 && wc == 'x');
assert(charset_mbtowc(charset, &wc, "", 1) == 0 && wc == 0);
+ assert(charset_mbtowc(charset, &wc, "b", 1) == 1 && wc == 'b');
+ assert(charset_mbtowc(charset, &wc, "", 2) == 0 && wc == 0);
+ assert(charset_mbtowc(charset, &wc, "c", 2) == 1 && wc == 'c');
/* Encoder */
@@ -80,6 +89,30 @@
wc == 1 << 26);
assert(charset_mbtowc(charset, &wc, "\375\277\277\277\277\277", 9) == 6 &&
wc == 0x7fffffff);
+
+ assert(charset_mbtowc(charset, &wc, "\302\000", 2) == -1);
+ assert(charset_mbtowc(charset, &wc, "\302\300", 2) == -1);
+ assert(charset_mbtowc(charset, &wc, "\340\040\200", 9) == -1);
+ assert(charset_mbtowc(charset, &wc, "\340\340\200", 9) == -1);
+ assert(charset_mbtowc(charset, &wc, "\340\240\000", 9) == -1);
+ assert(charset_mbtowc(charset, &wc, "\340\240\300", 9) == -1);
+ assert(charset_mbtowc(charset, &wc, "\360\020\200\200", 9) == -1);
+ assert(charset_mbtowc(charset, &wc, "\360\320\200\200", 9) == -1);
+ assert(charset_mbtowc(charset, &wc, "\360\220\000\200", 9) == -1);
+ assert(charset_mbtowc(charset, &wc, "\360\220\300\200", 9) == -1);
+ assert(charset_mbtowc(charset, &wc, "\360\220\200\000", 9) == -1);
+ assert(charset_mbtowc(charset, &wc, "\360\220\200\300", 9) == -1);
+ assert(charset_mbtowc(charset, &wc, "\375\077\277\277\277\277", 9) == -1);
+ assert(charset_mbtowc(charset, &wc, "\375\377\277\277\277\277", 9) == -1);
+ assert(charset_mbtowc(charset, &wc, "\375\277\077\277\277\277", 9) == -1);
+ assert(charset_mbtowc(charset, &wc, "\375\277\377\277\277\277", 9) == -1);
+ assert(charset_mbtowc(charset, &wc, "\375\277\277\277\077\277", 9) == -1);
+ assert(charset_mbtowc(charset, &wc, "\375\277\277\277\377\277", 9) == -1);
+ assert(charset_mbtowc(charset, &wc, "\375\277\277\277\277\077", 9) == -1);
+ assert(charset_mbtowc(charset, &wc, "\375\277\277\277\277\377", 9) == -1);
+
+ assert(charset_mbtowc(charset, &wc, "\376\277\277\277\277\277", 9) == -1);
+ assert(charset_mbtowc(charset, &wc, "\377\277\277\277\277\277", 9) == -1);
/* Encoder */
strcpy(s, ".......");
1.1 vorbis-tools/share/charset.h
Index: charset.h
===================================================================
/*
* Copyright (C) 2001 Edmund Grimley Evans <edmundo at rano.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <stdlib.h>
/*
* These functions are like the C library's mbtowc() and wctomb(),
* but instead of depending on the locale they always work in UTF-8,
* and they use int instead of wchar_t.
*/
int utf8_mbtowc(int *pwc, const char *s, size_t n);
int utf8_wctomb(char *s, int wc);
/*
* This is an object-oriented version of mbtowc() and wctomb().
* The caller first uses charset_find() to get a pointer to struct
* charset, then uses the mbtowc() and wctomb() methods on it.
* The function charset_max() gives the maximum length of a
* multibyte character in that encoding.
* This API is only appropriate for stateless encodings like UTF-8
* or ISO-8859-3, but I have no intention of implementing anything
* other than UTF-8 and 8-bit encodings.
*
* MINOR BUG: If there is no memory charset_find() may return 0 and
* there is no way to distinguish this case from an unknown encoding.
*/
struct charset;
struct charset *charset_find(const char *code);
int charset_mbtowc(struct charset *charset, int *pwc, const char *s, size_t n);
int charset_wctomb(struct charset *charset, char *s, int wc);
int charset_max(struct charset *charset);
/*
* Function to convert a buffer from one encoding to another.
* Invalid bytes are replaced by '#', and characters that are
* not available in the target encoding are replaced by '?'.
* Each of TO and TOLEN may be zero if the result is not wanted.
* The input or output may contain null bytes, but the output
* buffer is also null-terminated, so it is all right to
* use charset_convert(fromcode, tocode, s, strlen(s), &t, 0).
*
* Return value:
*
* -2 : memory allocation failed
* -1 : unknown encoding
* 0 : data was converted exactly
* 1 : valid data was converted approximately (using '?')
* 2 : input was invalid (but still converted, using '#')
*/
int charset_convert(const char *fromcode, const char *tocode,
const char *from, size_t fromlen,
char **to, size_t *tolen);
--- >8 ----
List archives: http://www.xiph.org/archives/
Ogg project homepage: http://www.xiph.org/ogg/
To unsubscribe from this list, send a message to 'cvs-request at xiph.org'
containing only the word 'unsubscribe' in the body. No subject is needed.
Unsubscribe messages sent to the list will be ignored/filtered.
More information about the commits
mailing list