[xiph-cvs] cvs commit: vorbis-tools/share charset.h charset.c charset_test.c

Fri Oct 19 18:11:02 PDT 2001

msmith      01/10/19 18:11:01

  Modified:    share    charset.c charset_test.c
  Added:       share    charset.h
  Log:
  Updates and bugfixes, plus extra tests, from Edmund Evans.

Revision  Changes    Path
1.2       +6 -12     vorbis-tools/share/charset.c

Index: charset.c
===================================================================
RCS file: /usr/local/cvsroot/vorbis-tools/share/charset.c,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -r1.1 -r1.2

--- charset.c	2001/10/02 03:03:42	1.1
+++ charset.c	2001/10/20 01:11:01	1.2
@@ -88,7 +88,7 @@
   else if (c < 0xc2)
     return -1;
   else if (c < 0xe0) {
-    if (n >= 2) {
+    if (n >= 2 && (s[1] & 0xc0) == 0x80) {
       if (pwc)
         *pwc = ((c & 0x1f) << 6) | (s[1] & 0x3f);
       return 2;
@@ -176,7 +176,7 @@
  */
 
 struct charset {
-  int min, max;
+  int max;
   int (*mbtowc)(void *table, int *pwc, const char *s, size_t n);
   int (*wctomb)(void *table, char *s, int wc);
   void *map;
@@ -192,11 +192,6 @@
   return (*charset->wctomb)(charset->map, s, wc);
 }
 
-int charset_min(struct charset *charset)
-{
-  return charset->min;
-}
-
 int charset_max(struct charset *charset)
 {
   return charset->max;
@@ -398,21 +393,21 @@
  */
 
 struct charset charset_utf8 = {
-  1, 6,
+  6,
   &mbtowc_utf8,
   &wctomb_utf8,
   0
 };
 
 struct charset charset_iso1 = {
-  1, 1,
+  1,
   &mbtowc_iso1,
   &wctomb_iso1,
   0
 };
 
 struct charset charset_ascii = {
-  1, 1,
+  1,
   &mbtowc_ascii,
   &wctomb_ascii,
   0
@@ -449,7 +444,6 @@
             maps[i].charset = 0;
           }
           else {
-	    maps[i].charset->min = 1;
             maps[i].charset->max = 1;
             maps[i].charset->mbtowc = &mbtowc_8bit;
             maps[i].charset->wctomb = &wctomb_8bit;
@@ -488,7 +482,7 @@
   if (!charset1 || !charset2 )
     return -1;
 
-  tobuf = (char *)malloc((fromlen / charset1->min) * charset2->max + 1);
+  tobuf = (char *)malloc(fromlen * charset2->max + 1);
   if (!tobuf)
     return -2;
 

1.2       +37 -4     vorbis-tools/share/charset_test.c

Index: charset_test.c
===================================================================
RCS file: /usr/local/cvsroot/vorbis-tools/share/charset_test.c,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -r1.1 -r1.2
--- charset_test.c	2001/10/02 03:03:42	1.1
+++ charset_test.c	2001/10/20 01:11:01	1.2
@@ -30,13 +30,22 @@
 
   /* Decoder */
 
-  assert(charset_mbtowc(charset, 0, (char *)(-1), 0) == 0);
+  assert(charset_mbtowc(charset, 0, 0, 0) == 0);
   assert(charset_mbtowc(charset, 0, 0, 1) == 0);
+  assert(charset_mbtowc(charset, 0, (char *)(-1), 0) == 0);
+
+  assert(charset_mbtowc(charset, 0, "a", 0) == 0);
+  assert(charset_mbtowc(charset, 0, "", 1) == 0);
+  assert(charset_mbtowc(charset, 0, "b", 1) == 1);
+  assert(charset_mbtowc(charset, 0, "", 2) == 0);
+  assert(charset_mbtowc(charset, 0, "c", 2) == 1);
 
-  assert(charset_mbtowc(charset, &wc, "x", 0) == 0);
-  assert(charset_mbtowc(charset, &wc, "x", 1) == 1 && wc == 'x');
-  assert(charset_mbtowc(charset, &wc, "x", 2) == 1 && wc == 'x');
+  wc = 'x';
+  assert(charset_mbtowc(charset, &wc, "a", 0) == 0 && wc == 'x');
   assert(charset_mbtowc(charset, &wc, "", 1) == 0 && wc == 0);
+  assert(charset_mbtowc(charset, &wc, "b", 1) == 1 && wc == 'b');
+  assert(charset_mbtowc(charset, &wc, "", 2) == 0 && wc == 0);
+  assert(charset_mbtowc(charset, &wc, "c", 2) == 1 && wc == 'c');
 
   /* Encoder */
 
@@ -80,6 +89,30 @@
          wc == 1 << 26);
   assert(charset_mbtowc(charset, &wc, "\375\277\277\277\277\277", 9) == 6 &&
          wc == 0x7fffffff);
+
+  assert(charset_mbtowc(charset, &wc, "\302\000", 2) == -1);
+  assert(charset_mbtowc(charset, &wc, "\302\300", 2) == -1);
+  assert(charset_mbtowc(charset, &wc, "\340\040\200", 9) == -1);
+  assert(charset_mbtowc(charset, &wc, "\340\340\200", 9) == -1);
+  assert(charset_mbtowc(charset, &wc, "\340\240\000", 9) == -1);
+  assert(charset_mbtowc(charset, &wc, "\340\240\300", 9) == -1);
+  assert(charset_mbtowc(charset, &wc, "\360\020\200\200", 9) == -1);
+  assert(charset_mbtowc(charset, &wc, "\360\320\200\200", 9) == -1);
+  assert(charset_mbtowc(charset, &wc, "\360\220\000\200", 9) == -1);
+  assert(charset_mbtowc(charset, &wc, "\360\220\300\200", 9) == -1);
+  assert(charset_mbtowc(charset, &wc, "\360\220\200\000", 9) == -1);
+  assert(charset_mbtowc(charset, &wc, "\360\220\200\300", 9) == -1);
+  assert(charset_mbtowc(charset, &wc, "\375\077\277\277\277\277", 9) == -1);
+  assert(charset_mbtowc(charset, &wc, "\375\377\277\277\277\277", 9) == -1);
+  assert(charset_mbtowc(charset, &wc, "\375\277\077\277\277\277", 9) == -1);
+  assert(charset_mbtowc(charset, &wc, "\375\277\377\277\277\277", 9) == -1);
+  assert(charset_mbtowc(charset, &wc, "\375\277\277\277\077\277", 9) == -1);
+  assert(charset_mbtowc(charset, &wc, "\375\277\277\277\377\277", 9) == -1);
+  assert(charset_mbtowc(charset, &wc, "\375\277\277\277\277\077", 9) == -1);
+  assert(charset_mbtowc(charset, &wc, "\375\277\277\277\277\377", 9) == -1);
+
+  assert(charset_mbtowc(charset, &wc, "\376\277\277\277\277\277", 9) == -1);
+  assert(charset_mbtowc(charset, &wc, "\377\277\277\277\277\277", 9) == -1);
 
   /* Encoder */
   strcpy(s, ".......");

1.1                  vorbis-tools/share/charset.h

Index: charset.h
===================================================================
/*
 * Copyright (C) 2001 Edmund Grimley Evans <edmundo at rano.org>
 * 
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

#include <stdlib.h>

/*
 * These functions are like the C library's mbtowc() and wctomb(),
 * but instead of depending on the locale they always work in UTF-8,
 * and they use int instead of wchar_t.
 */

int utf8_mbtowc(int *pwc, const char *s, size_t n);
int utf8_wctomb(char *s, int wc);

/*
 * This is an object-oriented version of mbtowc() and wctomb().
 * The caller first uses charset_find() to get a pointer to struct
 * charset, then uses the mbtowc() and wctomb() methods on it.
 * The function charset_max() gives the maximum length of a
 * multibyte character in that encoding.
 * This API is only appropriate for stateless encodings like UTF-8
 * or ISO-8859-3, but I have no intention of implementing anything
 * other than UTF-8 and 8-bit encodings.
 *
 * MINOR BUG: If there is no memory charset_find() may return 0 and
 * there is no way to distinguish this case from an unknown encoding.
 */

struct charset;

struct charset *charset_find(const char *code);

int charset_mbtowc(struct charset *charset, int *pwc, const char *s, size_t n);
int charset_wctomb(struct charset *charset, char *s, int wc);
int charset_max(struct charset *charset);

/*
 * Function to convert a buffer from one encoding to another.
 * Invalid bytes are replaced by '#', and characters that are
 * not available in the target encoding are replaced by '?'.
 * Each of TO and TOLEN may be zero if the result is not wanted.
 * The input or output may contain null bytes, but the output
 * buffer is also null-terminated, so it is all right to
 * use charset_convert(fromcode, tocode, s, strlen(s), &t, 0).
 *
 * Return value:
 *
 *  -2 : memory allocation failed
 *  -1 : unknown encoding
 *   0 : data was converted exactly
 *   1 : valid data was converted approximately (using '?')
 *   2 : input was invalid (but still converted, using '#')
 */

int charset_convert(const char *fromcode, const char *tocode,
                    const char *from, size_t fromlen,
                    char **to, size_t *tolen);

--- >8 ----
List archives:  http://www.xiph.org/archives/
Ogg project homepage: http://www.xiph.org/ogg/
To unsubscribe from this list, send a message to 'cvs-request at xiph.org'
containing only the word 'unsubscribe' in the body.  No subject is needed.
Unsubscribe messages sent to the list will be ignored/filtered.