mbchar.h

00001 /* Multibyte character data type.
00002    Copyright (C) 2001, 2005 Free Software Foundation, Inc.
00003 
00004    This program is free software; you can redistribute it and/or modify
00005    it under the terms of the GNU Lesser General Public License as published by
00006    the Free Software Foundation; either version 2.1, or (at your option)
00007    any later version.
00008 
00009    This program is distributed in the hope that it will be useful,
00010    but WITHOUT ANY WARRANTY; without even the implied warranty of
00011    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012    GNU Lesser General Public License for more details.
00013 
00014    You should have received a copy of the GNU Lesser General Public License
00015    along with this program; if not, write to the Free Software Foundation,
00016    Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
00017 
00018 /* Written by Bruno Haible <bruno@clisp.org>.  */
00019 
00020 /* A multibyte character is a short subsequence of a char* string,
00021    representing a single wide character.
00022 
00023    We use multibyte characters instead of wide characters because of
00024    the following goals:
00025    1) correct multibyte handling, i.e. operate according to the LC_CTYPE
00026       locale,
00027    2) ease of maintenance, i.e. the maintainer needs not know all details
00028       of the ISO C 99 standard,
00029    3) don't fail grossly if the input is not in the encoding set by the
00030       locale, because often different encodings are in use in the same
00031       countries (ISO-8859-1/UTF-8, EUC-JP/Shift_JIS, ...),
00032    4) fast in the case of ASCII characters,
00033    5) portability, i.e. don't make unportable assumptions about wchar_t.
00034 
00035    Multibyte characters are only accessed through the mb* macros.
00036 
00037    mb_ptr (mbc)
00038      return a pointer to the beginning of the multibyte sequence.
00039 
00040    mb_len (mbc)
00041      returns the number of bytes occupied by the multibyte sequence.
00042      Always > 0.
00043 
00044    mb_iseq (mbc, sc)
00045      returns true if mbc is the standard ASCII character sc.
00046 
00047    mb_isnul (mbc)
00048      returns true if mbc is the nul character.
00049 
00050    mb_cmp (mbc1, mbc2)
00051      returns a positive, zero, or negative value depending on whether mbc1
00052      sorts after, same or before mbc2.
00053 
00054    mb_casecmp (mbc1, mbc2)
00055      returns a positive, zero, or negative value depending on whether mbc1
00056      sorts after, same or before mbc2, modulo upper/lowercase conversion.
00057 
00058    mb_equal (mbc1, mbc2)
00059      returns true if mbc1 and mbc2 are equal.
00060 
00061    mb_caseequal (mbc1, mbc2)
00062      returns true if mbc1 and mbc2 are equal modulo upper/lowercase conversion.
00063 
00064    mb_isalnum (mbc)
00065      returns true if mbc is alphanumeric.
00066 
00067    mb_isalpha (mbc)
00068      returns true if mbc is alphabetic.
00069 
00070    mb_isascii(mbc)
00071      returns true if mbc is plain ASCII.
00072 
00073    mb_isblank (mbc)
00074      returns true if mbc is a blank.
00075 
00076    mb_iscntrl (mbc)
00077      returns true if mbc is a control character.
00078 
00079    mb_isdigit (mbc)
00080      returns true if mbc is a decimal digit.
00081 
00082    mb_isgraph (mbc)
00083      returns true if mbc is a graphic character.
00084 
00085    mb_islower (mbc)
00086      returns true if mbc is lowercase.
00087 
00088    mb_isprint (mbc)
00089      returns true if mbc is a printable character.
00090 
00091    mb_ispunct (mbc)
00092      returns true if mbc is a punctuation character.
00093 
00094    mb_isspace (mbc)
00095      returns true if mbc is a space character.
00096 
00097    mb_isupper (mbc)
00098      returns true if mbc is uppercase.
00099 
00100    mb_isxdigit (mbc)
00101      returns true if mbc is a hexadecimal digit.
00102 
00103    mb_width (mbc)
00104      returns the number of columns on the output device occupied by mbc.
00105      Always >= 0.
00106 
00107    mb_putc (mbc, stream)
00108      outputs mbc on stream, a byte oriented FILE stream opened for output.
00109 
00110    mb_setascii (&mbc, sc)
00111      assigns the standard ASCII character sc to mbc.
00112 
00113    mb_copy (&destmbc, &srcmbc)
00114      copies srcmbc to destmbc.
00115 
00116    Here are the function prototypes of the macros.
00117 
00118    extern const char *  mb_ptr (const mbchar_t mbc);
00119    extern size_t        mb_len (const mbchar_t mbc);
00120    extern bool          mb_iseq (const mbchar_t mbc, char sc);
00121    extern bool          mb_isnul (const mbchar_t mbc);
00122    extern int           mb_cmp (const mbchar_t mbc1, const mbchar_t mbc2);
00123    extern int           mb_casecmp (const mbchar_t mbc1, const mbchar_t mbc2);
00124    extern bool          mb_equal (const mbchar_t mbc1, const mbchar_t mbc2);
00125    extern bool          mb_caseequal (const mbchar_t mbc1, const mbchar_t mbc2);
00126    extern bool          mb_isalnum (const mbchar_t mbc);
00127    extern bool          mb_isalpha (const mbchar_t mbc);
00128    extern bool          mb_isascii (const mbchar_t mbc);
00129    extern bool          mb_isblank (const mbchar_t mbc);
00130    extern bool          mb_iscntrl (const mbchar_t mbc);
00131    extern bool          mb_isdigit (const mbchar_t mbc);
00132    extern bool          mb_isgraph (const mbchar_t mbc);
00133    extern bool          mb_islower (const mbchar_t mbc);
00134    extern bool          mb_isprint (const mbchar_t mbc);
00135    extern bool          mb_ispunct (const mbchar_t mbc);
00136    extern bool          mb_isspace (const mbchar_t mbc);
00137    extern bool          mb_isupper (const mbchar_t mbc);
00138    extern bool          mb_isxdigit (const mbchar_t mbc);
00139    extern int           mb_width (const mbchar_t mbc);
00140    extern void          mb_putc (const mbchar_t mbc, FILE *stream);
00141    extern void          mb_setascii (mbchar_t *new, char sc);
00142    extern void          mb_copy (mbchar_t *new, const mbchar_t *old);
00143  */
00144 
00145 #ifndef _MBCHAR_H
00146 #define _MBCHAR_H 1
00147 
00148 #include <stdbool.h>
00149 #include <string.h>
00150 
00151 /* Tru64 with Desktop Toolkit C has a bug: <stdio.h> must be included before
00152    <wchar.h>.
00153    BSD/OS 4.1 has a bug: <stdio.h> and <time.h> must be included before
00154    <wchar.h>.  */
00155 #include <stdio.h>
00156 #include <time.h>
00157 #include <wchar.h>
00158 
00159 #include <wctype.h>
00160 
00161 #define MBCHAR_BUF_SIZE 24
00162 
00163 struct mbchar
00164 {
00165   const char *ptr;      /* pointer to current character */
00166   size_t bytes;         /* number of bytes of current character, > 0 */
00167   bool wc_valid;        /* true if wc is a valid wide character */
00168   wchar_t wc;           /* if wc_valid: the current character */
00169   char buf[MBCHAR_BUF_SIZE]; /* room for the bytes, used for file input only */
00170 };
00171 
00172 /* EOF (not a real character) is represented with bytes = 0 and
00173    wc_valid = false.  */
00174 
00175 typedef struct mbchar mbchar_t;
00176 
00177 /* Access the current character.  */
00178 #define mb_ptr(mbc) ((mbc).ptr)
00179 #define mb_len(mbc) ((mbc).bytes)
00180 
00181 /* Comparison of characters.  */
00182 #define mb_iseq(mbc, sc) ((mbc).wc_valid && (mbc).wc == (sc))
00183 #define mb_isnul(mbc) ((mbc).wc_valid && (mbc).wc == 0)
00184 #define mb_cmp(mbc1, mbc2) \
00185   ((mbc1).wc_valid                                                      \
00186    ? ((mbc2).wc_valid                                                   \
00187       ? (int) (mbc1).wc - (int) (mbc2).wc                               \
00188       : -1)                                                             \
00189    : ((mbc2).wc_valid                                                   \
00190       ? 1                                                               \
00191       : (mbc1).bytes == (mbc2).bytes                                    \
00192         ? memcmp ((mbc1).ptr, (mbc2).ptr, (mbc1).bytes)                 \
00193         : (mbc1).bytes < (mbc2).bytes                                   \
00194           ? (memcmp ((mbc1).ptr, (mbc2).ptr, (mbc1).bytes) > 0 ? 1 : -1) \
00195           : (memcmp ((mbc1).ptr, (mbc2).ptr, (mbc2).bytes) >= 0 ? 1 : -1)))
00196 #define mb_casecmp(mbc1, mbc2) \
00197   ((mbc1).wc_valid                                                      \
00198    ? ((mbc2).wc_valid                                                   \
00199       ? (int) towlower ((mbc1).wc) - (int) towlower ((mbc2).wc)         \
00200       : -1)                                                             \
00201    : ((mbc2).wc_valid                                                   \
00202       ? 1                                                               \
00203       : (mbc1).bytes == (mbc2).bytes                                    \
00204         ? memcmp ((mbc1).ptr, (mbc2).ptr, (mbc1).bytes)                 \
00205         : (mbc1).bytes < (mbc2).bytes                                   \
00206           ? (memcmp ((mbc1).ptr, (mbc2).ptr, (mbc1).bytes) > 0 ? 1 : -1) \
00207           : (memcmp ((mbc1).ptr, (mbc2).ptr, (mbc2).bytes) >= 0 ? 1 : -1)))
00208 #define mb_equal(mbc1, mbc2) \
00209   ((mbc1).wc_valid && (mbc2).wc_valid                                   \
00210    ? (mbc1).wc == (mbc2).wc                                             \
00211    : (mbc1).bytes == (mbc2).bytes                                       \
00212      && memcmp ((mbc1).ptr, (mbc2).ptr, (mbc1).bytes) == 0)
00213 #define mb_caseequal(mbc1, mbc2) \
00214   ((mbc1).wc_valid && (mbc2).wc_valid                                   \
00215    ? towlower ((mbc1).wc) == towlower ((mbc2).wc)                       \
00216    : (mbc1).bytes == (mbc2).bytes                                       \
00217      && memcmp ((mbc1).ptr, (mbc2).ptr, (mbc1).bytes) == 0)
00218 
00219 /* <ctype.h>, <wctype.h> classification.  */
00220 #define mb_isascii(mbc) \
00221   ((mbc).wc_valid && (mbc).wc >= 0 && (mbc).wc <= 127)
00222 #define mb_isalnum(mbc) ((mbc).wc_valid && iswalnum ((mbc).wc))
00223 #define mb_isalpha(mbc) ((mbc).wc_valid && iswalpha ((mbc).wc))
00224 #define mb_isblank(mbc) ((mbc).wc_valid && iswblank ((mbc).wc))
00225 #define mb_iscntrl(mbc) ((mbc).wc_valid && iswcntrl ((mbc).wc))
00226 #define mb_isdigit(mbc) ((mbc).wc_valid && iswdigit ((mbc).wc))
00227 #define mb_isgraph(mbc) ((mbc).wc_valid && iswgraph ((mbc).wc))
00228 #define mb_islower(mbc) ((mbc).wc_valid && iswlower ((mbc).wc))
00229 #define mb_isprint(mbc) ((mbc).wc_valid && iswprint ((mbc).wc))
00230 #define mb_ispunct(mbc) ((mbc).wc_valid && iswpunct ((mbc).wc))
00231 #define mb_isspace(mbc) ((mbc).wc_valid && iswspace ((mbc).wc))
00232 #define mb_isupper(mbc) ((mbc).wc_valid && iswupper ((mbc).wc))
00233 #define mb_isxdigit(mbc) ((mbc).wc_valid && iswxdigit ((mbc).wc))
00234 
00235 /* Extra <wchar.h> function.  */
00236 
00237 /* Unprintable characters appear as a small box of width 1.  */
00238 #define MB_UNPRINTABLE_WIDTH 1
00239 
00240 static inline int
00241 mb_width_aux (wint_t wc)
00242 {
00243   int w = wcwidth (wc);
00244   /* For unprintable characters, arbitrarily return 0 for control characters
00245      and MB_UNPRINTABLE_WIDTH otherwise.  */
00246   return (w >= 0 ? w : iswcntrl (wc) ? 0 : MB_UNPRINTABLE_WIDTH);
00247 }
00248 
00249 #define mb_width(mbc) \
00250   ((mbc).wc_valid ? mb_width_aux ((mbc).wc) : MB_UNPRINTABLE_WIDTH)
00251 
00252 /* Output.  */
00253 #define mb_putc(mbc, stream)  fwrite ((mbc).ptr, 1, (mbc).bytes, (stream))
00254 
00255 /* Assignment.  */
00256 #define mb_setascii(mbc, sc) \
00257   ((mbc)->ptr = (mbc)->buf, (mbc)->bytes = 1, (mbc)->wc_valid = 1, \
00258    (mbc)->wc = (mbc)->buf[0] = (sc))
00259 
00260 /* Copying a character.  */
00261 static inline void
00262 mb_copy (mbchar_t *new, const mbchar_t *old)
00263 {
00264   if (old->ptr == &old->buf[0])
00265     {
00266       memcpy (&new->buf[0], &old->buf[0], old->bytes);
00267       new->ptr = &new->buf[0];
00268     }
00269   else
00270     new->ptr = old->ptr;
00271   new->bytes = old->bytes;
00272   if ((new->wc_valid = old->wc_valid))
00273     new->wc = old->wc;
00274 }
00275 
00276 
00277 /* is_basic(c) tests whether the single-byte character c is in the
00278    ISO C "basic character set".
00279    This is a convenience function, and is in this file only to share code
00280    between mbiter_multi.h and mbfile_multi.h.  */
00281 #if (' ' == 32) && ('!' == 33) && ('"' == 34) && ('#' == 35) \
00282     && ('%' == 37) && ('&' == 38) && ('\'' == 39) && ('(' == 40) \
00283     && (')' == 41) && ('*' == 42) && ('+' == 43) && (',' == 44) \
00284     && ('-' == 45) && ('.' == 46) && ('/' == 47) && ('0' == 48) \
00285     && ('1' == 49) && ('2' == 50) && ('3' == 51) && ('4' == 52) \
00286     && ('5' == 53) && ('6' == 54) && ('7' == 55) && ('8' == 56) \
00287     && ('9' == 57) && (':' == 58) && (';' == 59) && ('<' == 60) \
00288     && ('=' == 61) && ('>' == 62) && ('?' == 63) && ('A' == 65) \
00289     && ('B' == 66) && ('C' == 67) && ('D' == 68) && ('E' == 69) \
00290     && ('F' == 70) && ('G' == 71) && ('H' == 72) && ('I' == 73) \
00291     && ('J' == 74) && ('K' == 75) && ('L' == 76) && ('M' == 77) \
00292     && ('N' == 78) && ('O' == 79) && ('P' == 80) && ('Q' == 81) \
00293     && ('R' == 82) && ('S' == 83) && ('T' == 84) && ('U' == 85) \
00294     && ('V' == 86) && ('W' == 87) && ('X' == 88) && ('Y' == 89) \
00295     && ('Z' == 90) && ('[' == 91) && ('\\' == 92) && (']' == 93) \
00296     && ('^' == 94) && ('_' == 95) && ('a' == 97) && ('b' == 98) \
00297     && ('c' == 99) && ('d' == 100) && ('e' == 101) && ('f' == 102) \
00298     && ('g' == 103) && ('h' == 104) && ('i' == 105) && ('j' == 106) \
00299     && ('k' == 107) && ('l' == 108) && ('m' == 109) && ('n' == 110) \
00300     && ('o' == 111) && ('p' == 112) && ('q' == 113) && ('r' == 114) \
00301     && ('s' == 115) && ('t' == 116) && ('u' == 117) && ('v' == 118) \
00302     && ('w' == 119) && ('x' == 120) && ('y' == 121) && ('z' == 122) \
00303     && ('{' == 123) && ('|' == 124) && ('}' == 125) && ('~' == 126)
00304 /* The character set is ISO-646, not EBCDIC. */
00305 # define IS_BASIC_ASCII 1
00306 
00307 extern unsigned int is_basic_table[];
00308 
00309 static inline bool
00310 is_basic (char c)
00311 {
00312   return (is_basic_table [(unsigned char) c >> 5] >> ((unsigned char) c & 31))
00313          & 1;
00314 }
00315 
00316 #else
00317 
00318 static inline bool
00319 is_basic (char c)
00320 {
00321   switch (c)
00322     {
00323     case '\t': case '\v': case '\f':
00324     case ' ': case '!': case '"': case '#': case '%':
00325     case '&': case '\'': case '(': case ')': case '*':
00326     case '+': case ',': case '-': case '.': case '/':
00327     case '0': case '1': case '2': case '3': case '4':
00328     case '5': case '6': case '7': case '8': case '9':
00329     case ':': case ';': case '<': case '=': case '>':
00330     case '?':
00331     case 'A': case 'B': case 'C': case 'D': case 'E':
00332     case 'F': case 'G': case 'H': case 'I': case 'J':
00333     case 'K': case 'L': case 'M': case 'N': case 'O':
00334     case 'P': case 'Q': case 'R': case 'S': case 'T':
00335     case 'U': case 'V': case 'W': case 'X': case 'Y':
00336     case 'Z':
00337     case '[': case '\\': case ']': case '^': case '_':
00338     case 'a': case 'b': case 'c': case 'd': case 'e':
00339     case 'f': case 'g': case 'h': case 'i': case 'j':
00340     case 'k': case 'l': case 'm': case 'n': case 'o':
00341     case 'p': case 'q': case 'r': case 's': case 't':
00342     case 'u': case 'v': case 'w': case 'x': case 'y':
00343     case 'z': case '{': case '|': case '}': case '~':
00344       return 1;
00345     default:
00346       return 0;
00347     }
00348 }
00349 
00350 #endif
00351 
00352 #endif /* _MBCHAR_H */

Generated on Thu Jan 24 16:50:54 2008 for WvStreams by  doxygen 1.5.4