00001 /* Iterating through multibyte strings: macros for multi-byte encodings. 00002 Copyright (C) 2001, 2005 Free Software Foundation, Inc. 00003 00004 This program is free software; you can redistribute it and/or modify 00005 it under the terms of the GNU Lesser General Public License as published by 00006 the Free Software Foundation; either version 2.1, or (at your option) 00007 any later version. 00008 00009 This program is distributed in the hope that it will be useful, 00010 but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 GNU Lesser General Public License for more details. 00013 00014 You should have received a copy of the GNU Lesser General Public License 00015 along with this program; if not, write to the Free Software Foundation, 00016 Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ 00017 00018 /* Written by Bruno Haible <bruno@clisp.org>. */ 00019 00020 /* The macros in this file implement forward iteration through a 00021 multi-byte string, without knowing its length a-priori. 00022 00023 With these macros, an iteration loop that looks like 00024 00025 char *iter; 00026 for (iter = buf; *iter != '\0'; iter++) 00027 { 00028 do_something (*iter); 00029 } 00030 00031 becomes 00032 00033 mbui_iterator_t iter; 00034 for (mbui_init (iter, buf); mbui_avail (iter); mbui_advance (iter)) 00035 { 00036 do_something (mbui_cur_ptr (iter), mb_len (mbui_cur (iter))); 00037 } 00038 00039 The benefit of these macros over plain use of mbrtowc is: 00040 - Handling of invalid multibyte sequences is possible without 00041 making the code more complicated, while still preserving the 00042 invalid multibyte sequences. 00043 00044 Compared to mbiter.h, the macros here don't need to know the string's 00045 length a-priori. The downside is that at each step, the look-ahead 00046 that guards against overrunning the terminating '\0' is more expensive. 00047 The mbui_* macros are therefore suitable when there is a high probability 00048 that only the first few multibyte characters need to be inspected. 00049 Whereas the mbi_* macros are better if usually the iteration runs 00050 through the entire string. 00051 00052 mbui_iterator_t 00053 is a type usable for variable declarations. 00054 00055 mbui_init (iter, startptr) 00056 initializes the iterator, starting at startptr. 00057 00058 mbui_avail (iter) 00059 returns true if there are more multibyte chracters available before 00060 the end of string is reached. In this case, mbui_cur (iter) is 00061 initialized to the next multibyte chracter. 00062 00063 mbui_advance (iter) 00064 advances the iterator by one multibyte character. 00065 00066 mbui_cur (iter) 00067 returns the current multibyte character, of type mbchar_t. All the 00068 macros defined in mbchar.h can be used on it. 00069 00070 mbui_cur_ptr (iter) 00071 return a pointer to the beginning of the current multibyte character. 00072 00073 mbui_reloc (iter, ptrdiff) 00074 relocates iterator when the string is moved by ptrdiff bytes. 00075 00076 Here are the function prototypes of the macros. 00077 00078 extern void mbui_init (mbui_iterator_t iter, const char *startptr); 00079 extern bool mbui_avail (mbui_iterator_t iter); 00080 extern void mbui_advance (mbui_iterator_t iter); 00081 extern mbchar_t mbui_cur (mbui_iterator_t iter); 00082 extern const char * mbui_cur_ptr (mbui_iterator_t iter); 00083 extern void mbui_reloc (mbui_iterator_t iter, ptrdiff_t ptrdiff); 00084 */ 00085 00086 #ifndef _MBUITER_H 00087 #define _MBUITER_H 1 00088 00089 #include <assert.h> 00090 #include <stdbool.h> 00091 #include <stdlib.h> 00092 00093 /* Tru64 with Desktop Toolkit C has a bug: <stdio.h> must be included before 00094 <wchar.h>. 00095 BSD/OS 4.1 has a bug: <stdio.h> and <time.h> must be included before 00096 <wchar.h>. */ 00097 #include <stdio.h> 00098 #include <time.h> 00099 #include <wchar.h> 00100 00101 #include "mbchar.h" 00102 #include "strnlen1.h" 00103 00104 struct mbuiter_multi 00105 { 00106 bool in_shift; /* true if next byte may not be interpreted as ASCII */ 00107 mbstate_t state; /* if in_shift: current shift state */ 00108 bool next_done; /* true if mbui_avail has already filled the following */ 00109 struct mbchar cur; /* the current character: 00110 const char *cur.ptr pointer to current character 00111 The following are only valid after mbui_avail. 00112 size_t cur.bytes number of bytes of current character 00113 bool cur.wc_valid true if wc is a valid wide character 00114 wchar_t cur.wc if wc_valid: the current character 00115 */ 00116 }; 00117 00118 static inline void 00119 mbuiter_multi_next (struct mbuiter_multi *iter) 00120 { 00121 if (iter->next_done) 00122 return; 00123 if (iter->in_shift) 00124 goto with_shift; 00125 /* Handle most ASCII characters quickly, without calling mbrtowc(). */ 00126 if (is_basic (*iter->cur.ptr)) 00127 { 00128 /* These characters are part of the basic character set. ISO C 99 00129 guarantees that their wide character code is identical to their 00130 char code. */ 00131 iter->cur.bytes = 1; 00132 iter->cur.wc = *iter->cur.ptr; 00133 iter->cur.wc_valid = true; 00134 } 00135 else 00136 { 00137 assert (mbsinit (&iter->state)); 00138 iter->in_shift = true; 00139 with_shift: 00140 iter->cur.bytes = mbrtowc (&iter->cur.wc, iter->cur.ptr, 00141 strnlen1 (iter->cur.ptr, MB_CUR_MAX), 00142 &iter->state); 00143 if (iter->cur.bytes == (size_t) -1) 00144 { 00145 /* An invalid multibyte sequence was encountered. */ 00146 iter->cur.bytes = 1; 00147 iter->cur.wc_valid = false; 00148 /* Whether to set iter->in_shift = false and reset iter->state 00149 or not is not very important; the string is bogus anyway. */ 00150 } 00151 else if (iter->cur.bytes == (size_t) -2) 00152 { 00153 /* An incomplete multibyte character at the end. */ 00154 iter->cur.bytes = strlen (iter->cur.ptr); 00155 iter->cur.wc_valid = false; 00156 /* Whether to set iter->in_shift = false and reset iter->state 00157 or not is not important; the string end is reached anyway. */ 00158 } 00159 else 00160 { 00161 if (iter->cur.bytes == 0) 00162 { 00163 /* A null wide character was encountered. */ 00164 iter->cur.bytes = 1; 00165 assert (*iter->cur.ptr == '\0'); 00166 assert (iter->cur.wc == 0); 00167 } 00168 iter->cur.wc_valid = true; 00169 00170 /* When in the initial state, we can go back treating ASCII 00171 characters more quickly. */ 00172 if (mbsinit (&iter->state)) 00173 iter->in_shift = false; 00174 } 00175 } 00176 iter->next_done = true; 00177 } 00178 00179 static inline void 00180 mbuiter_multi_reloc (struct mbuiter_multi *iter, ptrdiff_t ptrdiff) 00181 { 00182 iter->cur.ptr += ptrdiff; 00183 } 00184 00185 /* Iteration macros. */ 00186 typedef struct mbuiter_multi mbui_iterator_t; 00187 #define mbui_init(iter, startptr) \ 00188 ((iter).cur.ptr = (startptr), \ 00189 (iter).in_shift = false, memset (&(iter).state, '\0', sizeof (mbstate_t)), \ 00190 (iter).next_done = false) 00191 #define mbui_avail(iter) \ 00192 (mbuiter_multi_next (&(iter)), !mb_isnul ((iter).cur)) 00193 #define mbui_advance(iter) \ 00194 ((iter).cur.ptr += (iter).cur.bytes, (iter).next_done = false) 00195 00196 /* Access to the current character. */ 00197 #define mbui_cur(iter) (iter).cur 00198 #define mbui_cur_ptr(iter) (iter).cur.ptr 00199 00200 /* Relocation. */ 00201 #define mbui_reloc(iter, ptrdiff) mbuiter_multi_reloc (&iter, ptrdiff) 00202 00203 #endif /* _MBUITER_H */