Main Page | Class Hierarchy | Alphabetical List | Data Structures | File List | Data Fields | Globals | Related Pages

utf16.h

Go to the documentation of this file.
00001 /* 00002 ******************************************************************************* 00003 * 00004 * Copyright (C) 1999-2001, International Business Machines 00005 * Corporation and others. All Rights Reserved. 00006 * 00007 ******************************************************************************* 00008 * file name: utf16.h 00009 * encoding: US-ASCII 00010 * tab size: 8 (not used) 00011 * indentation:4 00012 * 00013 * created on: 1999sep09 00014 * created by: Markus W. Scherer 00015 */ 00016 00032 #ifndef __UTF16_H__ 00033 #define __UTF16_H__ 00034 00035 /* single-code point definitions -------------------------------------------- */ 00036 00037 /* handle surrogate pairs */ 00038 #define UTF_IS_FIRST_SURROGATE(uchar) (((uchar)&0xfffffc00)==0xd800) 00039 #define UTF_IS_SECOND_SURROGATE(uchar) (((uchar)&0xfffffc00)==0xdc00) 00040 00041 #define UTF_IS_SURROGATE_FIRST(c) (((c)&0x400)==0) 00042 00043 /* get the UTF-32 value directly from the surrogate pseudo-characters */ 00044 #define UTF_SURROGATE_OFFSET ((0xd800<<10UL)+0xdc00-0x10000) 00045 00046 #define UTF16_GET_PAIR_VALUE(first, second) \ 00047 (((first)<<10UL)+(second)-UTF_SURROGATE_OFFSET) 00048 00049 /* get the first and second surrogates for a supplementary code point */ 00055 #define UTF_FIRST_SURROGATE(supplementary) (UChar)(((supplementary)>>10)+0xd7c0) 00056 00062 #define UTF_SECOND_SURROGATE(supplementary) (UChar)(((supplementary)&0x3ff)|0xdc00) 00063 00065 #define UTF16_LEAD(supplementary) UTF_FIRST_SURROGATE(supplementary) 00066 00068 #define UTF16_TRAIL(supplementary) UTF_SECOND_SURROGATE(supplementary) 00069 00070 /* classes of code unit values */ 00071 #define UTF16_IS_SINGLE(uchar) !UTF_IS_SURROGATE(uchar) 00072 #define UTF16_IS_LEAD(uchar) UTF_IS_FIRST_SURROGATE(uchar) 00073 #define UTF16_IS_TRAIL(uchar) UTF_IS_SECOND_SURROGATE(uchar) 00074 00075 /* number of code units per code point */ 00076 #define UTF16_NEED_MULTIPLE_UCHAR(c) ((uint32_t)(c)>0xffff) 00077 #define UTF16_CHAR_LENGTH(c) ((uint32_t)(c)<=0xffff ? 1 : 2) 00078 #define UTF16_MAX_CHAR_LENGTH 2 00079 00080 /* average number of code units compared to UTF-16 */ 00081 #define UTF16_ARRAY_SIZE(size) (size) 00082 00083 /* 00084 * Get a single code point from an offset that points to any 00085 * of the code units that belong to that code point. 00086 * Assume 0<=i<length. 00087 * 00088 * This could be used for iteration together with 00089 * UTF16_CHAR_LENGTH() and UTF_IS_ERROR(), 00090 * but the use of UTF16_NEXT_CHAR_[UN]SAFE() and 00091 * UTF16_PREV_CHAR_[UN]SAFE() is more efficient for that. 00092 */ 00093 #define UTF16_GET_CHAR_UNSAFE(s, i, c) { \ 00094 (c)=(s)[i]; \ 00095 if(UTF_IS_SURROGATE(c)) { \ 00096 if(UTF_IS_SURROGATE_FIRST(c)) { \ 00097 (c)=UTF16_GET_PAIR_VALUE((c), (s)[(i)+1]); \ 00098 } else { \ 00099 (c)=UTF16_GET_PAIR_VALUE((s)[(i)-1], (c)); \ 00100 } \ 00101 } \ 00102 } 00103 00104 #define UTF16_GET_CHAR_SAFE(s, start, i, length, c, strict) { \ 00105 (c)=(s)[i]; \ 00106 if(UTF_IS_SURROGATE(c)) { \ 00107 uint16_t __c2; \ 00108 if(UTF_IS_SURROGATE_FIRST(c)) { \ 00109 if((i)+1<(length) && UTF_IS_SECOND_SURROGATE(__c2=(s)[(i)+1])) { \ 00110 (c)=UTF16_GET_PAIR_VALUE((c), __c2); \ 00111 /* strict: ((c)&0xfffe)==0xfffe is caught by UTF_IS_ERROR() and UTF_IS_UNICODE_CHAR() */ \ 00112 } else if(strict) {\ 00113 /* unmatched first surrogate */ \ 00114 (c)=UTF_ERROR_VALUE; \ 00115 } \ 00116 } else { \ 00117 if((i)-1>=(start) && UTF_IS_FIRST_SURROGATE(__c2=(s)[(i)-1])) { \ 00118 (c)=UTF16_GET_PAIR_VALUE(__c2, (c)); \ 00119 /* strict: ((c)&0xfffe)==0xfffe is caught by UTF_IS_ERROR() and UTF_IS_UNICODE_CHAR() */ \ 00120 } else if(strict) {\ 00121 /* unmatched second surrogate */ \ 00122 (c)=UTF_ERROR_VALUE; \ 00123 } \ 00124 } \ 00125 } else if((strict) && !UTF_IS_UNICODE_CHAR(c)) { \ 00126 (c)=UTF_ERROR_VALUE; \ 00127 } \ 00128 } 00129 00130 /* definitions with forward iteration --------------------------------------- */ 00131 00132 /* 00133 * all the macros that go forward assume that 00134 * the initial offset is 0<=i<length; 00135 * they update the offset 00136 */ 00137 00138 /* fast versions, no error-checking */ 00139 00140 /* 00141 * Get a single code point from an offset that points to the first 00142 * of the code units that belong to that code point. 00143 * Assume 0<=i<length. 00144 */ 00145 #define UTF16_NEXT_CHAR_UNSAFE(s, i, c) { \ 00146 (c)=(s)[(i)++]; \ 00147 if(UTF_IS_FIRST_SURROGATE(c)) { \ 00148 (c)=UTF16_GET_PAIR_VALUE((c), (s)[(i)++]); \ 00149 } \ 00150 } 00151 00152 #define UTF16_APPEND_CHAR_UNSAFE(s, i, c) { \ 00153 if((uint32_t)(c)<=0xffff) { \ 00154 (s)[(i)++]=(uint16_t)(c); \ 00155 } else { \ 00156 (s)[(i)++]=(uint16_t)(((c)>>10)+0xd7c0); \ 00157 (s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \ 00158 } \ 00159 } 00160 00161 #define UTF16_FWD_1_UNSAFE(s, i) { \ 00162 if(UTF_IS_FIRST_SURROGATE((s)[(i)++])) { \ 00163 ++(i); \ 00164 } \ 00165 } 00166 00167 #define UTF16_FWD_N_UNSAFE(s, i, n) { \ 00168 int32_t __N=(n); \ 00169 while(__N>0) { \ 00170 UTF16_FWD_1_UNSAFE(s, i); \ 00171 --__N; \ 00172 } \ 00173 } 00174 00175 /* 00176 * Set a random-access offset and adjust it so that 00177 * it points to the beginning of a Unicode character. 00178 * The offset that is passed in points to 00179 * any code unit of a code point 00180 * and will point to the first code unit after 00181 * the macro invocation. 00182 * Never increments the offset. 00183 */ 00184 #define UTF16_SET_CHAR_START_UNSAFE(s, i) { \ 00185 if(UTF_IS_SECOND_SURROGATE((s)[i])) { \ 00186 --(i); \ 00187 } \ 00188 } 00189 00190 /* safe versions with error-checking and optional regularity-checking */ 00191 00192 #define UTF16_NEXT_CHAR_SAFE(s, i, length, c, strict) { \ 00193 (c)=(s)[(i)++]; \ 00194 if(UTF_IS_FIRST_SURROGATE(c)) { \ 00195 uint16_t __c2; \ 00196 if((i)<(length) && UTF_IS_SECOND_SURROGATE(__c2=(s)[(i)])) { \ 00197 ++(i); \ 00198 (c)=UTF16_GET_PAIR_VALUE((c), __c2); \ 00199 /* strict: ((c)&0xfffe)==0xfffe is caught by UTF_IS_ERROR() and UTF_IS_UNICODE_CHAR() */ \ 00200 } else if(strict) {\ 00201 /* unmatched first surrogate */ \ 00202 (c)=UTF_ERROR_VALUE; \ 00203 } \ 00204 } else if((strict) && !UTF_IS_UNICODE_CHAR(c)) { \ 00205 /* unmatched second surrogate or other non-character */ \ 00206 (c)=UTF_ERROR_VALUE; \ 00207 } \ 00208 } 00209 00210 #define UTF16_APPEND_CHAR_SAFE(s, i, length, c) { \ 00211 if((uint32_t)(c)<=0xffff) { \ 00212 (s)[(i)++]=(uint16_t)(c); \ 00213 } else if((uint32_t)(c)<=0x10ffff) { \ 00214 if((i)+1<(length)) { \ 00215 (s)[(i)++]=(uint16_t)(((c)>>10)+0xd7c0); \ 00216 (s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \ 00217 } else /* not enough space */ { \ 00218 (s)[(i)++]=UTF_ERROR_VALUE; \ 00219 } \ 00220 } else /* c>0x10ffff, write error value */ { \ 00221 (s)[(i)++]=UTF_ERROR_VALUE; \ 00222 } \ 00223 } 00224 00225 #define UTF16_FWD_1_SAFE(s, i, length) { \ 00226 if(UTF_IS_FIRST_SURROGATE((s)[(i)++]) && (i)<(length) && UTF_IS_SECOND_SURROGATE((s)[i])) { \ 00227 ++(i); \ 00228 } \ 00229 } 00230 00231 #define UTF16_FWD_N_SAFE(s, i, length, n) { \ 00232 int32_t __N=(n); \ 00233 while(__N>0 && (i)<(length)) { \ 00234 UTF16_FWD_1_SAFE(s, i, length); \ 00235 --__N; \ 00236 } \ 00237 } 00238 00239 #define UTF16_SET_CHAR_START_SAFE(s, start, i) { \ 00240 if(UTF_IS_SECOND_SURROGATE((s)[i]) && (i)>(start) && UTF_IS_FIRST_SURROGATE((s)[(i)-1])) { \ 00241 --(i); \ 00242 } \ 00243 } 00244 00245 /* definitions with backward iteration -------------------------------------- */ 00246 00247 /* 00248 * all the macros that go backward assume that 00249 * the valid buffer range starts at offset 0 00250 * and that the initial offset is 0<i<=length; 00251 * they update the offset 00252 */ 00253 00254 /* fast versions, no error-checking */ 00255 00256 /* 00257 * Get a single code point from an offset that points behind the last 00258 * of the code units that belong to that code point. 00259 * Assume 0<=i<length. 00260 */ 00261 #define UTF16_PREV_CHAR_UNSAFE(s, i, c) { \ 00262 (c)=(s)[--(i)]; \ 00263 if(UTF_IS_SECOND_SURROGATE(c)) { \ 00264 (c)=UTF16_GET_PAIR_VALUE((s)[--(i)], (c)); \ 00265 } \ 00266 } 00267 00268 #define UTF16_BACK_1_UNSAFE(s, i) { \ 00269 if(UTF_IS_SECOND_SURROGATE((s)[--(i)])) { \ 00270 --(i); \ 00271 } \ 00272 } 00273 00274 #define UTF16_BACK_N_UNSAFE(s, i, n) { \ 00275 int32_t __N=(n); \ 00276 while(__N>0) { \ 00277 UTF16_BACK_1_UNSAFE(s, i); \ 00278 --__N; \ 00279 } \ 00280 } 00281 00282 /* 00283 * Set a random-access offset and adjust it so that 00284 * it points after the end of a Unicode character. 00285 * The offset that is passed in points behind 00286 * any code unit of a code point 00287 * and will point behind the last code unit after 00288 * the macro invocation. 00289 * Never decrements the offset. 00290 */ 00291 #define UTF16_SET_CHAR_LIMIT_UNSAFE(s, i) { \ 00292 if(UTF_IS_FIRST_SURROGATE((s)[(i)-1])) { \ 00293 ++(i); \ 00294 } \ 00295 } 00296 00297 /* safe versions with error-checking and optional regularity-checking */ 00298 00299 #define UTF16_PREV_CHAR_SAFE(s, start, i, c, strict) { \ 00300 (c)=(s)[--(i)]; \ 00301 if(UTF_IS_SECOND_SURROGATE(c)) { \ 00302 uint16_t __c2; \ 00303 if((i)>(start) && UTF_IS_FIRST_SURROGATE(__c2=(s)[(i)-1])) { \ 00304 --(i); \ 00305 (c)=UTF16_GET_PAIR_VALUE(__c2, (c)); \ 00306 /* strict: ((c)&0xfffe)==0xfffe is caught by UTF_IS_ERROR() and UTF_IS_UNICODE_CHAR() */ \ 00307 } else if(strict) {\ 00308 /* unmatched second surrogate */ \ 00309 (c)=UTF_ERROR_VALUE; \ 00310 } \ 00311 } else if((strict) && !UTF_IS_UNICODE_CHAR(c)) { \ 00312 /* unmatched first surrogate or other non-character */ \ 00313 (c)=UTF_ERROR_VALUE; \ 00314 } \ 00315 } 00316 00317 #define UTF16_BACK_1_SAFE(s, start, i) { \ 00318 if(UTF_IS_SECOND_SURROGATE((s)[--(i)]) && (i)>(start) && UTF_IS_FIRST_SURROGATE((s)[(i)-1])) { \ 00319 --(i); \ 00320 } \ 00321 } 00322 00323 #define UTF16_BACK_N_SAFE(s, start, i, n) { \ 00324 int32_t __N=(n); \ 00325 while(__N>0 && (i)>(start)) { \ 00326 UTF16_BACK_1_SAFE(s, start, i); \ 00327 --__N; \ 00328 } \ 00329 } 00330 00331 #define UTF16_SET_CHAR_LIMIT_SAFE(s, start, i, length) { \ 00332 if((start)<(i) && (i)<(length) && UTF_IS_FIRST_SURROGATE((s)[(i)-1]) && UTF_IS_SECOND_SURROGATE((s)[i])) { \ 00333 ++(i); \ 00334 } \ 00335 } 00336 00337 #endif

Generated on Fri Aug 13 09:53:50 2004 for ICU 2.1 by doxygen 1.3.7