nux-0.9.48
|
00001 /* 00002 * Copyright 2010 Inalogic® Inc. 00003 * 00004 * This program is free software: you can redistribute it and/or modify it 00005 * under the terms of the GNU Lesser General Public License, as 00006 * published by the Free Software Foundation; either version 2.1 or 3.0 00007 * of the License. 00008 * 00009 * This program is distributed in the hope that it will be useful, but 00010 * WITHOUT ANY WARRANTY; without even the implied warranties of 00011 * MERCHANTABILITY, SATISFACTORY QUALITY or FITNESS FOR A PARTICULAR 00012 * PURPOSE. See the applicable version of the GNU Lesser General Public 00013 * License for more details. 00014 * 00015 * You should have received a copy of both the GNU Lesser General Public 00016 * License along with this program. If not, see <http://www.gnu.org/licenses/> 00017 * 00018 * Authored by: Jay Taoko <jaytaoko@inalogic.com> 00019 * 00020 */ 00021 00022 00023 #ifndef NUNI_H 00024 #define NUNI_H 00025 00026 /* 00027 * Copyright 2001-2004 Unicode, Inc. 00028 * 00029 * Disclaimer 00030 * 00031 * This source code is provided as is by Unicode, Inc. No claims are 00032 * made as to fitness for any particular purpose. No warranties of any 00033 * kind are expressed or implied. The recipient agrees to determine 00034 * applicability of information provided. If this file has been 00035 * purchased on magnetic or optical media from Unicode, Inc., the 00036 * sole remedy for any claim will be exchange of defective media 00037 * within 90 days of receipt. 00038 * 00039 * Limitations on Rights to Redistribute This Code 00040 * 00041 * Unicode, Inc. hereby grants the right to freely use the information 00042 * supplied in this file in the creation of products supporting the 00043 * Unicode Standard, and to make copies of this file in any form 00044 * for internal or external distribution as long as this notice 00045 * remains attached. 00046 */ 00047 00048 /* --------------------------------------------------------------------- 00049 00050 Conversions between UTF32, UTF-16, and UTF-8. Header file. 00051 00052 Several funtions are included here, forming a complete set of 00053 conversions between the three formats. UTF-7 is not included 00054 here, but is handled in a separate source file. 00055 00056 Each of these routines takes pointers to input buffers and output 00057 buffers. The input buffers are const. 00058 00059 Each routine converts the text between *sourceStart and sourceEnd, 00060 putting the result into the buffer between *targetStart and 00061 targetEnd. Note: the end pointers are *after* the last item: e.g. 00062 *(sourceEnd - 1) is the last item. 00063 00064 The return result indicates whether the conversion was successful, 00065 and if not, whether the problem was in the source or target buffers. 00066 (Only the first encountered problem is indicated.) 00067 00068 After the conversion, *sourceStart and *targetStart are both 00069 updated to point to the end of last text successfully converted in 00070 the respective buffers. 00071 00072 Input parameters: 00073 sourceStart - pointer to a pointer to the source buffer. 00074 The contents of this are modified on return so that 00075 it points at the next thing to be converted. 00076 targetStart - similarly, pointer to pointer to the target buffer. 00077 sourceEnd, targetEnd - respectively pointers to the ends of the 00078 two buffers, for overflow checking only. 00079 00080 These conversion functions take a ConversionFlags argument. When this 00081 flag is set to strict, both irregular sequences and isolated surrogates 00082 will cause an error. When the flag is set to lenient, both irregular 00083 sequences and isolated surrogates are converted. 00084 00085 Whether the flag is strict or lenient, all illegal sequences will cause 00086 an error return. This includes sequences such as: <F4 90 80 80>, <C0 80>, 00087 or <A0> in UTF-8, and values above 0x10FFFF in UTF-32. Conformant code 00088 must check for illegal sequences. 00089 00090 When the flag is set to lenient, characters over 0x10FFFF are converted 00091 to the replacement character; otherwise (when the flag is set to strict) 00092 they constitute an error. 00093 00094 Output parameters: 00095 The value "sourceIllegal" is returned from some routines if the input 00096 sequence is malformed. When "sourceIllegal" is returned, the source 00097 value will point to the illegal value that caused the problem. E.g., 00098 in UTF-8 when a sequence is malformed, it points to the start of the 00099 malformed sequence. 00100 00101 Author: Mark E. Davis, 1994. 00102 Rev History: Rick McGowan, fixes & updates May 2001. 00103 Fixes & updates, Sept 2001. 00104 00105 ------------------------------------------------------------------------ */ 00106 00107 /* --------------------------------------------------------------------- 00108 The following 4 definitions are compiler-specific. 00109 The C standard does not guarantee that wchar_t has at least 00110 16 bits, so wchar_t is no less portable than unsigned short! 00111 All should be unsigned values to avoid sign extension during 00112 bit mask & shift operations. 00113 ------------------------------------------------------------------------ */ 00114 00115 //typedef unsigned long t_UTF32; /* at least 32 bits */ 00116 //typedef unsigned short t_UTF16; /* at least 16 bits */ 00117 //typedef unsigned char t_UTF8; /* typically 8 bits */ 00118 //typedef unsigned char Boolean; /* 0 or 1 */ 00119 00120 00121 namespace nux 00122 { 00123 00124 /* Some fundamental constants */ 00125 #define UNI_REPLACEMENT_CHAR (t_UTF32)0x0000FFFD 00126 #define UNI_MAX_BMP (t_UTF32)0x0000FFFF 00127 #define UNI_MAX_UTF16 (t_UTF32)0x0010FFFF 00128 #define UNI_MAX_UTF32 (t_UTF32)0x7FFFFFFF 00129 #define UNI_MAX_LEGAL_UTF32 (t_UTF32)0x0010FFFF 00130 00131 typedef enum 00132 { 00133 conversionOK = 0, /* conversion successful */ 00134 sourceExhausted, /* partial character in source, but hit end */ 00135 targetExhausted, /* insuff. room in target for conversion */ 00136 sourceIllegal /* source sequence is illegal/malformed */ 00137 } ConversionResult; 00138 00139 typedef enum 00140 { 00141 strictConversion = 0, 00142 lenientConversion 00143 } ConversionFlags; 00144 00145 ConversionResult ConvertUTF8toUTF16 ( 00146 const t_UTF8 **sourceStart, const t_UTF8 *sourceEnd, 00147 t_UTF16 **targetStart, t_UTF16 *targetEnd, ConversionFlags flags); 00148 00149 ConversionResult ConvertUTF16toUTF8 ( 00150 const t_UTF16 **sourceStart, const t_UTF16 *sourceEnd, 00151 t_UTF8 **targetStart, t_UTF8 *targetEnd, ConversionFlags flags); 00152 00153 ConversionResult ConvertUTF8toUTF32 ( 00154 const t_UTF8 **sourceStart, const t_UTF8 *sourceEnd, 00155 t_UTF32 **targetStart, t_UTF32 *targetEnd, ConversionFlags flags); 00156 00157 ConversionResult ConvertUTF32toUTF8 ( 00158 const t_UTF32 **sourceStart, const t_UTF32 *sourceEnd, 00159 t_UTF8 **targetStart, t_UTF8 *targetEnd, ConversionFlags flags); 00160 00161 ConversionResult ConvertUTF16toUTF32 ( 00162 const t_UTF16 **sourceStart, const t_UTF16 *sourceEnd, 00163 t_UTF32 **targetStart, t_UTF32 *targetEnd, ConversionFlags flags); 00164 00165 ConversionResult ConvertUTF32toUTF16 ( 00166 const t_UTF32 **sourceStart, const t_UTF32 *sourceEnd, 00167 t_UTF16 **targetStart, t_UTF16 *targetEnd, ConversionFlags flags); 00168 00169 bool isLegalUTF8Sequence (const t_UTF8 *source, const t_UTF8 *sourceEnd); 00170 00171 00172 /* intended to work the same as g_utf8_validate */ 00173 bool tr_utf8_validate ( const char *str, int max_len, const char **end ); 00174 00175 } 00176 00177 #endif /* NUNI_H */