00001
00002
00003
00004
00005
00006
00007
00008
00009
#ifndef NORMLZR_H
00010
#define NORMLZR_H
00011
00012
#include "unicode/utypes.h"
00013
#include "unicode/unistr.h"
00014
#include "unicode/chariter.h"
00015
#include "unicode/unorm.h"
00016
00017
struct UCharIterator;
00018
typedef struct UCharIterator UCharIterator;
00019
00020 U_NAMESPACE_BEGIN
00111 class U_COMMON_API Normalizer
00112 {
00113
public:
00119
enum {
00120 DONE=0xffff
00121 };
00122
00123
00124
00135 Normalizer(
const UnicodeString& str,
UNormalizationMode mode);
00136
00148 Normalizer(
const UChar* str, int32_t length,
UNormalizationMode mode);
00149
00160 Normalizer(
const CharacterIterator& iter,
UNormalizationMode mode);
00161
00166 Normalizer(
const Normalizer& copy);
00167
00172 ~Normalizer();
00173
00174
00175
00176
00177
00178
00196
static void normalize(
const UnicodeString& source,
00197
UNormalizationMode mode, int32_t options,
00198
UnicodeString& result,
00199 UErrorCode &status);
00200
00222
static void compose(
const UnicodeString& source,
00223 UBool compat, int32_t options,
00224
UnicodeString& result,
00225 UErrorCode &status);
00226
00249
static void decompose(
const UnicodeString& source,
00250 UBool compat, int32_t options,
00251
UnicodeString& result,
00252 UErrorCode &status);
00253
00272
static UNormalizationCheckResult
00273 quickCheck(
const UnicodeString &source,
UNormalizationMode mode, UErrorCode &status);
00274
00275
00276
00277
00278
00279
00280
00281
00282
00283
00284
00285
00286
00287
00288
00289
00290
00291
00292
00293
00294
00295
00296
00297
00298
00299
00300
00301
00302
00303
00304
static UnicodeString &
00305 concatenate(
UnicodeString &left,
UnicodeString &right,
00306
UnicodeString &result,
00307
UNormalizationMode mode, int32_t options,
00308 UErrorCode &errorCode);
00309
00310
00311
00312
00313
00322
UChar32 current(
void);
00323
00332
UChar32 first(
void);
00333
00342
UChar32 last(
void);
00343
00352
UChar32 next(
void);
00353
00362
UChar32 previous(
void);
00363
00383
UChar32 setIndex(int32_t index);
00384
00394
void setIndexOnly(int32_t index);
00395
00401
void reset(
void);
00402
00417 int32_t getIndex(
void)
const;
00418
00427 int32_t startIndex(
void)
const;
00428
00439 int32_t endIndex(
void)
const;
00440
00449 UBool operator==(
const Normalizer& that)
const;
00450
00459
inline UBool operator!=(
const Normalizer& that)
const;
00460
00467 Normalizer* clone(
void)
const;
00468
00475 int32_t hashCode(
void)
const;
00476
00477
00478
00479
00480
00496
void setMode(
UNormalizationMode newMode);
00497
00508
UNormalizationMode getUMode(
void)
const;
00509
00526
void setOption(int32_t option,
00527 UBool value);
00528
00539 UBool getOption(int32_t option)
const;
00540
00549
void setText(
const UnicodeString& newText,
00550 UErrorCode &status);
00551
00560
void setText(
const CharacterIterator& newText,
00561 UErrorCode &status);
00562
00572
void setText(
const UChar* newText,
00573 int32_t length,
00574 UErrorCode &status);
00581
void getText(
UnicodeString& result);
00582
00583
00584
00585
00586
00591
enum {
00592 COMPAT_BIT = 1,
00593 DECOMP_BIT = 2,
00594 COMPOSE_BIT = 4,
00595 FCD_BIT = 8
00596 };
00597
00602 enum EMode {
00616 NO_OP = 0,
00617
00633 COMPOSE = COMPOSE_BIT,
00634
00650 COMPOSE_COMPAT = COMPOSE_BIT | COMPAT_BIT,
00651
00667 DECOMP = DECOMP_BIT,
00668
00684 DECOMP_COMPAT = DECOMP_BIT | COMPAT_BIT,
00685
00689 FCD = FCD_BIT
00690 };
00691
00693
enum {
00712 IGNORE_HANGUL = 0x001
00713 };
00714
00725 Normalizer(
const UnicodeString& str,
00726 EMode mode);
00727
00746 Normalizer(
const UnicodeString& str,
00747 EMode mode,
00748 int32_t opt);
00749
00761 Normalizer(
const UChar* str,
00762 int32_t length,
00763 EMode mode);
00764
00780 Normalizer(
const UChar* str,
00781 int32_t length,
00782 EMode mode,
00783 int32_t option);
00784
00795 Normalizer(
const CharacterIterator& iter,
00796 EMode mode);
00797
00813 Normalizer(
const CharacterIterator& iter,
00814 EMode mode,
00815 int32_t opt);
00816
00837
inline static void
00838 normalize(
const UnicodeString& source,
00839 EMode mode,
00840 int32_t options,
00841
UnicodeString& result,
00842 UErrorCode &status);
00843
00860
inline static UNormalizationCheckResult
00861 quickCheck(
const UnicodeString& source,
00862 EMode mode,
00863 UErrorCode& status);
00864
00872
inline static UNormalizationMode getUNormalizationMode(EMode mode,
00873 UErrorCode& status);
00874
00882
inline static EMode getNormalizerEMode(UNormalizationMode mode,
00883 UErrorCode& status);
00884
00911
inline void setMode(EMode newMode);
00912
00919
inline EMode getMode(
void) const;
00920
00921 private:
00922
00923
00924
00925
00926
00927
00928 UBool nextNormalize();
00929 UBool previousNormalize();
00930
00931
void init(
CharacterIterator *iter);
00932
void clearBuffer(
void);
00933
00934
00935
00936 inline static UNormalizationMode getUMode(EMode mode);
00937
00938
00939
00940
00941
00942 UNormalizationMode fUMode;
00943 int32_t fOptions;
00944
00945
00946 UCharIterator *text;
00947
00948
00949
00950 int32_t currentIndex, nextIndex;
00951
00952
00953
UnicodeString buffer;
00954 int32_t bufferPos;
00955 };
00956
00957
00958
00959
00960
00961 inline UBool
00962 Normalizer::operator!= (const Normalizer& other)
const
00963
{
return ! operator==(other); }
00964
00965
inline void
00966 Normalizer::normalize(
const UnicodeString& source,
00967 EMode mode, int32_t options,
00968
UnicodeString& result,
00969 UErrorCode &status) {
00970 normalize(source, getUNormalizationMode(mode, status), options, result, status);
00971 }
00972
00973
inline UNormalizationCheckResult
00974 Normalizer::quickCheck(
const UnicodeString& source,
00975 EMode mode,
00976 UErrorCode &status) {
00977
return quickCheck(source, getUNormalizationMode(mode, status), status);
00978 }
00979
00980
inline void
00981 Normalizer::setMode(EMode newMode) {
00982 UErrorCode status =
U_ZERO_ERROR;
00983 fUMode = getUNormalizationMode(newMode, status);
00984 }
00985
00986
inline Normalizer::EMode
00987 Normalizer::getMode()
const {
00988 UErrorCode status =
U_ZERO_ERROR;
00989
return getNormalizerEMode(fUMode, status);
00990 }
00991
00992 inline UNormalizationMode Normalizer::getUNormalizationMode(
00993 Normalizer::EMode mode, UErrorCode &status)
00994 {
00995
if (
U_SUCCESS(status))
00996 {
00997
switch (mode)
00998 {
00999
case Normalizer::NO_OP :
01000
return UNORM_NONE;
01001
case Normalizer::COMPOSE :
01002
return UNORM_NFC;
01003
case Normalizer::COMPOSE_COMPAT :
01004
return UNORM_NFKC;
01005
case Normalizer::DECOMP :
01006
return UNORM_NFD;
01007
case Normalizer::DECOMP_COMPAT :
01008
return UNORM_NFKD;
01009
case Normalizer::FCD:
01010
return UNORM_FCD;
01011
default :
01012 status =
U_ILLEGAL_ARGUMENT_ERROR;
01013 }
01014 }
01015
return UNORM_DEFAULT;
01016 }
01017
01018
inline UNormalizationMode
01019
Normalizer::getUMode(Normalizer::EMode mode) {
01020
switch(mode) {
01021
case Normalizer::NO_OP :
01022
return UNORM_NONE;
01023
case Normalizer::COMPOSE :
01024
return UNORM_NFC;
01025
case Normalizer::COMPOSE_COMPAT :
01026
return UNORM_NFKC;
01027
case Normalizer::DECOMP :
01028
return UNORM_NFD;
01029
case Normalizer::DECOMP_COMPAT :
01030
return UNORM_NFKD;
01031
case Normalizer::FCD:
01032
return UNORM_FCD;
01033
default :
01034
return UNORM_DEFAULT;
01035 }
01036 }
01037
01038 inline Normalizer::EMode Normalizer::getNormalizerEMode(
01039 UNormalizationMode mode, UErrorCode &status)
01040 {
01041
if (
U_SUCCESS(status))
01042 {
01043
switch (mode)
01044 {
01045
case UNORM_NONE :
01046
return Normalizer::NO_OP;
01047
case UNORM_NFD :
01048
return Normalizer::DECOMP;
01049
case UNORM_NFKD :
01050
return Normalizer::DECOMP_COMPAT;
01051
case UNORM_NFC :
01052
return Normalizer::COMPOSE;
01053
case UNORM_NFKC :
01054
return Normalizer::COMPOSE_COMPAT;
01055
case UNORM_FCD:
01056
return Normalizer::FCD;
01057
default :
01058 status =
U_ILLEGAL_ARGUMENT_ERROR;
01059 }
01060 }
01061
return Normalizer::DECOMP_COMPAT;
01062 }
01063
01064 U_NAMESPACE_END
01065
#endif // _NORMLZR