Main Page | Class Hierarchy | Alphabetical List | Data Structures | File List | Data Fields | Globals | Related Pages

uchar.h File Reference

C API: Unicode Char. More...

#include "unicode/utypes.h"

Go to the source code of this file.

Defines

#define U_UNICODE_VERSION   "3.1.1"
 Unicode version number, default for the current ICU version.

#define UCHAR_MIN_VALUE   0
 The lowest Unicode code point value.

#define UCHAR_MAX_VALUE   0x10ffff
 The highest Unicode code point value (scalar value) according to The Unicode Standard.

#define U_MASK(x)   ((uint32_t)1<<(x))
 Get a single-bit bit set (a flag) from a bit number 0..31.

#define U_GC_CN_MASK   U_MASK(U_GENERAL_OTHER_TYPES)
 U_GC_XX_MASK constants are bit flags corresponding to Unicode general category values.

#define U_GC_LU_MASK   U_MASK(U_UPPERCASE_LETTER)
#define U_GC_LL_MASK   U_MASK(U_LOWERCASE_LETTER)
#define U_GC_LT_MASK   U_MASK(U_TITLECASE_LETTER)
#define U_GC_LM_MASK   U_MASK(U_MODIFIER_LETTER)
#define U_GC_LO_MASK   U_MASK(U_OTHER_LETTER)
#define U_GC_MN_MASK   U_MASK(U_NON_SPACING_MARK)
#define U_GC_ME_MASK   U_MASK(U_ENCLOSING_MARK)
#define U_GC_MC_MASK   U_MASK(U_COMBINING_SPACING_MARK)
#define U_GC_ND_MASK   U_MASK(U_DECIMAL_DIGIT_NUMBER)
#define U_GC_NL_MASK   U_MASK(U_LETTER_NUMBER)
#define U_GC_NO_MASK   U_MASK(U_OTHER_NUMBER)
#define U_GC_ZS_MASK   U_MASK(U_SPACE_SEPARATOR)
#define U_GC_ZL_MASK   U_MASK(U_LINE_SEPARATOR)
#define U_GC_ZP_MASK   U_MASK(U_PARAGRAPH_SEPARATOR)
#define U_GC_CC_MASK   U_MASK(U_CONTROL_CHAR)
#define U_GC_CF_MASK   U_MASK(U_FORMAT_CHAR)
#define U_GC_CO_MASK   U_MASK(U_PRIVATE_USE_CHAR)
#define U_GC_CS_MASK   U_MASK(U_SURROGATE)
#define U_GC_PD_MASK   U_MASK(U_DASH_PUNCTUATION)
#define U_GC_PS_MASK   U_MASK(U_START_PUNCTUATION)
#define U_GC_PE_MASK   U_MASK(U_END_PUNCTUATION)
#define U_GC_PC_MASK   U_MASK(U_CONNECTOR_PUNCTUATION)
#define U_GC_PO_MASK   U_MASK(U_OTHER_PUNCTUATION)
#define U_GC_SM_MASK   U_MASK(U_MATH_SYMBOL)
#define U_GC_SC_MASK   U_MASK(U_CURRENCY_SYMBOL)
#define U_GC_SK_MASK   U_MASK(U_MODIFIER_SYMBOL)
#define U_GC_SO_MASK   U_MASK(U_OTHER_SYMBOL)
#define U_GC_PI_MASK   U_MASK(U_INITIAL_PUNCTUATION)
#define U_GC_PF_MASK   U_MASK(U_FINAL_PUNCTUATION)
#define U_GC_L_MASK   (U_GC_LU_MASK|U_GC_LL_MASK|U_GC_LT_MASK|U_GC_LM_MASK|U_GC_LO_MASK)
#define U_GC_M_MASK   (U_GC_MN_MASK|U_GC_ME_MASK|U_GC_MC_MASK)
#define U_GC_N_MASK   (U_GC_ND_MASK|U_GC_NL_MASK|U_GC_NO_MASK)
#define U_GC_Z_MASK   (U_GC_ZS_MASK|U_GC_ZL_MASK|U_GC_ZP_MASK)
#define U_GC_C_MASK   (U_GC_CN_MASK|U_GC_CC_MASK|U_GC_CF_MASK|U_GC_CO_MASK|U_GC_CS_MASK)
#define U_GC_P_MASK
#define U_GC_S_MASK   (U_GC_SM_MASK|U_GC_SC_MASK|U_GC_SK_MASK|U_GC_SO_MASK)
#define U_GET_GC_MASK(c)   U_MASK(u_charType(c))
 Get a single-bit bit set for the general category of a character.

#define U_FOLD_CASE_DEFAULT   0
 Option value for case folding: use all mappings defined in CaseFolding.txt.

#define U_FOLD_CASE_EXCLUDE_SPECIAL_I   1
 Option value for case folding: exclude the mappings for dotted I and dotless i marked with 'I' in CaseFolding.txt.

#define u_charScript   ublock_getCode

Typedefs

typedef enum UProperty UProperty
 ICU 2.1

typedef enum UCharCategory UCharCategory
typedef enum UCharDirection UCharDirection
typedef enum UBlockCode UBlockCode
 ICU 2.0

typedef enum UCellWidth UCellWidth
 

typedef enum UCharNameChoice UCharNameChoice
 

typedef UBool U_CALLCONV UCharEnumTypeRange (const void *context, UChar32 start, UChar32 limit, UCharCategory type)
 Callback from u_enumCharTypes(), is called for each contiguous range of code points c (where start<=c<limit) with the same Unicode general category ("character type").

typedef UBool UEnumCharNamesFn (void *context, UChar32 code, UCharNameChoice nameChoice, const char *name, int32_t length)
 Type of a callback function for u_enumCharNames() that gets called for each Unicode character with the code point value and the character name.

typedef UBlockCode UCharScript

Enumerations

enum  UProperty {
  UCHAR_ALPHABETIC, UCHAR_BINARY_START = UCHAR_ALPHABETIC, UCHAR_ASCII_HEX_DIGIT, UCHAR_BIDI_CONTROL,
  UCHAR_BIDI_MIRRORED, UCHAR_DASH, UCHAR_DEFAULT_IGNORABLE_CODE_POINT, UCHAR_DEPRECATED,
  UCHAR_DIACRITIC, UCHAR_EXTENDER, UCHAR_FULL_COMPOSITION_EXCLUSION, UCHAR_GRAPHEME_BASE,
  UCHAR_GRAPHEME_EXTEND, UCHAR_GRAPHEME_LINK, UCHAR_HEX_DIGIT, UCHAR_HYPHEN,
  UCHAR_ID_CONTINUE, UCHAR_ID_START, UCHAR_IDEOGRAPHIC, UCHAR_IDS_BINARY_OPERATOR,
  UCHAR_IDS_TRINARY_OPERATOR, UCHAR_JOIN_CONTROL, UCHAR_LOGICAL_ORDER_EXCEPTION, UCHAR_LOWERCASE,
  UCHAR_MATH, UCHAR_NONCHARACTER_CODE_POINT, UCHAR_QUOTATION_MARK, UCHAR_RADICAL,
  UCHAR_SOFT_DOTTED, UCHAR_TERMINAL_PUNCTUATION, UCHAR_UNIFIED_IDEOGRAPH, UCHAR_UPPERCASE,
  UCHAR_WHITE_SPACE, UCHAR_XID_CONTINUE, UCHAR_XID_START, UCHAR_BINARY_LIMIT
}
 Selection constants for Unicode properties. More...

enum  UCharCategory {
  U_UNASSIGNED = 0, U_GENERAL_OTHER_TYPES = 0, U_UPPERCASE_LETTER = 1, U_LOWERCASE_LETTER = 2,
  U_TITLECASE_LETTER = 3, U_MODIFIER_LETTER = 4, U_OTHER_LETTER = 5, U_NON_SPACING_MARK = 6,
  U_ENCLOSING_MARK = 7, U_COMBINING_SPACING_MARK = 8, U_DECIMAL_DIGIT_NUMBER = 9, U_LETTER_NUMBER = 10,
  U_OTHER_NUMBER = 11, U_SPACE_SEPARATOR = 12, U_LINE_SEPARATOR = 13, U_PARAGRAPH_SEPARATOR = 14,
  U_CONTROL_CHAR = 15, U_FORMAT_CHAR = 16, U_PRIVATE_USE_CHAR = 17, U_SURROGATE = 18,
  U_DASH_PUNCTUATION = 19, U_START_PUNCTUATION = 20, U_END_PUNCTUATION = 21, U_CONNECTOR_PUNCTUATION = 22,
  U_OTHER_PUNCTUATION = 23, U_MATH_SYMBOL = 24, U_CURRENCY_SYMBOL = 25, U_MODIFIER_SYMBOL = 26,
  U_OTHER_SYMBOL = 27, U_INITIAL_PUNCTUATION = 28, U_FINAL_PUNCTUATION = 29, U_CHAR_CATEGORY_COUNT
}
 Data for enumerated Unicode general category types. More...

enum  UCharDirection {
  U_LEFT_TO_RIGHT = 0, U_RIGHT_TO_LEFT = 1, U_EUROPEAN_NUMBER = 2, U_EUROPEAN_NUMBER_SEPARATOR = 3,
  U_EUROPEAN_NUMBER_TERMINATOR = 4, U_ARABIC_NUMBER = 5, U_COMMON_NUMBER_SEPARATOR = 6, U_BLOCK_SEPARATOR = 7,
  U_SEGMENT_SEPARATOR = 8, U_WHITE_SPACE_NEUTRAL = 9, U_OTHER_NEUTRAL = 10, U_LEFT_TO_RIGHT_EMBEDDING = 11,
  U_LEFT_TO_RIGHT_OVERRIDE = 12, U_RIGHT_TO_LEFT_ARABIC = 13, U_RIGHT_TO_LEFT_EMBEDDING = 14, U_RIGHT_TO_LEFT_OVERRIDE = 15,
  U_POP_DIRECTIONAL_FORMAT = 16, U_DIR_NON_SPACING_MARK = 17, U_BOUNDARY_NEUTRAL = 18, U_CHAR_DIRECTION_COUNT
}
 This specifies the language directional property of a character set. More...

enum  UBlockCode {
  UBLOCK_BASIC_LATIN = 1, U_BASIC_LATIN = 1, UBLOCK_LATIN_1_SUPPLEMENT = 2, U_LATIN_1_SUPPLEMENT = 2,
  UBLOCK_LATIN_EXTENDED_A = 3, U_LATIN_EXTENDED_A = 3, UBLOCK_LATIN_EXTENDED_B = 4, U_LATIN_EXTENDED_B = 4,
  UBLOCK_IPA_EXTENSIONS = 5, U_IPA_EXTENSIONS = 5, UBLOCK_SPACING_MODIFIER_LETTERS = 6, U_SPACING_MODIFIER_LETTERS = 6,
  UBLOCK_COMBINING_DIACRITICAL_MARKS = 7, U_COMBINING_DIACRITICAL_MARKS = 7, UBLOCK_GREEK = 8, U_GREEK = 8,
  UBLOCK_CYRILLIC = 9, U_CYRILLIC = 9, UBLOCK_ARMENIAN = 10, U_ARMENIAN = 10,
  UBLOCK_HEBREW = 11, U_HEBREW = 11, UBLOCK_ARABIC = 12, U_ARABIC = 12,
  UBLOCK_SYRIAC = 13, U_SYRIAC = 13, UBLOCK_THAANA = 14, U_THAANA = 14,
  UBLOCK_DEVANAGARI = 15, U_DEVANAGARI = 15, UBLOCK_BENGALI = 16, U_BENGALI = 16,
  UBLOCK_GURMUKHI = 17, U_GURMUKHI = 17, UBLOCK_GUJARATI = 18, U_GUJARATI = 18,
  UBLOCK_ORIYA = 19, U_ORIYA = 19, UBLOCK_TAMIL = 20, U_TAMIL = 20,
  UBLOCK_TELUGU = 21, U_TELUGU = 21, UBLOCK_KANNADA = 22, U_KANNADA = 22,
  UBLOCK_MALAYALAM = 23, U_MALAYALAM = 23, UBLOCK_SINHALA = 24, U_SINHALA = 24,
  UBLOCK_THAI = 25, U_THAI = 25, UBLOCK_LAO = 26, U_LAO = 26,
  UBLOCK_TIBETAN = 27, U_TIBETAN = 27, UBLOCK_MYANMAR = 28, U_MYANMAR = 28,
  UBLOCK_GEORGIAN = 29, U_GEORGIAN = 29, UBLOCK_HANGUL_JAMO = 30, U_HANGUL_JAMO = 30,
  UBLOCK_ETHIOPIC = 31, U_ETHIOPIC = 31, UBLOCK_CHEROKEE = 32, U_CHEROKEE = 32,
  UBLOCK_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS = 33, U_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS = 33, UBLOCK_OGHAM = 34, U_OGHAM = 34,
  UBLOCK_RUNIC = 35, U_RUNIC = 35, UBLOCK_KHMER = 36, U_KHMER = 36,
  UBLOCK_MONGOLIAN = 37, U_MONGOLIAN = 37, UBLOCK_LATIN_EXTENDED_ADDITIONAL = 38, U_LATIN_EXTENDED_ADDITIONAL = 38,
  UBLOCK_GREEK_EXTENDED = 39, U_GREEK_EXTENDED = 39, UBLOCK_GENERAL_PUNCTUATION = 40, U_GENERAL_PUNCTUATION = 40,
  UBLOCK_SUPERSCRIPTS_AND_SUBSCRIPTS = 41, U_SUPERSCRIPTS_AND_SUBSCRIPTS = 41, UBLOCK_CURRENCY_SYMBOLS = 42, U_CURRENCY_SYMBOLS = 42,
  UBLOCK_COMBINING_MARKS_FOR_SYMBOLS = 43, U_COMBINING_MARKS_FOR_SYMBOLS = 43, UBLOCK_LETTERLIKE_SYMBOLS = 44, U_LETTERLIKE_SYMBOLS = 44,
  UBLOCK_NUMBER_FORMS = 45, U_NUMBER_FORMS = 45, UBLOCK_ARROWS = 46, U_ARROWS = 46,
  UBLOCK_MATHEMATICAL_OPERATORS = 47, U_MATHEMATICAL_OPERATORS = 47, UBLOCK_MISCELLANEOUS_TECHNICAL = 48, U_MISCELLANEOUS_TECHNICAL = 48,
  UBLOCK_CONTROL_PICTURES = 49, U_CONTROL_PICTURES = 49, UBLOCK_OPTICAL_CHARACTER_RECOGNITION = 50, U_OPTICAL_CHARACTER_RECOGNITION = 50,
  UBLOCK_ENCLOSED_ALPHANUMERICS = 51, U_ENCLOSED_ALPHANUMERICS = 51, UBLOCK_BOX_DRAWING = 52, U_BOX_DRAWING = 52,
  UBLOCK_BLOCK_ELEMENTS = 53, U_BLOCK_ELEMENTS = 53, UBLOCK_GEOMETRIC_SHAPES = 54, U_GEOMETRIC_SHAPES = 54,
  UBLOCK_MISCELLANEOUS_SYMBOLS = 55, U_MISCELLANEOUS_SYMBOLS = 55, UBLOCK_DINGBATS = 56, U_DINGBATS = 56,
  UBLOCK_BRAILLE_PATTERNS = 57, U_BRAILLE_PATTERNS = 57, UBLOCK_CJK_RADICALS_SUPPLEMENT = 58, U_CJK_RADICALS_SUPPLEMENT = 58,
  UBLOCK_KANGXI_RADICALS = 59, U_KANGXI_RADICALS = 59, UBLOCK_IDEOGRAPHIC_DESCRIPTION_CHARACTERS = 60, U_IDEOGRAPHIC_DESCRIPTION_CHARACTERS = 60,
  UBLOCK_CJK_SYMBOLS_AND_PUNCTUATION = 61, U_CJK_SYMBOLS_AND_PUNCTUATION = 61, UBLOCK_HIRAGANA = 62, U_HIRAGANA = 62,
  UBLOCK_KATAKANA = 63, U_KATAKANA = 63, UBLOCK_BOPOMOFO = 64, U_BOPOMOFO = 64,
  UBLOCK_HANGUL_COMPATIBILITY_JAMO = 65, U_HANGUL_COMPATIBILITY_JAMO = 65, UBLOCK_KANBUN = 66, U_KANBUN = 66,
  UBLOCK_BOPOMOFO_EXTENDED = 67, U_BOPOMOFO_EXTENDED = 67, UBLOCK_ENCLOSED_CJK_LETTERS_AND_MONTHS = 68, U_ENCLOSED_CJK_LETTERS_AND_MONTHS = 68,
  UBLOCK_CJK_COMPATIBILITY = 69, U_CJK_COMPATIBILITY = 69, UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A = 70, U_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A = 70,
  UBLOCK_CJK_UNIFIED_IDEOGRAPHS = 71, U_CJK_UNIFIED_IDEOGRAPHS = 71, UBLOCK_YI_SYLLABLES = 72, U_YI_SYLLABLES = 72,
  UBLOCK_YI_RADICALS = 73, U_YI_RADICALS = 73, UBLOCK_HANGUL_SYLLABLES = 74, U_HANGUL_SYLLABLES = 74,
  UBLOCK_HIGH_SURROGATES = 75, U_HIGH_SURROGATES = 75, UBLOCK_HIGH_PRIVATE_USE_SURROGATES = 76, U_HIGH_PRIVATE_USE_SURROGATES = 76,
  UBLOCK_LOW_SURROGATES = 77, U_LOW_SURROGATES = 77, UBLOCK_PRIVATE_USE = 78, UBLOCK_PRIVATE_USE_AREA = UBLOCK_PRIVATE_USE,
  U_PRIVATE_USE_AREA = 78, UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS = 79, U_CJK_COMPATIBILITY_IDEOGRAPHS = 79, UBLOCK_ALPHABETIC_PRESENTATION_FORMS = 80,
  U_ALPHABETIC_PRESENTATION_FORMS = 80, UBLOCK_ARABIC_PRESENTATION_FORMS_A = 81, U_ARABIC_PRESENTATION_FORMS_A = 81, UBLOCK_COMBINING_HALF_MARKS = 82,
  U_COMBINING_HALF_MARKS = 82, UBLOCK_CJK_COMPATIBILITY_FORMS = 83, U_CJK_COMPATIBILITY_FORMS = 83, UBLOCK_SMALL_FORM_VARIANTS = 84,
  U_SMALL_FORM_VARIANTS = 84, UBLOCK_ARABIC_PRESENTATION_FORMS_B = 85, U_ARABIC_PRESENTATION_FORMS_B = 85, UBLOCK_SPECIALS = 86,
  U_SPECIALS = 86, UBLOCK_HALFWIDTH_AND_FULLWIDTH_FORMS = 87, U_HALFWIDTH_AND_FULLWIDTH_FORMS = 87, UBLOCK_OLD_ITALIC = 88,
  UBLOCK_GOTHIC = 89, UBLOCK_DESERET = 90, UBLOCK_BYZANTINE_MUSICAL_SYMBOLS = 91, UBLOCK_MUSICAL_SYMBOLS = 92,
  UBLOCK_MATHEMATICAL_ALPHANUMERIC_SYMBOLS = 93, UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B = 94, UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT = 95, UBLOCK_TAGS = 96,
  UBLOCK_COUNT = 97, U_SCRIPT_COUNT = UBLOCK_COUNT, UBLOCK_INVALID_CODE = -1, U_CHAR_SCRIPT_COUNT = UBLOCK_COUNT,
  U_NO_SCRIPT = UBLOCK_COUNT
}
 Constants for Unicode blocks, generated from Unicode Data file Blocks.txt These are the same values as Unicode::EUnicodeScript ICU 2.0. More...

enum  UCellWidth {
  U_ZERO_WIDTH = 0, U_HALF_WIDTH = 1, U_FULL_WIDTH = 2, U_NEUTRAL_WIDTH = 3,
  U_CELL_WIDTH_COUNT
}
 Values returned by the u_getCellWidth() function. More...

enum  UCharNameChoice { U_UNICODE_CHAR_NAME, U_UNICODE_10_CHAR_NAME, U_EXTENDED_CHAR_NAME, U_CHAR_NAME_CHOICE_COUNT }
 Selector constants for u_charName(). More...


Functions

U_CAPI UBool U_EXPORT2 u_hasBinaryProperty (UChar32 c, UProperty which)
 Check a binary Unicode property for a code point.

U_CAPI UBool U_EXPORT2 u_isUAlphabetic (UChar32 c)
 Check if a code point has the Alphabetic Unicode property.

U_CAPI UBool U_EXPORT2 u_isULowercase (UChar32 c)
 Check if a code point has the Lowercase Unicode property.

U_CAPI UBool U_EXPORT2 u_isUUppercase (UChar32 c)
 Check if a code point has the Uppercase Unicode property.

U_CAPI UBool U_EXPORT2 u_isUWhiteSpace (UChar32 c)
 Check if a code point has the White_Space Unicode property.

U_CAPI UBool U_EXPORT2 u_islower (UChar32 c)
 Determines whether the specified UChar is a lowercase character according to UnicodeData.txt.

U_CAPI UBool U_EXPORT2 u_isupper (UChar32 c)
 Determines whether the specified character is an uppercase character according to UnicodeData.txt.

U_CAPI UBool U_EXPORT2 u_istitle (UChar32 c)
 Determines whether the specified character is a titlecase character according to UnicodeData.txt.

U_CAPI UBool U_EXPORT2 u_isdigit (UChar32 c)
 Determines whether the specified character is a digit according to UnicodeData.txt.

U_CAPI UBool U_EXPORT2 u_isalnum (UChar32 c)
 Determines whether the specified character is an alphanumeric character (letter or digit)according to UnicodeData.txt.

U_CAPI UBool U_EXPORT2 u_isdefined (UChar32 c)
 Determines whether the specified numeric value is actually a defined character according to UnicodeData.txt.

U_CAPI UBool U_EXPORT2 u_isalpha (UChar32 c)
 Determines whether the specified character is a letter according to UnicodeData.txt.

U_CAPI UBool U_EXPORT2 u_isspace (UChar32 c)
 Determines if the specified character is a space character or not.

U_CAPI UBool U_EXPORT2 u_isWhitespace (UChar32 c)
 Determines if the specified character is white space according to ICU.

U_CAPI UBool U_EXPORT2 u_iscntrl (UChar32 c)
 Determines whether the specified character is a control character or not.

U_CAPI UBool U_EXPORT2 u_isprint (UChar32 c)
 Determines whether the specified character is a printable character according to UnicodeData.txt.

U_CAPI UBool U_EXPORT2 u_isbase (UChar32 c)
 Determines whether the specified character is of the base form according to UnicodeData.txt.

U_CAPI UCharDirection U_EXPORT2 u_charDirection (UChar32 c)
 Returns the linguistic direction property of a character.

U_CAPI UBool U_EXPORT2 u_isMirrored (UChar32 c)
 Determines whether the character has the "mirrored" property.

U_CAPI UChar32 U_EXPORT2 u_charMirror (UChar32 c)
 Maps the specified character to a "mirror-image" character.

U_CAPI uint16_t U_EXPORT2 u_charCellWidth (UChar32 c)
 Returns a value indicating the display-cell width of the character when used in Asian text, according to the Unicode standard (see p.

U_CAPI int8_t U_EXPORT2 u_charType (UChar32 c)
 Returns a value indicating a character category.

U_CAPI void U_EXPORT2 u_enumCharTypes (UCharEnumTypeRange *enumRange, const void *context)
 Enumerate efficiently all code points with their Unicode general categories.

U_CAPI uint8_t U_EXPORT2 u_getCombiningClass (UChar32 c)
 Returns the combining class of the code point as specified in UnicodeData.txt.

U_CAPI int32_t U_EXPORT2 u_charDigitValue (UChar32 c)
 Retrives the decimal numeric value of a digit character.

U_CAPI UBlockCode U_EXPORT2 ublock_getCode (UChar32 ch)
 Returns the Unicode allocation block that contains the character.

U_CAPI int32_t U_EXPORT2 u_charName (UChar32 code, UCharNameChoice nameChoice, char *buffer, int32_t bufferLength, UErrorCode *pErrorCode)
 Retrieve the name of a Unicode character.

U_CAPI UChar32 U_EXPORT2 u_charFromName (UCharNameChoice nameChoice, const char *name, UErrorCode *pErrorCode)
 Find a Unicode character by its name and return its code point value.

U_CAPI void U_EXPORT2 u_enumCharNames (UChar32 start, UChar32 limit, UEnumCharNamesFn *fn, void *context, UCharNameChoice nameChoice, UErrorCode *pErrorCode)
 Enumerate all assigned Unicode characters between the start and limit code points (start inclusive, limit exclusive) and call a function for each, passing the code point value and the character name.

U_CAPI UBool U_EXPORT2 u_isIDStart (UChar32 c)
 A convenience method for determining if a Unicode character is allowed to start in a Unicode identifier.

U_CAPI UBool U_EXPORT2 u_isIDPart (UChar32 c)
 A convenience method for determining if a Unicode character may be part of a Unicode identifier other than the starting character.

U_CAPI UBool U_EXPORT2 u_isIDIgnorable (UChar32 c)
 A convenience method for determining if a Unicode character should be regarded as an ignorable character in a Unicode identifier.

U_CAPI UBool U_EXPORT2 u_isJavaIDStart (UChar32 c)
 A convenience method for determining if a Unicode character is allowed as the first character in a Java identifier.

U_CAPI UBool U_EXPORT2 u_isJavaIDPart (UChar32 c)
 A convenience method for determining if a Unicode character may be part of a Java identifier other than the starting character.

U_CAPI UChar32 U_EXPORT2 u_tolower (UChar32 c)
 The given character is mapped to its lowercase equivalent according to UnicodeData.txt; if the character has no lowercase equivalent, the character itself is returned.

U_CAPI UChar32 U_EXPORT2 u_toupper (UChar32 c)
 The given character is mapped to its uppercase equivalent according to UnicodeData.txt; if the character has no uppercase equivalent, the character itself is returned.

U_CAPI UChar32 U_EXPORT2 u_totitle (UChar32 c)
 The given character is mapped to its titlecase equivalent according to UnicodeData.txt.

U_CAPI UChar32 U_EXPORT2 u_foldCase (UChar32 c, uint32_t options)
 The given character is mapped to its case folding equivalent according to UnicodeData.txt and CaseFolding.txt; if the character has no case folding equivalent, the character itself is returned.

U_CAPI int32_t U_EXPORT2 u_digit (UChar32 ch, int8_t radix)
 Returns the numeric value of the character ch in the specified radix.

U_CAPI UChar32 U_EXPORT2 u_forDigit (int32_t digit, int8_t radix)
 Determines the character representation for a specific digit in the specified radix.

U_CAPI void U_EXPORT2 u_charAge (UChar32 c, UVersionInfo versionArray)
 Get the "age" of the code point.

U_CAPI void U_EXPORT2 u_getUnicodeVersion (UVersionInfo info)
 Gets the Unicode version information.


Detailed Description

C API: Unicode Char.

Unicode C API

The Unicode C API allows you to query the properties associated with individual Unicode character values.

The Unicode character information, provided implicitly by the Unicode character encoding standard, includes information about the script (for example, symbols or control characters) to which the character belongs, as well as semantic information such as whether a character is a digit or uppercase, lowercase, or uncased.


Define Documentation

#define u_charScript   ublock_getCode
 

Deprecated:
Use u_charBlock instead. Remove after Aug,2002

#define U_FOLD_CASE_DEFAULT   0
 

Option value for case folding: use all mappings defined in CaseFolding.txt.

ICU 1.8

#define U_FOLD_CASE_EXCLUDE_SPECIAL_I   1
 

Option value for case folding: exclude the mappings for dotted I and dotless i marked with 'I' in CaseFolding.txt.

ICU 1.8

#define U_GC_CN_MASK   U_MASK(U_GENERAL_OTHER_TYPES)
 

U_GC_XX_MASK constants are bit flags corresponding to Unicode general category values.

For each category, the nth bit is set if the numeric value of the corresponding UCharCategory constant is n.

There are also some U_GC_Y_MASK constants for groups of general categories like L& for all letter categories.

See also:
u_charType

U_GET_GC_MASK

UCharCategory ICU 2.1

#define U_GC_P_MASK
 

Value:

(U_GC_PD_MASK|U_GC_PS_MASK|U_GC_PE_MASK|U_GC_PC_MASK|U_GC_PO_MASK| \ U_GC_PI_MASK|U_GC_PF_MASK)

#define U_GET_GC_MASK  )     U_MASK(u_charType(c))
 

Get a single-bit bit set for the general category of a character.

This bit set can be compared bitwise with U_GC_SM_MASK, U_GC_L_MASK, etc. Same as U_MASK(u_charType(c)).

See also:
u_charType

UCharCategory

U_GC_CN_MASK ICU 2.1

#define U_MASK  )     ((uint32_t)1<<(x))
 

Get a single-bit bit set (a flag) from a bit number 0..31.

ICU 2.1

#define U_UNICODE_VERSION   "3.1.1"
 

Unicode version number, default for the current ICU version.

The actual Unicode Character Database (UCD) data is stored in uprops.dat and may be generated from UCD files from a different Unicode version. Call u_getUnicodeVersion to get the actual Unicode version of the data.

See also:
u_getUnicodeVersion

#define UCHAR_MAX_VALUE   0x10ffff
 

The highest Unicode code point value (scalar value) according to The Unicode Standard.

This is a 21-bit value (20.1 bits, rounded up). For a single character, UChar32 is a simple type that can hold any code point value.

#define UCHAR_MIN_VALUE   0
 

The lowest Unicode code point value.

Code points are non-negative.


Typedef Documentation

typedef UBool U_CALLCONV UCharEnumTypeRange(const void *context, UChar32 start, UChar32 limit, UCharCategory type)
 

Callback from u_enumCharTypes(), is called for each contiguous range of code points c (where start<=c<limit) with the same Unicode general category ("character type").

The callback function can stop the enumeration by returning FALSE.

Parameters:
context an opaque pointer, as passed into utrie_enum()
start the first code point in a contiguous range with value
limit one past the last code point in a contiguous range with value
type the general category for all code points in [start..limit[
Returns:
FALSE to stop the enumeration
ICU 2.1
See also:
UCharCategory

u_enumCharTypes

typedef UBlockCode UCharScript
 

Deprecated:
Use the enum UCharBlock instead. Remove after Aug,2002

typedef UBool UEnumCharNamesFn(void *context, UChar32 code, UCharNameChoice nameChoice, const char *name, int32_t length)
 

Type of a callback function for u_enumCharNames() that gets called for each Unicode character with the code point value and the character name.

If such a function returns FALSE, then the enumeration is stopped.

Parameters:
context The context pointer that was passed to u_enumCharNames().
code The Unicode code point for the character with this name.
nameChoice Selector for which kind of names is enumerated.
name The character's name, zero-terminated.
length The length of the name.
Returns:
TRUE if the enumeration should continue, FALSE to stop it.
See also:
UCharNameChoice

u_enumCharNames


Enumeration Type Documentation

enum UBlockCode
 

Constants for Unicode blocks, generated from Unicode Data file Blocks.txt These are the same values as Unicode::EUnicodeScript ICU 2.0.

Enumeration values:
UBLOCK_BASIC_LATIN  ICU 2.0
U_BASIC_LATIN 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_LATIN_1_SUPPLEMENT  ICU 2.0
U_LATIN_1_SUPPLEMENT 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_LATIN_EXTENDED_A  ICU 2.0
U_LATIN_EXTENDED_A 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_LATIN_EXTENDED_B  ICU 2.0
U_LATIN_EXTENDED_B 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_IPA_EXTENSIONS  ICU 2.0
U_IPA_EXTENSIONS 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_SPACING_MODIFIER_LETTERS  ICU 2.0
U_SPACING_MODIFIER_LETTERS 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_COMBINING_DIACRITICAL_MARKS  ICU 2.0
U_COMBINING_DIACRITICAL_MARKS 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_GREEK  ICU 2.0
U_GREEK 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_CYRILLIC  ICU 2.0
U_CYRILLIC 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_ARMENIAN  ICU 2.0
U_ARMENIAN 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_HEBREW  ICU 2.0
U_HEBREW 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_ARABIC  ICU 2.0
U_ARABIC 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_SYRIAC  ICU 2.0
U_SYRIAC 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_THAANA  ICU 2.0
U_THAANA 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_DEVANAGARI  ICU 2.0
U_DEVANAGARI 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_BENGALI  ICU 2.0
U_BENGALI 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_GURMUKHI  ICU 2.0
U_GURMUKHI 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_GUJARATI  ICU 2.0
U_GUJARATI 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_ORIYA  ICU 2.0
U_ORIYA 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_TAMIL  ICU 2.0
U_TAMIL 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_TELUGU  ICU 2.0
U_TELUGU 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_KANNADA  ICU 2.0
U_KANNADA 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_MALAYALAM  ICU 2.0
U_MALAYALAM 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_SINHALA  ICU 2.0
U_SINHALA 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_THAI  ICU 2.0
U_THAI 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_LAO  ICU 2.0
U_LAO 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_TIBETAN  ICU 2.0
U_TIBETAN 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_MYANMAR  ICU 2.0
U_MYANMAR 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_GEORGIAN  ICU 2.0
U_GEORGIAN 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_HANGUL_JAMO  ICU 2.0
U_HANGUL_JAMO 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_ETHIOPIC  ICU 2.0
U_ETHIOPIC 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_CHEROKEE  ICU 2.0
U_CHEROKEE 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS  ICU 2.0
U_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_OGHAM  ICU 2.0
U_OGHAM 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_RUNIC  ICU 2.0
U_RUNIC 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_KHMER  ICU 2.0
U_KHMER 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_MONGOLIAN  ICU 2.0
U_MONGOLIAN 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_LATIN_EXTENDED_ADDITIONAL  ICU 2.0
U_LATIN_EXTENDED_ADDITIONAL 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_GREEK_EXTENDED  ICU 2.0
U_GREEK_EXTENDED 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_GENERAL_PUNCTUATION  ICU 2.0
U_GENERAL_PUNCTUATION 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_SUPERSCRIPTS_AND_SUBSCRIPTS  ICU 2.0
U_SUPERSCRIPTS_AND_SUBSCRIPTS 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_CURRENCY_SYMBOLS  ICU 2.0
U_CURRENCY_SYMBOLS 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_COMBINING_MARKS_FOR_SYMBOLS  ICU 2.0
U_COMBINING_MARKS_FOR_SYMBOLS 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_LETTERLIKE_SYMBOLS  ICU 2.0
U_LETTERLIKE_SYMBOLS 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_NUMBER_FORMS  ICU 2.0
U_NUMBER_FORMS 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_ARROWS  ICU 2.0
U_ARROWS 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_MATHEMATICAL_OPERATORS  ICU 2.0
U_MATHEMATICAL_OPERATORS 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_MISCELLANEOUS_TECHNICAL  ICU 2.0
U_MISCELLANEOUS_TECHNICAL 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_CONTROL_PICTURES  ICU 2.0
U_CONTROL_PICTURES 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_OPTICAL_CHARACTER_RECOGNITION  ICU 2.0
U_OPTICAL_CHARACTER_RECOGNITION 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_ENCLOSED_ALPHANUMERICS  ICU 2.0
U_ENCLOSED_ALPHANUMERICS 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_BOX_DRAWING  ICU 2.0
U_BOX_DRAWING 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_BLOCK_ELEMENTS  ICU 2.0
U_BLOCK_ELEMENTS 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_GEOMETRIC_SHAPES  ICU 2.0
U_GEOMETRIC_SHAPES 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_MISCELLANEOUS_SYMBOLS  ICU 2.0
U_MISCELLANEOUS_SYMBOLS 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_DINGBATS  ICU 2.0
U_DINGBATS 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_BRAILLE_PATTERNS  ICU 2.0
U_BRAILLE_PATTERNS 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_CJK_RADICALS_SUPPLEMENT  ICU 2.0
U_CJK_RADICALS_SUPPLEMENT 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_KANGXI_RADICALS  ICU 2.0
U_KANGXI_RADICALS 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_IDEOGRAPHIC_DESCRIPTION_CHARACTERS  ICU 2.0
U_IDEOGRAPHIC_DESCRIPTION_CHARACTERS 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_CJK_SYMBOLS_AND_PUNCTUATION  ICU 2.0
U_CJK_SYMBOLS_AND_PUNCTUATION 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_HIRAGANA  ICU 2.0
U_HIRAGANA 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_KATAKANA  ICU 2.0
U_KATAKANA 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_BOPOMOFO  ICU 2.0
U_BOPOMOFO 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_HANGUL_COMPATIBILITY_JAMO  ICU 2.0
U_HANGUL_COMPATIBILITY_JAMO 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_KANBUN  ICU 2.0
U_KANBUN 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_BOPOMOFO_EXTENDED  ICU 2.0
U_BOPOMOFO_EXTENDED 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_ENCLOSED_CJK_LETTERS_AND_MONTHS  ICU 2.0
U_ENCLOSED_CJK_LETTERS_AND_MONTHS 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_CJK_COMPATIBILITY  ICU 2.0
U_CJK_COMPATIBILITY 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A  ICU 2.0
U_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_CJK_UNIFIED_IDEOGRAPHS  ICU 2.0
U_CJK_UNIFIED_IDEOGRAPHS 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_YI_SYLLABLES  ICU 2.0
U_YI_SYLLABLES 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_YI_RADICALS  ICU 2.0
U_YI_RADICALS 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_HANGUL_SYLLABLES  ICU 2.0
U_HANGUL_SYLLABLES 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_HIGH_SURROGATES  ICU 2.0
U_HIGH_SURROGATES 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_HIGH_PRIVATE_USE_SURROGATES  ICU 2.0
U_HIGH_PRIVATE_USE_SURROGATES 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_LOW_SURROGATES  ICU 2.0
U_LOW_SURROGATES 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_PRIVATE_USE  ICU 2.0
UBLOCK_PRIVATE_USE_AREA 
Deprecated:
Use UBLOCK_PRIVATE_USE. Remove after Aug, 2002
U_PRIVATE_USE_AREA 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS  ICU 2.0
U_CJK_COMPATIBILITY_IDEOGRAPHS 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_ALPHABETIC_PRESENTATION_FORMS  ICU 2.0
U_ALPHABETIC_PRESENTATION_FORMS 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_ARABIC_PRESENTATION_FORMS_A  ICU 2.0
U_ARABIC_PRESENTATION_FORMS_A 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_COMBINING_HALF_MARKS  ICU 2.0
U_COMBINING_HALF_MARKS 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_CJK_COMPATIBILITY_FORMS  ICU 2.0
U_CJK_COMPATIBILITY_FORMS 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_SMALL_FORM_VARIANTS  ICU 2.0
U_SMALL_FORM_VARIANTS 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_ARABIC_PRESENTATION_FORMS_B  ICU 2.0
U_ARABIC_PRESENTATION_FORMS_B 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_SPECIALS  ICU 2.0
U_SPECIALS 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_HALFWIDTH_AND_FULLWIDTH_FORMS  ICU 2.0
U_HALFWIDTH_AND_FULLWIDTH_FORMS 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_OLD_ITALIC  ICU 2.0
UBLOCK_GOTHIC  ICU 2.0
UBLOCK_DESERET  ICU 2.0
UBLOCK_BYZANTINE_MUSICAL_SYMBOLS  ICU 2.0
UBLOCK_MUSICAL_SYMBOLS  ICU 2.0
UBLOCK_MATHEMATICAL_ALPHANUMERIC_SYMBOLS  ICU 2.0
UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B  ICU 2.0
UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT  ICU 2.0
UBLOCK_TAGS  ICU 2.0
UBLOCK_COUNT  ICU 2.0
U_SCRIPT_COUNT 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
UBLOCK_INVALID_CODE  ICU 2.0
U_CHAR_SCRIPT_COUNT 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002
U_NO_SCRIPT 
Deprecated:
Use the enum that begins with UBLOCK. Remove after Aug, 2002

enum UCellWidth
 

Values returned by the u_getCellWidth() function.

Enumeration values:
U_ZERO_WIDTH 
U_HALF_WIDTH 
U_FULL_WIDTH 
U_NEUTRAL_WIDTH 
U_CELL_WIDTH_COUNT 

enum UCharCategory
 

Data for enumerated Unicode general category types.

See http://www.unicode.org/Public/UNIDATA/UnicodeData.html .

Enumeration values:
U_UNASSIGNED  Non-category for unassigned and non-character code points.

U_GENERAL_OTHER_TYPES  Cn "Other, Not Assigned (no characters in [UnicodeData.txt] have this property)" (same as U_UNASSIGNED!) ICU 2.0.
U_UPPERCASE_LETTER  Lu .
U_LOWERCASE_LETTER  Ll .
U_TITLECASE_LETTER  Lt .
U_MODIFIER_LETTER  Lm .
U_OTHER_LETTER  Lo .
U_NON_SPACING_MARK  Mn .
U_ENCLOSING_MARK  Me .
U_COMBINING_SPACING_MARK  Mc .
U_DECIMAL_DIGIT_NUMBER  Nd .
U_LETTER_NUMBER  Nl .
U_OTHER_NUMBER  No .
U_SPACE_SEPARATOR  Zs .
U_LINE_SEPARATOR  Zl .
U_PARAGRAPH_SEPARATOR  Zp .
U_CONTROL_CHAR  Cc .
U_FORMAT_CHAR  Cf .
U_PRIVATE_USE_CHAR  Co .
U_SURROGATE  Cs .
U_DASH_PUNCTUATION  Pd .
U_START_PUNCTUATION  Ps .
U_END_PUNCTUATION  Pe .
U_CONNECTOR_PUNCTUATION  Pc .
U_OTHER_PUNCTUATION  Po .
U_MATH_SYMBOL  Sm .
U_CURRENCY_SYMBOL  Sc .
U_MODIFIER_SYMBOL  Sk .
U_OTHER_SYMBOL  So .
U_INITIAL_PUNCTUATION  Pi .
U_FINAL_PUNCTUATION  Pf .
U_CHAR_CATEGORY_COUNT  One higher than the last enum UCharCategory constant.

enum UCharDirection
 

This specifies the language directional property of a character set.

Enumeration values:
U_LEFT_TO_RIGHT  L .
U_RIGHT_TO_LEFT  R .
U_EUROPEAN_NUMBER  EN .
U_EUROPEAN_NUMBER_SEPARATOR  ES .
U_EUROPEAN_NUMBER_TERMINATOR  ET .
U_ARABIC_NUMBER  AN .
U_COMMON_NUMBER_SEPARATOR  CS .
U_BLOCK_SEPARATOR  B .
U_SEGMENT_SEPARATOR  S .
U_WHITE_SPACE_NEUTRAL  WS .
U_OTHER_NEUTRAL  ON .
U_LEFT_TO_RIGHT_EMBEDDING  LRE .
U_LEFT_TO_RIGHT_OVERRIDE  LRO .
U_RIGHT_TO_LEFT_ARABIC  AL .
U_RIGHT_TO_LEFT_EMBEDDING  RLE .
U_RIGHT_TO_LEFT_OVERRIDE  RLO .
U_POP_DIRECTIONAL_FORMAT  PDF .
U_DIR_NON_SPACING_MARK  NSM .
U_BOUNDARY_NEUTRAL  BN .
U_CHAR_DIRECTION_COUNT 

enum UCharNameChoice
 

Selector constants for u_charName().

u_charName() returns the "modern" name of a Unicode character; or the name that was defined in Unicode version 1.0, before the Unicode standard merged with ISO-10646; or an "extended" name that gives each Unicode code point a unique name.

See also:
u_charName

enum UProperty
 

Selection constants for Unicode properties.

These constants are used in functions like u_hasBinaryProperty to select one of the Unicode properties.

The properties APIs are intended to reflect Unicode properties as defined in the Unicode Character Database (UCD) and Unicode Technical Reports (UTR). For details about the properties see http://www.unicode.org/ . For names of Unicode properties see the UCD file PropertyAliases.txt.

Important: If ICU is built with UCD files from Unicode versions below 3.2, then properties marked with "new in Unicode 3.2" are not or not fully available. Check u_getUnicodeVersion to be sure.

See also:
u_getUnicodeVersion ICU 2.1
Enumeration values:
UCHAR_ALPHABETIC  Binary property Alphabetic.

Same as u_isUAlphabetic, different from u_isalpha. Lu+Ll+Lt+Lm+Lo+Other_Alphabetic ICU 2.1

UCHAR_BINARY_START  First constant for binary Unicode properties.

ICU 2.1

UCHAR_ASCII_HEX_DIGIT  Binary property ASCII_Hex_Digit.

0-9 A-F a-f ICU 2.1

UCHAR_BIDI_CONTROL  Binary property Bidi_Control.

Format controls which have specific functions in the Bidi Algorithm. ICU 2.1

UCHAR_BIDI_MIRRORED  Binary property Bidi_Mirrored.

Characters that may change display in RTL text. Same as u_isMirrored. See Bidi Algorithm, UTR 9. ICU 2.1

UCHAR_DASH  Binary property Dash.

Variations of dashes. ICU 2.1

UCHAR_DEFAULT_IGNORABLE_CODE_POINT  Binary property Default_Ignorable_Code_Point (new in Unicode 3.2).

Ignorable in most processing. Cf+Cc+Cs+Other_Default_Ignorable_Code_Point-White_Space ICU 2.1

UCHAR_DEPRECATED  Binary property Deprecated (new in Unicode 3.2).

The usage of deprecated characters is strongly discouraged. ICU 2.1

UCHAR_DIACRITIC  Binary property Diacritic.

Characters that linguistically modify the meaning of another character to which they apply. ICU 2.1

UCHAR_EXTENDER  Binary property Extender.

Extend the value or shape of a preceding alphabetic character, e.g., length and iteration marks. ICU 2.1

UCHAR_FULL_COMPOSITION_EXCLUSION  Binary property Full_Composition_Exclusion.

CompositionExclusions.txt+Singleton Decompositions+ Non-Starter Decompositions. ICU 2.1

UCHAR_GRAPHEME_BASE  Binary property Grapheme_Base (new in Unicode 3.2).

For programmatic determination of grapheme cluster boundaries. [0..10FFFF]-Cc-Cf-Cs-Co-Cn-Zl-Zp-Grapheme_Link-Grapheme_Extend ICU 2.1

UCHAR_GRAPHEME_EXTEND  Binary property Grapheme_Extend (new in Unicode 3.2).

For programmatic determination of grapheme cluster boundaries. Me+Mn+Mc+Other_Grapheme_Extend-Grapheme_Link ICU 2.1

UCHAR_GRAPHEME_LINK  Binary property Grapheme_Link (new in Unicode 3.2).

For programmatic determination of grapheme cluster boundaries. ICU 2.1

UCHAR_HEX_DIGIT  Binary property Hex_Digit.

Characters commonly used for hexadecimal numbers. ICU 2.1

UCHAR_HYPHEN  Binary property Hyphen.

Dashes used to mark connections between pieces of words, plus the Katakana middle dot. ICU 2.1

UCHAR_ID_CONTINUE  Binary property ID_Continue.

Characters that can continue an identifier. ID_Start+Mn+Mc+Nd+Pc ICU 2.1

UCHAR_ID_START  Binary property ID_Start.

Characters that can start an identifier. Lu+Ll+Lt+Lm+Lo+Nl ICU 2.1

UCHAR_IDEOGRAPHIC  Binary property Ideographic.

CJKV ideographs. ICU 2.1

UCHAR_IDS_BINARY_OPERATOR  Binary property IDS_Binary_Operator (new in Unicode 3.2).

For programmatic determination of Ideographic Description Sequences. ICU 2.1

UCHAR_IDS_TRINARY_OPERATOR  Binary property IDS_Trinary_Operator (new in Unicode 3.2).

For programmatic determination of Ideographic Description Sequences. ICU 2.1

UCHAR_JOIN_CONTROL  Binary property Join_Control.

Format controls for cursive joining and ligation. ICU 2.1

UCHAR_LOGICAL_ORDER_EXCEPTION  Binary property Logical_Order_Exception (new in Unicode 3.2).

Characters that do not use logical order and require special handling in most processing. ICU 2.1

UCHAR_LOWERCASE  Binary property Lowercase.

Same as u_isULowercase, different from u_islower. Ll+Other_Lowercase ICU 2.1

UCHAR_MATH  Binary property Math.

Sm+Other_Math ICU 2.1

UCHAR_NONCHARACTER_CODE_POINT  Binary property Noncharacter_Code_Point.

Code points that are explicitly defined as illegal for the encoding of characters. ICU 2.1

UCHAR_QUOTATION_MARK  Binary property Quotation_Mark.

ICU 2.1

UCHAR_RADICAL  Binary property Radical (new in Unicode 3.2).

For programmatic determination of Ideographic Description Sequences. ICU 2.1

UCHAR_SOFT_DOTTED  Binary property Soft_Dotted (new in Unicode 3.2).

Characters with a "soft dot", like i or j. An accent placed on these characters causes the dot to disappear. ICU 2.1

UCHAR_TERMINAL_PUNCTUATION  Binary property Terminal_Punctuation.

Punctuation characters that generally mark the end of textual units. ICU 2.1

UCHAR_UNIFIED_IDEOGRAPH  Binary property Unified_Ideograph (new in Unicode 3.2).

For programmatic determination of Ideographic Description Sequences. ICU 2.1

UCHAR_UPPERCASE  Binary property Uppercase.

Same as u_isUUppercase, different from u_isupper. Lu+Other_Uppercase ICU 2.1

UCHAR_WHITE_SPACE  Binary property White_Space.

Same as u_isUWhiteSpace, different from u_isspace and u_isWhitespace. Space characters+TAB+CR+LF-ZWSP-ZWNBSP ICU 2.1

UCHAR_XID_CONTINUE  Binary property XID_Continue.

ID_Continue modified to allow closure under normalization forms NFKC and NFKD. ICU 2.1

UCHAR_XID_START  Binary property XID_Start.

ID_Start modified to allow closure under normalization forms NFKC and NFKD. ICU 2.1

UCHAR_BINARY_LIMIT  One more than the last constant for binary Unicode properties.

ICU 2.1


Function Documentation

U_CAPI void U_EXPORT2 u_charAge UChar32  c,
UVersionInfo  versionArray
 

Get the "age" of the code point.

The "age" is the Unicode version when the code point was first designated (as a non-character or for Private Use) or assigned a character. This can be useful to avoid emitting code points to receiving processes that do not accept newer characters. The data is from the UCD file DerivedAge.txt.

Parameters:
c The code point.
versionArray The Unicode version number array, to be filled in.
ICU 2.1

U_CAPI uint16_t U_EXPORT2 u_charCellWidth UChar32  c  ) 
 

Returns a value indicating the display-cell width of the character when used in Asian text, according to the Unicode standard (see p.

6-130 of The Unicode Standard, Version 2.0). The results for various characters are as follows:

U_ZERO_WIDTH: Characters which are considered to take up no display-cell space: control characters format characters line and paragraph separators non-spacing marks combining Hangul jungseong combining Hangul jongseong unassigned Unicode values

U_HALF_WIDTH: Characters which take up half a cell in standard Asian text: all characters in the General Scripts Area except combining Hangul choseong and the characters called out specifically above as ZERO_WIDTH alphabetic and Arabic presentation forms halfwidth CJK punctuation halfwidth Katakana halfwidth Hangul Jamo halfwidth forms, arrows, and shapes

U_FULL_WIDTH: Characters which take up a full cell in standard Asian text: combining Hangul choseong all characters in the CJK Phonetics and Symbols Area all characters in the CJK Ideographs Area all characters in the Hangul Syllables Area CJK compatibility ideographs CJK compatibility forms small form variants fullwidth ASCII fullwidth punctuation and currency signs

U_NEUTRAL_WIDTH: Characters whose cell width is context-dependent: all characters in the Symbols Area, except those specifically called out above all characters in the Surrogates Area all charcaters in the Private Use Area

For Korean text, this algorithm should work properly with properly normalized Korean text. Precomposed Hangul syllables and non-combining jamo are all considered full- width characters. For combining jamo, we treat we treat choseong (initial consonants) as double-width characters and junseong (vowels) and jongseong (final consonants) as non-spacing marks. This will work right in text that uses the precomposed choseong characters instead of teo choseong characters in a row, and which uses the choseong filler character at the beginning of syllables that don't have an initial consonant. The results may be slightly off with Korean text following different conventions.

U_CAPI int32_t U_EXPORT2 u_charDigitValue UChar32  c  ) 
 

Retrives the decimal numeric value of a digit character.

Parameters:
c the digit character for which to get the numeric value
Returns:
the numeric value of ch in decimal radix. This method returns -1 if ch is not a valid digit character.

U_CAPI UCharDirection U_EXPORT2 u_charDirection UChar32  c  ) 
 

Returns the linguistic direction property of a character.

Returns the linguistic direction property of a character. For example, 0x0041 (letter A) has the LEFT_TO_RIGHT directional property.

See also:
UCharDirection

U_CAPI UChar32 U_EXPORT2 u_charFromName UCharNameChoice  nameChoice,
const char *  name,
UErrorCode pErrorCode
 

Find a Unicode character by its name and return its code point value.

The name is matched exactly and completely. If the name does not correspond to a code point, pErrorCode is set to U_INVALID_CHAR_FOUND. A Unicode 1.0 name is matched only if it differs from the modern name. Unicode names are all uppercase. Extended names are lowercase followed by an uppercase hexadecimal number, and within angle brackets.

Parameters:
nameChoice Selector for which name to match.
name The name to match.
pErrorCode Pointer to a UErrorCode variable
Returns:
The Unicode value of the code point with the given name, or an undefined value if there is no such code point.
See also:
UCharNameChoice

u_charName

u_enumCharNames

U_CAPI UChar32 U_EXPORT2 u_charMirror UChar32  c  ) 
 

Maps the specified character to a "mirror-image" character.

For characters with the "mirrored" property, implementations sometimes need a "poor man's" mapping to another Unicode character (code point) such that the default glyph may serve as the mirror-image of the default glyph of the specified character. This is useful for text conversion to and from codepages with visual order, and for displays without glyph selecetion capabilities.

Parameters:
c the character (code point, Unicode scalar value) to be mapped
Returns:
another Unicode code point that may serve as a mirror-image substitute, or c itself if there is no such mapping or c does not have the "mirrored" property

U_CAPI int32_t U_EXPORT2 u_charName UChar32  code,
UCharNameChoice  nameChoice,
char *  buffer,
int32_t  bufferLength,
UErrorCode pErrorCode
 

Retrieve the name of a Unicode character.

Depending on nameChoice, the character name written into the buffer is the "modern" name or the name that was defined in Unicode version 1.0. The name contains only "invariant" characters like A-Z, 0-9, space, and '-'. Unicode 1.0 names are only retrieved if they are different from the modern names and if the data file contains the data for them. gennames may or may not be called with a command line option to include 1.0 names in unames.dat.

Parameters:
code The character (code point) for which to get the name. It must be 0<=code<0x10ffff.
nameChoice Selector for which name to get.
buffer Destination address for copying the name. The name will always be zero-terminated. If there is no name, then the buffer will be set to the empty string.
bufferLength ==sizeof(buffer)
pErrorCode Pointer to a UErrorCode variable; check for U_SUCCESS() after u_charName() returns.
Returns:
The length of the name, or 0 if there is no name for this character. If the bufferLength is less than or equal to the length, then the buffer contains the truncated name and the returned length indicates the full length of the name. The length does not include the zero-termination.
See also:
UCharNameChoice

u_charFromName

u_enumCharNames

U_CAPI int8_t U_EXPORT2 u_charType UChar32  c  ) 
 

Returns a value indicating a character category.

The categories are taken from the Unicode Character Database (UCD) in UnicodeData.txt.

Parameters:
c the character to be tested
Returns:
a value of type int, the character category.
See also:
UCharCategory

U_CAPI int32_t U_EXPORT2 u_digit UChar32  ch,
int8_t  radix
 

Returns the numeric value of the character ch in the specified radix.

If the radix is not in the range 2 <= radix <= 36 or if the value of ch is not a valid digit in the specified radix, -1 is returned. A character is a valid digit if at least one of the following is true:

  • The method u_isdigit is true of the character and the Unicode decimal digit value of the character (or its single-character decomposition) is less than the specified radix. In this case the decimal digit value is returned.
  • The character is one of the uppercase Latin letters 'A' through 'Z' and its code is less than radix + 'A' - 10. In this case, ch - 'A' + 10 is returned.
  • The character is one of the lowercase Latin letters 'a' through 'z' and its code is less than radix + 'a' - 10. In this case, ch - 'a' + 10 is returned.

Parameters:
ch the character to be converted.
radix the radix.
Returns:
the numeric value represented by the character in the specified radix.
See also:
u_forDigit

u_charDigitValue

u_isdigit ICU 2.0

U_CAPI void U_EXPORT2 u_enumCharNames UChar32  start,
UChar32  limit,
UEnumCharNamesFn fn,
void *  context,
UCharNameChoice  nameChoice,
UErrorCode pErrorCode
 

Enumerate all assigned Unicode characters between the start and limit code points (start inclusive, limit exclusive) and call a function for each, passing the code point value and the character name.

For Unicode 1.0 names, only those are enumerated that differ from the modern names.

Parameters:
start The first code point in the enumeration range.
limit One more than the last code point in the enumeration range (the first one after the range).
fn The function that is to be called for each character name.
context An arbitrary pointer that is passed to the function.
nameChoice Selector for which kind of names to enumerate.
pErrorCode Pointer to a UErrorCode variable
See also:
UCharNameChoice

UEnumCharNamesFn

u_charName

u_charFromName

U_CAPI void U_EXPORT2 u_enumCharTypes UCharEnumTypeRange enumRange,
const void *  context
 

Enumerate efficiently all code points with their Unicode general categories.

This is useful for building data structures (e.g., UnicodeSet's), for enumerating all assigned code points (type!=U_UNASSIGNED), etc.

For each contiguous range of code points with a given general category ("character type"), the UCharEnumTypeRange function is called. Adjacent ranges have different types. The Unicode Standard guarantees that the numeric value of the type is 0..31.

Parameters:
enumRange a pointer to a function that is called for each contiguous range of code points with the same general category
context an opaque pointer that is passed on to the callback function
ICU 2.1
See also:
UCharCategory

UCharEnumTypeRange

U_CAPI UChar32 U_EXPORT2 u_foldCase UChar32  c,
uint32_t  options
 

The given character is mapped to its case folding equivalent according to UnicodeData.txt and CaseFolding.txt; if the character has no case folding equivalent, the character itself is returned.

Only "simple", single-code point case folding mappings are used. "Full" mappings are used by u_strFoldCase().

Parameters:
c the character to be converted
options Either U_FOLD_CASE_DEFAULT or U_FOLD_CASE_EXCLUDE_SPECIAL_I
Returns:
the case folding equivalent of the character, if any; otherwise the character itself. ICU 1.8

U_CAPI UChar32 U_EXPORT2 u_forDigit int32_t  digit,
int8_t  radix
 

Determines the character representation for a specific digit in the specified radix.

If the value of radix is not a valid radix, or the value of digit is not a valid digit in the specified radix, the null character (U+0000) is returned.

The radix argument is valid if it is greater than or equal to 2 and less than or equal to 36. The digit argument is valid if 0 <= digit < radix.

If the digit is less than 10, then '0' + digit is returned. Otherwise, the value 'a' + digit - 10 is returned.

Parameters:
digit the number to convert to a character.
radix the radix.
Returns:
the char representation of the specified digit in the specified radix.
See also:
u_digit

u_charDigitValue

u_isdigit ICU 2.0

U_CAPI uint8_t U_EXPORT2 u_getCombiningClass UChar32  c  ) 
 

Returns the combining class of the code point as specified in UnicodeData.txt.

Parameters:
c the code point of the character
Returns:
the combining class of the character

U_CAPI void U_EXPORT2 u_getUnicodeVersion UVersionInfo  info  ) 
 

Gets the Unicode version information.

The version array stores the version information for the Unicode standard that is currently used by ICU. For example, release "1.3.31.2" is then represented as 0x01031F02.

Parameters:
versionArray the version # information, the result will be filled in

U_CAPI UBool U_EXPORT2 u_hasBinaryProperty UChar32  c,
UProperty  which
 

Check a binary Unicode property for a code point.

Unicode, especially in version 3.2, defines many more properties than the original set in UnicodeData.txt. This API is intended to reflect Unicode properties as defined in the Unicode Character Database (UCD) and Unicode Technical Reports (UTR). For details about the properties see http://www.unicode.org/ . For names of Unicode properties see the UCD file PropertyAliases.txt.

The properties APIs are intended to reflect Unicode properties as defined in the Unicode Character Database (UCD) and Unicode Technical Reports (UTR). For details about the properties see http://www.unicode.org/ . For names of Unicode properties see the UCD file PropertyAliases.txt.

Important: If ICU is built with UCD files from Unicode versions below 3.2, then properties marked with "new in Unicode 3.2" are not or not fully available.

Parameters:
c Code point to test.
which UProperty selector constant, identifies which binary property to check. Must be UCHAR_BINARY_START<=which<UCHAR_BINARY_LIMIT.
Returns:
TRUE or FALSE according to the binary Unicode property value for c. Also FALSE if which is out of bounds or if the Unicode version does not have data for the property at all, or not for this code point.
See also:
UProperty

u_getUnicodeVersion ICU 2.1

U_CAPI UBool U_EXPORT2 u_isalnum UChar32  c  ) 
 

Determines whether the specified character is an alphanumeric character (letter or digit)according to UnicodeData.txt.

Parameters:
ch the character to be tested
Returns:
true if the character is a letter or a digit; false otherwise.

U_CAPI UBool U_EXPORT2 u_isalpha UChar32  c  ) 
 

Determines whether the specified character is a letter according to UnicodeData.txt.

Parameters:
ch the character to be tested
Returns:
true if the character is a letter; false otherwise.
See also:
u_isdigit

u_isalnum

U_CAPI UBool U_EXPORT2 u_isbase UChar32  c  ) 
 

Determines whether the specified character is of the base form according to UnicodeData.txt.

Parameters:
ch the character to be tested
Returns:
true if the Unicode character is of the base form; false otherwise.
See also:
u_isalpha

u_isdigit

U_CAPI UBool U_EXPORT2 u_iscntrl UChar32  c  ) 
 

Determines whether the specified character is a control character or not.

A control character is one of the following:

  • ISO 8-bit control character (U+0000..U+001f and U+007f..U+009f)
  • U_CONTROL_CHAR (Cc)
  • U_FORMAT_CHAR (Cf)
  • U_LINE_SEPARATOR (Zl)
  • U_PARAGRAPH_SEPARATOR (Zp)

Parameters:
ch the character to be tested
Returns:
true if the Unicode character is a control character; false otherwise.
See also:
u_isprint

U_CAPI UBool U_EXPORT2 u_isdefined UChar32  c  ) 
 

Determines whether the specified numeric value is actually a defined character according to UnicodeData.txt.

Parameters:
ch the character to be tested
Returns:
true if the character has a defined Unicode meaning; false otherwise.
See also:
u_isdigit

u_isalpha

u_isalnum

u_isupper

u_islower

u_istitle

U_CAPI UBool U_EXPORT2 u_isdigit UChar32  c  ) 
 

Determines whether the specified character is a digit according to UnicodeData.txt.

Parameters:
ch the character to be tested
Returns:
true if the character is a digit; false otherwise.

U_CAPI UBool U_EXPORT2 u_isIDIgnorable UChar32  c  ) 
 

A convenience method for determining if a Unicode character should be regarded as an ignorable character in a Unicode identifier.

The following Unicode characters are ignorable in a Unicode identifier:
0x0000 through 0x0008, ISO control characters that
0x000E through 0x001B, are not whitespace
and 0x007F through 0x009F
0x200C through 0x200F join controls
0x200A through 0x200E bidirectional controls
0x206A through 0x206F format controls
0xFEFF zero-width no-break space

Parameters:
c the Unicode character.
Returns:
TRUE if the character may be part of a Unicode identifier; FALSE otherwise.
See also:
u_isIDPart

U_CAPI UBool U_EXPORT2 u_isIDPart UChar32  c  ) 
 

A convenience method for determining if a Unicode character may be part of a Unicode identifier other than the starting character.

A character may be part of a Unicode identifier if and only if it is one of the following:

  • a letter
  • a connecting punctuation character (such as "_").
  • a digit
  • a numeric letter (such as a Roman numeral character)
  • a combining mark
  • a non-spacing mark
  • an ignorable control character

Parameters:
c the Unicode character.
Returns:
TRUE if the character may be part of a Unicode identifier; FALSE otherwise.
See also:
u_isIDIgnorable

u_isIDStart

U_CAPI UBool U_EXPORT2 u_isIDStart UChar32  c  ) 
 

A convenience method for determining if a Unicode character is allowed to start in a Unicode identifier.

A character may start a Unicode identifier if and only if it is a letter.

Parameters:
c the Unicode character.
Returns:
TRUE if the character may start a Unicode identifier; FALSE otherwise.
See also:
u_isalpha

u_isIDPart

U_CAPI UBool U_EXPORT2 u_isJavaIDPart UChar32  c  ) 
 

A convenience method for determining if a Unicode character may be part of a Java identifier other than the starting character.

A character may be part of a Java identifier if and only if it is one of the following:

  • a letter
  • a currency symbol (such as "$")
  • a connecting punctuation character (such as "_").
  • a digit
  • a numeric letter (such as a Roman numeral character)
  • a combining mark
  • a non-spacing mark
  • an ignorable control character

Parameters:
c the Unicode character.
Returns:
TRUE if the character may be part of a Unicode identifier; FALSE otherwise.
See also:
u_isIDIgnorable

u_isJavaIDStart

u_isalpha

u_isdigit

u_isIDPart

U_CAPI UBool U_EXPORT2 u_isJavaIDStart UChar32  c  ) 
 

A convenience method for determining if a Unicode character is allowed as the first character in a Java identifier.

A character may start a Java identifier if and only if it is one of the following:

  • a letter
  • a currency symbol (such as "$")
  • a connecting punctuation symbol (such as "_").

Parameters:
c the Unicode character.
Returns:
TRUE if the character may start a Java identifier; FALSE otherwise.
See also:
u_isJavaIDPart

u_isalpha

u_isIDStart

U_CAPI UBool U_EXPORT2 u_islower UChar32  c  ) 
 

Determines whether the specified UChar is a lowercase character according to UnicodeData.txt.

Parameters:
ch the character to be tested
Returns:
true if the character is lowercase; false otherwise.
See also:
UNICODE_VERSION

u_isupper

u_istitle

u_islower

U_CAPI UBool U_EXPORT2 u_isMirrored UChar32  c  ) 
 

Determines whether the character has the "mirrored" property.

This property is set for characters that are commonly used in Right-To-Left contexts and need to be displayed with a "mirrored" glyph.

Parameters:
c the character (code point, Unicode scalar value) to be tested
Returns:
TRUE if the character has the "mirrored" property

U_CAPI UBool U_EXPORT2 u_isprint UChar32  c  ) 
 

Determines whether the specified character is a printable character according to UnicodeData.txt.

Parameters:
ch the character to be tested
Returns:
true if the Unicode character is a printable character; false otherwise.
See also:
u_iscntrl

U_CAPI UBool U_EXPORT2 u_isspace UChar32  c  ) 
 

Determines if the specified character is a space character or not.

Parameters:
ch the character to be tested
Returns:
true if the character is a space character; false otherwise.

U_CAPI UBool U_EXPORT2 u_istitle UChar32  c  ) 
 

Determines whether the specified character is a titlecase character according to UnicodeData.txt.

Parameters:
ch the character to be tested
Returns:
true if the character is titlecase; false otherwise.
See also:
u_isupper

u_islower

u_totitle

U_CAPI UBool U_EXPORT2 u_isUAlphabetic UChar32  c  ) 
 

Check if a code point has the Alphabetic Unicode property.

Same as u_hasBinaryProperty(c, UCHAR_ALPHABETIC). This is different from u_isalpha!

See also:
UCHAR_ALPHABETIC

u_isalpha

u_hasBinaryProperty ICU 2.1

U_CAPI UBool U_EXPORT2 u_isULowercase UChar32  c  ) 
 

Check if a code point has the Lowercase Unicode property.

Same as u_hasBinaryProperty(c, UCHAR_LOWERCASE). This is different from u_islower!

See also:
UCHAR_LOWERCASE

u_islower

u_hasBinaryProperty ICU 2.1

U_CAPI UBool U_EXPORT2 u_isupper UChar32  c  ) 
 

Determines whether the specified character is an uppercase character according to UnicodeData.txt.

Parameters:
ch the character to be tested
Returns:
true if the character is uppercase; false otherwise.
See also:
u_islower

u_istitle

u_tolower

U_CAPI UBool U_EXPORT2 u_isUUppercase UChar32  c  ) 
 

Check if a code point has the Uppercase Unicode property.

Same as u_hasBinaryProperty(c, UCHAR_UPPERCASE). This is different from u_isupper!

See also:
UCHAR_UPPERCASE

u_isupper

u_hasBinaryProperty ICU 2.1

U_CAPI UBool U_EXPORT2 u_isUWhiteSpace UChar32  c  ) 
 

Check if a code point has the White_Space Unicode property.

Same as u_hasBinaryProperty(c, UCHAR_WHITE_SPACE). This is different from both u_isspace and u_isWhitespace!

See also:
UCHAR_WHITE_SPACE

u_isWhitespace

u_isspace

u_hasBinaryProperty ICU 2.1

U_CAPI UBool U_EXPORT2 u_isWhitespace UChar32  c  ) 
 

Determines if the specified character is white space according to ICU.

A character is considered to be an ICU whitespace character if and only if it satisfies one of the following criteria:

  • It is a Unicode space separator (category "Zs"), but is not a no-break space (&#92;u00A0 or &#92;uFEFF).
  • It is a Unicode line separator (category "Zl").
  • It is a Unicode paragraph separator (category "Zp").
  • It is &#92;u0009, HORIZONTAL TABULATION.
  • It is &#92;u000A, LINE FEED.
  • It is &#92;u000B, VERTICAL TABULATION.
  • It is &#92;u000C, FORM FEED.
  • It is &#92;u000D, CARRIAGE RETURN.
  • It is &#92;u001C, FILE SEPARATOR.
  • It is &#92;u001D, GROUP SEPARATOR.
  • It is &#92;u001E, RECORD SEPARATOR.
  • It is &#92;u001F, UNIT SEPARATOR.
Note: This method corresponds to the Java method java.lang.Character.isWhitespace().

Parameters:
ch the character to be tested.
Returns:
true if the character is an ICU whitespace character; false otherwise.
See also:
u_isspace

U_CAPI UChar32 U_EXPORT2 u_tolower UChar32  c  ) 
 

The given character is mapped to its lowercase equivalent according to UnicodeData.txt; if the character has no lowercase equivalent, the character itself is returned.

A character has a lowercase equivalent if and only if a lowercase mapping is specified for the character in the UnicodeData.txt attribute table.

u_tolower() only deals with the general letter case conversion. For language specific case conversion behavior, use ustrToUpper(). For example, the case conversion for dot-less i and dotted I in Turkish, or for final sigma in Greek.

Parameters:
ch the character to be converted
Returns:
the lowercase equivalent of the character, if any; otherwise the character itself.

U_CAPI UChar32 U_EXPORT2 u_totitle UChar32  c  ) 
 

The given character is mapped to its titlecase equivalent according to UnicodeData.txt.

There are only four Unicode characters that are truly titlecase forms that are distinct from uppercase forms. As a rule, if a character has no true titlecase equivalent, its uppercase equivalent is returned.

A character has a titlecase equivalent if and only if a titlecase mapping is specified for the character in the UnicodeData.txt data.

Parameters:
ch the character to be converted
Returns:
the titlecase equivalent of the character, if any; otherwise the character itself.

U_CAPI UChar32 U_EXPORT2 u_toupper UChar32  c  ) 
 

The given character is mapped to its uppercase equivalent according to UnicodeData.txt; if the character has no uppercase equivalent, the character itself is returned.

u_toupper() only deals with the general letter case conversion. For language specific case conversion behavior, use ustrToUpper(). For example, the case conversion for dot-less i and dotted I in Turkish, or ess-zed (i.e., "sharp S") in German.

Parameters:
ch the character to be converted
Returns:
the uppercase equivalent of the character, if any; otherwise the character itself.

U_CAPI UBlockCode U_EXPORT2 ublock_getCode UChar32  ch  ) 
 

Returns the Unicode allocation block that contains the character.

See also:
#UCharBlock ICU 2.0


Generated on Fri Aug 13 09:53:50 2004 for ICU 2.1 by doxygen 1.3.7