#include "unicode/ucnv_err.h"
#include "unicode/uenum.h"
Go to the source code of this file.
Defines | |
#define | UCNV_MAX_CONVERTER_NAME_LENGTH 60 |
Maximum length of a converter name including the terminating NULL. | |
#define | UCNV_MAX_FULL_FILE_NAME_LENGTH (600+UCNV_MAX_CONVERTER_NAME_LENGTH) |
Maximum length of a converter name including path and terminating NULL. | |
#define | UCNV_SI 0x0F |
Shift in for EBDCDIC_STATEFUL and iso2022 states. | |
#define | UCNV_SO 0x0E |
Shift out for EBDCDIC_STATEFUL and iso2022 states. | |
#define | UCNV_OPTION_SEP_CHAR ',' |
Character that separates converter names from options and options from each other. | |
#define | UCNV_OPTION_SEP_STRING "," |
String version of UCNV_OPTION_SEP_CHAR. | |
#define | UCNV_VALUE_SEP_CHAR '=' |
Character that separates a converter option from its value. | |
#define | UCNV_VALUE_SEP_STRING "=" |
String version of UCNV_VALUE_SEP_CHAR. | |
#define | UCNV_LOCALE_OPTION_STRING ",locale=" |
Converter option for specifying a locale. | |
#define | UCNV_VERSION_OPTION_STRING ",version=" |
Converter option for specifying a version selector (0. | |
#define | UCNV_SWAP_LFNL_OPTION_STRING ",swaplfnl" |
Converter option for EBCDIC SBCS or mixed-SBCS/DBCS (stateful) codepages. | |
#define | U_CNV_SAFECLONE_BUFFERSIZE 1024 |
Definition of a buffer size that is designed to be large enough for converters to be cloned with ucnv_safeClone(). | |
#define | UCNV_GET_MAX_BYTES_FOR_STRING(length, maxCharSize) (((int32_t)(length)+10)*(int32_t)(maxCharSize)) |
Calculates the size of a buffer for conversion from Unicode to a charset. | |
Typedefs | |
typedef USet | USet |
typedef UConverterToUnicodeArgs * | args |
typedef UConverterToUnicodeArgs const char * | codeUnits |
typedef UConverterToUnicodeArgs const char int32_t | length |
typedef UConverterToUnicodeArgs const char int32_t UConverterCallbackReason | reason |
typedef UConverterToUnicodeArgs const char int32_t UConverterCallbackReason UErrorCode * | pErrorCode |
typedef UConverterFromUnicodeArgs * | args |
typedef UConverterFromUnicodeArgs const UChar * | codeUnits |
typedef UConverterFromUnicodeArgs const UChar int32_t | length |
typedef UConverterFromUnicodeArgs const UChar int32_t UChar32 | codePoint |
typedef UConverterFromUnicodeArgs const UChar int32_t UChar32 UConverterCallbackReason | reason |
typedef UConverterFromUnicodeArgs const UChar int32_t UChar32 UConverterCallbackReason UErrorCode * | pErrorCode |
Enumerations | |
enum | UConverterType { UCNV_UNSUPPORTED_CONVERTER = -1, UCNV_SBCS = 0, UCNV_DBCS = 1, UCNV_MBCS = 2, UCNV_LATIN_1 = 3, UCNV_UTF8 = 4, UCNV_UTF16_BigEndian = 5, UCNV_UTF16_LittleEndian = 6, UCNV_UTF32_BigEndian = 7, UCNV_UTF32_LittleEndian = 8, UCNV_EBCDIC_STATEFUL = 9, UCNV_ISO_2022 = 10, UCNV_LMBCS_1 = 11, UCNV_LMBCS_2, UCNV_LMBCS_3, UCNV_LMBCS_4, UCNV_LMBCS_5, UCNV_LMBCS_6, UCNV_LMBCS_8, UCNV_LMBCS_11, UCNV_LMBCS_16, UCNV_LMBCS_17, UCNV_LMBCS_18, UCNV_LMBCS_19, UCNV_LMBCS_LAST = UCNV_LMBCS_19, UCNV_HZ, UCNV_SCSU, UCNV_ISCII, UCNV_US_ASCII, UCNV_UTF7, UCNV_BOCU1, UCNV_UTF16, UCNV_UTF32, UCNV_CESU8, UCNV_IMAP_MAILBOX, UCNV_NUMBER_OF_SUPPORTED_CONVERTER_TYPES } |
Enum for specifying basic types of converters. More... | |
enum | UConverterPlatform { UCNV_UNKNOWN = -1, UCNV_IBM = 0 } |
Enum for specifying which platform a converter ID refers to. More... | |
enum | UConverterUnicodeSet { UCNV_ROUNDTRIP_SET, UCNV_SET_COUNT } |
Selectors for Unicode sets that can be returned by ucnv_getUnicodeSet(). More... | |
Functions | |
typedef | void (U_EXPORT2 *UConverterToUCallback)(const void *context |
Function pointer for error callback in the codepage to unicode direction. | |
U_STABLE int U_EXPORT2 | ucnv_compareNames (const char *name1, const char *name2) |
Do a fuzzy compare of a two converter/alias names. | |
U_STABLE UConverter *U_EXPORT2 | ucnv_open (const char *converterName, UErrorCode *err) |
Creates a UConverter object with the names specified as a C string. | |
U_STABLE UConverter *U_EXPORT2 | ucnv_openU (const UChar *name, UErrorCode *err) |
Creates a Unicode converter with the names specified as unicode string. | |
U_STABLE UConverter *U_EXPORT2 | ucnv_openCCSID (int32_t codepage, UConverterPlatform platform, UErrorCode *err) |
Creates a UConverter object from a CCSID number and platform pair. | |
U_STABLE UConverter *U_EXPORT2 | ucnv_openPackage (const char *packageName, const char *converterName, UErrorCode *err) |
U_STABLE UConverter *U_EXPORT2 | ucnv_safeClone (const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status) |
Thread safe converter cloning operation. | |
U_STABLE void U_EXPORT2 | ucnv_close (UConverter *converter) |
Deletes the unicode converter and releases resources associated with just this instance. | |
U_STABLE void U_EXPORT2 | ucnv_getSubstChars (const UConverter *converter, char *subChars, int8_t *len, UErrorCode *err) |
Fills in the output parameter, subChars, with the substitution characters as multiple bytes. | |
U_STABLE void U_EXPORT2 | ucnv_setSubstChars (UConverter *converter, const char *subChars, int8_t len, UErrorCode *err) |
Sets the substitution chars when converting from unicode to a codepage. | |
U_STABLE void U_EXPORT2 | ucnv_getInvalidChars (const UConverter *converter, char *errBytes, int8_t *len, UErrorCode *err) |
Fills in the output parameter, errBytes, with the error characters from the last failing conversion. | |
U_STABLE void U_EXPORT2 | ucnv_getInvalidUChars (const UConverter *converter, UChar *errUChars, int8_t *len, UErrorCode *err) |
Fills in the output parameter, errChars, with the error characters from the last failing conversion. | |
U_STABLE void U_EXPORT2 | ucnv_reset (UConverter *converter) |
Resets the state of a converter to the default state. | |
U_STABLE void U_EXPORT2 | ucnv_resetToUnicode (UConverter *converter) |
Resets the to-Unicode part of a converter state to the default state. | |
U_STABLE void U_EXPORT2 | ucnv_resetFromUnicode (UConverter *converter) |
Resets the from-Unicode part of a converter state to the default state. | |
U_STABLE int8_t U_EXPORT2 | ucnv_getMaxCharSize (const UConverter *converter) |
Returns the maximum number of bytes that are output per UChar in conversion from Unicode using this converter. | |
U_STABLE int8_t U_EXPORT2 | ucnv_getMinCharSize (const UConverter *converter) |
Returns the minimum byte length for characters in this codepage. | |
U_STABLE int32_t U_EXPORT2 | ucnv_getDisplayName (const UConverter *converter, const char *displayLocale, UChar *displayName, int32_t displayNameCapacity, UErrorCode *err) |
Returns the display name of the converter passed in based on the Locale passed in. | |
U_STABLE const char *U_EXPORT2 | ucnv_getName (const UConverter *converter, UErrorCode *err) |
Gets the internal, canonical name of the converter (zero-terminated). | |
U_STABLE int32_t U_EXPORT2 | ucnv_getCCSID (const UConverter *converter, UErrorCode *err) |
Gets a codepage number associated with the converter. | |
U_STABLE UConverterPlatform U_EXPORT2 | ucnv_getPlatform (const UConverter *converter, UErrorCode *err) |
Gets a codepage platform associated with the converter. | |
U_STABLE UConverterType U_EXPORT2 | ucnv_getType (const UConverter *converter) |
Gets the type of the converter e.g. | |
U_STABLE void U_EXPORT2 | ucnv_getStarters (const UConverter *converter, UBool starters[256], UErrorCode *err) |
Gets the "starter" (lead) bytes for converters of type MBCS. | |
U_STABLE void U_EXPORT2 | ucnv_getUnicodeSet (const UConverter *cnv, USet *setFillIn, UConverterUnicodeSet whichSet, UErrorCode *pErrorCode) |
Returns the set of Unicode code points that can be converted by an ICU converter. | |
U_STABLE void U_EXPORT2 | ucnv_getToUCallBack (const UConverter *converter, UConverterToUCallback *action, const void **context) |
Gets the current calback function used by the converter when an illegal or invalid codepage sequence is found. | |
U_STABLE void U_EXPORT2 | ucnv_getFromUCallBack (const UConverter *converter, UConverterFromUCallback *action, const void **context) |
Gets the current callback function used by the converter when illegal or invalid Unicode sequence is found. | |
U_STABLE void U_EXPORT2 | ucnv_setToUCallBack (UConverter *converter, UConverterToUCallback newAction, const void *newContext, UConverterToUCallback *oldAction, const void **oldContext, UErrorCode *err) |
Changes the callback function used by the converter when an illegal or invalid sequence is found. | |
U_STABLE void U_EXPORT2 | ucnv_setFromUCallBack (UConverter *converter, UConverterFromUCallback newAction, const void *newContext, UConverterFromUCallback *oldAction, const void **oldContext, UErrorCode *err) |
Changes the current callback function used by the converter when an illegal or invalid sequence is found. | |
U_STABLE void U_EXPORT2 | ucnv_fromUnicode (UConverter *converter, char **target, const char *targetLimit, const UChar **source, const UChar *sourceLimit, int32_t *offsets, UBool flush, UErrorCode *err) |
Converts an array of unicode characters to an array of codepage characters. | |
U_STABLE void U_EXPORT2 | ucnv_toUnicode (UConverter *converter, UChar **target, const UChar *targetLimit, const char **source, const char *sourceLimit, int32_t *offsets, UBool flush, UErrorCode *err) |
Converts a buffer of codepage bytes into an array of unicode UChars characters. | |
U_STABLE int32_t U_EXPORT2 | ucnv_fromUChars (UConverter *cnv, char *dest, int32_t destCapacity, const UChar *src, int32_t srcLength, UErrorCode *pErrorCode) |
Convert the Unicode string into a codepage string using an existing UConverter. | |
U_STABLE int32_t U_EXPORT2 | ucnv_toUChars (UConverter *cnv, UChar *dest, int32_t destCapacity, const char *src, int32_t srcLength, UErrorCode *pErrorCode) |
Convert the codepage string into a Unicode string using an existing UConverter. | |
U_STABLE UChar32 U_EXPORT2 | ucnv_getNextUChar (UConverter *converter, const char **source, const char *sourceLimit, UErrorCode *err) |
Convert a codepage buffer into Unicode one character at a time. | |
U_STABLE void U_EXPORT2 | ucnv_convertEx (UConverter *targetCnv, UConverter *sourceCnv, char **target, const char *targetLimit, const char **source, const char *sourceLimit, UChar *pivotStart, UChar **pivotSource, UChar **pivotTarget, const UChar *pivotLimit, UBool reset, UBool flush, UErrorCode *pErrorCode) |
Convert from one external charset to another using two existing UConverters. | |
U_STABLE int32_t U_EXPORT2 | ucnv_convert (const char *toConverterName, const char *fromConverterName, char *target, int32_t targetCapacity, const char *source, int32_t sourceLength, UErrorCode *pErrorCode) |
Convert from one external charset to another. | |
U_STABLE int32_t U_EXPORT2 | ucnv_toAlgorithmic (UConverterType algorithmicType, UConverter *cnv, char *target, int32_t targetCapacity, const char *source, int32_t sourceLength, UErrorCode *pErrorCode) |
Convert from one external charset to another. | |
U_STABLE int32_t U_EXPORT2 | ucnv_fromAlgorithmic (UConverter *cnv, UConverterType algorithmicType, char *target, int32_t targetCapacity, const char *source, int32_t sourceLength, UErrorCode *pErrorCode) |
Convert from one external charset to another. | |
U_STABLE int32_t U_EXPORT2 | ucnv_flushCache (void) |
Frees up memory occupied by unused, cached converter shared data. | |
U_STABLE int32_t U_EXPORT2 | ucnv_countAvailable (void) |
Returns the number of available converters, as per the alias file. | |
U_STABLE const char *U_EXPORT2 | ucnv_getAvailableName (int32_t n) |
Gets the canonical converter name of the specified converter from a list of all available converters contaied in the alias file. | |
U_STABLE UEnumeration *U_EXPORT2 | ucnv_openAllNames (UErrorCode *pErrorCode) |
Returns a UEnumeration to enumerate all of the canonical converter names, as per the alias file, regardless of the ability to open each converter. | |
U_STABLE uint16_t U_EXPORT2 | ucnv_countAliases (const char *alias, UErrorCode *pErrorCode) |
Gives the number of aliases for a given converter or alias name. | |
U_STABLE const char *U_EXPORT2 | ucnv_getAlias (const char *alias, uint16_t n, UErrorCode *pErrorCode) |
Gives the name of the alias at given index of alias list. | |
U_STABLE void U_EXPORT2 | ucnv_getAliases (const char *alias, const char **aliases, UErrorCode *pErrorCode) |
Fill-up the list of alias names for the given alias. | |
U_STABLE UEnumeration *U_EXPORT2 | ucnv_openStandardNames (const char *convName, const char *standard, UErrorCode *pErrorCode) |
Return a new UEnumeration object for enumerating all the alias names for a given converter that are recognized by a standard. | |
U_STABLE uint16_t U_EXPORT2 | ucnv_countStandards (void) |
Gives the number of standards associated to converter names. | |
U_STABLE const char *U_EXPORT2 | ucnv_getStandard (uint16_t n, UErrorCode *pErrorCode) |
Gives the name of the standard at given index of standard list. | |
U_STABLE const char *U_EXPORT2 | ucnv_getStandardName (const char *name, const char *standard, UErrorCode *pErrorCode) |
Returns a standard name for a given converter name. | |
U_STABLE const char *U_EXPORT2 | ucnv_getCanonicalName (const char *alias, const char *standard, UErrorCode *pErrorCode) |
This function will return the internal canonical converter name of the tagged alias. | |
U_STABLE const char *U_EXPORT2 | ucnv_getDefaultName (void) |
returns the current default converter name. | |
U_STABLE void U_EXPORT2 | ucnv_setDefaultName (const char *name) |
sets the current default converter name. | |
U_STABLE void U_EXPORT2 | ucnv_fixFileSeparator (const UConverter *cnv, UChar *source, int32_t sourceLen) |
Fixes the backslash character mismapping. | |
U_STABLE UBool U_EXPORT2 | ucnv_isAmbiguous (const UConverter *cnv) |
Determines if the converter contains ambiguous mappings of the same character or not. | |
U_STABLE void U_EXPORT2 | ucnv_setFallback (UConverter *cnv, UBool usesFallback) |
Sets the converter to use fallback mapping or not. | |
U_STABLE UBool U_EXPORT2 | ucnv_usesFallback (const UConverter *cnv) |
Determines if the converter uses fallback mappings or not. | |
U_STABLE const char *U_EXPORT2 | ucnv_detectUnicodeSignature (const char *source, int32_t sourceLength, int32_t *signatureLength, UErrorCode *pErrorCode) |
Detects Unicode signature byte sequences at the start of the byte stream and returns the charset name of the indicated Unicode charset. | |
U_DRAFT int32_t U_EXPORT2 | ucnv_fromUCountPending (const UConverter *cnv, UErrorCode *status) |
Returns the number of UChars held in the converter's internal state because more input is needed for completing the conversion. | |
U_DRAFT int32_t U_EXPORT2 | ucnv_toUCountPending (const UConverter *cnv, UErrorCode *status) |
Returns the number of chars held in the converter's internal state because more input is needed for completing the conversion. |
This API is used to convert codepage or character encoded data to and from UTF-16. You can open a converter with ucnv_open(). With that converter, you can get its properties, set options, convert your data and close the converter.
Since many software programs recogize different converter names for different types of converters, there are other functions in this API to iterate over the converter aliases. The functions ucnv_getAvailableName(), ucnv_getAlias() and ucnv_getStandardName() are some of the more frequently used alias functions to get this information.
When a converter encounters an illegal, irregular, invalid or unmappable character its default behavior is to use a substitution character to replace the bad byte sequence. This behavior can be changed by using ucnv_setFromUCallBack() or ucnv_setToUCallBack() on the converter. The header ucnv_err.h defines many other callback actions that can be used instead of a character substitution.
More information about this API can be found in our User's Guide.
Definition in file ucnv.h.
#define U_CNV_SAFECLONE_BUFFERSIZE 1024 |
Definition of a buffer size that is designed to be large enough for converters to be cloned with ucnv_safeClone().
#define UCNV_GET_MAX_BYTES_FOR_STRING | ( | length, | |||
maxCharSize | ) | (((int32_t)(length)+10)*(int32_t)(maxCharSize)) |
Calculates the size of a buffer for conversion from Unicode to a charset.
The calculated size is guaranteed to be sufficient for this conversion.
It takes into account initial and final non-character bytes that are output by some converters. It does not take into account callbacks which output more than one charset character sequence per call, like escape callbacks. The default (substitution) callback only outputs one charset character sequence.
length | Number of UChars to be converted. | |
maxCharSize | Return value from ucnv_getMaxCharSize() for the converter that will be used. |
#define UCNV_LOCALE_OPTION_STRING ",locale=" |
#define UCNV_MAX_CONVERTER_NAME_LENGTH 60 |
#define UCNV_MAX_FULL_FILE_NAME_LENGTH (600+UCNV_MAX_CONVERTER_NAME_LENGTH) |
#define UCNV_OPTION_SEP_CHAR ',' |
#define UCNV_OPTION_SEP_STRING "," |
#define UCNV_SI 0x0F |
#define UCNV_SO 0x0E |
#define UCNV_SWAP_LFNL_OPTION_STRING ",swaplfnl" |
Converter option for EBCDIC SBCS or mixed-SBCS/DBCS (stateful) codepages.
Swaps Unicode mappings for EBCDIC LF and NL codes, as used on S/390 (z/OS) Unix System Services (Open Edition). For example, ucnv_open("ibm-1047,swaplfnl", &errorCode); See convrtrs.txt.
#define UCNV_VALUE_SEP_CHAR '=' |
#define UCNV_VALUE_SEP_STRING "=" |
#define UCNV_VERSION_OPTION_STRING ",version=" |
enum UConverterPlatform |
Enum for specifying which platform a converter ID refers to.
The use of platform/CCSID is not recommended. See ucnv_openCCSID().
enum UConverterType |
enum UConverterUnicodeSet |
Selectors for Unicode sets that can be returned by ucnv_getUnicodeSet().
U_STABLE void U_EXPORT2 ucnv_close | ( | UConverter * | converter | ) |
Deletes the unicode converter and releases resources associated with just this instance.
Does not free up shared converter tables.
converter | the converter object to be deleted |
U_STABLE int U_EXPORT2 ucnv_compareNames | ( | const char * | name1, | |
const char * | name2 | |||
) |
Do a fuzzy compare of a two converter/alias names.
The comparison is case-insensitive. It also ignores the characters '-', '_', and ' ' (dash, underscore, and space). Thus the strings "UTF-8", "utf_8", and "Utf 8" are exactly equivalent.
name1 | a converter name or alias, zero-terminated | |
name2 | a converter name or alias, zero-terminated |
U_STABLE int32_t U_EXPORT2 ucnv_convert | ( | const char * | toConverterName, | |
const char * | fromConverterName, | |||
char * | target, | |||
int32_t | targetCapacity, | |||
const char * | source, | |||
int32_t | sourceLength, | |||
UErrorCode * | pErrorCode | |||
) |
Convert from one external charset to another.
Internally, two converters are opened according to the name arguments, then the text is converted to and from the 16-bit Unicode "pivot" using ucnv_convertEx(), then the converters are closed again.
This is a convenience function, not an efficient way to convert a lot of text: ucnv_convert()
The function returns when one of the following is true:
toConverterName | The name of the converter that is used to convert from the UTF-16 pivot buffer to the target. | |
fromConverterName | The name of the converter that is used to convert from the source to the UTF-16 pivot buffer. | |
target | Pointer to the output buffer. | |
targetCapacity | Capacity of the target, in bytes. | |
source | Pointer to the input buffer. | |
sourceLength | Length of the input text, in bytes, or -1 for NUL-terminated input. | |
pErrorCode | ICU error code in/out parameter. Must fulfill U_SUCCESS before the function call. |
U_STABLE void U_EXPORT2 ucnv_convertEx | ( | UConverter * | targetCnv, | |
UConverter * | sourceCnv, | |||
char ** | target, | |||
const char * | targetLimit, | |||
const char ** | source, | |||
const char * | sourceLimit, | |||
UChar * | pivotStart, | |||
UChar ** | pivotSource, | |||
UChar ** | pivotTarget, | |||
const UChar * | pivotLimit, | |||
UBool | reset, | |||
UBool | flush, | |||
UErrorCode * | pErrorCode | |||
) |
Convert from one external charset to another using two existing UConverters.
Internally, two conversions - ucnv_toUnicode() and ucnv_fromUnicode() - are used, "pivoting" through 16-bit Unicode.
There is a similar function, ucnv_convert(), which has the following limitations:
By contrast, ucnv_convertEx()
ucnv_convertEx() also provides further convenience:
The function returns when one of the following is true:
Limitation compared to the direct use of ucnv_fromUnicode() and ucnv_toUnicode(): ucnv_convertEx() does not provide offset information.
Limitation compared to ucnv_fromUChars() and ucnv_toUChars(): ucnv_convertEx() does not support preflighting directly.
Sample code for converting a single string from one external charset to UTF-8, ignoring the location of errors:
int32_t myToUTF8(UConverter *cnv, const char *s, int32_t length, char *u8, int32_t capacity, UErrorCode *pErrorCode) { UConverter *utf8Cnv; char *target; if(U_FAILURE(*pErrorCode)) { return 0; } utf8Cnv=myGetCachedUTF8Converter(pErrorCode); if(U_FAILURE(*pErrorCode)) { return 0; } target=u8; ucnv_convertEx(cnv, utf8Cnv, &target, u8+capacity, &s, length>=0 ? s+length : NULL, NULL, NULL, NULL, NULL, TRUE, TRUE, pErrorCode); myReleaseCachedUTF8Converter(utf8Cnv); // return the output string length, but without preflighting return (int32_t)(target-u8); }
targetCnv | Output converter, used to convert from the UTF-16 pivot to the target using ucnv_fromUnicode(). | |
sourceCnv | Input converter, used to convert from the source to the UTF-16 pivot using ucnv_toUnicode(). | |
target | I/O parameter, same as for ucnv_fromUChars(). Input: *target points to the beginning of the target buffer. Output: *target points to the first unit after the last char written. | |
targetLimit | Pointer to the first unit after the target buffer. | |
source | I/O parameter, same as for ucnv_toUChars(). Input: *source points to the beginning of the source buffer. Output: *source points to the first unit after the last char read. | |
sourceLimit | Pointer to the first unit after the source buffer. | |
pivotStart | Pointer to the UTF-16 pivot buffer. If pivotStart==NULL, then an internal buffer is used and the other pivot arguments are ignored and can be NULL as well. | |
pivotSource | I/O parameter, same as source in ucnv_fromUChars() for conversion from the pivot buffer to the target buffer. | |
pivotTarget | I/O parameter, same as target in ucnv_toUChars() for conversion from the source buffer to the pivot buffer. It must be pivotStart<=*pivotSource<=*pivotTarget<=pivotLimit and pivotStart<pivotLimit (unless pivotStart==NULL). | |
pivotLimit | Pointer to the first unit after the pivot buffer. | |
reset | If TRUE, then ucnv_resetToUnicode(sourceCnv) and ucnv_resetFromUnicode(targetCnv) are called, and the pivot pointers are reset (*pivotTarget=*pivotSource=pivotStart). | |
flush | If true, indicates the end of the input. Passed directly to ucnv_toUnicode(), and carried over to ucnv_fromUnicode() when the source is empty as well. | |
pErrorCode | ICU error code in/out parameter. Must fulfill U_SUCCESS before the function call. U_BUFFER_OVERFLOW_ERROR always refers to the target buffer because overflows into the pivot buffer are handled internally. Other conversion errors are from the source-to-pivot conversion if *pivotSource==pivotStart, otherwise from the pivot-to-target conversion. |
U_STABLE uint16_t U_EXPORT2 ucnv_countAliases | ( | const char * | alias, | |
UErrorCode * | pErrorCode | |||
) |
Gives the number of aliases for a given converter or alias name.
If the alias is ambiguous, then the preferred converter is used and the status is set to U_AMBIGUOUS_ALIAS_WARNING. This method only enumerates the listed entries in the alias file.
alias | alias name | |
pErrorCode | error status |
U_STABLE int32_t U_EXPORT2 ucnv_countAvailable | ( | void | ) |
Returns the number of available converters, as per the alias file.
U_STABLE uint16_t U_EXPORT2 ucnv_countStandards | ( | void | ) |
Gives the number of standards associated to converter names.
U_STABLE const char* U_EXPORT2 ucnv_detectUnicodeSignature | ( | const char * | source, | |
int32_t | sourceLength, | |||
int32_t * | signatureLength, | |||
UErrorCode * | pErrorCode | |||
) |
Detects Unicode signature byte sequences at the start of the byte stream and returns the charset name of the indicated Unicode charset.
NULL is returned when no Unicode signature is recognized. The number of bytes in the signature is output as well.
The caller can ucnv_open() a converter using the charset name. The first code unit (UChar) from the start of the stream will be U+FEFF (the Unicode BOM/signature character) and can usually be ignored.
For most Unicode charsets it is also possible to ignore the indicated number of initial stream bytes and start converting after them. However, there are stateful Unicode charsets (UTF-7 and BOCU-1) for which this will not work. Therefore, it is best to ignore the first output UChar instead of the input signature bytes.
Usage:
UErrorCode err = U_ZERO_ERROR; char input[] = { '\xEF','\xBB', '\xBF','\x41','\x42','\x43' }; int32_t signatureLength = 0; char *encoding = ucnv_detectUnicodeSignatures(input,sizeof(input),&signatureLength,&err); UConverter *conv = NULL; UChar output[100]; UChar *target = output, *out; char *source = input; if(encoding!=NULL && U_SUCCESS(err)){ // should signature be discarded ? conv = ucnv_open(encoding, &err); // do the conversion ucnv_toUnicode(conv, target, output + sizeof(output)/U_SIZEOF_UCHAR, source, input + sizeof(input), NULL, TRUE, &err); out = output; if (discardSignature){ ++out; // ignore initial U+FEFF } while(out != target) { printf("%04x ", *out++); } puts(""); }
source | The source string in which the signature should be detected. | |
sourceLength | Length of the input string, or -1 if terminated with a NUL byte. | |
signatureLength | A pointer to int32_t to receive the number of bytes that make up the signature of the detected UTF. 0 if not detected. Can be a NULL pointer. | |
pErrorCode | ICU error code in/out parameter. Must fulfill U_SUCCESS before the function call. |
U_STABLE void U_EXPORT2 ucnv_fixFileSeparator | ( | const UConverter * | cnv, | |
UChar * | source, | |||
int32_t | sourceLen | |||
) |
Fixes the backslash character mismapping.
For example, in SJIS, the backslash character in the ASCII portion is also used to represent the yen currency sign. When mapping from Unicode character 0x005C, it's unclear whether to map the character back to yen or backslash in SJIS. This function will take the input buffer and replace all the yen sign characters with backslash. This is necessary when the user tries to open a file with the input buffer on Windows. This function will test the converter to see whether such mapping is required. You can sometimes avoid using this function by using the correct version of Shift-JIS.
cnv | The converter representing the target codepage. | |
source | the input buffer to be fixed | |
sourceLen | the length of the input buffer |
U_STABLE int32_t U_EXPORT2 ucnv_flushCache | ( | void | ) |
Frees up memory occupied by unused, cached converter shared data.
U_STABLE int32_t U_EXPORT2 ucnv_fromAlgorithmic | ( | UConverter * | cnv, | |
UConverterType | algorithmicType, | |||
char * | target, | |||
int32_t | targetCapacity, | |||
const char * | source, | |||
int32_t | sourceLength, | |||
UErrorCode * | pErrorCode | |||
) |
Convert from one external charset to another.
Internally, the text is converted to and from the 16-bit Unicode "pivot" using ucnv_convertEx(). ucnv_fromAlgorithmic() works exactly like ucnv_convert() except that the two converters need not be looked up and opened completely.
The source-to-pivot conversion uses a purely algorithmic converter according to the specified type, e.g., UCNV_UTF8 for a UTF-8 converter. The pivot-to-target conversion uses the cnv converter parameter.
Internally, the algorithmic converter is opened and closed for each function call, which is more efficient than using the public ucnv_open() but somewhat less efficient than only resetting an existing converter and using ucnv_convertEx().
This function is more convenient than ucnv_convertEx() for single-string conversions, especially when "preflighting" is desired (returning the length of the complete output even if it does not fit into the target buffer; see the User Guide Strings chapter). See ucnv_convert() for details.
cnv | The converter that is used to convert from the UTF-16 pivot buffer to the target. | |
algorithmicType | UConverterType constant identifying the desired source charset as a purely algorithmic converter. Those are converters for Unicode charsets like UTF-8, BOCU-1, SCSU, UTF-7, IMAP-mailbox-name, etc., as well as US-ASCII and ISO-8859-1. | |
target | Pointer to the output buffer. | |
targetCapacity | Capacity of the target, in bytes. | |
source | Pointer to the input buffer. | |
sourceLength | Length of the input text, in bytes | |
pErrorCode | ICU error code in/out parameter. Must fulfill U_SUCCESS before the function call. |
U_STABLE int32_t U_EXPORT2 ucnv_fromUChars | ( | UConverter * | cnv, | |
char * | dest, | |||
int32_t | destCapacity, | |||
const UChar * | src, | |||
int32_t | srcLength, | |||
UErrorCode * | pErrorCode | |||
) |
Convert the Unicode string into a codepage string using an existing UConverter.
The output string is NUL-terminated if possible.
This function is a more convenient but less powerful version of ucnv_fromUnicode(). It is only useful for whole strings, not for streaming conversion.
The maximum output buffer capacity required (barring output from callbacks) will be UCNV_GET_MAX_BYTES_FOR_STRING(srcLength, ucnv_getMaxCharSize(cnv)).
cnv | the converter object to be used (ucnv_resetFromUnicode() will be called) | |
src | the input Unicode string | |
srcLength | the input string length, or -1 if NUL-terminated | |
dest | destination string buffer, can be NULL if destCapacity==0 | |
destCapacity | the number of chars available at dest | |
pErrorCode | normal ICU error code; common error codes that may be set by this function include U_BUFFER_OVERFLOW_ERROR, U_STRING_NOT_TERMINATED_WARNING, U_ILLEGAL_ARGUMENT_ERROR, and conversion errors |
U_DRAFT int32_t U_EXPORT2 ucnv_fromUCountPending | ( | const UConverter * | cnv, | |
UErrorCode * | status | |||
) |
Returns the number of UChars held in the converter's internal state because more input is needed for completing the conversion.
This function is useful for mapping semantics of ICU's converter interface to those of iconv, and this information is not needed for normal conversion.
cnv | The converter in which the input is held | |
status | ICU error code in/out parameter. Must fulfill U_SUCCESS before the function call. |
U_STABLE void U_EXPORT2 ucnv_fromUnicode | ( | UConverter * | converter, | |
char ** | target, | |||
const char * | targetLimit, | |||
const UChar ** | source, | |||
const UChar * | sourceLimit, | |||
int32_t * | offsets, | |||
UBool | flush, | |||
UErrorCode * | err | |||
) |
Converts an array of unicode characters to an array of codepage characters.
This function is optimized for converting a continuous stream of data in buffer-sized chunks, where the entire source and target does not fit in available buffers.
The source pointer is an in/out parameter. It starts out pointing where the conversion is to begin, and ends up pointing after the last UChar consumed.
Target similarly starts out pointer at the first available byte in the output buffer, and ends up pointing after the last byte written to the output.
The converter always attempts to consume the entire source buffer, unless (1.) the target buffer is full, or (2.) a failing error is returned from the current callback function. When a successful error status has been returned, it means that all of the source buffer has been consumed. At that point, the caller should reset the source and sourceLimit pointers to point to the next chunk.
At the end of the stream (flush==TRUE), the input is completely consumed when *source==sourceLimit and no error code is set. The converter object is then automatically reset by this function. (This means that a converter need not be reset explicitly between data streams if it finishes the previous stream without errors.)
This is a stateful conversion. Additionally, even when all source data has been consumed, some data may be in the converters' internal state. Call this function repeatedly, updating the target pointers with the next empty chunk of target in case of a U_BUFFER_OVERFLOW_ERROR
, and updating the source pointers with the next chunk of source when a successful error status is returned, until there are no more chunks of source data.
converter | the Unicode converter | |
target | I/O parameter. Input : Points to the beginning of the buffer to copy codepage characters to. Output : points to after the last codepage character copied to target . | |
targetLimit | the pointer just after last of the target buffer | |
source | I/O parameter, pointer to pointer to the source Unicode character buffer. | |
sourceLimit | the pointer just after the last of the source buffer | |
offsets | if NULL is passed, nothing will happen to it, otherwise it needs to have the same number of allocated cells as target . Will fill in offsets from target to source pointer e.g: offsets[3] is equal to 6, it means that the target[3] was a result of transcoding source[6] For output data carried across calls, and other data without a specific source character (such as from escape sequences or callbacks) -1 will be placed for offsets. | |
flush | set to TRUE if the current source buffer is the last available chunk of the source, FALSE otherwise. Note that if a failing status is returned, this function may have to be called multiple times with flush set to TRUE until the source buffer is consumed. | |
err | the error status. U_ILLEGAL_ARGUMENT_ERROR will be set if the converter is NULL . U_BUFFER_OVERFLOW_ERROR will be set if the target is full and there is still data to be written to the target. |
U_STABLE const char* U_EXPORT2 ucnv_getAlias | ( | const char * | alias, | |
uint16_t | n, | |||
UErrorCode * | pErrorCode | |||
) |
Gives the name of the alias at given index of alias list.
This method only enumerates the listed entries in the alias file. If the alias is ambiguous, then the preferred converter is used and the status is set to U_AMBIGUOUS_ALIAS_WARNING.
alias | alias name | |
n | index in alias list | |
pErrorCode | result of operation |
U_STABLE void U_EXPORT2 ucnv_getAliases | ( | const char * | alias, | |
const char ** | aliases, | |||
UErrorCode * | pErrorCode | |||
) |
Fill-up the list of alias names for the given alias.
This method only enumerates the listed entries in the alias file. If the alias is ambiguous, then the preferred converter is used and the status is set to U_AMBIGUOUS_ALIAS_WARNING.
alias | alias name | |
aliases | fill-in list, aliases is a pointer to an array of ucnv_countAliases() string-pointers (const char * ) that will be filled in. The strings themselves are owned by the library. | |
pErrorCode | result of operation |
U_STABLE const char* U_EXPORT2 ucnv_getAvailableName | ( | int32_t | n | ) |
Gets the canonical converter name of the specified converter from a list of all available converters contaied in the alias file.
All converters in this list can be opened.
n | the index to a converter available on the system (in the range [0..ucnv_countAvaiable()] ) |
NULL
if the index is out of bounds. U_STABLE const char* U_EXPORT2 ucnv_getCanonicalName | ( | const char * | alias, | |
const char * | standard, | |||
UErrorCode * | pErrorCode | |||
) |
This function will return the internal canonical converter name of the tagged alias.
This is the opposite of ucnv_openStandardNames, which returns the tagged alias given the canonical name.
Example alias table:
conv alias1 { STANDARD1 } alias2 { STANDARD1* }
Result of ucnv_getStandardName("alias1", "STANDARD1") from example alias table:
"conv"
NULL
is returned. The returned string is owned by the library. U_STABLE int32_t U_EXPORT2 ucnv_getCCSID | ( | const UConverter * | converter, | |
UErrorCode * | err | |||
) |
Gets a codepage number associated with the converter.
This is not guaranteed to be the one used to create the converter. Some converters do not represent platform registered codepages and return zero for the codepage number. The error code fill-in parameter indicates if the codepage number is available. Does not check if the converter is NULL
or if converter's data table is NULL
.
Important: The use of CCSIDs is not recommended because it is limited to only two platforms in principle and only one (UCNV_IBM) in the current ICU converter API. Also, CCSIDs are insufficient to identify IBM Unicode conversion tables precisely. For more details see ucnv_openCCSID().
converter | the Unicode converter | |
err | the error status code. |
U_STABLE const char* U_EXPORT2 ucnv_getDefaultName | ( | void | ) |
returns the current default converter name.
NULL
is returned. Storage owned by the library U_STABLE int32_t U_EXPORT2 ucnv_getDisplayName | ( | const UConverter * | converter, | |
const char * | displayLocale, | |||
UChar * | displayName, | |||
int32_t | displayNameCapacity, | |||
UErrorCode * | err | |||
) |
Returns the display name of the converter passed in based on the Locale passed in.
If the locale contains no display name, the internal ASCII name will be filled in.
converter | the Unicode converter. | |
displayLocale | is the specific Locale we want to localised for | |
displayName | user provided buffer to be filled in | |
displayNameCapacity | size of displayName Buffer | |
err | error status code |
U_STABLE void U_EXPORT2 ucnv_getFromUCallBack | ( | const UConverter * | converter, | |
UConverterFromUCallback * | action, | |||
const void ** | context | |||
) |
Gets the current callback function used by the converter when illegal or invalid Unicode sequence is found.
Context pointers are always owned by the caller.
converter | the unicode converter | |
action | fillin: returns the callback function pointer | |
context | fillin: returns the callback's private void* context |
U_STABLE void U_EXPORT2 ucnv_getInvalidChars | ( | const UConverter * | converter, | |
char * | errBytes, | |||
int8_t * | len, | |||
UErrorCode * | err | |||
) |
Fills in the output parameter, errBytes, with the error characters from the last failing conversion.
converter | the Unicode converter | |
errBytes | the codepage bytes which were in error | |
len | on input the capacity of errBytes, on output the number of bytes which were copied to it | |
err | the error status code. If the substitution character array is too small, an U_INDEX_OUTOFBOUNDS_ERROR will be returned. |
U_STABLE void U_EXPORT2 ucnv_getInvalidUChars | ( | const UConverter * | converter, | |
UChar * | errUChars, | |||
int8_t * | len, | |||
UErrorCode * | err | |||
) |
Fills in the output parameter, errChars, with the error characters from the last failing conversion.
converter | the Unicode converter | |
errUChars | the UChars which were in error | |
len | on input the capacity of errUChars, on output the number of UChars which were copied to it | |
err | the error status code. If the substitution character array is too small, an U_INDEX_OUTOFBOUNDS_ERROR will be returned. |
U_STABLE int8_t U_EXPORT2 ucnv_getMaxCharSize | ( | const UConverter * | converter | ) |
Returns the maximum number of bytes that are output per UChar in conversion from Unicode using this converter.
The returned number can be used with UCNV_GET_MAX_BYTES_FOR_STRING to calculate the size of a target buffer for conversion from Unicode.
Note: Before ICU 2.8, this function did not return reliable numbers for some stateful converters (EBCDIC_STATEFUL, ISO-2022) and LMBCS.
This number may not be the same as the maximum number of bytes per "conversion unit". In other words, it may not be the intuitively expected number of bytes per character that would be published for a charset, and may not fulfill any other purpose than the allocation of an output buffer of guaranteed sufficient size for a given input length and converter.
Examples for special cases that are taken into account:
The number returned here does not take into account (see UCNV_GET_MAX_BYTES_FOR_STRING):
Examples for returned values:
converter | The Unicode converter. |
U_STABLE int8_t U_EXPORT2 ucnv_getMinCharSize | ( | const UConverter * | converter | ) |
Returns the minimum byte length for characters in this codepage.
This is usually either 1 or 2.
converter | the Unicode converter |
U_STABLE const char* U_EXPORT2 ucnv_getName | ( | const UConverter * | converter, | |
UErrorCode * | err | |||
) |
Gets the internal, canonical name of the converter (zero-terminated).
The lifetime of the returned string will be that of the converter passed to this function.
converter | the Unicode converter | |
err | UErrorCode status |
U_STABLE UChar32 U_EXPORT2 ucnv_getNextUChar | ( | UConverter * | converter, | |
const char ** | source, | |||
const char * | sourceLimit, | |||
UErrorCode * | err | |||
) |
Convert a codepage buffer into Unicode one character at a time.
The input is completely consumed when the U_INDEX_OUTOFBOUNDS_ERROR is set.
Advantage compared to ucnv_toUnicode() or ucnv_toUChars():
Limitations compared to ucnv_toUnicode():
It is possible to "mix" ucnv_getNextUChar() and ucnv_toUnicode() because ucnv_getNextUChar() uses the current state of the converter (unlike ucnv_toUChars() which always resets first). However, if ucnv_getNextUChar() is called after ucnv_toUnicode() stopped in the middle of a character sequence (with flush=FALSE), then ucnv_getNextUChar() will always use the slower ucnv_toUnicode() internally until the next character boundary. (This is new in ICU 2.6. In earlier releases, ucnv_getNextUChar() had to start at a character boundary.)
Instead of using ucnv_getNextUChar(), it is recommended to convert using ucnv_toUnicode() or ucnv_toUChars() and then iterate over the text using U16_NEXT() or a UCharIterator (uiter.h) or a C++ CharacterIterator or similar. This allows streaming conversion and offset output, for example.
Handling of surrogate pairs and supplementary-plane code points:
There are two different kinds of codepages that provide mappings for surrogate characters:
converter | an open UConverter | |
source | the address of a pointer to the codepage buffer, will be updated to point after the bytes consumed in the conversion call. | |
sourceLimit | points to the end of the input buffer | |
err | fills in error status (see ucnv_toUnicode) U_INDEX_OUTOFBOUNDS_ERROR will be set if the input is empty or does not convert to any output (e.g.: pure state-change codes SI/SO, escape sequences for ISO 2022, or if the callback did not output anything, ...). This function will not set a U_BUFFER_OVERFLOW_ERROR because the "buffer" is the return code. However, there might be subsequent output stored in the converter object that will be returned in following calls to this function. |
U_STABLE UConverterPlatform U_EXPORT2 ucnv_getPlatform | ( | const UConverter * | converter, | |
UErrorCode * | err | |||
) |
Gets a codepage platform associated with the converter.
Currently, only UCNV_IBM
will be returned. Does not test if the converter is NULL
or if converter's data table is NULL
.
converter | the Unicode converter | |
err | the error status code. |
U_STABLE const char* U_EXPORT2 ucnv_getStandard | ( | uint16_t | n, | |
UErrorCode * | pErrorCode | |||
) |
Gives the name of the standard at given index of standard list.
n | index in standard list | |
pErrorCode | result of operation |
U_STABLE const char* U_EXPORT2 ucnv_getStandardName | ( | const char * | name, | |
const char * | standard, | |||
UErrorCode * | pErrorCode | |||
) |
Returns a standard name for a given converter name.
Example alias table:
conv alias1 { STANDARD1 } alias2 { STANDARD1* }
Result of ucnv_getStandardName("conv", "STANDARD1") from example alias table:
"alias2"
name | original converter name | |
standard | name of the standard governing the names; MIME and IANA are such standards | |
pErrorCode | result of operation |
NULL
is returned. Owned by the library. U_STABLE void U_EXPORT2 ucnv_getStarters | ( | const UConverter * | converter, | |
UBool | starters[256], | |||
UErrorCode * | err | |||
) |
Gets the "starter" (lead) bytes for converters of type MBCS.
Will fill in an U_ILLEGAL_ARGUMENT_ERROR
if converter passed in is not MBCS. Fills in an array of type UBool, with the value of the byte as offset to the array. For example, if (starters[0x20] == TRUE) at return, it means that the byte 0x20 is a starter byte in this converter. Context pointers are always owned by the caller.
converter | a valid, opened converter of type MBCS | |
starters | an array of size 256 to be filled in | |
err | error status, U_ILLEGAL_ARGUMENT_ERROR if the converter is not a type which can return starters. |
U_STABLE void U_EXPORT2 ucnv_getSubstChars | ( | const UConverter * | converter, | |
char * | subChars, | |||
int8_t * | len, | |||
UErrorCode * | err | |||
) |
Fills in the output parameter, subChars, with the substitution characters as multiple bytes.
converter | the Unicode converter | |
subChars | the subsitution characters | |
len | on input the capacity of subChars, on output the number of bytes copied to it | |
err | the outgoing error status code. If the substitution character array is too small, an U_INDEX_OUTOFBOUNDS_ERROR will be returned. |
U_STABLE void U_EXPORT2 ucnv_getToUCallBack | ( | const UConverter * | converter, | |
UConverterToUCallback * | action, | |||
const void ** | context | |||
) |
Gets the current calback function used by the converter when an illegal or invalid codepage sequence is found.
Context pointers are always owned by the caller.
converter | the unicode converter | |
action | fillin: returns the callback function pointer | |
context | fillin: returns the callback's private void* context |
U_STABLE UConverterType U_EXPORT2 ucnv_getType | ( | const UConverter * | converter | ) |
Gets the type of the converter e.g.
SBCS, MBCS, DBCS, UTF8, UTF16_BE, UTF16_LE, ISO_2022, EBCDIC_STATEFUL, LATIN_1
converter | a valid, opened converter |
U_STABLE void U_EXPORT2 ucnv_getUnicodeSet | ( | const UConverter * | cnv, | |
USet * | setFillIn, | |||
UConverterUnicodeSet | whichSet, | |||
UErrorCode * | pErrorCode | |||
) |
Returns the set of Unicode code points that can be converted by an ICU converter.
The current implementation returns only one kind of set (UCNV_ROUNDTRIP_SET): The set of all Unicode code points that can be roundtrip-converted (converted without any data loss) with the converter. This set will not include code points that have fallback mappings or are only the result of reverse fallback mappings. See UTR #22 "Character Mapping Markup Language" at http://www.unicode.org/reports/tr22/
This is useful for example for
In the future, there may be more UConverterUnicodeSet choices to select sets with different properties.
cnv | The converter for which a set is requested. | |
setFillIn | A valid USet *. It will be cleared by this function before the converter's specific set is filled into the USet. | |
whichSet | A UConverterUnicodeSet selector; currently UCNV_ROUNDTRIP_SET is the only supported value. | |
pErrorCode | ICU error code in/out parameter. Must fulfill U_SUCCESS before the function call. |
U_STABLE UBool U_EXPORT2 ucnv_isAmbiguous | ( | const UConverter * | cnv | ) |
Determines if the converter contains ambiguous mappings of the same character or not.
cnv | the converter to be tested |
U_STABLE UConverter* U_EXPORT2 ucnv_open | ( | const char * | converterName, | |
UErrorCode * | err | |||
) |
Creates a UConverter object with the names specified as a C string.
The actual name will be resolved with the alias file using a case-insensitive string comparison that ignores the delimiters '-', '_', and ' ' (dash, underscore, and space). E.g., the names "UTF8", "utf-8", and "Utf 8" are all equivalent. If NULL
is passed for the converter name, it will create one with the getDefaultName return value.
A converter name for ICU 1.5 and above may contain options like a locale specification to control the specific behavior of the newly instantiated converter. The meaning of the options depends on the particular converter. If an option is not defined for or recognized by a given converter, then it is ignored.
Options are appended to the converter name string, with a UCNV_OPTION_SEP_CHAR
between the name and the first option and also between adjacent options.
If the alias is ambiguous, then the preferred converter is used and the status is set to U_AMBIGUOUS_ALIAS_WARNING.
The conversion behavior and names can vary between platforms. ICU may convert some characters differently from other platforms. Details on this topic are in the User's Guide.
converterName | Name of the uconv table, may have options appended | |
err | outgoing error status U_MEMORY_ALLOCATION_ERROR, U_FILE_ACCESS_ERROR |
NULL
if an error occured U_STABLE UEnumeration* U_EXPORT2 ucnv_openAllNames | ( | UErrorCode * | pErrorCode | ) |
Returns a UEnumeration to enumerate all of the canonical converter names, as per the alias file, regardless of the ability to open each converter.
U_STABLE UConverter* U_EXPORT2 ucnv_openCCSID | ( | int32_t | codepage, | |
UConverterPlatform | platform, | |||
UErrorCode * | err | |||
) |
Creates a UConverter object from a CCSID number and platform pair.
Note that the usefulness of this function is limited to platforms with numeric encoding IDs. Only IBM and Microsoft platforms use numeric (16-bit) identifiers for encodings.
In addition, IBM CCSIDs and Unicode conversion tables are not 1:1 related. For many IBM CCSIDs there are multiple (up to six) Unicode conversion tables, and for some Unicode conversion tables there are multiple CCSIDs. Some "alternate" Unicode conversion tables are provided by the IBM CDRA conversion table registry. The most prominent example of a systematic modification of conversion tables that is not provided in the form of conversion table files in the repository is that S/390 Unix System Services swaps the codes for Line Feed and New Line in all EBCDIC codepages, which requires such a swap in the Unicode conversion tables as well.
Only IBM default conversion tables are accessible with ucnv_openCCSID(). ucnv_getCCSID() will return the same CCSID for all conversion tables that are associated with that CCSID.
Currently, the only "platform" supported in the ICU converter API is UCNV_IBM.
In summary, the use of CCSIDs and the associated API functions is not recommended.
In order to open a converter with the default IBM CDRA Unicode conversion table, you can use this function or use the prefix "ibm-":
char name[20]; sprintf(name, "ibm-%hu", ccsid); cnv=ucnv_open(name, &errorCode);
In order to open a converter with the IBM S/390 Unix System Services variant of a Unicode/EBCDIC conversion table, you can use the prefix "ibm-" together with the option string UCNV_SWAP_LFNL_OPTION_STRING:
char name[20]; sprintf(name, "ibm-%hu" UCNV_SWAP_LFNL_OPTION_STRING, ccsid); cnv=ucnv_open(name, &errorCode);
In order to open a converter from a Microsoft codepage number, use the prefix "cp":
char name[20]; sprintf(name, "cp%hu", codepageID); cnv=ucnv_open(name, &errorCode);
If the alias is ambiguous, then the preferred converter is used and the status is set to U_AMBIGUOUS_ALIAS_WARNING.
codepage | codepage number to create | |
platform | the platform in which the codepage number exists | |
err | error status U_MEMORY_ALLOCATION_ERROR, U_FILE_ACCESS_ERROR |
NULL
if an error occured. U_STABLE UConverter* U_EXPORT2 ucnv_openPackage | ( | const char * | packageName, | |
const char * | converterName, | |||
UErrorCode * | err | |||
) |
Creates a UConverter object specified from a packageName and a converterName.
The packageName and converterName must point to an ICU udata object, as defined by udata_open( packageName, "cnv", converterName, err)
or equivalent. Typically, packageName will refer to a (.dat) file, or to a package registered with udata_setAppData(). Using a full file or directory pathname for packageName is deprecated.
The name will NOT be looked up in the alias mechanism, nor will the converter be stored in the converter cache or the alias table. The only way to open further converters is call this function multiple times, or use the ucnv_safeClone() function to clone a 'master' converter.
A future version of ICU may add alias table lookups and/or caching to this function.
Example Use: cnv = ucnv_openPackage("myapp", "myconverter", &err);
packageName | name of the package (equivalent to 'path' in udata_open() call) | |
converterName | name of the data item to be used, without suffix. | |
err | outgoing error status U_MEMORY_ALLOCATION_ERROR, U_FILE_ACCESS_ERROR |
NULL
if an error occured U_STABLE UEnumeration* U_EXPORT2 ucnv_openStandardNames | ( | const char * | convName, | |
const char * | standard, | |||
UErrorCode * | pErrorCode | |||
) |
Return a new UEnumeration object for enumerating all the alias names for a given converter that are recognized by a standard.
This method only enumerates the listed entries in the alias file. The convrtrs.txt file can be modified to change the results of this function. The first result in this list is the same result given by ucnv_getStandardName
, which is the default alias for the specified standard name. The returned object must be closed with uenum_close
when you are done with the object.
convName | original converter name | |
standard | name of the standard governing the names; MIME and IANA are such standards | |
pErrorCode | The error code |
U_STABLE UConverter* U_EXPORT2 ucnv_openU | ( | const UChar * | name, | |
UErrorCode * | err | |||
) |
Creates a Unicode converter with the names specified as unicode string.
The name should be limited to the ASCII-7 alphanumerics range. The actual name will be resolved with the alias file using a case-insensitive string comparison that ignores the delimiters '-', '_', and ' ' (dash, underscore, and space). E.g., the names "UTF8", "utf-8", and "Utf 8" are all equivalent. If NULL
is passed for the converter name, it will create one with the ucnv_getDefaultName() return value. If the alias is ambiguous, then the preferred converter is used and the status is set to U_AMBIGUOUS_ALIAS_WARNING.
name | : name of the uconv table in a zero terminated Unicode string | |
err | outgoing error status U_MEMORY_ALLOCATION_ERROR, U_FILE_ACCESS_ERROR |
NULL
if an error occured U_STABLE void U_EXPORT2 ucnv_reset | ( | UConverter * | converter | ) |
Resets the state of a converter to the default state.
This is used in the case of an error, to restart a conversion from a known default state. It will also empty the internal output buffers.
converter | the Unicode converter |
U_STABLE void U_EXPORT2 ucnv_resetFromUnicode | ( | UConverter * | converter | ) |
Resets the from-Unicode part of a converter state to the default state.
This is used in the case of an error to restart a conversion from Unicode to a known default state. It will also empty the internal output buffers used for the conversion from Unicode codepoints.
converter | the Unicode converter |
U_STABLE void U_EXPORT2 ucnv_resetToUnicode | ( | UConverter * | converter | ) |
Resets the to-Unicode part of a converter state to the default state.
This is used in the case of an error to restart a conversion to Unicode to a known default state. It will also empty the internal output buffers used for the conversion to Unicode codepoints.
converter | the Unicode converter |
U_STABLE UConverter* U_EXPORT2 ucnv_safeClone | ( | const UConverter * | cnv, | |
void * | stackBuffer, | |||
int32_t * | pBufferSize, | |||
UErrorCode * | status | |||
) |
Thread safe converter cloning operation.
For most efficient operation, pass in a stackBuffer (and a *pBufferSize) with at least U_CNV_SAFECLONE_BUFFERSIZE bytes of space. If the buffer size is sufficient, then the clone will use the stack buffer; otherwise, it will be allocated, and *pBufferSize will indicate the actual size. (This should not occur with U_CNV_SAFECLONE_BUFFERSIZE.)
You must ucnv_close() the clone in any case.
If *pBufferSize==0, (regardless of whether stackBuffer==NULL or not) then *pBufferSize will be changed to a sufficient size for cloning this converter, without actually cloning the converter ("pure pre-flighting").
If *pBufferSize is greater than zero but not large enough for a stack-based clone, then the converter is cloned using newly allocated memory and *pBufferSize is changed to the necessary size.
If the converter clone fits into the stack buffer but the stack buffer is not sufficiently aligned for the clone, then the clone will use an adjusted pointer and use an accordingly smaller buffer size.
cnv | converter to be cloned | |
stackBuffer | user allocated space for the new clone. If NULL new memory will be allocated. If buffer is not large enough, new memory will be allocated. Clients can use the U_CNV_SAFECLONE_BUFFERSIZE. This will probably be enough to avoid memory allocations. | |
pBufferSize | pointer to size of allocated space. pBufferSize must not be NULL. | |
status | to indicate whether the operation went on smoothly or there were errors An informational status value, U_SAFECLONE_ALLOCATED_WARNING, is used if any allocations were necessary. However, it is better to check if *pBufferSize grew for checking for allocations because warning codes can be overridden by subsequent function calls. |
U_STABLE void U_EXPORT2 ucnv_setDefaultName | ( | const char * | name | ) |
sets the current default converter name.
Caller must own the storage for 'name' and preserve it indefinitely.
name | the converter name to be the default (must exist). |
U_STABLE void U_EXPORT2 ucnv_setFallback | ( | UConverter * | cnv, | |
UBool | usesFallback | |||
) |
Sets the converter to use fallback mapping or not.
cnv | The converter to set the fallback mapping usage on. | |
usesFallback | TRUE if the user wants the converter to take advantage of the fallback mapping, FALSE otherwise. |
U_STABLE void U_EXPORT2 ucnv_setFromUCallBack | ( | UConverter * | converter, | |
UConverterFromUCallback | newAction, | |||
const void * | newContext, | |||
UConverterFromUCallback * | oldAction, | |||
const void ** | oldContext, | |||
UErrorCode * | err | |||
) |
Changes the current callback function used by the converter when an illegal or invalid sequence is found.
Context pointers are always owned by the caller. Predefined actions and contexts can be found in the ucnv_err.h header.
converter | the unicode converter | |
newAction | the new callback function | |
newContext | the new fromUnicode callback context pointer. This can be NULL. | |
oldAction | fillin: returns the old callback function pointer. This can be NULL. | |
oldContext | fillin: returns the old callback's private void* context. This can be NULL. | |
err | The error code status |
U_STABLE void U_EXPORT2 ucnv_setSubstChars | ( | UConverter * | converter, | |
const char * | subChars, | |||
int8_t | len, | |||
UErrorCode * | err | |||
) |
Sets the substitution chars when converting from unicode to a codepage.
The substitution is specified as a string of 1-4 bytes, and may contain NULL
byte.
converter | the Unicode converter | |
subChars | the substitution character byte sequence we want set | |
len | the number of bytes in subChars | |
err | the error status code. U_INDEX_OUTOFBOUNDS_ERROR if len is bigger than the maximum number of bytes allowed in subchars |
U_STABLE void U_EXPORT2 ucnv_setToUCallBack | ( | UConverter * | converter, | |
UConverterToUCallback | newAction, | |||
const void * | newContext, | |||
UConverterToUCallback * | oldAction, | |||
const void ** | oldContext, | |||
UErrorCode * | err | |||
) |
Changes the callback function used by the converter when an illegal or invalid sequence is found.
Context pointers are always owned by the caller. Predefined actions and contexts can be found in the ucnv_err.h header.
converter | the unicode converter | |
newAction | the new callback function | |
newContext | the new toUnicode callback context pointer. This can be NULL. | |
oldAction | fillin: returns the old callback function pointer. This can be NULL. | |
oldContext | fillin: returns the old callback's private void* context. This can be NULL. | |
err | The error code status |
U_STABLE int32_t U_EXPORT2 ucnv_toAlgorithmic | ( | UConverterType | algorithmicType, | |
UConverter * | cnv, | |||
char * | target, | |||
int32_t | targetCapacity, | |||
const char * | source, | |||
int32_t | sourceLength, | |||
UErrorCode * | pErrorCode | |||
) |
Convert from one external charset to another.
Internally, the text is converted to and from the 16-bit Unicode "pivot" using ucnv_convertEx(). ucnv_toAlgorithmic() works exactly like ucnv_convert() except that the two converters need not be looked up and opened completely.
The source-to-pivot conversion uses the cnv converter parameter. The pivot-to-target conversion uses a purely algorithmic converter according to the specified type, e.g., UCNV_UTF8 for a UTF-8 converter.
Internally, the algorithmic converter is opened and closed for each function call, which is more efficient than using the public ucnv_open() but somewhat less efficient than only resetting an existing converter and using ucnv_convertEx().
This function is more convenient than ucnv_convertEx() for single-string conversions, especially when "preflighting" is desired (returning the length of the complete output even if it does not fit into the target buffer; see the User Guide Strings chapter). See ucnv_convert() for details.
algorithmicType | UConverterType constant identifying the desired target charset as a purely algorithmic converter. Those are converters for Unicode charsets like UTF-8, BOCU-1, SCSU, UTF-7, IMAP-mailbox-name, etc., as well as US-ASCII and ISO-8859-1. | |
cnv | The converter that is used to convert from the source to the UTF-16 pivot buffer. | |
target | Pointer to the output buffer. | |
targetCapacity | Capacity of the target, in bytes. | |
source | Pointer to the input buffer. | |
sourceLength | Length of the input text, in bytes | |
pErrorCode | ICU error code in/out parameter. Must fulfill U_SUCCESS before the function call. |
U_STABLE int32_t U_EXPORT2 ucnv_toUChars | ( | UConverter * | cnv, | |
UChar * | dest, | |||
int32_t | destCapacity, | |||
const char * | src, | |||
int32_t | srcLength, | |||
UErrorCode * | pErrorCode | |||
) |
Convert the codepage string into a Unicode string using an existing UConverter.
The output string is NUL-terminated if possible.
This function is a more convenient but less powerful version of ucnv_toUnicode(). It is only useful for whole strings, not for streaming conversion.
The maximum output buffer capacity required (barring output from callbacks) will be 2*srcLength (each char may be converted into a surrogate pair).
cnv | the converter object to be used (ucnv_resetToUnicode() will be called) | |
src | the input codepage string | |
srcLength | the input string length, or -1 if NUL-terminated | |
dest | destination string buffer, can be NULL if destCapacity==0 | |
destCapacity | the number of UChars available at dest | |
pErrorCode | normal ICU error code; common error codes that may be set by this function include U_BUFFER_OVERFLOW_ERROR, U_STRING_NOT_TERMINATED_WARNING, U_ILLEGAL_ARGUMENT_ERROR, and conversion errors |
U_DRAFT int32_t U_EXPORT2 ucnv_toUCountPending | ( | const UConverter * | cnv, | |
UErrorCode * | status | |||
) |
Returns the number of chars held in the converter's internal state because more input is needed for completing the conversion.
This function is useful for mapping semantics of ICU's converter interface to those of iconv, and this information is not needed for normal conversion.
cnv | The converter in which the input is held as internal state | |
status | ICU error code in/out parameter. Must fulfill U_SUCCESS before the function call. |
U_STABLE void U_EXPORT2 ucnv_toUnicode | ( | UConverter * | converter, | |
UChar ** | target, | |||
const UChar * | targetLimit, | |||
const char ** | source, | |||
const char * | sourceLimit, | |||
int32_t * | offsets, | |||
UBool | flush, | |||
UErrorCode * | err | |||
) |
Converts a buffer of codepage bytes into an array of unicode UChars characters.
This function is optimized for converting a continuous stream of data in buffer-sized chunks, where the entire source and target does not fit in available buffers.
The source pointer is an in/out parameter. It starts out pointing where the conversion is to begin, and ends up pointing after the last byte of source consumed.
Target similarly starts out pointer at the first available UChar in the output buffer, and ends up pointing after the last UChar written to the output. It does NOT necessarily keep UChar sequences together.
The converter always attempts to consume the entire source buffer, unless (1.) the target buffer is full, or (2.) a failing error is returned from the current callback function. When a successful error status has been returned, it means that all of the source buffer has been consumed. At that point, the caller should reset the source and sourceLimit pointers to point to the next chunk.
At the end of the stream (flush==TRUE), the input is completely consumed when *source==sourceLimit and no error code is set The converter object is then automatically reset by this function. (This means that a converter need not be reset explicitly between data streams if it finishes the previous stream without errors.)
This is a stateful conversion. Additionally, even when all source data has been consumed, some data may be in the converters' internal state. Call this function repeatedly, updating the target pointers with the next empty chunk of target in case of a U_BUFFER_OVERFLOW_ERROR
, and updating the source pointers with the next chunk of source when a successful error status is returned, until there are no more chunks of source data.
converter | the Unicode converter | |
target | I/O parameter. Input : Points to the beginning of the buffer to copy UChars into. Output : points to after the last UChar copied. | |
targetLimit | the pointer just after the end of the target buffer | |
source | I/O parameter, pointer to pointer to the source codepage buffer. | |
sourceLimit | the pointer to the byte after the end of the source buffer | |
offsets | if NULL is passed, nothing will happen to it, otherwise it needs to have the same number of allocated cells as target . Will fill in offsets from target to source pointer e.g: offsets[3] is equal to 6, it means that the target[3] was a result of transcoding source[6] For output data carried across calls, and other data without a specific source character (such as from escape sequences or callbacks) -1 will be placed for offsets. | |
flush | set to TRUE if the current source buffer is the last available chunk of the source, FALSE otherwise. Note that if a failing status is returned, this function may have to be called multiple times with flush set to TRUE until the source buffer is consumed. | |
err | the error status. U_ILLEGAL_ARGUMENT_ERROR will be set if the converter is NULL . U_BUFFER_OVERFLOW_ERROR will be set if the target is full and there is still data to be written to the target. |
U_STABLE UBool U_EXPORT2 ucnv_usesFallback | ( | const UConverter * | cnv | ) |
Determines if the converter uses fallback mappings or not.
cnv | The converter to be tested |
typedef void | ( | U_EXPORT2 * | UConverterFromUCallback | ) | const |
Function pointer for error callback in the codepage to unicode direction.
Called when an error has occured in conversion to unicode, or on open/close of the callback (see reason).
context | Pointer to the callback's private data | |
args | Information about the conversion in progress | |
codeUnits | Points to 'length' bytes of the concerned codepage sequence | |
length | Size (in bytes) of the concerned codepage sequence | |
reason | Defines the reason the callback was invoked | |
pErrorCode | ICU error code in/out parameter. For converter callback functions, set to a conversion error before the call, and the callback may reset it to U_ZERO_ERROR. |