GNU Classpath (0.98) | |
Frames | No Frames |
1: /* java.lang.Character -- Wrapper class for char, and Unicode subsets 2: Copyright (C) 1998, 1999, 2001, 2002, 2004, 2005 Free Software Foundation, Inc. 3: 4: This file is part of GNU Classpath. 5: 6: GNU Classpath is free software; you can redistribute it and/or modify 7: it under the terms of the GNU General Public License as published by 8: the Free Software Foundation; either version 2, or (at your option) 9: any later version. 10: 11: GNU Classpath is distributed in the hope that it will be useful, but 12: WITHOUT ANY WARRANTY; without even the implied warranty of 13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14: General Public License for more details. 15: 16: You should have received a copy of the GNU General Public License 17: along with GNU Classpath; see the file COPYING. If not, write to the 18: Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 19: 02110-1301 USA. 20: 21: Linking this library statically or dynamically with other modules is 22: making a combined work based on this library. Thus, the terms and 23: conditions of the GNU General Public License cover the whole 24: combination. 25: 26: As a special exception, the copyright holders of this library give you 27: permission to link this library with independent modules to produce an 28: executable, regardless of the license terms of these independent 29: modules, and to copy and distribute the resulting executable under 30: terms of your choice, provided that you also meet, for each linked 31: independent module, the terms and conditions of the license of that 32: module. An independent module is a module which is not derived from 33: or based on this library. If you modify this library, you may extend 34: this exception to your version of the library, but you are not 35: obligated to do so. If you do not wish to do so, delete this 36: exception statement from your version. */ 37: 38: 39: package java.lang; 40: 41: import gnu.java.lang.CharData; 42: 43: import java.io.Serializable; 44: import java.text.Collator; 45: import java.util.Locale; 46: 47: /** 48: * Wrapper class for the primitive char data type. In addition, this class 49: * allows one to retrieve property information and perform transformations 50: * on the defined characters in the Unicode Standard, Version 4.0.0. 51: * java.lang.Character is designed to be very dynamic, and as such, it 52: * retrieves information on the Unicode character set from a separate 53: * database, gnu.java.lang.CharData, which can be easily upgraded. 54: * 55: * <p>For predicates, boundaries are used to describe 56: * the set of characters for which the method will return true. 57: * This syntax uses fairly normal regular expression notation. 58: * See 5.13 of the Unicode Standard, Version 4.0, for the 59: * boundary specification. 60: * 61: * <p>See <a href="http://www.unicode.org">http://www.unicode.org</a> 62: * for more information on the Unicode Standard. 63: * 64: * @author Tom Tromey (tromey@cygnus.com) 65: * @author Paul N. Fisher 66: * @author Jochen Hoenicke 67: * @author Eric Blake (ebb9@email.byu.edu) 68: * @author Andrew John Hughes (gnu_andrew@member.fsf.org) 69: * @see CharData 70: * @since 1.0 71: * @status partly updated to 1.5; some things still missing 72: */ 73: public final class Character implements Serializable, Comparable<Character> 74: { 75: /** 76: * A subset of Unicode blocks. 77: * 78: * @author Paul N. Fisher 79: * @author Eric Blake (ebb9@email.byu.edu) 80: * @since 1.2 81: */ 82: public static class Subset 83: { 84: /** The name of the subset. */ 85: private final String name; 86: 87: /** 88: * Construct a new subset of characters. 89: * 90: * @param name the name of the subset 91: * @throws NullPointerException if name is null 92: */ 93: protected Subset(String name) 94: { 95: // Note that name.toString() is name, unless name was null. 96: this.name = name.toString(); 97: } 98: 99: /** 100: * Compares two Subsets for equality. This is <code>final</code>, and 101: * restricts the comparison on the <code>==</code> operator, so it returns 102: * true only for the same object. 103: * 104: * @param o the object to compare 105: * @return true if o is this 106: */ 107: public final boolean equals(Object o) 108: { 109: return o == this; 110: } 111: 112: /** 113: * Makes the original hashCode of Object final, to be consistent with 114: * equals. 115: * 116: * @return the hash code for this object 117: */ 118: public final int hashCode() 119: { 120: return super.hashCode(); 121: } 122: 123: /** 124: * Returns the name of the subset. 125: * 126: * @return the name 127: */ 128: public final String toString() 129: { 130: return name; 131: } 132: } // class Subset 133: 134: /** 135: * A family of character subsets in the Unicode specification. A character 136: * is in at most one of these blocks. 137: * 138: * This inner class was generated automatically from 139: * <code>doc/unicode/Blocks-4.0.0.txt</code>, by some perl scripts. 140: * This Unicode definition file can be found on the 141: * <a href="http://www.unicode.org">http://www.unicode.org</a> website. 142: * JDK 1.5 uses Unicode version 4.0.0. 143: * 144: * @author scripts/unicode-blocks.pl (written by Eric Blake) 145: * @since 1.2 146: */ 147: public static final class UnicodeBlock extends Subset 148: { 149: /** The start of the subset. */ 150: private final int start; 151: 152: /** The end of the subset. */ 153: private final int end; 154: 155: /** The canonical name of the block according to the Unicode standard. */ 156: private final String canonicalName; 157: 158: /** Enumeration for the <code>forName()</code> method */ 159: private enum NameType { CANONICAL, NO_SPACES, CONSTANT; } 160: 161: /** 162: * Constructor for strictly defined blocks. 163: * 164: * @param start the start character of the range 165: * @param end the end character of the range 166: * @param name the block name 167: * @param canonicalName the name of the block as defined in the Unicode 168: * standard. 169: */ 170: private UnicodeBlock(int start, int end, String name, 171: String canonicalName) 172: { 173: super(name); 174: this.start = start; 175: this.end = end; 176: this.canonicalName = canonicalName; 177: } 178: 179: /** 180: * Returns the Unicode character block which a character belongs to. 181: * <strong>Note</strong>: This method does not support the use of 182: * supplementary characters. For such support, <code>of(int)</code> 183: * should be used instead. 184: * 185: * @param ch the character to look up 186: * @return the set it belongs to, or null if it is not in one 187: */ 188: public static UnicodeBlock of(char ch) 189: { 190: return of((int) ch); 191: } 192: 193: /** 194: * Returns the Unicode character block which a code point belongs to. 195: * 196: * @param codePoint the character to look up 197: * @return the set it belongs to, or null if it is not in one. 198: * @throws IllegalArgumentException if the specified code point is 199: * invalid. 200: * @since 1.5 201: */ 202: public static UnicodeBlock of(int codePoint) 203: { 204: if (codePoint > MAX_CODE_POINT) 205: throw new IllegalArgumentException("The supplied integer value is " + 206: "too large to be a codepoint."); 207: // Simple binary search for the correct block. 208: int low = 0; 209: int hi = sets.length - 1; 210: while (low <= hi) 211: { 212: int mid = (low + hi) >> 1; 213: UnicodeBlock b = sets[mid]; 214: if (codePoint < b.start) 215: hi = mid - 1; 216: else if (codePoint > b.end) 217: low = mid + 1; 218: else 219: return b; 220: } 221: return null; 222: } 223: 224: /** 225: * <p> 226: * Returns the <code>UnicodeBlock</code> with the given name, as defined 227: * by the Unicode standard. The version of Unicode in use is defined by 228: * the <code>Character</code> class, and the names are given in the 229: * <code>Blocks-<version>.txt</code> file corresponding to that version. 230: * The name may be specified in one of three ways: 231: * </p> 232: * <ol> 233: * <li>The canonical, human-readable name used by the Unicode standard. 234: * This is the name with all spaces and hyphens retained. For example, 235: * `Basic Latin' retrieves the block, UnicodeBlock.BASIC_LATIN.</li> 236: * <li>The canonical name with all spaces removed e.g. `BasicLatin'.</li> 237: * <li>The name used for the constants specified by this class, which 238: * is the canonical name with all spaces and hyphens replaced with 239: * underscores e.g. `BASIC_LATIN'</li> 240: * </ol> 241: * <p> 242: * The names are compared case-insensitively using the case comparison 243: * associated with the U.S. English locale. The method recognises the 244: * previous names used for blocks as well as the current ones. At 245: * present, this simply means that the deprecated `SURROGATES_AREA' 246: * will be recognised by this method (the <code>of()</code> methods 247: * only return one of the three new surrogate blocks). 248: * </p> 249: * 250: * @param blockName the name of the block to look up. 251: * @return the specified block. 252: * @throws NullPointerException if the <code>blockName</code> is 253: * <code>null</code>. 254: * @throws IllegalArgumentException if the name does not match any Unicode 255: * block. 256: * @since 1.5 257: */ 258: public static final UnicodeBlock forName(String blockName) 259: { 260: NameType type; 261: if (blockName.indexOf(' ') != -1) 262: type = NameType.CANONICAL; 263: else if (blockName.indexOf('_') != -1) 264: type = NameType.CONSTANT; 265: else 266: type = NameType.NO_SPACES; 267: Collator usCollator = Collator.getInstance(Locale.US); 268: usCollator.setStrength(Collator.PRIMARY); 269: /* Special case for deprecated blocks not in sets */ 270: switch (type) 271: { 272: case CANONICAL: 273: if (usCollator.compare(blockName, "Surrogates Area") == 0) 274: return SURROGATES_AREA; 275: break; 276: case NO_SPACES: 277: if (usCollator.compare(blockName, "SurrogatesArea") == 0) 278: return SURROGATES_AREA; 279: break; 280: case CONSTANT: 281: if (usCollator.compare(blockName, "SURROGATES_AREA") == 0) 282: return SURROGATES_AREA; 283: break; 284: } 285: /* Other cases */ 286: switch (type) 287: { 288: case CANONICAL: 289: for (UnicodeBlock block : sets) 290: if (usCollator.compare(blockName, block.canonicalName) == 0) 291: return block; 292: break; 293: case NO_SPACES: 294: for (UnicodeBlock block : sets) 295: { 296: String nsName = block.canonicalName.replaceAll(" ",""); 297: if (usCollator.compare(blockName, nsName) == 0) 298: return block; 299: } 300: break; 301: case CONSTANT: 302: for (UnicodeBlock block : sets) 303: if (usCollator.compare(blockName, block.toString()) == 0) 304: return block; 305: break; 306: } 307: throw new IllegalArgumentException("No Unicode block found for " + 308: blockName + "."); 309: } 310: 311: /** 312: * Basic Latin. 313: * 0x0000 - 0x007F. 314: */ 315: public static final UnicodeBlock BASIC_LATIN 316: = new UnicodeBlock(0x0000, 0x007F, 317: "BASIC_LATIN", 318: "Basic Latin"); 319: 320: /** 321: * Latin-1 Supplement. 322: * 0x0080 - 0x00FF. 323: */ 324: public static final UnicodeBlock LATIN_1_SUPPLEMENT 325: = new UnicodeBlock(0x0080, 0x00FF, 326: "LATIN_1_SUPPLEMENT", 327: "Latin-1 Supplement"); 328: 329: /** 330: * Latin Extended-A. 331: * 0x0100 - 0x017F. 332: */ 333: public static final UnicodeBlock LATIN_EXTENDED_A 334: = new UnicodeBlock(0x0100, 0x017F, 335: "LATIN_EXTENDED_A", 336: "Latin Extended-A"); 337: 338: /** 339: * Latin Extended-B. 340: * 0x0180 - 0x024F. 341: */ 342: public static final UnicodeBlock LATIN_EXTENDED_B 343: = new UnicodeBlock(0x0180, 0x024F, 344: "LATIN_EXTENDED_B", 345: "Latin Extended-B"); 346: 347: /** 348: * IPA Extensions. 349: * 0x0250 - 0x02AF. 350: */ 351: public static final UnicodeBlock IPA_EXTENSIONS 352: = new UnicodeBlock(0x0250, 0x02AF, 353: "IPA_EXTENSIONS", 354: "IPA Extensions"); 355: 356: /** 357: * Spacing Modifier Letters. 358: * 0x02B0 - 0x02FF. 359: */ 360: public static final UnicodeBlock SPACING_MODIFIER_LETTERS 361: = new UnicodeBlock(0x02B0, 0x02FF, 362: "SPACING_MODIFIER_LETTERS", 363: "Spacing Modifier Letters"); 364: 365: /** 366: * Combining Diacritical Marks. 367: * 0x0300 - 0x036F. 368: */ 369: public static final UnicodeBlock COMBINING_DIACRITICAL_MARKS 370: = new UnicodeBlock(0x0300, 0x036F, 371: "COMBINING_DIACRITICAL_MARKS", 372: "Combining Diacritical Marks"); 373: 374: /** 375: * Greek. 376: * 0x0370 - 0x03FF. 377: */ 378: public static final UnicodeBlock GREEK 379: = new UnicodeBlock(0x0370, 0x03FF, 380: "GREEK", 381: "Greek"); 382: 383: /** 384: * Cyrillic. 385: * 0x0400 - 0x04FF. 386: */ 387: public static final UnicodeBlock CYRILLIC 388: = new UnicodeBlock(0x0400, 0x04FF, 389: "CYRILLIC", 390: "Cyrillic"); 391: 392: /** 393: * Cyrillic Supplementary. 394: * 0x0500 - 0x052F. 395: * @since 1.5 396: */ 397: public static final UnicodeBlock CYRILLIC_SUPPLEMENTARY 398: = new UnicodeBlock(0x0500, 0x052F, 399: "CYRILLIC_SUPPLEMENTARY", 400: "Cyrillic Supplementary"); 401: 402: /** 403: * Armenian. 404: * 0x0530 - 0x058F. 405: */ 406: public static final UnicodeBlock ARMENIAN 407: = new UnicodeBlock(0x0530, 0x058F, 408: "ARMENIAN", 409: "Armenian"); 410: 411: /** 412: * Hebrew. 413: * 0x0590 - 0x05FF. 414: */ 415: public static final UnicodeBlock HEBREW 416: = new UnicodeBlock(0x0590, 0x05FF, 417: "HEBREW", 418: "Hebrew"); 419: 420: /** 421: * Arabic. 422: * 0x0600 - 0x06FF. 423: */ 424: public static final UnicodeBlock ARABIC 425: = new UnicodeBlock(0x0600, 0x06FF, 426: "ARABIC", 427: "Arabic"); 428: 429: /** 430: * Syriac. 431: * 0x0700 - 0x074F. 432: * @since 1.4 433: */ 434: public static final UnicodeBlock SYRIAC 435: = new UnicodeBlock(0x0700, 0x074F, 436: "SYRIAC", 437: "Syriac"); 438: 439: /** 440: * Thaana. 441: * 0x0780 - 0x07BF. 442: * @since 1.4 443: */ 444: public static final UnicodeBlock THAANA 445: = new UnicodeBlock(0x0780, 0x07BF, 446: "THAANA", 447: "Thaana"); 448: 449: /** 450: * Devanagari. 451: * 0x0900 - 0x097F. 452: */ 453: public static final UnicodeBlock DEVANAGARI 454: = new UnicodeBlock(0x0900, 0x097F, 455: "DEVANAGARI", 456: "Devanagari"); 457: 458: /** 459: * Bengali. 460: * 0x0980 - 0x09FF. 461: */ 462: public static final UnicodeBlock BENGALI 463: = new UnicodeBlock(0x0980, 0x09FF, 464: "BENGALI", 465: "Bengali"); 466: 467: /** 468: * Gurmukhi. 469: * 0x0A00 - 0x0A7F. 470: */ 471: public static final UnicodeBlock GURMUKHI 472: = new UnicodeBlock(0x0A00, 0x0A7F, 473: "GURMUKHI", 474: "Gurmukhi"); 475: 476: /** 477: * Gujarati. 478: * 0x0A80 - 0x0AFF. 479: */ 480: public static final UnicodeBlock GUJARATI 481: = new UnicodeBlock(0x0A80, 0x0AFF, 482: "GUJARATI", 483: "Gujarati"); 484: 485: /** 486: * Oriya. 487: * 0x0B00 - 0x0B7F. 488: */ 489: public static final UnicodeBlock ORIYA 490: = new UnicodeBlock(0x0B00, 0x0B7F, 491: "ORIYA", 492: "Oriya"); 493: 494: /** 495: * Tamil. 496: * 0x0B80 - 0x0BFF. 497: */ 498: public static final UnicodeBlock TAMIL 499: = new UnicodeBlock(0x0B80, 0x0BFF, 500: "TAMIL", 501: "Tamil"); 502: 503: /** 504: * Telugu. 505: * 0x0C00 - 0x0C7F. 506: */ 507: public static final UnicodeBlock TELUGU 508: = new UnicodeBlock(0x0C00, 0x0C7F, 509: "TELUGU", 510: "Telugu"); 511: 512: /** 513: * Kannada. 514: * 0x0C80 - 0x0CFF. 515: */ 516: public static final UnicodeBlock KANNADA 517: = new UnicodeBlock(0x0C80, 0x0CFF, 518: "KANNADA", 519: "Kannada"); 520: 521: /** 522: * Malayalam. 523: * 0x0D00 - 0x0D7F. 524: */ 525: public static final UnicodeBlock MALAYALAM 526: = new UnicodeBlock(0x0D00, 0x0D7F, 527: "MALAYALAM", 528: "Malayalam"); 529: 530: /** 531: * Sinhala. 532: * 0x0D80 - 0x0DFF. 533: * @since 1.4 534: */ 535: public static final UnicodeBlock SINHALA 536: = new UnicodeBlock(0x0D80, 0x0DFF, 537: "SINHALA", 538: "Sinhala"); 539: 540: /** 541: * Thai. 542: * 0x0E00 - 0x0E7F. 543: */ 544: public static final UnicodeBlock THAI 545: = new UnicodeBlock(0x0E00, 0x0E7F, 546: "THAI", 547: "Thai"); 548: 549: /** 550: * Lao. 551: * 0x0E80 - 0x0EFF. 552: */ 553: public static final UnicodeBlock LAO 554: = new UnicodeBlock(0x0E80, 0x0EFF, 555: "LAO", 556: "Lao"); 557: 558: /** 559: * Tibetan. 560: * 0x0F00 - 0x0FFF. 561: */ 562: public static final UnicodeBlock TIBETAN 563: = new UnicodeBlock(0x0F00, 0x0FFF, 564: "TIBETAN", 565: "Tibetan"); 566: 567: /** 568: * Myanmar. 569: * 0x1000 - 0x109F. 570: * @since 1.4 571: */ 572: public static final UnicodeBlock MYANMAR 573: = new UnicodeBlock(0x1000, 0x109F, 574: "MYANMAR", 575: "Myanmar"); 576: 577: /** 578: * Georgian. 579: * 0x10A0 - 0x10FF. 580: */ 581: public static final UnicodeBlock GEORGIAN 582: = new UnicodeBlock(0x10A0, 0x10FF, 583: "GEORGIAN", 584: "Georgian"); 585: 586: /** 587: * Hangul Jamo. 588: * 0x1100 - 0x11FF. 589: */ 590: public static final UnicodeBlock HANGUL_JAMO 591: = new UnicodeBlock(0x1100, 0x11FF, 592: "HANGUL_JAMO", 593: "Hangul Jamo"); 594: 595: /** 596: * Ethiopic. 597: * 0x1200 - 0x137F. 598: * @since 1.4 599: */ 600: public static final UnicodeBlock ETHIOPIC 601: = new UnicodeBlock(0x1200, 0x137F, 602: "ETHIOPIC", 603: "Ethiopic"); 604: 605: /** 606: * Cherokee. 607: * 0x13A0 - 0x13FF. 608: * @since 1.4 609: */ 610: public static final UnicodeBlock CHEROKEE 611: = new UnicodeBlock(0x13A0, 0x13FF, 612: "CHEROKEE", 613: "Cherokee"); 614: 615: /** 616: * Unified Canadian Aboriginal Syllabics. 617: * 0x1400 - 0x167F. 618: * @since 1.4 619: */ 620: public static final UnicodeBlock UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS 621: = new UnicodeBlock(0x1400, 0x167F, 622: "UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS", 623: "Unified Canadian Aboriginal Syllabics"); 624: 625: /** 626: * Ogham. 627: * 0x1680 - 0x169F. 628: * @since 1.4 629: */ 630: public static final UnicodeBlock OGHAM 631: = new UnicodeBlock(0x1680, 0x169F, 632: "OGHAM", 633: "Ogham"); 634: 635: /** 636: * Runic. 637: * 0x16A0 - 0x16FF. 638: * @since 1.4 639: */ 640: public static final UnicodeBlock RUNIC 641: = new UnicodeBlock(0x16A0, 0x16FF, 642: "RUNIC", 643: "Runic"); 644: 645: /** 646: * Tagalog. 647: * 0x1700 - 0x171F. 648: * @since 1.5 649: */ 650: public static final UnicodeBlock TAGALOG 651: = new UnicodeBlock(0x1700, 0x171F, 652: "TAGALOG", 653: "Tagalog"); 654: 655: /** 656: * Hanunoo. 657: * 0x1720 - 0x173F. 658: * @since 1.5 659: */ 660: public static final UnicodeBlock HANUNOO 661: = new UnicodeBlock(0x1720, 0x173F, 662: "HANUNOO", 663: "Hanunoo"); 664: 665: /** 666: * Buhid. 667: * 0x1740 - 0x175F. 668: * @since 1.5 669: */ 670: public static final UnicodeBlock BUHID 671: = new UnicodeBlock(0x1740, 0x175F, 672: "BUHID", 673: "Buhid"); 674: 675: /** 676: * Tagbanwa. 677: * 0x1760 - 0x177F. 678: * @since 1.5 679: */ 680: public static final UnicodeBlock TAGBANWA 681: = new UnicodeBlock(0x1760, 0x177F, 682: "TAGBANWA", 683: "Tagbanwa"); 684: 685: /** 686: * Khmer. 687: * 0x1780 - 0x17FF. 688: * @since 1.4 689: */ 690: public static final UnicodeBlock KHMER 691: = new UnicodeBlock(0x1780, 0x17FF, 692: "KHMER", 693: "Khmer"); 694: 695: /** 696: * Mongolian. 697: * 0x1800 - 0x18AF. 698: * @since 1.4 699: */ 700: public static final UnicodeBlock MONGOLIAN 701: = new UnicodeBlock(0x1800, 0x18AF, 702: "MONGOLIAN", 703: "Mongolian"); 704: 705: /** 706: * Limbu. 707: * 0x1900 - 0x194F. 708: * @since 1.5 709: */ 710: public static final UnicodeBlock LIMBU 711: = new UnicodeBlock(0x1900, 0x194F, 712: "LIMBU", 713: "Limbu"); 714: 715: /** 716: * Tai Le. 717: * 0x1950 - 0x197F. 718: * @since 1.5 719: */ 720: public static final UnicodeBlock TAI_LE 721: = new UnicodeBlock(0x1950, 0x197F, 722: "TAI_LE", 723: "Tai Le"); 724: 725: /** 726: * Khmer Symbols. 727: * 0x19E0 - 0x19FF. 728: * @since 1.5 729: */ 730: public static final UnicodeBlock KHMER_SYMBOLS 731: = new UnicodeBlock(0x19E0, 0x19FF, 732: "KHMER_SYMBOLS", 733: "Khmer Symbols"); 734: 735: /** 736: * Phonetic Extensions. 737: * 0x1D00 - 0x1D7F. 738: * @since 1.5 739: */ 740: public static final UnicodeBlock PHONETIC_EXTENSIONS 741: = new UnicodeBlock(0x1D00, 0x1D7F, 742: "PHONETIC_EXTENSIONS", 743: "Phonetic Extensions"); 744: 745: /** 746: * Latin Extended Additional. 747: * 0x1E00 - 0x1EFF. 748: */ 749: public static final UnicodeBlock LATIN_EXTENDED_ADDITIONAL 750: = new UnicodeBlock(0x1E00, 0x1EFF, 751: "LATIN_EXTENDED_ADDITIONAL", 752: "Latin Extended Additional"); 753: 754: /** 755: * Greek Extended. 756: * 0x1F00 - 0x1FFF. 757: */ 758: public static final UnicodeBlock GREEK_EXTENDED 759: = new UnicodeBlock(0x1F00, 0x1FFF, 760: "GREEK_EXTENDED", 761: "Greek Extended"); 762: 763: /** 764: * General Punctuation. 765: * 0x2000 - 0x206F. 766: */ 767: public static final UnicodeBlock GENERAL_PUNCTUATION 768: = new UnicodeBlock(0x2000, 0x206F, 769: "GENERAL_PUNCTUATION", 770: "General Punctuation"); 771: 772: /** 773: * Superscripts and Subscripts. 774: * 0x2070 - 0x209F. 775: */ 776: public static final UnicodeBlock SUPERSCRIPTS_AND_SUBSCRIPTS 777: = new UnicodeBlock(0x2070, 0x209F, 778: "SUPERSCRIPTS_AND_SUBSCRIPTS", 779: "Superscripts and Subscripts"); 780: 781: /** 782: * Currency Symbols. 783: * 0x20A0 - 0x20CF. 784: */ 785: public static final UnicodeBlock CURRENCY_SYMBOLS 786: = new UnicodeBlock(0x20A0, 0x20CF, 787: "CURRENCY_SYMBOLS", 788: "Currency Symbols"); 789: 790: /** 791: * Combining Marks for Symbols. 792: * 0x20D0 - 0x20FF. 793: */ 794: public static final UnicodeBlock COMBINING_MARKS_FOR_SYMBOLS 795: = new UnicodeBlock(0x20D0, 0x20FF, 796: "COMBINING_MARKS_FOR_SYMBOLS", 797: "Combining Marks for Symbols"); 798: 799: /** 800: * Letterlike Symbols. 801: * 0x2100 - 0x214F. 802: */ 803: public static final UnicodeBlock LETTERLIKE_SYMBOLS 804: = new UnicodeBlock(0x2100, 0x214F, 805: "LETTERLIKE_SYMBOLS", 806: "Letterlike Symbols"); 807: 808: /** 809: * Number Forms. 810: * 0x2150 - 0x218F. 811: */ 812: public static final UnicodeBlock NUMBER_FORMS 813: = new UnicodeBlock(0x2150, 0x218F, 814: "NUMBER_FORMS", 815: "Number Forms"); 816: 817: /** 818: * Arrows. 819: * 0x2190 - 0x21FF. 820: */ 821: public static final UnicodeBlock ARROWS 822: = new UnicodeBlock(0x2190, 0x21FF, 823: "ARROWS", 824: "Arrows"); 825: 826: /** 827: * Mathematical Operators. 828: * 0x2200 - 0x22FF. 829: */ 830: public static final UnicodeBlock MATHEMATICAL_OPERATORS 831: = new UnicodeBlock(0x2200, 0x22FF, 832: "MATHEMATICAL_OPERATORS", 833: "Mathematical Operators"); 834: 835: /** 836: * Miscellaneous Technical. 837: * 0x2300 - 0x23FF. 838: */ 839: public static final UnicodeBlock MISCELLANEOUS_TECHNICAL 840: = new UnicodeBlock(0x2300, 0x23FF, 841: "MISCELLANEOUS_TECHNICAL", 842: "Miscellaneous Technical"); 843: 844: /** 845: * Control Pictures. 846: * 0x2400 - 0x243F. 847: */ 848: public static final UnicodeBlock CONTROL_PICTURES 849: = new UnicodeBlock(0x2400, 0x243F, 850: "CONTROL_PICTURES", 851: "Control Pictures"); 852: 853: /** 854: * Optical Character Recognition. 855: * 0x2440 - 0x245F. 856: */ 857: public static final UnicodeBlock OPTICAL_CHARACTER_RECOGNITION 858: = new UnicodeBlock(0x2440, 0x245F, 859: "OPTICAL_CHARACTER_RECOGNITION", 860: "Optical Character Recognition"); 861: 862: /** 863: * Enclosed Alphanumerics. 864: * 0x2460 - 0x24FF. 865: */ 866: public static final UnicodeBlock ENCLOSED_ALPHANUMERICS 867: = new UnicodeBlock(0x2460, 0x24FF, 868: "ENCLOSED_ALPHANUMERICS", 869: "Enclosed Alphanumerics"); 870: 871: /** 872: * Box Drawing. 873: * 0x2500 - 0x257F. 874: */ 875: public static final UnicodeBlock BOX_DRAWING 876: = new UnicodeBlock(0x2500, 0x257F, 877: "BOX_DRAWING", 878: "Box Drawing"); 879: 880: /** 881: * Block Elements. 882: * 0x2580 - 0x259F. 883: */ 884: public static final UnicodeBlock BLOCK_ELEMENTS 885: = new UnicodeBlock(0x2580, 0x259F, 886: "BLOCK_ELEMENTS", 887: "Block Elements"); 888: 889: /** 890: * Geometric Shapes. 891: * 0x25A0 - 0x25FF. 892: */ 893: public static final UnicodeBlock GEOMETRIC_SHAPES 894: = new UnicodeBlock(0x25A0, 0x25FF, 895: "GEOMETRIC_SHAPES", 896: "Geometric Shapes"); 897: 898: /** 899: * Miscellaneous Symbols. 900: * 0x2600 - 0x26FF. 901: */ 902: public static final UnicodeBlock MISCELLANEOUS_SYMBOLS 903: = new UnicodeBlock(0x2600, 0x26FF, 904: "MISCELLANEOUS_SYMBOLS", 905: "Miscellaneous Symbols"); 906: 907: /** 908: * Dingbats. 909: * 0x2700 - 0x27BF. 910: */ 911: public static final UnicodeBlock DINGBATS 912: = new UnicodeBlock(0x2700, 0x27BF, 913: "DINGBATS", 914: "Dingbats"); 915: 916: /** 917: * Miscellaneous Mathematical Symbols-A. 918: * 0x27C0 - 0x27EF. 919: * @since 1.5 920: */ 921: public static final UnicodeBlock MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A 922: = new UnicodeBlock(0x27C0, 0x27EF, 923: "MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A", 924: "Miscellaneous Mathematical Symbols-A"); 925: 926: /** 927: * Supplemental Arrows-A. 928: * 0x27F0 - 0x27FF. 929: * @since 1.5 930: */ 931: public static final UnicodeBlock SUPPLEMENTAL_ARROWS_A 932: = new UnicodeBlock(0x27F0, 0x27FF, 933: "SUPPLEMENTAL_ARROWS_A", 934: "Supplemental Arrows-A"); 935: 936: /** 937: * Braille Patterns. 938: * 0x2800 - 0x28FF. 939: * @since 1.4 940: */ 941: public static final UnicodeBlock BRAILLE_PATTERNS 942: = new UnicodeBlock(0x2800, 0x28FF, 943: "BRAILLE_PATTERNS", 944: "Braille Patterns"); 945: 946: /** 947: * Supplemental Arrows-B. 948: * 0x2900 - 0x297F. 949: * @since 1.5 950: */ 951: public static final UnicodeBlock SUPPLEMENTAL_ARROWS_B 952: = new UnicodeBlock(0x2900, 0x297F, 953: "SUPPLEMENTAL_ARROWS_B", 954: "Supplemental Arrows-B"); 955: 956: /** 957: * Miscellaneous Mathematical Symbols-B. 958: * 0x2980 - 0x29FF. 959: * @since 1.5 960: */ 961: public static final UnicodeBlock MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B 962: = new UnicodeBlock(0x2980, 0x29FF, 963: "MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B", 964: "Miscellaneous Mathematical Symbols-B"); 965: 966: /** 967: * Supplemental Mathematical Operators. 968: * 0x2A00 - 0x2AFF. 969: * @since 1.5 970: */ 971: public static final UnicodeBlock SUPPLEMENTAL_MATHEMATICAL_OPERATORS 972: = new UnicodeBlock(0x2A00, 0x2AFF, 973: "SUPPLEMENTAL_MATHEMATICAL_OPERATORS", 974: "Supplemental Mathematical Operators"); 975: 976: /** 977: * Miscellaneous Symbols and Arrows. 978: * 0x2B00 - 0x2BFF. 979: * @since 1.5 980: */ 981: public static final UnicodeBlock MISCELLANEOUS_SYMBOLS_AND_ARROWS 982: = new UnicodeBlock(0x2B00, 0x2BFF, 983: "MISCELLANEOUS_SYMBOLS_AND_ARROWS", 984: "Miscellaneous Symbols and Arrows"); 985: 986: /** 987: * CJK Radicals Supplement. 988: * 0x2E80 - 0x2EFF. 989: * @since 1.4 990: */ 991: public static final UnicodeBlock CJK_RADICALS_SUPPLEMENT 992: = new UnicodeBlock(0x2E80, 0x2EFF, 993: "CJK_RADICALS_SUPPLEMENT", 994: "CJK Radicals Supplement"); 995: 996: /** 997: * Kangxi Radicals. 998: * 0x2F00 - 0x2FDF. 999: * @since 1.4 1000: */ 1001: public static final UnicodeBlock KANGXI_RADICALS 1002: = new UnicodeBlock(0x2F00, 0x2FDF, 1003: "KANGXI_RADICALS", 1004: "Kangxi Radicals"); 1005: 1006: /** 1007: * Ideographic Description Characters. 1008: * 0x2FF0 - 0x2FFF. 1009: * @since 1.4 1010: */ 1011: public static final UnicodeBlock IDEOGRAPHIC_DESCRIPTION_CHARACTERS 1012: = new UnicodeBlock(0x2FF0, 0x2FFF, 1013: "IDEOGRAPHIC_DESCRIPTION_CHARACTERS", 1014: "Ideographic Description Characters"); 1015: 1016: /** 1017: * CJK Symbols and Punctuation. 1018: * 0x3000 - 0x303F. 1019: */ 1020: public static final UnicodeBlock CJK_SYMBOLS_AND_PUNCTUATION 1021: = new UnicodeBlock(0x3000, 0x303F, 1022: "CJK_SYMBOLS_AND_PUNCTUATION", 1023: "CJK Symbols and Punctuation"); 1024: 1025: /** 1026: * Hiragana. 1027: * 0x3040 - 0x309F. 1028: */ 1029: public static final UnicodeBlock HIRAGANA 1030: = new UnicodeBlock(0x3040, 0x309F, 1031: "HIRAGANA", 1032: "Hiragana"); 1033: 1034: /** 1035: * Katakana. 1036: * 0x30A0 - 0x30FF. 1037: */ 1038: public static final UnicodeBlock KATAKANA 1039: = new UnicodeBlock(0x30A0, 0x30FF, 1040: "KATAKANA", 1041: "Katakana"); 1042: 1043: /** 1044: * Bopomofo. 1045: * 0x3100 - 0x312F. 1046: */ 1047: public static final UnicodeBlock BOPOMOFO 1048: = new UnicodeBlock(0x3100, 0x312F, 1049: "BOPOMOFO", 1050: "Bopomofo"); 1051: 1052: /** 1053: * Hangul Compatibility Jamo. 1054: * 0x3130 - 0x318F. 1055: */ 1056: public static final UnicodeBlock HANGUL_COMPATIBILITY_JAMO 1057: = new UnicodeBlock(0x3130, 0x318F, 1058: "HANGUL_COMPATIBILITY_JAMO", 1059: "Hangul Compatibility Jamo"); 1060: 1061: /** 1062: * Kanbun. 1063: * 0x3190 - 0x319F. 1064: */ 1065: public static final UnicodeBlock KANBUN 1066: = new UnicodeBlock(0x3190, 0x319F, 1067: "KANBUN", 1068: "Kanbun"); 1069: 1070: /** 1071: * Bopomofo Extended. 1072: * 0x31A0 - 0x31BF. 1073: * @since 1.4 1074: */ 1075: public static final UnicodeBlock BOPOMOFO_EXTENDED 1076: = new UnicodeBlock(0x31A0, 0x31BF, 1077: "BOPOMOFO_EXTENDED", 1078: "Bopomofo Extended"); 1079: 1080: /** 1081: * Katakana Phonetic Extensions. 1082: * 0x31F0 - 0x31FF. 1083: * @since 1.5 1084: */ 1085: public static final UnicodeBlock KATAKANA_PHONETIC_EXTENSIONS 1086: = new UnicodeBlock(0x31F0, 0x31FF, 1087: "KATAKANA_PHONETIC_EXTENSIONS", 1088: "Katakana Phonetic Extensions"); 1089: 1090: /** 1091: * Enclosed CJK Letters and Months. 1092: * 0x3200 - 0x32FF. 1093: */ 1094: public static final UnicodeBlock ENCLOSED_CJK_LETTERS_AND_MONTHS 1095: = new UnicodeBlock(0x3200, 0x32FF, 1096: "ENCLOSED_CJK_LETTERS_AND_MONTHS", 1097: "Enclosed CJK Letters and Months"); 1098: 1099: /** 1100: * CJK Compatibility. 1101: * 0x3300 - 0x33FF. 1102: */ 1103: public static final UnicodeBlock CJK_COMPATIBILITY 1104: = new UnicodeBlock(0x3300, 0x33FF, 1105: "CJK_COMPATIBILITY", 1106: "CJK Compatibility"); 1107: 1108: /** 1109: * CJK Unified Ideographs Extension A. 1110: * 0x3400 - 0x4DBF. 1111: * @since 1.4 1112: */ 1113: public static final UnicodeBlock CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A 1114: = new UnicodeBlock(0x3400, 0x4DBF, 1115: "CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A", 1116: "CJK Unified Ideographs Extension A"); 1117: 1118: /** 1119: * Yijing Hexagram Symbols. 1120: * 0x4DC0 - 0x4DFF. 1121: * @since 1.5 1122: */ 1123: public static final UnicodeBlock YIJING_HEXAGRAM_SYMBOLS 1124: = new UnicodeBlock(0x4DC0, 0x4DFF, 1125: "YIJING_HEXAGRAM_SYMBOLS", 1126: "Yijing Hexagram Symbols"); 1127: 1128: /** 1129: * CJK Unified Ideographs. 1130: * 0x4E00 - 0x9FFF. 1131: */ 1132: public static final UnicodeBlock CJK_UNIFIED_IDEOGRAPHS 1133: = new UnicodeBlock(0x4E00, 0x9FFF, 1134: "CJK_UNIFIED_IDEOGRAPHS", 1135: "CJK Unified Ideographs"); 1136: 1137: /** 1138: * Yi Syllables. 1139: * 0xA000 - 0xA48F. 1140: * @since 1.4 1141: */ 1142: public static final UnicodeBlock YI_SYLLABLES 1143: = new UnicodeBlock(0xA000, 0xA48F, 1144: "YI_SYLLABLES", 1145: "Yi Syllables"); 1146: 1147: /** 1148: * Yi Radicals. 1149: * 0xA490 - 0xA4CF. 1150: * @since 1.4 1151: */ 1152: public static final UnicodeBlock YI_RADICALS 1153: = new UnicodeBlock(0xA490, 0xA4CF, 1154: "YI_RADICALS", 1155: "Yi Radicals"); 1156: 1157: /** 1158: * Hangul Syllables. 1159: * 0xAC00 - 0xD7AF. 1160: */ 1161: public static final UnicodeBlock HANGUL_SYLLABLES 1162: = new UnicodeBlock(0xAC00, 0xD7AF, 1163: "HANGUL_SYLLABLES", 1164: "Hangul Syllables"); 1165: 1166: /** 1167: * High Surrogates. 1168: * 0xD800 - 0xDB7F. 1169: * @since 1.5 1170: */ 1171: public static final UnicodeBlock HIGH_SURROGATES 1172: = new UnicodeBlock(0xD800, 0xDB7F, 1173: "HIGH_SURROGATES", 1174: "High Surrogates"); 1175: 1176: /** 1177: * High Private Use Surrogates. 1178: * 0xDB80 - 0xDBFF. 1179: * @since 1.5 1180: */ 1181: public static final UnicodeBlock HIGH_PRIVATE_USE_SURROGATES 1182: = new UnicodeBlock(0xDB80, 0xDBFF, 1183: "HIGH_PRIVATE_USE_SURROGATES", 1184: "High Private Use Surrogates"); 1185: 1186: /** 1187: * Low Surrogates. 1188: * 0xDC00 - 0xDFFF. 1189: * @since 1.5 1190: */ 1191: public static final UnicodeBlock LOW_SURROGATES 1192: = new UnicodeBlock(0xDC00, 0xDFFF, 1193: "LOW_SURROGATES", 1194: "Low Surrogates"); 1195: 1196: /** 1197: * Private Use Area. 1198: * 0xE000 - 0xF8FF. 1199: */ 1200: public static final UnicodeBlock PRIVATE_USE_AREA 1201: = new UnicodeBlock(0xE000, 0xF8FF, 1202: "PRIVATE_USE_AREA", 1203: "Private Use Area"); 1204: 1205: /** 1206: * CJK Compatibility Ideographs. 1207: * 0xF900 - 0xFAFF. 1208: */ 1209: public static final UnicodeBlock CJK_COMPATIBILITY_IDEOGRAPHS 1210: = new UnicodeBlock(0xF900, 0xFAFF, 1211: "CJK_COMPATIBILITY_IDEOGRAPHS", 1212: "CJK Compatibility Ideographs"); 1213: 1214: /** 1215: * Alphabetic Presentation Forms. 1216: * 0xFB00 - 0xFB4F. 1217: */ 1218: public static final UnicodeBlock ALPHABETIC_PRESENTATION_FORMS 1219: = new UnicodeBlock(0xFB00, 0xFB4F, 1220: "ALPHABETIC_PRESENTATION_FORMS", 1221: "Alphabetic Presentation Forms"); 1222: 1223: /** 1224: * Arabic Presentation Forms-A. 1225: * 0xFB50 - 0xFDFF. 1226: */ 1227: public static final UnicodeBlock ARABIC_PRESENTATION_FORMS_A 1228: = new UnicodeBlock(0xFB50, 0xFDFF, 1229: "ARABIC_PRESENTATION_FORMS_A", 1230: "Arabic Presentation Forms-A"); 1231: 1232: /** 1233: * Variation Selectors. 1234: * 0xFE00 - 0xFE0F. 1235: * @since 1.5 1236: */ 1237: public static final UnicodeBlock VARIATION_SELECTORS 1238: = new UnicodeBlock(0xFE00, 0xFE0F, 1239: "VARIATION_SELECTORS", 1240: "Variation Selectors"); 1241: 1242: /** 1243: * Combining Half Marks. 1244: * 0xFE20 - 0xFE2F. 1245: */ 1246: public static final UnicodeBlock COMBINING_HALF_MARKS 1247: = new UnicodeBlock(0xFE20, 0xFE2F, 1248: "COMBINING_HALF_MARKS", 1249: "Combining Half Marks"); 1250: 1251: /** 1252: * CJK Compatibility Forms. 1253: * 0xFE30 - 0xFE4F. 1254: */ 1255: public static final UnicodeBlock CJK_COMPATIBILITY_FORMS 1256: = new UnicodeBlock(0xFE30, 0xFE4F, 1257: "CJK_COMPATIBILITY_FORMS", 1258: "CJK Compatibility Forms"); 1259: 1260: /** 1261: * Small Form Variants. 1262: * 0xFE50 - 0xFE6F. 1263: */ 1264: public static final UnicodeBlock SMALL_FORM_VARIANTS 1265: = new UnicodeBlock(0xFE50, 0xFE6F, 1266: "SMALL_FORM_VARIANTS", 1267: "Small Form Variants"); 1268: 1269: /** 1270: * Arabic Presentation Forms-B. 1271: * 0xFE70 - 0xFEFF. 1272: */ 1273: public static final UnicodeBlock ARABIC_PRESENTATION_FORMS_B 1274: = new UnicodeBlock(0xFE70, 0xFEFF, 1275: "ARABIC_PRESENTATION_FORMS_B", 1276: "Arabic Presentation Forms-B"); 1277: 1278: /** 1279: * Halfwidth and Fullwidth Forms. 1280: * 0xFF00 - 0xFFEF. 1281: */ 1282: public static final UnicodeBlock HALFWIDTH_AND_FULLWIDTH_FORMS 1283: = new UnicodeBlock(0xFF00, 0xFFEF, 1284: "HALFWIDTH_AND_FULLWIDTH_FORMS", 1285: "Halfwidth and Fullwidth Forms"); 1286: 1287: /** 1288: * Specials. 1289: * 0xFFF0 - 0xFFFF. 1290: */ 1291: public static final UnicodeBlock SPECIALS 1292: = new UnicodeBlock(0xFFF0, 0xFFFF, 1293: "SPECIALS", 1294: "Specials"); 1295: 1296: /** 1297: * Linear B Syllabary. 1298: * 0x10000 - 0x1007F. 1299: * @since 1.5 1300: */ 1301: public static final UnicodeBlock LINEAR_B_SYLLABARY 1302: = new UnicodeBlock(0x10000, 0x1007F, 1303: "LINEAR_B_SYLLABARY", 1304: "Linear B Syllabary"); 1305: 1306: /** 1307: * Linear B Ideograms. 1308: * 0x10080 - 0x100FF. 1309: * @since 1.5 1310: */ 1311: public static final UnicodeBlock LINEAR_B_IDEOGRAMS 1312: = new UnicodeBlock(0x10080, 0x100FF, 1313: "LINEAR_B_IDEOGRAMS", 1314: "Linear B Ideograms"); 1315: 1316: /** 1317: * Aegean Numbers. 1318: * 0x10100 - 0x1013F. 1319: * @since 1.5 1320: */ 1321: public static final UnicodeBlock AEGEAN_NUMBERS 1322: = new UnicodeBlock(0x10100, 0x1013F, 1323: "AEGEAN_NUMBERS", 1324: "Aegean Numbers"); 1325: 1326: /** 1327: * Old Italic. 1328: * 0x10300 - 0x1032F. 1329: * @since 1.5 1330: */ 1331: public static final UnicodeBlock OLD_ITALIC 1332: = new UnicodeBlock(0x10300, 0x1032F, 1333: "OLD_ITALIC", 1334: "Old Italic"); 1335: 1336: /** 1337: * Gothic. 1338: * 0x10330 - 0x1034F. 1339: * @since 1.5 1340: */ 1341: public static final UnicodeBlock GOTHIC 1342: = new UnicodeBlock(0x10330, 0x1034F, 1343: "GOTHIC", 1344: "Gothic"); 1345: 1346: /** 1347: * Ugaritic. 1348: * 0x10380 - 0x1039F. 1349: * @since 1.5 1350: */ 1351: public static final UnicodeBlock UGARITIC 1352: = new UnicodeBlock(0x10380, 0x1039F, 1353: "UGARITIC", 1354: "Ugaritic"); 1355: 1356: /** 1357: * Deseret. 1358: * 0x10400 - 0x1044F. 1359: * @since 1.5 1360: */ 1361: public static final UnicodeBlock DESERET 1362: = new UnicodeBlock(0x10400, 0x1044F, 1363: "DESERET", 1364: "Deseret"); 1365: 1366: /** 1367: * Shavian. 1368: * 0x10450 - 0x1047F. 1369: * @since 1.5 1370: */ 1371: public static final UnicodeBlock SHAVIAN 1372: = new UnicodeBlock(0x10450, 0x1047F, 1373: "SHAVIAN", 1374: "Shavian"); 1375: 1376: /** 1377: * Osmanya. 1378: * 0x10480 - 0x104AF. 1379: * @since 1.5 1380: */ 1381: public static final UnicodeBlock OSMANYA 1382: = new UnicodeBlock(0x10480, 0x104AF, 1383: "OSMANYA", 1384: "Osmanya"); 1385: 1386: /** 1387: * Cypriot Syllabary. 1388: * 0x10800 - 0x1083F. 1389: * @since 1.5 1390: */ 1391: public static final UnicodeBlock CYPRIOT_SYLLABARY 1392: = new UnicodeBlock(0x10800, 0x1083F, 1393: "CYPRIOT_SYLLABARY", 1394: "Cypriot Syllabary"); 1395: 1396: /** 1397: * Byzantine Musical Symbols. 1398: * 0x1D000 - 0x1D0FF. 1399: * @since 1.5 1400: */ 1401: public static final UnicodeBlock BYZANTINE_MUSICAL_SYMBOLS 1402: = new UnicodeBlock(0x1D000, 0x1D0FF, 1403: "BYZANTINE_MUSICAL_SYMBOLS", 1404: "Byzantine Musical Symbols"); 1405: 1406: /** 1407: * Musical Symbols. 1408: * 0x1D100 - 0x1D1FF. 1409: * @since 1.5 1410: */ 1411: public static final UnicodeBlock MUSICAL_SYMBOLS 1412: = new UnicodeBlock(0x1D100, 0x1D1FF, 1413: "MUSICAL_SYMBOLS", 1414: "Musical Symbols"); 1415: 1416: /** 1417: * Tai Xuan Jing Symbols. 1418: * 0x1D300 - 0x1D35F. 1419: * @since 1.5 1420: */ 1421: public static final UnicodeBlock TAI_XUAN_JING_SYMBOLS 1422: = new UnicodeBlock(0x1D300, 0x1D35F, 1423: "TAI_XUAN_JING_SYMBOLS", 1424: "Tai Xuan Jing Symbols"); 1425: 1426: /** 1427: * Mathematical Alphanumeric Symbols. 1428: * 0x1D400 - 0x1D7FF. 1429: * @since 1.5 1430: */ 1431: public static final UnicodeBlock MATHEMATICAL_ALPHANUMERIC_SYMBOLS 1432: = new UnicodeBlock(0x1D400, 0x1D7FF, 1433: "MATHEMATICAL_ALPHANUMERIC_SYMBOLS", 1434: "Mathematical Alphanumeric Symbols"); 1435: 1436: /** 1437: * CJK Unified Ideographs Extension B. 1438: * 0x20000 - 0x2A6DF. 1439: * @since 1.5 1440: */ 1441: public static final UnicodeBlock CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B 1442: = new UnicodeBlock(0x20000, 0x2A6DF, 1443: "CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B", 1444: "CJK Unified Ideographs Extension B"); 1445: 1446: /** 1447: * CJK Compatibility Ideographs Supplement. 1448: * 0x2F800 - 0x2FA1F. 1449: * @since 1.5 1450: */ 1451: public static final UnicodeBlock CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT 1452: = new UnicodeBlock(0x2F800, 0x2FA1F, 1453: "CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT", 1454: "CJK Compatibility Ideographs Supplement"); 1455: 1456: /** 1457: * Tags. 1458: * 0xE0000 - 0xE007F. 1459: * @since 1.5 1460: */ 1461: public static final UnicodeBlock TAGS 1462: = new UnicodeBlock(0xE0000, 0xE007F, 1463: "TAGS", 1464: "Tags"); 1465: 1466: /** 1467: * Variation Selectors Supplement. 1468: * 0xE0100 - 0xE01EF. 1469: * @since 1.5 1470: */ 1471: public static final UnicodeBlock VARIATION_SELECTORS_SUPPLEMENT 1472: = new UnicodeBlock(0xE0100, 0xE01EF, 1473: "VARIATION_SELECTORS_SUPPLEMENT", 1474: "Variation Selectors Supplement"); 1475: 1476: /** 1477: * Supplementary Private Use Area-A. 1478: * 0xF0000 - 0xFFFFF. 1479: * @since 1.5 1480: */ 1481: public static final UnicodeBlock SUPPLEMENTARY_PRIVATE_USE_AREA_A 1482: = new UnicodeBlock(0xF0000, 0xFFFFF, 1483: "SUPPLEMENTARY_PRIVATE_USE_AREA_A", 1484: "Supplementary Private Use Area-A"); 1485: 1486: /** 1487: * Supplementary Private Use Area-B. 1488: * 0x100000 - 0x10FFFF. 1489: * @since 1.5 1490: */ 1491: public static final UnicodeBlock SUPPLEMENTARY_PRIVATE_USE_AREA_B 1492: = new UnicodeBlock(0x100000, 0x10FFFF, 1493: "SUPPLEMENTARY_PRIVATE_USE_AREA_B", 1494: "Supplementary Private Use Area-B"); 1495: 1496: /** 1497: * Surrogates Area. 1498: * 'D800' - 'DFFF'. 1499: * @deprecated As of 1.5, the three areas, 1500: * <a href="#HIGH_SURROGATES">HIGH_SURROGATES</a>, 1501: * <a href="#HIGH_PRIVATE_USE_SURROGATES">HIGH_PRIVATE_USE_SURROGATES</a> 1502: * and <a href="#LOW_SURROGATES">LOW_SURROGATES</a>, as defined 1503: * by the Unicode standard, should be used in preference to 1504: * this. These are also returned from calls to <code>of(int)</code> 1505: * and <code>of(char)</code>. 1506: */ 1507: @Deprecated 1508: public static final UnicodeBlock SURROGATES_AREA 1509: = new UnicodeBlock(0xD800, 0xDFFF, 1510: "SURROGATES_AREA", 1511: "Surrogates Area"); 1512: 1513: /** 1514: * The defined subsets. 1515: */ 1516: private static final UnicodeBlock sets[] = { 1517: BASIC_LATIN, 1518: LATIN_1_SUPPLEMENT, 1519: LATIN_EXTENDED_A, 1520: LATIN_EXTENDED_B, 1521: IPA_EXTENSIONS, 1522: SPACING_MODIFIER_LETTERS, 1523: COMBINING_DIACRITICAL_MARKS, 1524: GREEK, 1525: CYRILLIC, 1526: CYRILLIC_SUPPLEMENTARY, 1527: ARMENIAN, 1528: HEBREW, 1529: ARABIC, 1530: SYRIAC, 1531: THAANA, 1532: DEVANAGARI, 1533: BENGALI, 1534: GURMUKHI, 1535: GUJARATI, 1536: ORIYA, 1537: TAMIL, 1538: TELUGU, 1539: KANNADA, 1540: MALAYALAM, 1541: SINHALA, 1542: THAI, 1543: LAO, 1544: TIBETAN, 1545: MYANMAR, 1546: GEORGIAN, 1547: HANGUL_JAMO, 1548: ETHIOPIC, 1549: CHEROKEE, 1550: UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS, 1551: OGHAM, 1552: RUNIC, 1553: TAGALOG, 1554: HANUNOO, 1555: BUHID, 1556: TAGBANWA, 1557: KHMER, 1558: MONGOLIAN, 1559: LIMBU, 1560: TAI_LE, 1561: KHMER_SYMBOLS, 1562: PHONETIC_EXTENSIONS, 1563: LATIN_EXTENDED_ADDITIONAL, 1564: GREEK_EXTENDED, 1565: GENERAL_PUNCTUATION, 1566: SUPERSCRIPTS_AND_SUBSCRIPTS, 1567: CURRENCY_SYMBOLS, 1568: COMBINING_MARKS_FOR_SYMBOLS, 1569: LETTERLIKE_SYMBOLS, 1570: NUMBER_FORMS, 1571: ARROWS, 1572: MATHEMATICAL_OPERATORS, 1573: MISCELLANEOUS_TECHNICAL, 1574: CONTROL_PICTURES, 1575: OPTICAL_CHARACTER_RECOGNITION, 1576: ENCLOSED_ALPHANUMERICS, 1577: BOX_DRAWING, 1578: BLOCK_ELEMENTS, 1579: GEOMETRIC_SHAPES, 1580: MISCELLANEOUS_SYMBOLS, 1581: DINGBATS, 1582: MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A, 1583: SUPPLEMENTAL_ARROWS_A, 1584: BRAILLE_PATTERNS, 1585: SUPPLEMENTAL_ARROWS_B, 1586: MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B, 1587: SUPPLEMENTAL_MATHEMATICAL_OPERATORS, 1588: MISCELLANEOUS_SYMBOLS_AND_ARROWS, 1589: CJK_RADICALS_SUPPLEMENT, 1590: KANGXI_RADICALS, 1591: IDEOGRAPHIC_DESCRIPTION_CHARACTERS, 1592: CJK_SYMBOLS_AND_PUNCTUATION, 1593: HIRAGANA, 1594: KATAKANA, 1595: BOPOMOFO, 1596: HANGUL_COMPATIBILITY_JAMO, 1597: KANBUN, 1598: BOPOMOFO_EXTENDED, 1599: KATAKANA_PHONETIC_EXTENSIONS, 1600: ENCLOSED_CJK_LETTERS_AND_MONTHS, 1601: CJK_COMPATIBILITY, 1602: CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A, 1603: YIJING_HEXAGRAM_SYMBOLS, 1604: CJK_UNIFIED_IDEOGRAPHS, 1605: YI_SYLLABLES, 1606: YI_RADICALS, 1607: HANGUL_SYLLABLES, 1608: HIGH_SURROGATES, 1609: HIGH_PRIVATE_USE_SURROGATES, 1610: LOW_SURROGATES, 1611: PRIVATE_USE_AREA, 1612: CJK_COMPATIBILITY_IDEOGRAPHS, 1613: ALPHABETIC_PRESENTATION_FORMS, 1614: ARABIC_PRESENTATION_FORMS_A, 1615: VARIATION_SELECTORS, 1616: COMBINING_HALF_MARKS, 1617: CJK_COMPATIBILITY_FORMS, 1618: SMALL_FORM_VARIANTS, 1619: ARABIC_PRESENTATION_FORMS_B, 1620: HALFWIDTH_AND_FULLWIDTH_FORMS, 1621: SPECIALS, 1622: LINEAR_B_SYLLABARY, 1623: LINEAR_B_IDEOGRAMS, 1624: AEGEAN_NUMBERS, 1625: OLD_ITALIC, 1626: GOTHIC, 1627: UGARITIC, 1628: DESERET, 1629: SHAVIAN, 1630: OSMANYA, 1631: CYPRIOT_SYLLABARY, 1632: BYZANTINE_MUSICAL_SYMBOLS, 1633: MUSICAL_SYMBOLS, 1634: TAI_XUAN_JING_SYMBOLS, 1635: MATHEMATICAL_ALPHANUMERIC_SYMBOLS, 1636: CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B, 1637: CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT, 1638: TAGS, 1639: VARIATION_SELECTORS_SUPPLEMENT, 1640: SUPPLEMENTARY_PRIVATE_USE_AREA_A, 1641: SUPPLEMENTARY_PRIVATE_USE_AREA_B, 1642: }; 1643: } // class UnicodeBlock 1644: 1645: /** 1646: * A class to encompass all the properties of characters in the 1647: * private use blocks in the Unicode standard. This class extends 1648: * UnassignedCharacters because the return type from getType() is 1649: * different. 1650: * @author Anthony Balkissoon abalkiss at redhat dot com 1651: * 1652: */ 1653: private static class PrivateUseCharacters extends UnassignedCharacters 1654: { 1655: /** 1656: * Returns the type of the character cp. 1657: */ 1658: static int getType(int cp) 1659: { 1660: // The upper 2 code points in any plane are considered unassigned, 1661: // even in the private-use planes. 1662: if ((cp & 0xffff) >= 0xfffe) 1663: return UnassignedCharacters.getType(cp); 1664: return PRIVATE_USE; 1665: } 1666: 1667: /** 1668: * Returns true if the character cp is defined. 1669: */ 1670: static boolean isDefined(int cp) 1671: { 1672: // The upper 2 code points in any plane are considered unassigned, 1673: // even in the private-use planes. 1674: if ((cp & 0xffff) >= 0xfffe) 1675: return UnassignedCharacters.isDefined(cp); 1676: return true; 1677: } 1678: 1679: /** 1680: * Gets the directionality for the character cp. 1681: */ 1682: static byte getDirectionality(int cp) 1683: { 1684: if ((cp & 0xffff) >= 0xfffe) 1685: return UnassignedCharacters.getDirectionality(cp); 1686: return DIRECTIONALITY_LEFT_TO_RIGHT; 1687: } 1688: } 1689: 1690: /** 1691: * A class to encompass all the properties of code points that are 1692: * currently undefined in the Unicode standard. 1693: * @author Anthony Balkissoon abalkiss at redhat dot com 1694: * 1695: */ 1696: private static class UnassignedCharacters 1697: { 1698: /** 1699: * Returns the numeric value for the unassigned characters. 1700: * @param cp the character 1701: * @param radix the radix (not used) 1702: * @return the numeric value of this character in this radix 1703: */ 1704: static int digit(int cp, int radix) 1705: { 1706: return -1; 1707: } 1708: 1709: /** 1710: * Returns the Unicode directionality property for unassigned 1711: * characters. 1712: * @param cp the character 1713: * @return DIRECTIONALITY_UNDEFINED 1714: */ 1715: static byte getDirectionality(int cp) 1716: { 1717: return DIRECTIONALITY_UNDEFINED; 1718: } 1719: 1720: /** 1721: * Returns -1, the numeric value for unassigned Unicode characters. 1722: * @param cp the character 1723: * @return -1 1724: */ 1725: static int getNumericValue(int cp) 1726: { 1727: return -1; 1728: } 1729: 1730: /** 1731: * Returns UNASSIGNED, the type of unassigned Unicode characters. 1732: * @param cp the character 1733: * @return UNASSIGNED 1734: */ 1735: static int getType(int cp) 1736: { 1737: return UNASSIGNED; 1738: } 1739: 1740: /** 1741: * Returns false to indiciate that the character is not defined in the 1742: * Unicode standard. 1743: * @param cp the character 1744: * @return false 1745: */ 1746: static boolean isDefined(int cp) 1747: { 1748: return false; 1749: } 1750: 1751: /** 1752: * Returns false to indicate that the character is not a digit. 1753: * @param cp the character 1754: * @return false 1755: */ 1756: static boolean isDigit(int cp) 1757: { 1758: return false; 1759: } 1760: 1761: /** 1762: * Returns false to indicate that the character cannot be ignored 1763: * within an identifier 1764: * @param cp the character 1765: * @return false 1766: */ 1767: static boolean isIdentifierIgnorable(int cp) 1768: { 1769: return false; 1770: } 1771: 1772: /** 1773: * Returns false to indicate that the character cannot be part of a 1774: * Java identifier. 1775: * @param cp the character 1776: * @return false 1777: */ 1778: static boolean isJavaIdentifierPart(int cp) 1779: { 1780: return false; 1781: } 1782: 1783: /** 1784: * Returns false to indicate that the character cannot be start a 1785: * Java identifier. 1786: * @param cp the character 1787: * @return false 1788: */ 1789: static boolean isJavaIdentiferStart(int cp) 1790: { 1791: return false; 1792: } 1793: 1794: /** 1795: * Returns false to indicate that the character is not a letter. 1796: * @param cp the character 1797: * @return false 1798: */ 1799: static boolean isLetter(int cp) 1800: { 1801: return false; 1802: } 1803: 1804: /** 1805: * Returns false to indicate that the character cannot is neither a letter 1806: * nor a digit. 1807: * @param cp the character 1808: * @return false 1809: */ 1810: static boolean isLetterOrDigit(int cp) 1811: { 1812: return false; 1813: } 1814: 1815: /** 1816: * Returns false to indicate that the character is not a lowercase letter. 1817: * @param cp the character 1818: * @return false 1819: */ 1820: static boolean isLowerCase(int cp) 1821: { 1822: return false; 1823: } 1824: 1825: /** 1826: * Returns false to indicate that the character cannot is not mirrored. 1827: * @param cp the character 1828: * @return false 1829: */ 1830: static boolean isMirrored(int cp) 1831: { 1832: return false; 1833: } 1834: 1835: /** 1836: * Returns false to indicate that the character is not a space character. 1837: * @param cp the character 1838: * @return false 1839: */ 1840: static boolean isSpaceChar(int cp) 1841: { 1842: return false; 1843: } 1844: 1845: /** 1846: * Returns false to indicate that the character it not a titlecase letter. 1847: * @param cp the character 1848: * @return false 1849: */ 1850: static boolean isTitleCase(int cp) 1851: { 1852: return false; 1853: } 1854: 1855: /** 1856: * Returns false to indicate that the character cannot be part of a 1857: * Unicode identifier. 1858: * @param cp the character 1859: * @return false 1860: */ 1861: static boolean isUnicodeIdentifierPart(int cp) 1862: { 1863: return false; 1864: } 1865: 1866: /** 1867: * Returns false to indicate that the character cannot start a 1868: * Unicode identifier. 1869: * @param cp the character 1870: * @return false 1871: */ 1872: static boolean isUnicodeIdentifierStart(int cp) 1873: { 1874: return false; 1875: } 1876: 1877: /** 1878: * Returns false to indicate that the character is not an uppercase letter. 1879: * @param cp the character 1880: * @return false 1881: */ 1882: static boolean isUpperCase(int cp) 1883: { 1884: return false; 1885: } 1886: 1887: /** 1888: * Returns false to indicate that the character is not a whitespace 1889: * character. 1890: * @param cp the character 1891: * @return false 1892: */ 1893: static boolean isWhiteSpace(int cp) 1894: { 1895: return false; 1896: } 1897: 1898: /** 1899: * Returns cp to indicate this character has no lowercase conversion. 1900: * @param cp the character 1901: * @return cp 1902: */ 1903: static int toLowerCase(int cp) 1904: { 1905: return cp; 1906: } 1907: 1908: /** 1909: * Returns cp to indicate this character has no titlecase conversion. 1910: * @param cp the character 1911: * @return cp 1912: */ 1913: static int toTitleCase(int cp) 1914: { 1915: return cp; 1916: } 1917: 1918: /** 1919: * Returns cp to indicate this character has no uppercase conversion. 1920: * @param cp the character 1921: * @return cp 1922: */ 1923: static int toUpperCase(int cp) 1924: { 1925: return cp; 1926: } 1927: } 1928: 1929: /** 1930: * The immutable value of this Character. 1931: * 1932: * @serial the value of this Character 1933: */ 1934: private final char value; 1935: 1936: /** 1937: * Compatible with JDK 1.0+. 1938: */ 1939: private static final long serialVersionUID = 3786198910865385080L; 1940: 1941: /** 1942: * Smallest value allowed for radix arguments in Java. This value is 2. 1943: * 1944: * @see #digit(char, int) 1945: * @see #forDigit(int, int) 1946: * @see Integer#toString(int, int) 1947: * @see Integer#valueOf(String) 1948: */ 1949: public static final int MIN_RADIX = 2; 1950: 1951: /** 1952: * Largest value allowed for radix arguments in Java. This value is 36. 1953: * 1954: * @see #digit(char, int) 1955: * @see #forDigit(int, int) 1956: * @see Integer#toString(int, int) 1957: * @see Integer#valueOf(String) 1958: */ 1959: public static final int MAX_RADIX = 36; 1960: 1961: /** 1962: * The minimum value the char data type can hold. 1963: * This value is <code>'\\u0000'</code>. 1964: */ 1965: public static final char MIN_VALUE = '\u0000'; 1966: 1967: /** 1968: * The maximum value the char data type can hold. 1969: * This value is <code>'\\uFFFF'</code>. 1970: */ 1971: public static final char MAX_VALUE = '\uFFFF'; 1972: 1973: /** 1974: * The minimum Unicode 4.0 code point. This value is <code>0</code>. 1975: * @since 1.5 1976: */ 1977: public static final int MIN_CODE_POINT = 0; 1978: 1979: /** 1980: * The maximum Unicode 4.0 code point, which is greater than the range 1981: * of the char data type. 1982: * This value is <code>0x10FFFF</code>. 1983: * @since 1.5 1984: */ 1985: public static final int MAX_CODE_POINT = 0x10FFFF; 1986: 1987: /** 1988: * The minimum Unicode high surrogate code unit, or 1989: * <emph>leading-surrogate</emph>, in the UTF-16 character encoding. 1990: * This value is <code>'\uD800'</code>. 1991: * @since 1.5 1992: */ 1993: public static final char MIN_HIGH_SURROGATE = '\uD800'; 1994: 1995: /** 1996: * The maximum Unicode high surrogate code unit, or 1997: * <emph>leading-surrogate</emph>, in the UTF-16 character encoding. 1998: * This value is <code>'\uDBFF'</code>. 1999: * @since 1.5 2000: */ 2001: public static final char MAX_HIGH_SURROGATE = '\uDBFF'; 2002: 2003: /** 2004: * The minimum Unicode low surrogate code unit, or 2005: * <emph>trailing-surrogate</emph>, in the UTF-16 character encoding. 2006: * This value is <code>'\uDC00'</code>. 2007: * @since 1.5 2008: */ 2009: public static final char MIN_LOW_SURROGATE = '\uDC00'; 2010: 2011: /** 2012: * The maximum Unicode low surrogate code unit, or 2013: * <emph>trailing-surrogate</emph>, in the UTF-16 character encoding. 2014: * This value is <code>'\uDFFF'</code>. 2015: * @since 1.5 2016: */ 2017: public static final char MAX_LOW_SURROGATE = '\uDFFF'; 2018: 2019: /** 2020: * The minimum Unicode surrogate code unit in the UTF-16 character encoding. 2021: * This value is <code>'\uD800'</code>. 2022: * @since 1.5 2023: */ 2024: public static final char MIN_SURROGATE = MIN_HIGH_SURROGATE; 2025: 2026: /** 2027: * The maximum Unicode surrogate code unit in the UTF-16 character encoding. 2028: * This value is <code>'\uDFFF'</code>. 2029: * @since 1.5 2030: */ 2031: public static final char MAX_SURROGATE = MAX_LOW_SURROGATE; 2032: 2033: /** 2034: * The lowest possible supplementary Unicode code point (the first code 2035: * point outside the basic multilingual plane (BMP)). 2036: * This value is <code>0x10000</code>. 2037: */ 2038: public static final int MIN_SUPPLEMENTARY_CODE_POINT = 0x10000; 2039: 2040: /** 2041: * Class object representing the primitive char data type. 2042: * 2043: * @since 1.1 2044: */ 2045: public static final Class<Character> TYPE = (Class<Character>) VMClassLoader.getPrimitiveClass('C'); 2046: 2047: /** 2048: * The number of bits needed to represent a <code>char</code>. 2049: * @since 1.5 2050: */ 2051: public static final int SIZE = 16; 2052: 2053: // This caches some Character values, and is used by boxing 2054: // conversions via valueOf(). We must cache at least 0..127; 2055: // this constant controls how much we actually cache. 2056: private static final int MAX_CACHE = 127; 2057: private static Character[] charCache = new Character[MAX_CACHE + 1]; 2058: static 2059: { 2060: for (char i=0; i <= MAX_CACHE; i++) 2061: charCache[i] = new Character(i); 2062: } 2063: 2064: /** 2065: * Lu = Letter, Uppercase (Informative). 2066: * 2067: * @since 1.1 2068: */ 2069: public static final byte UPPERCASE_LETTER = 1; 2070: 2071: /** 2072: * Ll = Letter, Lowercase (Informative). 2073: * 2074: * @since 1.1 2075: */ 2076: public static final byte LOWERCASE_LETTER = 2; 2077: 2078: /** 2079: * Lt = Letter, Titlecase (Informative). 2080: * 2081: * @since 1.1 2082: */ 2083: public static final byte TITLECASE_LETTER = 3; 2084: 2085: /** 2086: * Mn = Mark, Non-Spacing (Normative). 2087: * 2088: * @since 1.1 2089: */ 2090: public static final byte NON_SPACING_MARK = 6; 2091: 2092: /** 2093: * Mc = Mark, Spacing Combining (Normative). 2094: * 2095: * @since 1.1 2096: */ 2097: public static final byte COMBINING_SPACING_MARK = 8; 2098: 2099: /** 2100: * Me = Mark, Enclosing (Normative). 2101: * 2102: * @since 1.1 2103: */ 2104: public static final byte ENCLOSING_MARK = 7; 2105: 2106: /** 2107: * Nd = Number, Decimal Digit (Normative). 2108: * 2109: * @since 1.1 2110: */ 2111: public static final byte DECIMAL_DIGIT_NUMBER = 9; 2112: 2113: /** 2114: * Nl = Number, Letter (Normative). 2115: * 2116: * @since 1.1 2117: */ 2118: public static final byte LETTER_NUMBER = 10; 2119: 2120: /** 2121: * No = Number, Other (Normative). 2122: * 2123: * @since 1.1 2124: */ 2125: public static final byte OTHER_NUMBER = 11; 2126: 2127: /** 2128: * Zs = Separator, Space (Normative). 2129: * 2130: * @since 1.1 2131: */ 2132: public static final byte SPACE_SEPARATOR = 12; 2133: 2134: /** 2135: * Zl = Separator, Line (Normative). 2136: * 2137: * @since 1.1 2138: */ 2139: public static final byte LINE_SEPARATOR = 13; 2140: 2141: /** 2142: * Zp = Separator, Paragraph (Normative). 2143: * 2144: * @since 1.1 2145: */ 2146: public static final byte PARAGRAPH_SEPARATOR = 14; 2147: 2148: /** 2149: * Cc = Other, Control (Normative). 2150: * 2151: * @since 1.1 2152: */ 2153: public static final byte CONTROL = 15; 2154: 2155: /** 2156: * Cf = Other, Format (Normative). 2157: * 2158: * @since 1.1 2159: */ 2160: public static final byte FORMAT = 16; 2161: 2162: /** 2163: * Cs = Other, Surrogate (Normative). 2164: * 2165: * @since 1.1 2166: */ 2167: public static final byte SURROGATE = 19; 2168: 2169: /** 2170: * Co = Other, Private Use (Normative). 2171: * 2172: * @since 1.1 2173: */ 2174: public static final byte PRIVATE_USE = 18; 2175: 2176: /** 2177: * Cn = Other, Not Assigned (Normative). 2178: * 2179: * @since 1.1 2180: */ 2181: public static final byte UNASSIGNED = 0; 2182: 2183: /** 2184: * Lm = Letter, Modifier (Informative). 2185: * 2186: * @since 1.1 2187: */ 2188: public static final byte MODIFIER_LETTER = 4; 2189: 2190: /** 2191: * Lo = Letter, Other (Informative). 2192: * 2193: * @since 1.1 2194: */ 2195: public static final byte OTHER_LETTER = 5; 2196: 2197: /** 2198: * Pc = Punctuation, Connector (Informative). 2199: * 2200: * @since 1.1 2201: */ 2202: public static final byte CONNECTOR_PUNCTUATION = 23; 2203: 2204: /** 2205: * Pd = Punctuation, Dash (Informative). 2206: * 2207: * @since 1.1 2208: */ 2209: public static final byte DASH_PUNCTUATION = 20; 2210: 2211: /** 2212: * Ps = Punctuation, Open (Informative). 2213: * 2214: * @since 1.1 2215: */ 2216: public static final byte START_PUNCTUATION = 21; 2217: 2218: /** 2219: * Pe = Punctuation, Close (Informative). 2220: * 2221: * @since 1.1 2222: */ 2223: public static final byte END_PUNCTUATION = 22; 2224: 2225: /** 2226: * Pi = Punctuation, Initial Quote (Informative). 2227: * 2228: * @since 1.4 2229: */ 2230: public static final byte INITIAL_QUOTE_PUNCTUATION = 29; 2231: 2232: /** 2233: * Pf = Punctuation, Final Quote (Informative). 2234: * 2235: * @since 1.4 2236: */ 2237: public static final byte FINAL_QUOTE_PUNCTUATION = 30; 2238: 2239: /** 2240: * Po = Punctuation, Other (Informative). 2241: * 2242: * @since 1.1 2243: */ 2244: public static final byte OTHER_PUNCTUATION = 24; 2245: 2246: /** 2247: * Sm = Symbol, Math (Informative). 2248: * 2249: * @since 1.1 2250: */ 2251: public static final byte MATH_SYMBOL = 25; 2252: 2253: /** 2254: * Sc = Symbol, Currency (Informative). 2255: * 2256: * @since 1.1 2257: */ 2258: public static final byte CURRENCY_SYMBOL = 26; 2259: 2260: /** 2261: * Sk = Symbol, Modifier (Informative). 2262: * 2263: * @since 1.1 2264: */ 2265: public static final byte MODIFIER_SYMBOL = 27; 2266: 2267: /** 2268: * So = Symbol, Other (Informative). 2269: * 2270: * @since 1.1 2271: */ 2272: public static final byte OTHER_SYMBOL = 28; 2273: 2274: /** 2275: * Undefined bidirectional character type. Undefined char values have 2276: * undefined directionality in the Unicode specification. 2277: * 2278: * @since 1.4 2279: */ 2280: public static final byte DIRECTIONALITY_UNDEFINED = -1; 2281: 2282: /** 2283: * Strong bidirectional character type "L". 2284: * 2285: * @since 1.4 2286: */ 2287: public static final byte DIRECTIONALITY_LEFT_TO_RIGHT = 0; 2288: 2289: /** 2290: * Strong bidirectional character type "R". 2291: * 2292: * @since 1.4 2293: */ 2294: public static final byte DIRECTIONALITY_RIGHT_TO_LEFT = 1; 2295: 2296: /** 2297: * Strong bidirectional character type "AL". 2298: * 2299: * @since 1.4 2300: */ 2301: public static final byte DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC = 2; 2302: 2303: /** 2304: * Weak bidirectional character type "EN". 2305: * 2306: * @since 1.4 2307: */ 2308: public static final byte DIRECTIONALITY_EUROPEAN_NUMBER = 3; 2309: 2310: /** 2311: * Weak bidirectional character type "ES". 2312: * 2313: * @since 1.4 2314: */ 2315: public static final byte DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR = 4; 2316: 2317: /** 2318: * Weak bidirectional character type "ET". 2319: * 2320: * @since 1.4 2321: */ 2322: public static final byte DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR = 5; 2323: 2324: /** 2325: * Weak bidirectional character type "AN". 2326: * 2327: * @since 1.4 2328: */ 2329: public static final byte DIRECTIONALITY_ARABIC_NUMBER = 6; 2330: 2331: /** 2332: * Weak bidirectional character type "CS". 2333: * 2334: * @since 1.4 2335: */ 2336: public static final byte DIRECTIONALITY_COMMON_NUMBER_SEPARATOR = 7; 2337: 2338: /** 2339: * Weak bidirectional character type "NSM". 2340: * 2341: * @since 1.4 2342: */ 2343: public static final byte DIRECTIONALITY_NONSPACING_MARK = 8; 2344: 2345: /** 2346: * Weak bidirectional character type "BN". 2347: * 2348: * @since 1.4 2349: */ 2350: public static final byte DIRECTIONALITY_BOUNDARY_NEUTRAL = 9; 2351: 2352: /** 2353: * Neutral bidirectional character type "B". 2354: * 2355: * @since 1.4 2356: */ 2357: public static final byte DIRECTIONALITY_PARAGRAPH_SEPARATOR = 10; 2358: 2359: /** 2360: * Neutral bidirectional character type "S". 2361: * 2362: * @since 1.4 2363: */ 2364: public static final byte DIRECTIONALITY_SEGMENT_SEPARATOR = 11; 2365: 2366: /** 2367: * Strong bidirectional character type "WS". 2368: * 2369: * @since 1.4 2370: */ 2371: public static final byte DIRECTIONALITY_WHITESPACE = 12; 2372: 2373: /** 2374: * Neutral bidirectional character type "ON". 2375: * 2376: * @since 1.4 2377: */ 2378: public static final byte DIRECTIONALITY_OTHER_NEUTRALS = 13; 2379: 2380: /** 2381: * Strong bidirectional character type "LRE". 2382: * 2383: * @since 1.4 2384: */ 2385: public static final byte DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING = 14; 2386: 2387: /** 2388: * Strong bidirectional character type "LRO". 2389: * 2390: * @since 1.4 2391: */ 2392: public static final byte DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE = 15; 2393: 2394: /** 2395: * Strong bidirectional character type "RLE". 2396: * 2397: * @since 1.4 2398: */ 2399: public static final byte DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING = 16; 2400: 2401: /** 2402: * Strong bidirectional character type "RLO". 2403: * 2404: * @since 1.4 2405: */ 2406: public static final byte DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE = 17; 2407: 2408: /** 2409: * Weak bidirectional character type "PDF". 2410: * 2411: * @since 1.4 2412: */ 2413: public static final byte DIRECTIONALITY_POP_DIRECTIONAL_FORMAT = 18; 2414: 2415: /** 2416: * Stores unicode block offset lookup table. Exploit package visibility of 2417: * String.value to avoid copying the array. 2418: * @see #readCodePoint(int) 2419: * @see CharData#BLOCKS 2420: */ 2421: private static final char[][] blocks = 2422: new char[][]{ 2423: String.zeroBasedStringValue(CharData.BLOCKS[0]), 2424: String.zeroBasedStringValue(CharData.BLOCKS[1]), 2425: String.zeroBasedStringValue(CharData.BLOCKS[2]), 2426: String.zeroBasedStringValue(CharData.BLOCKS[3]), 2427: String.zeroBasedStringValue(CharData.BLOCKS[4]), 2428: String.zeroBasedStringValue(CharData.BLOCKS[5]), 2429: String.zeroBasedStringValue(CharData.BLOCKS[6]), 2430: String.zeroBasedStringValue(CharData.BLOCKS[7]), 2431: String.zeroBasedStringValue(CharData.BLOCKS[8]), 2432: String.zeroBasedStringValue(CharData.BLOCKS[9]), 2433: String.zeroBasedStringValue(CharData.BLOCKS[10]), 2434: String.zeroBasedStringValue(CharData.BLOCKS[11]), 2435: String.zeroBasedStringValue(CharData.BLOCKS[12]), 2436: String.zeroBasedStringValue(CharData.BLOCKS[13]), 2437: String.zeroBasedStringValue(CharData.BLOCKS[14]), 2438: String.zeroBasedStringValue(CharData.BLOCKS[15]), 2439: String.zeroBasedStringValue(CharData.BLOCKS[16])}; 2440: 2441: /** 2442: * Stores unicode attribute offset lookup table. Exploit package visibility 2443: * of String.value to avoid copying the array. 2444: * @see CharData#DATA 2445: */ 2446: private static final char[][] data = 2447: new char[][]{ 2448: String.zeroBasedStringValue(CharData.DATA[0]), 2449: String.zeroBasedStringValue(CharData.DATA[1]), 2450: String.zeroBasedStringValue(CharData.DATA[2]), 2451: String.zeroBasedStringValue(CharData.DATA[3]), 2452: String.zeroBasedStringValue(CharData.DATA[4]), 2453: String.zeroBasedStringValue(CharData.DATA[5]), 2454: String.zeroBasedStringValue(CharData.DATA[6]), 2455: String.zeroBasedStringValue(CharData.DATA[7]), 2456: String.zeroBasedStringValue(CharData.DATA[8]), 2457: String.zeroBasedStringValue(CharData.DATA[9]), 2458: String.zeroBasedStringValue(CharData.DATA[10]), 2459: String.zeroBasedStringValue(CharData.DATA[11]), 2460: String.zeroBasedStringValue(CharData.DATA[12]), 2461: String.zeroBasedStringValue(CharData.DATA[13]), 2462: String.zeroBasedStringValue(CharData.DATA[14]), 2463: String.zeroBasedStringValue(CharData.DATA[15]), 2464: String.zeroBasedStringValue(CharData.DATA[16])}; 2465: 2466: /** 2467: * Stores unicode numeric value attribute table. Exploit package visibility 2468: * of String.value to avoid copying the array. 2469: * @see CharData#NUM_VALUE 2470: */ 2471: private static final char[][] numValue = 2472: new char[][]{ 2473: String.zeroBasedStringValue(CharData.NUM_VALUE[0]), 2474: String.zeroBasedStringValue(CharData.NUM_VALUE[1]), 2475: String.zeroBasedStringValue(CharData.NUM_VALUE[2]), 2476: String.zeroBasedStringValue(CharData.NUM_VALUE[3]), 2477: String.zeroBasedStringValue(CharData.NUM_VALUE[4]), 2478: String.zeroBasedStringValue(CharData.NUM_VALUE[5]), 2479: String.zeroBasedStringValue(CharData.NUM_VALUE[6]), 2480: String.zeroBasedStringValue(CharData.NUM_VALUE[7]), 2481: String.zeroBasedStringValue(CharData.NUM_VALUE[8]), 2482: String.zeroBasedStringValue(CharData.NUM_VALUE[9]), 2483: String.zeroBasedStringValue(CharData.NUM_VALUE[10]), 2484: String.zeroBasedStringValue(CharData.NUM_VALUE[11]), 2485: String.zeroBasedStringValue(CharData.NUM_VALUE[12]), 2486: String.zeroBasedStringValue(CharData.NUM_VALUE[13]), 2487: String.zeroBasedStringValue(CharData.NUM_VALUE[14]), 2488: String.zeroBasedStringValue(CharData.NUM_VALUE[15]), 2489: String.zeroBasedStringValue(CharData.NUM_VALUE[16])}; 2490: 2491: /** 2492: * Stores unicode uppercase attribute table. Exploit package visibility 2493: * of String.value to avoid copying the array. 2494: * @see CharData#UPPER 2495: */ 2496: private static final char[][] upper = 2497: new char[][]{ 2498: String.zeroBasedStringValue(CharData.UPPER[0]), 2499: String.zeroBasedStringValue(CharData.UPPER[1]), 2500: String.zeroBasedStringValue(CharData.UPPER[2]), 2501: String.zeroBasedStringValue(CharData.UPPER[3]), 2502: String.zeroBasedStringValue(CharData.UPPER[4]), 2503: String.zeroBasedStringValue(CharData.UPPER[5]), 2504: String.zeroBasedStringValue(CharData.UPPER[6]), 2505: String.zeroBasedStringValue(CharData.UPPER[7]), 2506: String.zeroBasedStringValue(CharData.UPPER[8]), 2507: String.zeroBasedStringValue(CharData.UPPER[9]), 2508: String.zeroBasedStringValue(CharData.UPPER[10]), 2509: String.zeroBasedStringValue(CharData.UPPER[11]), 2510: String.zeroBasedStringValue(CharData.UPPER[12]), 2511: String.zeroBasedStringValue(CharData.UPPER[13]), 2512: String.zeroBasedStringValue(CharData.UPPER[14]), 2513: String.zeroBasedStringValue(CharData.UPPER[15]), 2514: String.zeroBasedStringValue(CharData.UPPER[16])}; 2515: 2516: /** 2517: * Stores unicode lowercase attribute table. Exploit package visibility 2518: * of String.value to avoid copying the array. 2519: * @see CharData#LOWER 2520: */ 2521: private static final char[][] lower = 2522: new char[][]{ 2523: String.zeroBasedStringValue(CharData.LOWER[0]), 2524: String.zeroBasedStringValue(CharData.LOWER[1]), 2525: String.zeroBasedStringValue(CharData.LOWER[2]), 2526: String.zeroBasedStringValue(CharData.LOWER[3]), 2527: String.zeroBasedStringValue(CharData.LOWER[4]), 2528: String.zeroBasedStringValue(CharData.LOWER[5]), 2529: String.zeroBasedStringValue(CharData.LOWER[6]), 2530: String.zeroBasedStringValue(CharData.LOWER[7]), 2531: String.zeroBasedStringValue(CharData.LOWER[8]), 2532: String.zeroBasedStringValue(CharData.LOWER[9]), 2533: String.zeroBasedStringValue(CharData.LOWER[10]), 2534: String.zeroBasedStringValue(CharData.LOWER[11]), 2535: String.zeroBasedStringValue(CharData.LOWER[12]), 2536: String.zeroBasedStringValue(CharData.LOWER[13]), 2537: String.zeroBasedStringValue(CharData.LOWER[14]), 2538: String.zeroBasedStringValue(CharData.LOWER[15]), 2539: String.zeroBasedStringValue(CharData.LOWER[16])}; 2540: 2541: /** 2542: * Stores unicode direction attribute table. Exploit package visibility 2543: * of String.value to avoid copying the array. 2544: * @see CharData#DIRECTION 2545: */ 2546: // Package visible for use by String. 2547: static final char[][] direction = 2548: new char[][]{ 2549: String.zeroBasedStringValue(CharData.DIRECTION[0]), 2550: String.zeroBasedStringValue(CharData.DIRECTION[1]), 2551: String.zeroBasedStringValue(CharData.DIRECTION[2]), 2552: String.zeroBasedStringValue(CharData.DIRECTION[3]), 2553: String.zeroBasedStringValue(CharData.DIRECTION[4]), 2554: String.zeroBasedStringValue(CharData.DIRECTION[5]), 2555: String.zeroBasedStringValue(CharData.DIRECTION[6]), 2556: String.zeroBasedStringValue(CharData.DIRECTION[7]), 2557: String.zeroBasedStringValue(CharData.DIRECTION[8]), 2558: String.zeroBasedStringValue(CharData.DIRECTION[9]), 2559: String.zeroBasedStringValue(CharData.DIRECTION[10]), 2560: String.zeroBasedStringValue(CharData.DIRECTION[11]), 2561: String.zeroBasedStringValue(CharData.DIRECTION[12]), 2562: String.zeroBasedStringValue(CharData.DIRECTION[13]), 2563: String.zeroBasedStringValue(CharData.DIRECTION[14]), 2564: String.zeroBasedStringValue(CharData.DIRECTION[15]), 2565: String.zeroBasedStringValue(CharData.DIRECTION[16])}; 2566: 2567: /** 2568: * Stores unicode titlecase table. Exploit package visibility of 2569: * String.value to avoid copying the array. 2570: * @see CharData#TITLE 2571: */ 2572: private static final char[] title = String.zeroBasedStringValue(CharData.TITLE); 2573: 2574: /** 2575: * Mask for grabbing the type out of the contents of data. 2576: * @see CharData#DATA 2577: */ 2578: private static final int TYPE_MASK = 0x1F; 2579: 2580: /** 2581: * Mask for grabbing the non-breaking space flag out of the contents of 2582: * data. 2583: * @see CharData#DATA 2584: */ 2585: private static final int NO_BREAK_MASK = 0x20; 2586: 2587: /** 2588: * Mask for grabbing the mirrored directionality flag out of the contents 2589: * of data. 2590: * @see CharData#DATA 2591: */ 2592: private static final int MIRROR_MASK = 0x40; 2593: 2594: /** 2595: * Grabs an attribute offset from the Unicode attribute database. The lower 2596: * 5 bits are the character type, the next 2 bits are flags, and the top 2597: * 9 bits are the offset into the attribute tables. 2598: * 2599: * @param codePoint the character to look up 2600: * @return the character's attribute offset and type 2601: * @see #TYPE_MASK 2602: * @see #NO_BREAK_MASK 2603: * @see #MIRROR_MASK 2604: * @see CharData#DATA 2605: * @see CharData#SHIFT 2606: */ 2607: // Package visible for use in String. 2608: static char readCodePoint(int codePoint) 2609: { 2610: int plane = codePoint >>> 16; 2611: char offset = (char) (codePoint & 0xffff); 2612: return data[plane][(char) (blocks[plane][offset >> CharData.SHIFT[plane]] + offset)]; 2613: } 2614: 2615: /** 2616: * Wraps up a character. 2617: * 2618: * @param value the character to wrap 2619: */ 2620: public Character(char value) 2621: { 2622: this.value = value; 2623: } 2624: 2625: /** 2626: * Returns the character which has been wrapped by this class. 2627: * 2628: * @return the character wrapped 2629: */ 2630: public char charValue() 2631: { 2632: return value; 2633: } 2634: 2635: /** 2636: * Returns the numerical value (unsigned) of the wrapped character. 2637: * Range of returned values: 0x0000-0xFFFF. 2638: * 2639: * @return the value of the wrapped character 2640: */ 2641: public int hashCode() 2642: { 2643: return value; 2644: } 2645: 2646: /** 2647: * Determines if an object is equal to this object. This is only true for 2648: * another Character object wrapping the same value. 2649: * 2650: * @param o object to compare 2651: * @return true if o is a Character with the same value 2652: */ 2653: public boolean equals(Object o) 2654: { 2655: return o instanceof Character && value == ((Character) o).value; 2656: } 2657: 2658: /** 2659: * Converts the wrapped character into a String. 2660: * 2661: * @return a String containing one character -- the wrapped character 2662: * of this instance 2663: */ 2664: public String toString() 2665: { 2666: // Package constructor avoids an array copy. 2667: return new String(new char[] { value }, 0, 1, true); 2668: } 2669: 2670: /** 2671: * Returns a String of length 1 representing the specified character. 2672: * 2673: * @param ch the character to convert 2674: * @return a String containing the character 2675: * @since 1.4 2676: */ 2677: public static String toString(char ch) 2678: { 2679: // Package constructor avoids an array copy. 2680: return new String(new char[] { ch }, 0, 1, true); 2681: } 2682: 2683: /** 2684: * Determines if a character is a Unicode lowercase letter. For example, 2685: * <code>'a'</code> is lowercase. Returns true if getType() returns 2686: * LOWERCASE_LETTER. 2687: * <br> 2688: * lowercase = [Ll] 2689: * 2690: * @param ch character to test 2691: * @return true if ch is a Unicode lowercase letter, else false 2692: * @see #isUpperCase(char) 2693: * @see #isTitleCase(char) 2694: * @see #toLowerCase(char) 2695: * @see #getType(char) 2696: */ 2697: public static boolean isLowerCase(char ch) 2698: { 2699: return isLowerCase((int)ch); 2700: } 2701: 2702: /** 2703: * Determines if a character is a Unicode lowercase letter. For example, 2704: * <code>'a'</code> is lowercase. Returns true if getType() returns 2705: * LOWERCASE_LETTER. 2706: * <br> 2707: * lowercase = [Ll] 2708: * 2709: * @param codePoint character to test 2710: * @return true if ch is a Unicode lowercase letter, else false 2711: * @see #isUpperCase(char) 2712: * @see #isTitleCase(char) 2713: * @see #toLowerCase(char) 2714: * @see #getType(char) 2715: * 2716: * @since 1.5 2717: */ 2718: public static boolean isLowerCase(int codePoint) 2719: { 2720: return getType(codePoint) == LOWERCASE_LETTER; 2721: } 2722: 2723: /** 2724: * Determines if a character is a Unicode uppercase letter. For example, 2725: * <code>'A'</code> is uppercase. Returns true if getType() returns 2726: * UPPERCASE_LETTER. 2727: * <br> 2728: * uppercase = [Lu] 2729: * 2730: * @param ch character to test 2731: * @return true if ch is a Unicode uppercase letter, else false 2732: * @see #isLowerCase(char) 2733: * @see #isTitleCase(char) 2734: * @see #toUpperCase(char) 2735: * @see #getType(char) 2736: */ 2737: public static boolean isUpperCase(char ch) 2738: { 2739: return isUpperCase((int)ch); 2740: } 2741: 2742: /** 2743: * Determines if a character is a Unicode uppercase letter. For example, 2744: * <code>'A'</code> is uppercase. Returns true if getType() returns 2745: * UPPERCASE_LETTER. 2746: * <br> 2747: * uppercase = [Lu] 2748: * 2749: * @param codePoint character to test 2750: * @return true if ch is a Unicode uppercase letter, else false 2751: * @see #isLowerCase(char) 2752: * @see #isTitleCase(char) 2753: * @see #toUpperCase(char) 2754: * @see #getType(char) 2755: * 2756: * @since 1.5 2757: */ 2758: public static boolean isUpperCase(int codePoint) 2759: { 2760: return getType(codePoint) == UPPERCASE_LETTER; 2761: } 2762: 2763: /** 2764: * Determines if a character is a Unicode titlecase letter. For example, 2765: * the character "Lj" (Latin capital L with small letter j) is titlecase. 2766: * True if getType() returns TITLECASE_LETTER. 2767: * <br> 2768: * titlecase = [Lt] 2769: * 2770: * @param ch character to test 2771: * @return true if ch is a Unicode titlecase letter, else false 2772: * @see #isLowerCase(char) 2773: * @see #isUpperCase(char) 2774: * @see #toTitleCase(char) 2775: * @see #getType(char) 2776: */ 2777: public static boolean isTitleCase(char ch) 2778: { 2779: return isTitleCase((int)ch); 2780: } 2781: 2782: /** 2783: * Determines if a character is a Unicode titlecase letter. For example, 2784: * the character "Lj" (Latin capital L with small letter j) is titlecase. 2785: * True if getType() returns TITLECASE_LETTER. 2786: * <br> 2787: * titlecase = [Lt] 2788: * 2789: * @param codePoint character to test 2790: * @return true if ch is a Unicode titlecase letter, else false 2791: * @see #isLowerCase(char) 2792: * @see #isUpperCase(char) 2793: * @see #toTitleCase(char) 2794: * @see #getType(char) 2795: * 2796: * @since 1.5 2797: */ 2798: public static boolean isTitleCase(int codePoint) 2799: { 2800: return getType(codePoint) == TITLECASE_LETTER; 2801: } 2802: 2803: 2804: /** 2805: * Determines if a character is a Unicode decimal digit. For example, 2806: * <code>'0'</code> is a digit. A character is a Unicode digit if 2807: * getType() returns DECIMAL_DIGIT_NUMBER. 2808: * <br> 2809: * Unicode decimal digit = [Nd] 2810: * 2811: * @param ch character to test 2812: * @return true if ch is a Unicode decimal digit, else false 2813: * @see #digit(char, int) 2814: * @see #forDigit(int, int) 2815: * @see #getType(char) 2816: */ 2817: public static boolean isDigit(char ch) 2818: { 2819: return isDigit((int)ch); 2820: } 2821: 2822: /** 2823: * Determines if a character is a Unicode decimal digit. For example, 2824: * <code>'0'</code> is a digit. A character is a Unicode digit if 2825: * getType() returns DECIMAL_DIGIT_NUMBER. 2826: * <br> 2827: * Unicode decimal digit = [Nd] 2828: * 2829: * @param codePoint character to test 2830: * @return true if ch is a Unicode decimal digit, else false 2831: * @see #digit(char, int) 2832: * @see #forDigit(int, int) 2833: * @see #getType(char) 2834: * 2835: * @since 1.5 2836: */ 2837: 2838: public static boolean isDigit(int codePoint) 2839: { 2840: return getType(codePoint) == DECIMAL_DIGIT_NUMBER; 2841: } 2842: 2843: /** 2844: * Determines if a character is part of the Unicode Standard. This is an 2845: * evolving standard, but covers every character in the data file. 2846: * <br> 2847: * defined = not [Cn] 2848: * 2849: * @param ch character to test 2850: * @return true if ch is a Unicode character, else false 2851: * @see #isDigit(char) 2852: * @see #isLetter(char) 2853: * @see #isLetterOrDigit(char) 2854: * @see #isLowerCase(char) 2855: * @see #isTitleCase(char) 2856: * @see #isUpperCase(char) 2857: */ 2858: public static boolean isDefined(char ch) 2859: { 2860: return isDefined((int)ch); 2861: } 2862: 2863: /** 2864: * Determines if a character is part of the Unicode Standard. This is an 2865: * evolving standard, but covers every character in the data file. 2866: * <br> 2867: * defined = not [Cn] 2868: * 2869: * @param codePoint character to test 2870: * @return true if ch is a Unicode character, else false 2871: * @see #isDigit(char) 2872: * @see #isLetter(char) 2873: * @see #isLetterOrDigit(char) 2874: * @see #isLowerCase(char) 2875: * @see #isTitleCase(char) 2876: * @see #isUpperCase(char) 2877: * 2878: * @since 1.5 2879: */ 2880: public static boolean isDefined(int codePoint) 2881: { 2882: return getType(codePoint) != UNASSIGNED; 2883: } 2884: 2885: /** 2886: * Determines if a character is a Unicode letter. Not all letters have case, 2887: * so this may return true when isLowerCase and isUpperCase return false. 2888: * A character is a Unicode letter if getType() returns one of 2889: * UPPERCASE_LETTER, LOWERCASE_LETTER, TITLECASE_LETTER, MODIFIER_LETTER, 2890: * or OTHER_LETTER. 2891: * <br> 2892: * letter = [Lu]|[Ll]|[Lt]|[Lm]|[Lo] 2893: * 2894: * @param ch character to test 2895: * @return true if ch is a Unicode letter, else false 2896: * @see #isDigit(char) 2897: * @see #isJavaIdentifierStart(char) 2898: * @see #isJavaLetter(char) 2899: * @see #isJavaLetterOrDigit(char) 2900: * @see #isLetterOrDigit(char) 2901: * @see #isLowerCase(char) 2902: * @see #isTitleCase(char) 2903: * @see #isUnicodeIdentifierStart(char) 2904: * @see #isUpperCase(char) 2905: */ 2906: public static boolean isLetter(char ch) 2907: { 2908: return isLetter((int)ch); 2909: } 2910: 2911: /** 2912: * Determines if a character is a Unicode letter. Not all letters have case, 2913: * so this may return true when isLowerCase and isUpperCase return false. 2914: * A character is a Unicode letter if getType() returns one of 2915: * UPPERCASE_LETTER, LOWERCASE_LETTER, TITLECASE_LETTER, MODIFIER_LETTER, 2916: * or OTHER_LETTER. 2917: * <br> 2918: * letter = [Lu]|[Ll]|[Lt]|[Lm]|[Lo] 2919: * 2920: * @param codePoint character to test 2921: * @return true if ch is a Unicode letter, else false 2922: * @see #isDigit(char) 2923: * @see #isJavaIdentifierStart(char) 2924: * @see #isJavaLetter(char) 2925: * @see #isJavaLetterOrDigit(char) 2926: * @see #isLetterOrDigit(char) 2927: * @see #isLowerCase(char) 2928: * @see #isTitleCase(char) 2929: * @see #isUnicodeIdentifierStart(char) 2930: * @see #isUpperCase(char) 2931: * 2932: * @since 1.5 2933: */ 2934: public static boolean isLetter(int codePoint) 2935: { 2936: return ((1 << getType(codePoint)) 2937: & ((1 << UPPERCASE_LETTER) 2938: | (1 << LOWERCASE_LETTER) 2939: | (1 << TITLECASE_LETTER) 2940: | (1 << MODIFIER_LETTER) 2941: | (1 << OTHER_LETTER))) != 0; 2942: } 2943: /** 2944: * Returns the index into the given CharSequence that is offset 2945: * <code>codePointOffset</code> code points from <code>index</code>. 2946: * @param seq the CharSequence 2947: * @param index the start position in the CharSequence 2948: * @param codePointOffset the number of code points offset from the start 2949: * position 2950: * @return the index into the CharSequence that is codePointOffset code 2951: * points offset from index 2952: * 2953: * @throws NullPointerException if seq is null 2954: * @throws IndexOutOfBoundsException if index is negative or greater than the 2955: * length of the sequence. 2956: * @throws IndexOutOfBoundsException if codePointOffset is positive and the 2957: * subsequence from index to the end of seq has fewer than codePointOffset 2958: * code points 2959: * @throws IndexOutOfBoundsException if codePointOffset is negative and the 2960: * subsequence from the start of seq to index has fewer than 2961: * (-codePointOffset) code points 2962: * @since 1.5 2963: */ 2964: public static int offsetByCodePoints(CharSequence seq, 2965: int index, 2966: int codePointOffset) 2967: { 2968: int len = seq.length(); 2969: if (index < 0 || index > len) 2970: throw new IndexOutOfBoundsException(); 2971: 2972: int numToGo = codePointOffset; 2973: int offset = index; 2974: int adjust = 1; 2975: if (numToGo >= 0) 2976: { 2977: for (; numToGo > 0; offset++) 2978: { 2979: numToGo--; 2980: if (Character.isHighSurrogate(seq.charAt(offset)) 2981: && (offset + 1) < len 2982: && Character.isLowSurrogate(seq.charAt(offset + 1))) 2983: offset++; 2984: } 2985: return offset; 2986: } 2987: else 2988: { 2989: numToGo *= -1; 2990: for (; numToGo > 0;) 2991: { 2992: numToGo--; 2993: offset--; 2994: if (Character.isLowSurrogate(seq.charAt(offset)) 2995: && (offset - 1) >= 0 2996: && Character.isHighSurrogate(seq.charAt(offset - 1))) 2997: offset--; 2998: } 2999: return offset; 3000: } 3001: } 3002: 3003: /** 3004: * Returns the index into the given char subarray that is offset 3005: * <code>codePointOffset</code> code points from <code>index</code>. 3006: * @param a the char array 3007: * @param start the start index of the subarray 3008: * @param count the length of the subarray 3009: * @param index the index to be offset 3010: * @param codePointOffset the number of code points offset from <code>index 3011: * </code> 3012: * @return the index into the char array 3013: * 3014: * @throws NullPointerException if a is null 3015: * @throws IndexOutOfBoundsException if start or count is negative or if 3016: * start + count is greater than the length of the array 3017: * @throws IndexOutOfBoundsException if index is less than start or larger 3018: * than start + count 3019: * @throws IndexOutOfBoundsException if codePointOffset is positive and the 3020: * subarray from index to start + count - 1 has fewer than codePointOffset 3021: * code points. 3022: * @throws IndexOutOfBoundsException if codePointOffset is negative and the 3023: * subarray from start to index - 1 has fewer than (-codePointOffset) code 3024: * points 3025: * 3026: * @since 1.5 3027: */ 3028: public static int offsetByCodePoints(char[] a, 3029: int start, 3030: int count, 3031: int index, 3032: int codePointOffset) 3033: { 3034: int len = a.length; 3035: int end = start + count; 3036: if (start < 0 || count < 0 || end > len || index < start || index > end) 3037: throw new IndexOutOfBoundsException(); 3038: 3039: int numToGo = codePointOffset; 3040: int offset = index; 3041: int adjust = 1; 3042: if (numToGo >= 0) 3043: { 3044: for (; numToGo > 0; offset++) 3045: { 3046: numToGo--; 3047: if (Character.isHighSurrogate(a[offset]) 3048: && (offset + 1) < len 3049: && Character.isLowSurrogate(a[offset + 1])) 3050: offset++; 3051: } 3052: return offset; 3053: } 3054: else 3055: { 3056: numToGo *= -1; 3057: for (; numToGo > 0;) 3058: { 3059: numToGo--; 3060: offset--; 3061: if (Character.isLowSurrogate(a[offset]) 3062: && (offset - 1) >= 0 3063: && Character.isHighSurrogate(a[offset - 1])) 3064: offset--; 3065: if (offset < start) 3066: throw new IndexOutOfBoundsException(); 3067: } 3068: return offset; 3069: } 3070: 3071: } 3072: 3073: /** 3074: * Returns the number of Unicode code points in the specified range of the 3075: * given CharSequence. The first char in the range is at position 3076: * beginIndex and the last one is at position endIndex - 1. Paired 3077: * surrogates (supplementary characters are represented by a pair of chars - 3078: * one from the high surrogates and one from the low surrogates) 3079: * count as just one code point. 3080: * @param seq the CharSequence to inspect 3081: * @param beginIndex the beginning of the range 3082: * @param endIndex the end of the range 3083: * @return the number of Unicode code points in the given range of the 3084: * sequence 3085: * @throws NullPointerException if seq is null 3086: * @throws IndexOutOfBoundsException if beginIndex is negative, endIndex is 3087: * larger than the length of seq, or if beginIndex is greater than endIndex. 3088: * @since 1.5 3089: */ 3090: public static int codePointCount(CharSequence seq, int beginIndex, 3091: int endIndex) 3092: { 3093: int len = seq.length(); 3094: if (beginIndex < 0 || endIndex > len || beginIndex > endIndex) 3095: throw new IndexOutOfBoundsException(); 3096: 3097: int count = 0; 3098: for (int i = beginIndex; i < endIndex; i++) 3099: { 3100: count++; 3101: // If there is a pairing, count it only once. 3102: if (isHighSurrogate(seq.charAt(i)) && (i + 1) < endIndex 3103: && isLowSurrogate(seq.charAt(i + 1))) 3104: i ++; 3105: } 3106: return count; 3107: } 3108: 3109: /** 3110: * Returns the number of Unicode code points in the specified range of the 3111: * given char array. The first char in the range is at position 3112: * offset and the length of the range is count. Paired surrogates 3113: * (supplementary characters are represented by a pair of chars - 3114: * one from the high surrogates and one from the low surrogates) 3115: * count as just one code point. 3116: * @param a the char array to inspect 3117: * @param offset the beginning of the range 3118: * @param count the length of the range 3119: * @return the number of Unicode code points in the given range of the 3120: * array 3121: * @throws NullPointerException if a is null 3122: * @throws IndexOutOfBoundsException if offset or count is negative or if 3123: * offset + countendIndex is larger than the length of a. 3124: * @since 1.5 3125: */ 3126: public static int codePointCount(char[] a, int offset, 3127: int count) 3128: { 3129: int len = a.length; 3130: int end = offset + count; 3131: if (offset < 0 || count < 0 || end > len) 3132: throw new IndexOutOfBoundsException(); 3133: 3134: int counter = 0; 3135: for (int i = offset; i < end; i++) 3136: { 3137: counter++; 3138: // If there is a pairing, count it only once. 3139: if (isHighSurrogate(a[i]) && (i + 1) < end 3140: && isLowSurrogate(a[i + 1])) 3141: i ++; 3142: } 3143: return counter; 3144: } 3145: 3146: /** 3147: * Determines if a character is a Unicode letter or a Unicode digit. This 3148: * is the combination of isLetter and isDigit. 3149: * <br> 3150: * letter or digit = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nd] 3151: * 3152: * @param ch character to test 3153: * @return true if ch is a Unicode letter or a Unicode digit, else false 3154: * @see #isDigit(char) 3155: * @see #isJavaIdentifierPart(char) 3156: * @see #isJavaLetter(char) 3157: * @see #isJavaLetterOrDigit(char) 3158: * @see #isLetter(char) 3159: * @see #isUnicodeIdentifierPart(char) 3160: */ 3161: public static boolean isLetterOrDigit(char ch) 3162: { 3163: return isLetterOrDigit((int)ch); 3164: } 3165: 3166: /** 3167: * Determines if a character is a Unicode letter or a Unicode digit. This 3168: * is the combination of isLetter and isDigit. 3169: * <br> 3170: * letter or digit = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nd] 3171: * 3172: * @param codePoint character to test 3173: * @return true if ch is a Unicode letter or a Unicode digit, else false 3174: * @see #isDigit(char) 3175: * @see #isJavaIdentifierPart(char) 3176: * @see #isJavaLetter(char) 3177: * @see #isJavaLetterOrDigit(char) 3178: * @see #isLetter(char) 3179: * @see #isUnicodeIdentifierPart(char) 3180: * 3181: * @since 1.5 3182: */ 3183: public static boolean isLetterOrDigit(int codePoint) 3184: { 3185: return ((1 << getType(codePoint)) 3186: & ((1 << UPPERCASE_LETTER) 3187: | (1 << LOWERCASE_LETTER) 3188: | (1 << TITLECASE_LETTER) 3189: | (1 << MODIFIER_LETTER) 3190: | (1 << OTHER_LETTER) 3191: | (1 << DECIMAL_DIGIT_NUMBER))) != 0; 3192: } 3193: 3194: /** 3195: * Determines if a character can start a Java identifier. This is the 3196: * combination of isLetter, any character where getType returns 3197: * LETTER_NUMBER, currency symbols (like '$'), and connecting punctuation 3198: * (like '_'). 3199: * 3200: * @param ch character to test 3201: * @return true if ch can start a Java identifier, else false 3202: * @deprecated Replaced by {@link #isJavaIdentifierStart(char)} 3203: * @see #isJavaLetterOrDigit(char) 3204: * @see #isJavaIdentifierStart(char) 3205: * @see #isJavaIdentifierPart(char) 3206: * @see #isLetter(char) 3207: * @see #isLetterOrDigit(char) 3208: * @see #isUnicodeIdentifierStart(char) 3209: */ 3210: public static boolean isJavaLetter(char ch) 3211: { 3212: return isJavaIdentifierStart(ch); 3213: } 3214: 3215: /** 3216: * Determines if a character can follow the first letter in 3217: * a Java identifier. This is the combination of isJavaLetter (isLetter, 3218: * type of LETTER_NUMBER, currency, connecting punctuation) and digit, 3219: * numeric letter (like Roman numerals), combining marks, non-spacing marks, 3220: * or isIdentifierIgnorable. 3221: * 3222: * @param ch character to test 3223: * @return true if ch can follow the first letter in a Java identifier 3224: * @deprecated Replaced by {@link #isJavaIdentifierPart(char)} 3225: * @see #isJavaLetter(char) 3226: * @see #isJavaIdentifierStart(char) 3227: * @see #isJavaIdentifierPart(char) 3228: * @see #isLetter(char) 3229: * @see #isLetterOrDigit(char) 3230: * @see #isUnicodeIdentifierPart(char) 3231: * @see #isIdentifierIgnorable(char) 3232: */ 3233: public static boolean isJavaLetterOrDigit(char ch) 3234: { 3235: return isJavaIdentifierPart(ch); 3236: } 3237: 3238: /** 3239: * Determines if a character can start a Java identifier. This is the 3240: * combination of isLetter, any character where getType returns 3241: * LETTER_NUMBER, currency symbols (like '$'), and connecting punctuation 3242: * (like '_'). 3243: * <br> 3244: * Java identifier start = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Sc]|[Pc] 3245: * 3246: * @param ch character to test 3247: * @return true if ch can start a Java identifier, else false 3248: * @see #isJavaIdentifierPart(char) 3249: * @see #isLetter(char) 3250: * @see #isUnicodeIdentifierStart(char) 3251: * @since 1.1 3252: */ 3253: public static boolean isJavaIdentifierStart(char ch) 3254: { 3255: return isJavaIdentifierStart((int)ch); 3256: } 3257: 3258: /** 3259: * Determines if a character can start a Java identifier. This is the 3260: * combination of isLetter, any character where getType returns 3261: * LETTER_NUMBER, currency symbols (like '$'), and connecting punctuation 3262: * (like '_'). 3263: * <br> 3264: * Java identifier start = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Sc]|[Pc] 3265: * 3266: * @param codePoint character to test 3267: * @return true if ch can start a Java identifier, else false 3268: * @see #isJavaIdentifierPart(char) 3269: * @see #isLetter(char) 3270: * @see #isUnicodeIdentifierStart(char) 3271: * @since 1.5 3272: */ 3273: public static boolean isJavaIdentifierStart(int codePoint) 3274: { 3275: return ((1 << getType(codePoint)) 3276: & ((1 << UPPERCASE_LETTER) 3277: | (1 << LOWERCASE_LETTER) 3278: | (1 << TITLECASE_LETTER) 3279: | (1 << MODIFIER_LETTER) 3280: | (1 << OTHER_LETTER) 3281: | (1 << LETTER_NUMBER) 3282: | (1 << CURRENCY_SYMBOL) 3283: | (1 << CONNECTOR_PUNCTUATION))) != 0; 3284: } 3285: 3286: /** 3287: * Determines if a character can follow the first letter in 3288: * a Java identifier. This is the combination of isJavaLetter (isLetter, 3289: * type of LETTER_NUMBER, currency, connecting punctuation) and digit, 3290: * numeric letter (like Roman numerals), combining marks, non-spacing marks, 3291: * or isIdentifierIgnorable. 3292: * <br> 3293: * Java identifier extender = 3294: * [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Sc]|[Pc]|[Mn]|[Mc]|[Nd]|[Cf] 3295: * |U+0000-U+0008|U+000E-U+001B|U+007F-U+009F 3296: * 3297: * @param ch character to test 3298: * @return true if ch can follow the first letter in a Java identifier 3299: * @see #isIdentifierIgnorable(char) 3300: * @see #isJavaIdentifierStart(char) 3301: * @see #isLetterOrDigit(char) 3302: * @see #isUnicodeIdentifierPart(char) 3303: * @since 1.1 3304: */ 3305: public static boolean isJavaIdentifierPart(char ch) 3306: { 3307: return isJavaIdentifierPart((int)ch); 3308: } 3309: 3310: /** 3311: * Determines if a character can follow the first letter in 3312: * a Java identifier. This is the combination of isJavaLetter (isLetter, 3313: * type of LETTER_NUMBER, currency, connecting punctuation) and digit, 3314: * numeric letter (like Roman numerals), combining marks, non-spacing marks, 3315: * or isIdentifierIgnorable. 3316: * <br> 3317: * Java identifier extender = 3318: * [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Sc]|[Pc]|[Mn]|[Mc]|[Nd]|[Cf] 3319: * |U+0000-U+0008|U+000E-U+001B|U+007F-U+009F 3320: * 3321: * @param codePoint character to test 3322: * @return true if ch can follow the first letter in a Java identifier 3323: * @see #isIdentifierIgnorable(char) 3324: * @see #isJavaIdentifierStart(char) 3325: * @see #isLetterOrDigit(char) 3326: * @see #isUnicodeIdentifierPart(char) 3327: * @since 1.5 3328: */ 3329: public static boolean isJavaIdentifierPart(int codePoint) 3330: { 3331: int category = getType(codePoint); 3332: return ((1 << category) 3333: & ((1 << UPPERCASE_LETTER) 3334: | (1 << LOWERCASE_LETTER) 3335: | (1 << TITLECASE_LETTER) 3336: | (1 << MODIFIER_LETTER) 3337: | (1 << OTHER_LETTER) 3338: | (1 << NON_SPACING_MARK) 3339: | (1 << COMBINING_SPACING_MARK) 3340: | (1 << DECIMAL_DIGIT_NUMBER) 3341: | (1 << LETTER_NUMBER) 3342: | (1 << CURRENCY_SYMBOL) 3343: | (1 << CONNECTOR_PUNCTUATION) 3344: | (1 << FORMAT))) != 0 3345: || (category == CONTROL && isIdentifierIgnorable(codePoint)); 3346: } 3347: 3348: /** 3349: * Determines if a character can start a Unicode identifier. Only 3350: * letters can start a Unicode identifier, but this includes characters 3351: * in LETTER_NUMBER. 3352: * <br> 3353: * Unicode identifier start = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl] 3354: * 3355: * @param ch character to test 3356: * @return true if ch can start a Unicode identifier, else false 3357: * @see #isJavaIdentifierStart(char) 3358: * @see #isLetter(char) 3359: * @see #isUnicodeIdentifierPart(char) 3360: * @since 1.1 3361: */ 3362: public static boolean isUnicodeIdentifierStart(char ch) 3363: { 3364: return isUnicodeIdentifierStart((int)ch); 3365: } 3366: 3367: /** 3368: * Determines if a character can start a Unicode identifier. Only 3369: * letters can start a Unicode identifier, but this includes characters 3370: * in LETTER_NUMBER. 3371: * <br> 3372: * Unicode identifier start = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl] 3373: * 3374: * @param codePoint character to test 3375: * @return true if ch can start a Unicode identifier, else false 3376: * @see #isJavaIdentifierStart(char) 3377: * @see #isLetter(char) 3378: * @see #isUnicodeIdentifierPart(char) 3379: * @since 1.5 3380: */ 3381: public static boolean isUnicodeIdentifierStart(int codePoint) 3382: { 3383: return ((1 << getType(codePoint)) 3384: & ((1 << UPPERCASE_LETTER) 3385: | (1 << LOWERCASE_LETTER) 3386: | (1 << TITLECASE_LETTER) 3387: | (1 << MODIFIER_LETTER) 3388: | (1 << OTHER_LETTER) 3389: | (1 << LETTER_NUMBER))) != 0; 3390: } 3391: 3392: /** 3393: * Determines if a character can follow the first letter in 3394: * a Unicode identifier. This includes letters, connecting punctuation, 3395: * digits, numeric letters, combining marks, non-spacing marks, and 3396: * isIdentifierIgnorable. 3397: * <br> 3398: * Unicode identifier extender = 3399: * [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Mn]|[Mc]|[Nd]|[Pc]|[Cf]| 3400: * |U+0000-U+0008|U+000E-U+001B|U+007F-U+009F 3401: * 3402: * @param ch character to test 3403: * @return true if ch can follow the first letter in a Unicode identifier 3404: * @see #isIdentifierIgnorable(char) 3405: * @see #isJavaIdentifierPart(char) 3406: * @see #isLetterOrDigit(char) 3407: * @see #isUnicodeIdentifierStart(char) 3408: * @since 1.1 3409: */ 3410: public static boolean isUnicodeIdentifierPart(char ch) 3411: { 3412: return isUnicodeIdentifierPart((int)ch); 3413: } 3414: 3415: /** 3416: * Determines if a character can follow the first letter in 3417: * a Unicode identifier. This includes letters, connecting punctuation, 3418: * digits, numeric letters, combining marks, non-spacing marks, and 3419: * isIdentifierIgnorable. 3420: * <br> 3421: * Unicode identifier extender = 3422: * [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Mn]|[Mc]|[Nd]|[Pc]|[Cf]| 3423: * |U+0000-U+0008|U+000E-U+001B|U+007F-U+009F 3424: * 3425: * @param codePoint character to test 3426: * @return true if ch can follow the first letter in a Unicode identifier 3427: * @see #isIdentifierIgnorable(char) 3428: * @see #isJavaIdentifierPart(char) 3429: * @see #isLetterOrDigit(char) 3430: * @see #isUnicodeIdentifierStart(char) 3431: * @since 1.5 3432: */ 3433: public static boolean isUnicodeIdentifierPart(int codePoint) 3434: { 3435: int category = getType(codePoint); 3436: return ((1 << category) 3437: & ((1 << UPPERCASE_LETTER) 3438: | (1 << LOWERCASE_LETTER) 3439: | (1 << TITLECASE_LETTER) 3440: | (1 << MODIFIER_LETTER) 3441: | (1 << OTHER_LETTER) 3442: | (1 << NON_SPACING_MARK) 3443: | (1 << COMBINING_SPACING_MARK) 3444: | (1 << DECIMAL_DIGIT_NUMBER) 3445: | (1 << LETTER_NUMBER) 3446: | (1 << CONNECTOR_PUNCTUATION) 3447: | (1 << FORMAT))) != 0 3448: || (category == CONTROL && isIdentifierIgnorable(codePoint)); 3449: } 3450: 3451: /** 3452: * Determines if a character is ignorable in a Unicode identifier. This 3453: * includes the non-whitespace ISO control characters (<code>'\u0000'</code> 3454: * through <code>'\u0008'</code>, <code>'\u000E'</code> through 3455: * <code>'\u001B'</code>, and <code>'\u007F'</code> through 3456: * <code>'\u009F'</code>), and FORMAT characters. 3457: * <br> 3458: * Unicode identifier ignorable = [Cf]|U+0000-U+0008|U+000E-U+001B 3459: * |U+007F-U+009F 3460: * 3461: * @param ch character to test 3462: * @return true if ch is ignorable in a Unicode or Java identifier 3463: * @see #isJavaIdentifierPart(char) 3464: * @see #isUnicodeIdentifierPart(char) 3465: * @since 1.1 3466: */ 3467: public static boolean isIdentifierIgnorable(char ch) 3468: { 3469: return isIdentifierIgnorable((int)ch); 3470: } 3471: 3472: /** 3473: * Determines if a character is ignorable in a Unicode identifier. This 3474: * includes the non-whitespace ISO control characters (<code>'\u0000'</code> 3475: * through <code>'\u0008'</code>, <code>'\u000E'</code> through 3476: * <code>'\u001B'</code>, and <code>'\u007F'</code> through 3477: * <code>'\u009F'</code>), and FORMAT characters. 3478: * <br> 3479: * Unicode identifier ignorable = [Cf]|U+0000-U+0008|U+000E-U+001B 3480: * |U+007F-U+009F 3481: * 3482: * @param codePoint character to test 3483: * @return true if ch is ignorable in a Unicode or Java identifier 3484: * @see #isJavaIdentifierPart(char) 3485: * @see #isUnicodeIdentifierPart(char) 3486: * @since 1.5 3487: */ 3488: public static boolean isIdentifierIgnorable(int codePoint) 3489: { 3490: if ((codePoint >= 0 && codePoint <= 0x0008) 3491: || (codePoint >= 0x000E && codePoint <= 0x001B) 3492: || (codePoint >= 0x007F && codePoint <= 0x009F) 3493: || getType(codePoint) == FORMAT) 3494: return true; 3495: return false; 3496: } 3497: 3498: /** 3499: * Converts a Unicode character into its lowercase equivalent mapping. 3500: * If a mapping does not exist, then the character passed is returned. 3501: * Note that isLowerCase(toLowerCase(ch)) does not always return true. 3502: * 3503: * @param ch character to convert to lowercase 3504: * @return lowercase mapping of ch, or ch if lowercase mapping does 3505: * not exist 3506: * @see #isLowerCase(char) 3507: * @see #isUpperCase(char) 3508: * @see #toTitleCase(char) 3509: * @see #toUpperCase(char) 3510: */ 3511: public static char toLowerCase(char ch) 3512: { 3513: return (char) (lower[0][readCodePoint((int)ch) >>> 7] + ch); 3514: } 3515: 3516: /** 3517: * Converts a Unicode character into its lowercase equivalent mapping. 3518: * If a mapping does not exist, then the character passed is returned. 3519: * Note that isLowerCase(toLowerCase(ch)) does not always return true. 3520: * 3521: * @param codePoint character to convert to lowercase 3522: * @return lowercase mapping of ch, or ch if lowercase mapping does 3523: * not exist 3524: * @see #isLowerCase(char) 3525: * @see #isUpperCase(char) 3526: * @see #toTitleCase(char) 3527: * @see #toUpperCase(char) 3528: * 3529: * @since 1.5 3530: */ 3531: public static int toLowerCase(int codePoint) 3532: { 3533: // If the code point is unassigned or in one of the private use areas 3534: // then we delegate the call to the appropriate private static inner class. 3535: int plane = codePoint >>> 16; 3536: if (plane > 2 && plane < 14) 3537: return UnassignedCharacters.toLowerCase(codePoint); 3538: if (plane > 14) 3539: return PrivateUseCharacters.toLowerCase(codePoint); 3540: 3541: // The short value stored in lower[plane] is the signed difference between 3542: // codePoint and its lowercase conversion. 3543: return ((short)lower[plane][readCodePoint(codePoint) >>> 7]) + codePoint; 3544: } 3545: 3546: /** 3547: * Converts a Unicode character into its uppercase equivalent mapping. 3548: * If a mapping does not exist, then the character passed is returned. 3549: * Note that isUpperCase(toUpperCase(ch)) does not always return true. 3550: * 3551: * @param ch character to convert to uppercase 3552: * @return uppercase mapping of ch, or ch if uppercase mapping does 3553: * not exist 3554: * @see #isLowerCase(char) 3555: * @see #isUpperCase(char) 3556: * @see #toLowerCase(char) 3557: * @see #toTitleCase(char) 3558: */ 3559: public static char toUpperCase(char ch) 3560: { 3561: return (char) (upper[0][readCodePoint((int)ch) >>> 7] + ch); 3562: } 3563: 3564: /** 3565: * Converts a Unicode character into its uppercase equivalent mapping. 3566: * If a mapping does not exist, then the character passed is returned. 3567: * Note that isUpperCase(toUpperCase(ch)) does not always return true. 3568: * 3569: * @param codePoint character to convert to uppercase 3570: * @return uppercase mapping of ch, or ch if uppercase mapping does 3571: * not exist 3572: * @see #isLowerCase(char) 3573: * @see #isUpperCase(char) 3574: * @see #toLowerCase(char) 3575: * @see #toTitleCase(char) 3576: * 3577: * @since 1.5 3578: */ 3579: public static int toUpperCase(int codePoint) 3580: { 3581: // If the code point is unassigned or in one of the private use areas 3582: // then we delegate the call to the appropriate private static inner class. 3583: int plane = codePoint >>> 16; 3584: if (plane > 2 && plane < 14) 3585: return UnassignedCharacters.toUpperCase(codePoint); 3586: if (plane > 14) 3587: return PrivateUseCharacters.toUpperCase(codePoint); 3588: 3589: // The short value stored in upper[plane] is the signed difference between 3590: // codePoint and its uppercase conversion. 3591: return ((short)upper[plane][readCodePoint(codePoint) >>> 7]) + codePoint; 3592: } 3593: 3594: /** 3595: * Converts a Unicode character into its titlecase equivalent mapping. 3596: * If a mapping does not exist, then the character passed is returned. 3597: * Note that isTitleCase(toTitleCase(ch)) does not always return true. 3598: * 3599: * @param ch character to convert to titlecase 3600: * @return titlecase mapping of ch, or ch if titlecase mapping does 3601: * not exist 3602: * @see #isTitleCase(char) 3603: * @see #toLowerCase(char) 3604: * @see #toUpperCase(char) 3605: */ 3606: public static char toTitleCase(char ch) 3607: { 3608: // As title is short, it doesn't hurt to exhaustively iterate over it. 3609: for (int i = title.length - 2; i >= 0; i -= 2) 3610: if (title[i] == ch) 3611: return title[i + 1]; 3612: return toUpperCase(ch); 3613: } 3614: 3615: /** 3616: * Converts a Unicode character into its titlecase equivalent mapping. 3617: * If a mapping does not exist, then the character passed is returned. 3618: * Note that isTitleCase(toTitleCase(ch)) does not always return true. 3619: * 3620: * @param codePoint character to convert to titlecase 3621: * @return titlecase mapping of ch, or ch if titlecase mapping does 3622: * not exist 3623: * @see #isTitleCase(char) 3624: * @see #toLowerCase(char) 3625: * @see #toUpperCase(char) 3626: * 3627: * @since 1.5 3628: */ 3629: public static int toTitleCase(int codePoint) 3630: { 3631: // As of Unicode 4.0.0 no characters outside of plane 0 have 3632: // titlecase mappings that are different from their uppercase 3633: // mapping. 3634: if (codePoint < 0x10000) 3635: return (int) toTitleCase((char)codePoint); 3636: return toUpperCase(codePoint); 3637: } 3638: 3639: /** 3640: * Converts a character into a digit of the specified radix. If the radix 3641: * exceeds MIN_RADIX or MAX_RADIX, or if the result of getNumericValue(ch) 3642: * exceeds the radix, or if ch is not a decimal digit or in the case 3643: * insensitive set of 'a'-'z', the result is -1. 3644: * <br> 3645: * character argument boundary = [Nd]|U+0041-U+005A|U+0061-U+007A 3646: * |U+FF21-U+FF3A|U+FF41-U+FF5A 3647: * 3648: * @param ch character to convert into a digit 3649: * @param radix radix in which ch is a digit 3650: * @return digit which ch represents in radix, or -1 not a valid digit 3651: * @see #MIN_RADIX 3652: * @see #MAX_RADIX 3653: * @see #forDigit(int, int) 3654: * @see #isDigit(char) 3655: * @see #getNumericValue(char) 3656: */ 3657: public static int digit(char ch, int radix) 3658: { 3659: if (radix < MIN_RADIX || radix > MAX_RADIX) 3660: return -1; 3661: char attr = readCodePoint((int)ch); 3662: if (((1 << (attr & TYPE_MASK)) 3663: & ((1 << UPPERCASE_LETTER) 3664: | (1 << LOWERCASE_LETTER) 3665: | (1 << DECIMAL_DIGIT_NUMBER))) != 0) 3666: { 3667: // Signedness doesn't matter; 0xffff vs. -1 are both rejected. 3668: int digit = numValue[0][attr >> 7]; 3669: return (digit < radix) ? digit : -1; 3670: } 3671: return -1; 3672: } 3673: 3674: /** 3675: * Converts a character into a digit of the specified radix. If the radix 3676: * exceeds MIN_RADIX or MAX_RADIX, or if the result of getNumericValue(ch) 3677: * exceeds the radix, or if ch is not a decimal digit or in the case 3678: * insensitive set of 'a'-'z', the result is -1. 3679: * <br> 3680: * character argument boundary = [Nd]|U+0041-U+005A|U+0061-U+007A 3681: * |U+FF21-U+FF3A|U+FF41-U+FF5A 3682: * 3683: * @param codePoint character to convert into a digit 3684: * @param radix radix in which ch is a digit 3685: * @return digit which ch represents in radix, or -1 not a valid digit 3686: * @see #MIN_RADIX 3687: * @see #MAX_RADIX 3688: * @see #forDigit(int, int) 3689: * @see #isDigit(char) 3690: * @see #getNumericValue(char) 3691: */ 3692: public static int digit(int codePoint, int radix) 3693: { 3694: if (radix < MIN_RADIX || radix > MAX_RADIX) 3695: return -1; 3696: 3697: // If the code point is unassigned or in one of the private use areas 3698: // then we delegate the call to the appropriate private static inner class. 3699: int plane = codePoint >>> 16; 3700: if (plane > 2 && plane < 14) 3701: return UnassignedCharacters.digit(codePoint, radix); 3702: if (plane > 14) 3703: return PrivateUseCharacters.digit(codePoint, radix); 3704: char attr = readCodePoint(codePoint); 3705: if (((1 << (attr & TYPE_MASK)) 3706: & ((1 << UPPERCASE_LETTER) 3707: | (1 << LOWERCASE_LETTER) 3708: | (1 << DECIMAL_DIGIT_NUMBER))) != 0) 3709: { 3710: // Signedness doesn't matter; 0xffff vs. -1 are both rejected. 3711: int digit = numValue[plane][attr >> 7]; 3712: 3713: // If digit is less than or equal to -3 then the numerical value was 3714: // too large to fit into numValue and is stored in CharData.LARGENUMS. 3715: if (digit <= -3) 3716: digit = CharData.LARGENUMS[-digit - 3]; 3717: return (digit < radix) ? digit : -1; 3718: } 3719: return -1; 3720: } 3721: 3722: /** 3723: * Returns the Unicode numeric value property of a character. For example, 3724: * <code>'\\u216C'</code> (the Roman numeral fifty) returns 50. 3725: * 3726: * <p>This method also returns values for the letters A through Z, (not 3727: * specified by Unicode), in these ranges: <code>'\u0041'</code> 3728: * through <code>'\u005A'</code> (uppercase); <code>'\u0061'</code> 3729: * through <code>'\u007A'</code> (lowercase); and <code>'\uFF21'</code> 3730: * through <code>'\uFF3A'</code>, <code>'\uFF41'</code> through 3731: * <code>'\uFF5A'</code> (full width variants). 3732: * 3733: * <p>If the character lacks a numeric value property, -1 is returned. 3734: * If the character has a numeric value property which is not representable 3735: * as a nonnegative integer, such as a fraction, -2 is returned. 3736: * 3737: * character argument boundary = [Nd]|[Nl]|[No]|U+0041-U+005A|U+0061-U+007A 3738: * |U+FF21-U+FF3A|U+FF41-U+FF5A 3739: * 3740: * @param ch character from which the numeric value property will 3741: * be retrieved 3742: * @return the numeric value property of ch, or -1 if it does not exist, or 3743: * -2 if it is not representable as a nonnegative integer 3744: * @see #forDigit(int, int) 3745: * @see #digit(char, int) 3746: * @see #isDigit(char) 3747: * @since 1.1 3748: */ 3749: public static int getNumericValue(char ch) 3750: { 3751: // Treat numValue as signed. 3752: return (short) numValue[0][readCodePoint((int)ch) >> 7]; 3753: } 3754: 3755: /** 3756: * Returns the Unicode numeric value property of a character. For example, 3757: * <code>'\\u216C'</code> (the Roman numeral fifty) returns 50. 3758: * 3759: * <p>This method also returns values for the letters A through Z, (not 3760: * specified by Unicode), in these ranges: <code>'\u0041'</code> 3761: * through <code>'\u005A'</code> (uppercase); <code>'\u0061'</code> 3762: * through <code>'\u007A'</code> (lowercase); and <code>'\uFF21'</code> 3763: * through <code>'\uFF3A'</code>, <code>'\uFF41'</code> through 3764: * <code>'\uFF5A'</code> (full width variants). 3765: * 3766: * <p>If the character lacks a numeric value property, -1 is returned. 3767: * If the character has a numeric value property which is not representable 3768: * as a nonnegative integer, such as a fraction, -2 is returned. 3769: * 3770: * character argument boundary = [Nd]|[Nl]|[No]|U+0041-U+005A|U+0061-U+007A 3771: * |U+FF21-U+FF3A|U+FF41-U+FF5A 3772: * 3773: * @param codePoint character from which the numeric value property will 3774: * be retrieved 3775: * @return the numeric value property of ch, or -1 if it does not exist, or 3776: * -2 if it is not representable as a nonnegative integer 3777: * @see #forDigit(int, int) 3778: * @see #digit(char, int) 3779: * @see #isDigit(char) 3780: * @since 1.5 3781: */ 3782: public static int getNumericValue(int codePoint) 3783: { 3784: // If the code point is unassigned or in one of the private use areas 3785: // then we delegate the call to the appropriate private static inner class. 3786: int plane = codePoint >>> 16; 3787: if (plane > 2 && plane < 14) 3788: return UnassignedCharacters.getNumericValue(codePoint); 3789: if (plane > 14) 3790: return PrivateUseCharacters.getNumericValue(codePoint); 3791: 3792: // If the value N found in numValue[plane] is less than or equal to -3 3793: // then the numeric value was too big to fit into 16 bits and is 3794: // stored in CharData.LARGENUMS at offset (-N - 3). 3795: short num = (short)numValue[plane][readCodePoint(codePoint) >> 7]; 3796: if (num <= -3) 3797: return CharData.LARGENUMS[-num - 3]; 3798: return num; 3799: } 3800: 3801: /** 3802: * Determines if a character is a ISO-LATIN-1 space. This is only the five 3803: * characters <code>'\t'</code>, <code>'\n'</code>, <code>'\f'</code>, 3804: * <code>'\r'</code>, and <code>' '</code>. 3805: * <br> 3806: * Java space = U+0020|U+0009|U+000A|U+000C|U+000D 3807: * 3808: * @param ch character to test 3809: * @return true if ch is a space, else false 3810: * @deprecated Replaced by {@link #isWhitespace(char)} 3811: * @see #isSpaceChar(char) 3812: * @see #isWhitespace(char) 3813: */ 3814: public static boolean isSpace(char ch) 3815: { 3816: // Performing the subtraction up front alleviates need to compare longs. 3817: return ch-- <= ' ' && ((1 << ch) 3818: & ((1 << (' ' - 1)) 3819: | (1 << ('\t' - 1)) 3820: | (1 << ('\n' - 1)) 3821: | (1 << ('\r' - 1)) 3822: | (1 << ('\f' - 1)))) != 0; 3823: } 3824: 3825: /** 3826: * Determines if a character is a Unicode space character. This includes 3827: * SPACE_SEPARATOR, LINE_SEPARATOR, and PARAGRAPH_SEPARATOR. 3828: * <br> 3829: * Unicode space = [Zs]|[Zp]|[Zl] 3830: * 3831: * @param ch character to test 3832: * @return true if ch is a Unicode space, else false 3833: * @see #isWhitespace(char) 3834: * @since 1.1 3835: */ 3836: public static boolean isSpaceChar(char ch) 3837: { 3838: return isSpaceChar((int)ch); 3839: } 3840: 3841: /** 3842: * Determines if a character is a Unicode space character. This includes 3843: * SPACE_SEPARATOR, LINE_SEPARATOR, and PARAGRAPH_SEPARATOR. 3844: * <br> 3845: * Unicode space = [Zs]|[Zp]|[Zl] 3846: * 3847: * @param codePoint character to test 3848: * @return true if ch is a Unicode space, else false 3849: * @see #isWhitespace(char) 3850: * @since 1.5 3851: */ 3852: public static boolean isSpaceChar(int codePoint) 3853: { 3854: return ((1 << getType(codePoint)) 3855: & ((1 << SPACE_SEPARATOR) 3856: | (1 << LINE_SEPARATOR) 3857: | (1 << PARAGRAPH_SEPARATOR))) != 0; 3858: } 3859: 3860: /** 3861: * Determines if a character is Java whitespace. This includes Unicode 3862: * space characters (SPACE_SEPARATOR, LINE_SEPARATOR, and 3863: * PARAGRAPH_SEPARATOR) except the non-breaking spaces 3864: * (<code>'\u00A0'</code>, <code>'\u2007'</code>, and <code>'\u202F'</code>); 3865: * and these characters: <code>'\u0009'</code>, <code>'\u000A'</code>, 3866: * <code>'\u000B'</code>, <code>'\u000C'</code>, <code>'\u000D'</code>, 3867: * <code>'\u001C'</code>, <code>'\u001D'</code>, <code>'\u001E'</code>, 3868: * and <code>'\u001F'</code>. 3869: * <br> 3870: * Java whitespace = ([Zs] not Nb)|[Zl]|[Zp]|U+0009-U+000D|U+001C-U+001F 3871: * 3872: * @param ch character to test 3873: * @return true if ch is Java whitespace, else false 3874: * @see #isSpaceChar(char) 3875: * @since 1.1 3876: */ 3877: public static boolean isWhitespace(char ch) 3878: { 3879: return isWhitespace((int) ch); 3880: } 3881: 3882: /** 3883: * Determines if a character is Java whitespace. This includes Unicode 3884: * space characters (SPACE_SEPARATOR, LINE_SEPARATOR, and 3885: * PARAGRAPH_SEPARATOR) except the non-breaking spaces 3886: * (<code>'\u00A0'</code>, <code>'\u2007'</code>, and <code>'\u202F'</code>); 3887: * and these characters: <code>'\u0009'</code>, <code>'\u000A'</code>, 3888: * <code>'\u000B'</code>, <code>'\u000C'</code>, <code>'\u000D'</code>, 3889: * <code>'\u001C'</code>, <code>'\u001D'</code>, <code>'\u001E'</code>, 3890: * and <code>'\u001F'</code>. 3891: * <br> 3892: * Java whitespace = ([Zs] not Nb)|[Zl]|[Zp]|U+0009-U+000D|U+001C-U+001F 3893: * 3894: * @param codePoint character to test 3895: * @return true if ch is Java whitespace, else false 3896: * @see #isSpaceChar(char) 3897: * @since 1.5 3898: */ 3899: public static boolean isWhitespace(int codePoint) 3900: { 3901: int plane = codePoint >>> 16; 3902: if (plane > 2 && plane < 14) 3903: return UnassignedCharacters.isWhiteSpace(codePoint); 3904: if (plane > 14) 3905: return PrivateUseCharacters.isWhiteSpace(codePoint); 3906: 3907: int attr = readCodePoint(codePoint); 3908: return ((((1 << (attr & TYPE_MASK)) 3909: & ((1 << SPACE_SEPARATOR) 3910: | (1 << LINE_SEPARATOR) 3911: | (1 << PARAGRAPH_SEPARATOR))) != 0) 3912: && (attr & NO_BREAK_MASK) == 0) 3913: || (codePoint <= '\u001F' && ((1 << codePoint) 3914: & ((1 << '\t') 3915: | (1 << '\n') 3916: | (1 << '\u000B') 3917: | (1 << '\u000C') 3918: | (1 << '\r') 3919: | (1 << '\u001C') 3920: | (1 << '\u001D') 3921: | (1 << '\u001E') 3922: | (1 << '\u001F'))) != 0); 3923: } 3924: 3925: /** 3926: * Determines if a character has the ISO Control property. 3927: * <br> 3928: * ISO Control = [Cc] 3929: * 3930: * @param ch character to test 3931: * @return true if ch is an ISO Control character, else false 3932: * @see #isSpaceChar(char) 3933: * @see #isWhitespace(char) 3934: * @since 1.1 3935: */ 3936: public static boolean isISOControl(char ch) 3937: { 3938: return isISOControl((int)ch); 3939: } 3940: 3941: /** 3942: * Determines if the character is an ISO Control character. This is true 3943: * if the code point is in the range [0, 0x001F] or if it is in the range 3944: * [0x007F, 0x009F]. 3945: * @param codePoint the character to check 3946: * @return true if the character is in one of the above ranges 3947: * 3948: * @since 1.5 3949: */ 3950: public static boolean isISOControl(int codePoint) 3951: { 3952: if ((codePoint >= 0 && codePoint <= 0x001F) 3953: || (codePoint >= 0x007F && codePoint <= 0x009F)) 3954: return true; 3955: return false; 3956: } 3957: 3958: /** 3959: * Returns the Unicode general category property of a character. 3960: * 3961: * @param ch character from which the general category property will 3962: * be retrieved 3963: * @return the character category property of ch as an integer 3964: * @see #UNASSIGNED 3965: * @see #UPPERCASE_LETTER 3966: * @see #LOWERCASE_LETTER 3967: * @see #TITLECASE_LETTER 3968: * @see #MODIFIER_LETTER 3969: * @see #OTHER_LETTER 3970: * @see #NON_SPACING_MARK 3971: * @see #ENCLOSING_MARK 3972: * @see #COMBINING_SPACING_MARK 3973: * @see #DECIMAL_DIGIT_NUMBER 3974: * @see #LETTER_NUMBER 3975: * @see #OTHER_NUMBER 3976: * @see #SPACE_SEPARATOR 3977: * @see #LINE_SEPARATOR 3978: * @see #PARAGRAPH_SEPARATOR 3979: * @see #CONTROL 3980: * @see #FORMAT 3981: * @see #PRIVATE_USE 3982: * @see #SURROGATE 3983: * @see #DASH_PUNCTUATION 3984: * @see #START_PUNCTUATION 3985: * @see #END_PUNCTUATION 3986: * @see #CONNECTOR_PUNCTUATION 3987: * @see #OTHER_PUNCTUATION 3988: * @see #MATH_SYMBOL 3989: * @see #CURRENCY_SYMBOL 3990: * @see #MODIFIER_SYMBOL 3991: * @see #INITIAL_QUOTE_PUNCTUATION 3992: * @see #FINAL_QUOTE_PUNCTUATION 3993: * @since 1.1 3994: */ 3995: public static int getType(char ch) 3996: { 3997: return getType((int)ch); 3998: } 3999: 4000: /** 4001: * Returns the Unicode general category property of a character. 4002: * 4003: * @param codePoint character from which the general category property will 4004: * be retrieved 4005: * @return the character category property of ch as an integer 4006: * @see #UNASSIGNED 4007: * @see #UPPERCASE_LETTER 4008: * @see #LOWERCASE_LETTER 4009: * @see #TITLECASE_LETTER 4010: * @see #MODIFIER_LETTER 4011: * @see #OTHER_LETTER 4012: * @see #NON_SPACING_MARK 4013: * @see #ENCLOSING_MARK 4014: * @see #COMBINING_SPACING_MARK 4015: * @see #DECIMAL_DIGIT_NUMBER 4016: * @see #LETTER_NUMBER 4017: * @see #OTHER_NUMBER 4018: * @see #SPACE_SEPARATOR 4019: * @see #LINE_SEPARATOR 4020: * @see #PARAGRAPH_SEPARATOR 4021: * @see #CONTROL 4022: * @see #FORMAT 4023: * @see #PRIVATE_USE 4024: * @see #SURROGATE 4025: * @see #DASH_PUNCTUATION 4026: * @see #START_PUNCTUATION 4027: * @see #END_PUNCTUATION 4028: * @see #CONNECTOR_PUNCTUATION 4029: * @see #OTHER_PUNCTUATION 4030: * @see #MATH_SYMBOL 4031: * @see #CURRENCY_SYMBOL 4032: * @see #MODIFIER_SYMBOL 4033: * @see #INITIAL_QUOTE_PUNCTUATION 4034: * @see #FINAL_QUOTE_PUNCTUATION 4035: * 4036: * @since 1.5 4037: */ 4038: public static int getType(int codePoint) 4039: { 4040: // If the codePoint is unassigned or in one of the private use areas 4041: // then we delegate the call to the appropriate private static inner class. 4042: int plane = codePoint >>> 16; 4043: if (plane > 2 && plane < 14) 4044: return UnassignedCharacters.getType(codePoint); 4045: if (plane > 14) 4046: return PrivateUseCharacters.getType(codePoint); 4047: 4048: return readCodePoint(codePoint) & TYPE_MASK; 4049: } 4050: 4051: /** 4052: * Converts a digit into a character which represents that digit 4053: * in a specified radix. If the radix exceeds MIN_RADIX or MAX_RADIX, 4054: * or the digit exceeds the radix, then the null character <code>'\0'</code> 4055: * is returned. Otherwise the return value is in '0'-'9' and 'a'-'z'. 4056: * <br> 4057: * return value boundary = U+0030-U+0039|U+0061-U+007A 4058: * 4059: * @param digit digit to be converted into a character 4060: * @param radix radix of digit 4061: * @return character representing digit in radix, or '\0' 4062: * @see #MIN_RADIX 4063: * @see #MAX_RADIX 4064: * @see #digit(char, int) 4065: */ 4066: public static char forDigit(int digit, int radix) 4067: { 4068: if (radix < MIN_RADIX || radix > MAX_RADIX 4069: || digit < 0 || digit >= radix) 4070: return '\0'; 4071: return Number.digits[digit]; 4072: } 4073: 4074: /** 4075: * Returns the Unicode directionality property of the character. This 4076: * is used in the visual ordering of text. 4077: * 4078: * @param ch the character to look up 4079: * @return the directionality constant, or DIRECTIONALITY_UNDEFINED 4080: * @see #DIRECTIONALITY_UNDEFINED 4081: * @see #DIRECTIONALITY_LEFT_TO_RIGHT 4082: * @see #DIRECTIONALITY_RIGHT_TO_LEFT 4083: * @see #DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC 4084: * @see #DIRECTIONALITY_EUROPEAN_NUMBER 4085: * @see #DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR 4086: * @see #DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR 4087: * @see #DIRECTIONALITY_ARABIC_NUMBER 4088: * @see #DIRECTIONALITY_COMMON_NUMBER_SEPARATOR 4089: * @see #DIRECTIONALITY_NONSPACING_MARK 4090: * @see #DIRECTIONALITY_BOUNDARY_NEUTRAL 4091: * @see #DIRECTIONALITY_PARAGRAPH_SEPARATOR 4092: * @see #DIRECTIONALITY_SEGMENT_SEPARATOR 4093: * @see #DIRECTIONALITY_WHITESPACE 4094: * @see #DIRECTIONALITY_OTHER_NEUTRALS 4095: * @see #DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING 4096: * @see #DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE 4097: * @see #DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING 4098: * @see #DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE 4099: * @see #DIRECTIONALITY_POP_DIRECTIONAL_FORMAT 4100: * @since 1.4 4101: */ 4102: public static byte getDirectionality(char ch) 4103: { 4104: // The result will correctly be signed. 4105: return getDirectionality((int)ch); 4106: } 4107: 4108: 4109: /** 4110: * Returns the Unicode directionality property of the character. This 4111: * is used in the visual ordering of text. 4112: * 4113: * @param codePoint the character to look up 4114: * @return the directionality constant, or DIRECTIONALITY_UNDEFINED 4115: * @see #DIRECTIONALITY_UNDEFINED 4116: * @see #DIRECTIONALITY_LEFT_TO_RIGHT 4117: * @see #DIRECTIONALITY_RIGHT_TO_LEFT 4118: * @see #DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC 4119: * @see #DIRECTIONALITY_EUROPEAN_NUMBER 4120: * @see #DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR 4121: * @see #DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR 4122: * @see #DIRECTIONALITY_ARABIC_NUMBER 4123: * @see #DIRECTIONALITY_COMMON_NUMBER_SEPARATOR 4124: * @see #DIRECTIONALITY_NONSPACING_MARK 4125: * @see #DIRECTIONALITY_BOUNDARY_NEUTRAL 4126: * @see #DIRECTIONALITY_PARAGRAPH_SEPARATOR 4127: * @see #DIRECTIONALITY_SEGMENT_SEPARATOR 4128: * @see #DIRECTIONALITY_WHITESPACE 4129: * @see #DIRECTIONALITY_OTHER_NEUTRALS 4130: * @see #DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING 4131: * @see #DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE 4132: * @see #DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING 4133: * @see #DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE 4134: * @see #DIRECTIONALITY_POP_DIRECTIONAL_FORMAT 4135: * @since 1.5 4136: */ 4137: public static byte getDirectionality(int codePoint) 4138: { 4139: // If the code point is unassigned or in one of the private use areas 4140: // then we delegate the call to the appropriate private static inner class. 4141: int plane = codePoint >>> 16; 4142: if (plane > 2 && plane < 14) 4143: return UnassignedCharacters.getDirectionality(codePoint); 4144: if (plane > 14) 4145: return PrivateUseCharacters.getDirectionality(codePoint); 4146: 4147: // The result will correctly be signed. 4148: return (byte) (direction[plane][readCodePoint(codePoint) >> 7] >> 2); 4149: } 4150: 4151: /** 4152: * Determines whether the character is mirrored according to Unicode. For 4153: * example, <code>\u0028</code> (LEFT PARENTHESIS) appears as '(' in 4154: * left-to-right text, but ')' in right-to-left text. 4155: * 4156: * @param ch the character to look up 4157: * @return true if the character is mirrored 4158: * @since 1.4 4159: */ 4160: public static boolean isMirrored(char ch) 4161: { 4162: return (readCodePoint((int)ch) & MIRROR_MASK) != 0; 4163: } 4164: 4165: /** 4166: * Determines whether the character is mirrored according to Unicode. For 4167: * example, <code>\u0028</code> (LEFT PARENTHESIS) appears as '(' in 4168: * left-to-right text, but ')' in right-to-left text. 4169: * 4170: * @param codePoint the character to look up 4171: * @return true if the character is mirrored 4172: * @since 1.5 4173: */ 4174: public static boolean isMirrored(int codePoint) 4175: { 4176: // If the code point is unassigned or part of one of the private use areas 4177: // then we delegate the call to the appropriate private static inner class. 4178: int plane = codePoint >>> 16; 4179: if (plane > 2 && plane < 14) 4180: return UnassignedCharacters.isMirrored(codePoint); 4181: if (plane > 14) 4182: return PrivateUseCharacters.isMirrored(codePoint); 4183: 4184: return (readCodePoint(codePoint) & MIRROR_MASK) != 0; 4185: } 4186: 4187: /** 4188: * Compares another Character to this Character, numerically. 4189: * 4190: * @param anotherCharacter Character to compare with this Character 4191: * @return a negative integer if this Character is less than 4192: * anotherCharacter, zero if this Character is equal, and 4193: * a positive integer if this Character is greater 4194: * @throws NullPointerException if anotherCharacter is null 4195: * @since 1.2 4196: */ 4197: public int compareTo(Character anotherCharacter) 4198: { 4199: return value - anotherCharacter.value; 4200: } 4201: 4202: /** 4203: * Returns an <code>Character</code> object wrapping the value. 4204: * In contrast to the <code>Character</code> constructor, this method 4205: * will cache some values. It is used by boxing conversion. 4206: * 4207: * @param val the value to wrap 4208: * @return the <code>Character</code> 4209: * 4210: * @since 1.5 4211: */ 4212: public static Character valueOf(char val) 4213: { 4214: if (val > MAX_CACHE) 4215: return new Character(val); 4216: else 4217: return charCache[val - MIN_VALUE]; 4218: } 4219: 4220: /** 4221: * Reverse the bytes in val. 4222: * @since 1.5 4223: */ 4224: public static char reverseBytes(char val) 4225: { 4226: return (char) (((val >> 8) & 0xff) | ((val << 8) & 0xff00)); 4227: } 4228: 4229: /** 4230: * Converts a unicode code point to a UTF-16 representation of that 4231: * code point. 4232: * 4233: * @param codePoint the unicode code point 4234: * 4235: * @return the UTF-16 representation of that code point 4236: * 4237: * @throws IllegalArgumentException if the code point is not a valid 4238: * unicode code point 4239: * 4240: * @since 1.5 4241: */ 4242: public static char[] toChars(int codePoint) 4243: { 4244: if (!isValidCodePoint(codePoint)) 4245: throw new IllegalArgumentException("Illegal Unicode code point : " 4246: + codePoint); 4247: char[] result = new char[charCount(codePoint)]; 4248: int ignore = toChars(codePoint, result, 0); 4249: return result; 4250: } 4251: 4252: /** 4253: * Converts a unicode code point to its UTF-16 representation. 4254: * 4255: * @param codePoint the unicode code point 4256: * @param dst the target char array 4257: * @param dstIndex the start index for the target 4258: * 4259: * @return number of characters written to <code>dst</code> 4260: * 4261: * @throws IllegalArgumentException if <code>codePoint</code> is not a 4262: * valid unicode code point 4263: * @throws NullPointerException if <code>dst</code> is <code>null</code> 4264: * @throws IndexOutOfBoundsException if <code>dstIndex</code> is not valid 4265: * in <code>dst</code> or if the UTF-16 representation does not 4266: * fit into <code>dst</code> 4267: * 4268: * @since 1.5 4269: */ 4270: public static int toChars(int codePoint, char[] dst, int dstIndex) 4271: { 4272: if (!isValidCodePoint(codePoint)) 4273: { 4274: throw new IllegalArgumentException("not a valid code point: " 4275: + codePoint); 4276: } 4277: 4278: int result; 4279: if (isSupplementaryCodePoint(codePoint)) 4280: { 4281: // Write second char first to cause IndexOutOfBoundsException 4282: // immediately. 4283: final int cp2 = codePoint - 0x10000; 4284: dst[dstIndex + 1] = (char) ((cp2 % 0x400) + (int) MIN_LOW_SURROGATE); 4285: dst[dstIndex] = (char) ((cp2 / 0x400) + (int) MIN_HIGH_SURROGATE); 4286: result = 2; 4287: } 4288: else 4289: { 4290: dst[dstIndex] = (char) codePoint; 4291: result = 1; 4292: } 4293: return result; 4294: } 4295: 4296: /** 4297: * Return number of 16-bit characters required to represent the given 4298: * code point. 4299: * 4300: * @param codePoint a unicode code point 4301: * 4302: * @return 2 if codePoint >= 0x10000, 1 otherwise. 4303: * 4304: * @since 1.5 4305: */ 4306: public static int charCount(int codePoint) 4307: { 4308: return 4309: (codePoint >= MIN_SUPPLEMENTARY_CODE_POINT) 4310: ? 2 4311: : 1; 4312: } 4313: 4314: /** 4315: * Determines whether the specified code point is 4316: * in the range 0x10000 .. 0x10FFFF, i.e. the character is within the Unicode 4317: * supplementary character range. 4318: * 4319: * @param codePoint a Unicode code point 4320: * 4321: * @return <code>true</code> if code point is in supplementary range 4322: * 4323: * @since 1.5 4324: */ 4325: public static boolean isSupplementaryCodePoint(int codePoint) 4326: { 4327: return codePoint >= MIN_SUPPLEMENTARY_CODE_POINT 4328: && codePoint <= MAX_CODE_POINT; 4329: } 4330: 4331: /** 4332: * Determines whether the specified code point is 4333: * in the range 0x0000 .. 0x10FFFF, i.e. it is a valid Unicode code point. 4334: * 4335: * @param codePoint a Unicode code point 4336: * 4337: * @return <code>true</code> if code point is valid 4338: * 4339: * @since 1.5 4340: */ 4341: public static boolean isValidCodePoint(int codePoint) 4342: { 4343: return codePoint >= MIN_CODE_POINT && codePoint <= MAX_CODE_POINT; 4344: } 4345: 4346: /** 4347: * Return true if the given character is a high surrogate. 4348: * @param ch the character 4349: * @return true if the character is a high surrogate character 4350: * 4351: * @since 1.5 4352: */ 4353: public static boolean isHighSurrogate(char ch) 4354: { 4355: return ch >= MIN_HIGH_SURROGATE && ch <= MAX_HIGH_SURROGATE; 4356: } 4357: 4358: /** 4359: * Return true if the given character is a low surrogate. 4360: * @param ch the character 4361: * @return true if the character is a low surrogate character 4362: * 4363: * @since 1.5 4364: */ 4365: public static boolean isLowSurrogate(char ch) 4366: { 4367: return ch >= MIN_LOW_SURROGATE && ch <= MAX_LOW_SURROGATE; 4368: } 4369: 4370: /** 4371: * Return true if the given characters compose a surrogate pair. 4372: * This is true if the first character is a high surrogate and the 4373: * second character is a low surrogate. 4374: * @param ch1 the first character 4375: * @param ch2 the first character 4376: * @return true if the characters compose a surrogate pair 4377: * 4378: * @since 1.5 4379: */ 4380: public static boolean isSurrogatePair(char ch1, char ch2) 4381: { 4382: return isHighSurrogate(ch1) && isLowSurrogate(ch2); 4383: } 4384: 4385: /** 4386: * Given a valid surrogate pair, this returns the corresponding 4387: * code point. 4388: * @param high the high character of the pair 4389: * @param low the low character of the pair 4390: * @return the corresponding code point 4391: * 4392: * @since 1.5 4393: */ 4394: public static int toCodePoint(char high, char low) 4395: { 4396: return ((high - MIN_HIGH_SURROGATE) * 0x400) + 4397: (low - MIN_LOW_SURROGATE) + 0x10000; 4398: } 4399: 4400: /** 4401: * Get the code point at the specified index in the CharSequence. 4402: * This is like CharSequence#charAt(int), but if the character is 4403: * the start of a surrogate pair, and there is a following 4404: * character, and this character completes the pair, then the 4405: * corresponding supplementary code point is returned. Otherwise, 4406: * the character at the index is returned. 4407: * 4408: * @param sequence the CharSequence 4409: * @param index the index of the codepoint to get, starting at 0 4410: * @return the codepoint at the specified index 4411: * @throws IndexOutOfBoundsException if index is negative or >= length() 4412: * @since 1.5 4413: */ 4414: public static int codePointAt(CharSequence sequence, int index) 4415: { 4416: int len = sequence.length(); 4417: if (index < 0 || index >= len) 4418: throw new IndexOutOfBoundsException(); 4419: char high = sequence.charAt(index); 4420: if (! isHighSurrogate(high) || ++index >= len) 4421: return high; 4422: char low = sequence.charAt(index); 4423: if (! isLowSurrogate(low)) 4424: return high; 4425: return toCodePoint(high, low); 4426: } 4427: 4428: /** 4429: * Get the code point at the specified index in the CharSequence. 4430: * If the character is the start of a surrogate pair, and there is a 4431: * following character, and this character completes the pair, then 4432: * the corresponding supplementary code point is returned. 4433: * Otherwise, the character at the index is returned. 4434: * 4435: * @param chars the character array in which to look 4436: * @param index the index of the codepoint to get, starting at 0 4437: * @return the codepoint at the specified index 4438: * @throws IndexOutOfBoundsException if index is negative or >= length() 4439: * @since 1.5 4440: */ 4441: public static int codePointAt(char[] chars, int index) 4442: { 4443: return codePointAt(chars, index, chars.length); 4444: } 4445: 4446: /** 4447: * Get the code point at the specified index in the CharSequence. 4448: * If the character is the start of a surrogate pair, and there is a 4449: * following character within the specified range, and this 4450: * character completes the pair, then the corresponding 4451: * supplementary code point is returned. Otherwise, the character 4452: * at the index is returned. 4453: * 4454: * @param chars the character array in which to look 4455: * @param index the index of the codepoint to get, starting at 0 4456: * @param limit the limit past which characters should not be examined 4457: * @return the codepoint at the specified index 4458: * @throws IndexOutOfBoundsException if index is negative or >= 4459: * limit, or if limit is negative or >= the length of the array 4460: * @since 1.5 4461: */ 4462: public static int codePointAt(char[] chars, int index, int limit) 4463: { 4464: if (index < 0 || index >= limit || limit < 0 || limit > chars.length) 4465: throw new IndexOutOfBoundsException(); 4466: char high = chars[index]; 4467: if (! isHighSurrogate(high) || ++index >= limit) 4468: return high; 4469: char low = chars[index]; 4470: if (! isLowSurrogate(low)) 4471: return high; 4472: return toCodePoint(high, low); 4473: } 4474: 4475: /** 4476: * Get the code point before the specified index. This is like 4477: * #codePointAt(char[], int), but checks the characters at 4478: * <code>index-1</code> and <code>index-2</code> to see if they form 4479: * a supplementary code point. If they do not, the character at 4480: * <code>index-1</code> is returned. 4481: * 4482: * @param chars the character array 4483: * @param index the index just past the codepoint to get, starting at 0 4484: * @return the codepoint at the specified index 4485: * @throws IndexOutOfBoundsException if index is negative or >= length() 4486: * @since 1.5 4487: */ 4488: public static int codePointBefore(char[] chars, int index) 4489: { 4490: return codePointBefore(chars, index, 1); 4491: } 4492: 4493: /** 4494: * Get the code point before the specified index. This is like 4495: * #codePointAt(char[], int), but checks the characters at 4496: * <code>index-1</code> and <code>index-2</code> to see if they form 4497: * a supplementary code point. If they do not, the character at 4498: * <code>index-1</code> is returned. The start parameter is used to 4499: * limit the range of the array which may be examined. 4500: * 4501: * @param chars the character array 4502: * @param index the index just past the codepoint to get, starting at 0 4503: * @param start the index before which characters should not be examined 4504: * @return the codepoint at the specified index 4505: * @throws IndexOutOfBoundsException if index is > start or > 4506: * the length of the array, or if limit is negative or >= the 4507: * length of the array 4508: * @since 1.5 4509: */ 4510: public static int codePointBefore(char[] chars, int index, int start) 4511: { 4512: if (index < start || index > chars.length 4513: || start < 0 || start >= chars.length) 4514: throw new IndexOutOfBoundsException(); 4515: --index; 4516: char low = chars[index]; 4517: if (! isLowSurrogate(low) || --index < start) 4518: return low; 4519: char high = chars[index]; 4520: if (! isHighSurrogate(high)) 4521: return low; 4522: return toCodePoint(high, low); 4523: } 4524: 4525: /** 4526: * Get the code point before the specified index. This is like 4527: * #codePointAt(CharSequence, int), but checks the characters at 4528: * <code>index-1</code> and <code>index-2</code> to see if they form 4529: * a supplementary code point. If they do not, the character at 4530: * <code>index-1</code> is returned. 4531: * 4532: * @param sequence the CharSequence 4533: * @param index the index just past the codepoint to get, starting at 0 4534: * @return the codepoint at the specified index 4535: * @throws IndexOutOfBoundsException if index is negative or >= length() 4536: * @since 1.5 4537: */ 4538: public static int codePointBefore(CharSequence sequence, int index) 4539: { 4540: int len = sequence.length(); 4541: if (index < 1 || index > len) 4542: throw new IndexOutOfBoundsException(); 4543: --index; 4544: char low = sequence.charAt(index); 4545: if (! isLowSurrogate(low) || --index < 0) 4546: return low; 4547: char high = sequence.charAt(index); 4548: if (! isHighSurrogate(high)) 4549: return low; 4550: return toCodePoint(high, low); 4551: } 4552: } // class Character
GNU Classpath (0.98) |