GNU Classpath (0.92) | |
Frames | No Frames |
1: /* java.lang.Character -- Wrapper class for char, and Unicode subsets 2: Copyright (C) 1998, 1999, 2001, 2002, 2005 Free Software Foundation, Inc. 3: 4: This file is part of GNU Classpath. 5: 6: GNU Classpath is free software; you can redistribute it and/or modify 7: it under the terms of the GNU General Public License as published by 8: the Free Software Foundation; either version 2, or (at your option) 9: any later version. 10: 11: GNU Classpath is distributed in the hope that it will be useful, but 12: WITHOUT ANY WARRANTY; without even the implied warranty of 13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14: General Public License for more details. 15: 16: You should have received a copy of the GNU General Public License 17: along with GNU Classpath; see the file COPYING. If not, write to the 18: Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 19: 02110-1301 USA. 20: 21: Linking this library statically or dynamically with other modules is 22: making a combined work based on this library. Thus, the terms and 23: conditions of the GNU General Public License cover the whole 24: combination. 25: 26: As a special exception, the copyright holders of this library give you 27: permission to link this library with independent modules to produce an 28: executable, regardless of the license terms of these independent 29: modules, and to copy and distribute the resulting executable under 30: terms of your choice, provided that you also meet, for each linked 31: independent module, the terms and conditions of the license of that 32: module. An independent module is a module which is not derived from 33: or based on this library. If you modify this library, you may extend 34: this exception to your version of the library, but you are not 35: obligated to do so. If you do not wish to do so, delete this 36: exception statement from your version. */ 37: 38: 39: package java.lang; 40: 41: import gnu.java.lang.CharData; 42: 43: import java.io.Serializable; 44: import java.text.Collator; 45: import java.util.Locale; 46: 47: /** 48: * Wrapper class for the primitive char data type. In addition, this class 49: * allows one to retrieve property information and perform transformations 50: * on the defined characters in the Unicode Standard, Version 4.0.0. 51: * java.lang.Character is designed to be very dynamic, and as such, it 52: * retrieves information on the Unicode character set from a separate 53: * database, gnu.java.lang.CharData, which can be easily upgraded. 54: * 55: * <p>For predicates, boundaries are used to describe 56: * the set of characters for which the method will return true. 57: * This syntax uses fairly normal regular expression notation. 58: * See 5.13 of the Unicode Standard, Version 4.0, for the 59: * boundary specification. 60: * 61: * <p>See <a href="http://www.unicode.org">http://www.unicode.org</a> 62: * for more information on the Unicode Standard. 63: * 64: * @author Tom Tromey (tromey@cygnus.com) 65: * @author Paul N. Fisher 66: * @author Jochen Hoenicke 67: * @author Eric Blake (ebb9@email.byu.edu) 68: * @see CharData 69: * @since 1.0 70: * @status updated to 1.4 71: */ 72: public final class Character implements Serializable, Comparable 73: { 74: /** 75: * A subset of Unicode blocks. 76: * 77: * @author Paul N. Fisher 78: * @author Eric Blake (ebb9@email.byu.edu) 79: * @since 1.2 80: */ 81: public static class Subset 82: { 83: /** The name of the subset. */ 84: private final String name; 85: 86: /** 87: * Construct a new subset of characters. 88: * 89: * @param name the name of the subset 90: * @throws NullPointerException if name is null 91: */ 92: protected Subset(String name) 93: { 94: // Note that name.toString() is name, unless name was null. 95: this.name = name.toString(); 96: } 97: 98: /** 99: * Compares two Subsets for equality. This is <code>final</code>, and 100: * restricts the comparison on the <code>==</code> operator, so it returns 101: * true only for the same object. 102: * 103: * @param o the object to compare 104: * @return true if o is this 105: */ 106: public final boolean equals(Object o) 107: { 108: return o == this; 109: } 110: 111: /** 112: * Makes the original hashCode of Object final, to be consistent with 113: * equals. 114: * 115: * @return the hash code for this object 116: */ 117: public final int hashCode() 118: { 119: return super.hashCode(); 120: } 121: 122: /** 123: * Returns the name of the subset. 124: * 125: * @return the name 126: */ 127: public final String toString() 128: { 129: return name; 130: } 131: } // class Subset 132: 133: /** 134: * A family of character subsets in the Unicode specification. A character 135: * is in at most one of these blocks. 136: * 137: * This inner class was generated automatically from 138: * <code>doc/unicode/Blocks-4.0.0.txt</code>, by some perl scripts. 139: * This Unicode definition file can be found on the 140: * <a href="http://www.unicode.org">http://www.unicode.org</a> website. 141: * JDK 1.5 uses Unicode version 4.0.0. 142: * 143: * @author scripts/unicode-blocks.pl (written by Eric Blake) 144: * @since 1.2 145: */ 146: public static final class UnicodeBlock extends Subset 147: { 148: /** The start of the subset. */ 149: private final int start; 150: 151: /** The end of the subset. */ 152: private final int end; 153: 154: /** The canonical name of the block according to the Unicode standard. */ 155: private final String canonicalName; 156: 157: /** Constants for the <code>forName()</code> method */ 158: private static final int CANONICAL_NAME = 0; 159: private static final int NO_SPACES_NAME = 1; 160: private static final int CONSTANT_NAME = 2; 161: 162: /** 163: * Constructor for strictly defined blocks. 164: * 165: * @param start the start character of the range 166: * @param end the end character of the range 167: * @param name the block name 168: * @param canonicalName the name of the block as defined in the Unicode 169: * standard. 170: */ 171: private UnicodeBlock(int start, int end, String name, 172: String canonicalName) 173: { 174: super(name); 175: this.start = start; 176: this.end = end; 177: this.canonicalName = canonicalName; 178: } 179: 180: /** 181: * Returns the Unicode character block which a character belongs to. 182: * <strong>Note</strong>: This method does not support the use of 183: * supplementary characters. For such support, <code>of(int)</code> 184: * should be used instead. 185: * 186: * @param ch the character to look up 187: * @return the set it belongs to, or null if it is not in one 188: */ 189: public static UnicodeBlock of(char ch) 190: { 191: return of((int) ch); 192: } 193: 194: /** 195: * Returns the Unicode character block which a code point belongs to. 196: * 197: * @param codePoint the character to look up 198: * @return the set it belongs to, or null if it is not in one. 199: * @throws IllegalArgumentException if the specified code point is 200: * invalid. 201: * @since 1.5 202: */ 203: public static UnicodeBlock of(int codePoint) 204: { 205: if (codePoint > MAX_CODE_POINT) 206: throw new IllegalArgumentException("The supplied integer value is " + 207: "too large to be a codepoint."); 208: // Simple binary search for the correct block. 209: int low = 0; 210: int hi = sets.length - 1; 211: while (low <= hi) 212: { 213: int mid = (low + hi) >> 1; 214: UnicodeBlock b = sets[mid]; 215: if (codePoint < b.start) 216: hi = mid - 1; 217: else if (codePoint > b.end) 218: low = mid + 1; 219: else 220: return b; 221: } 222: return null; 223: } 224: 225: /** 226: * <p> 227: * Returns the <code>UnicodeBlock</code> with the given name, as defined 228: * by the Unicode standard. The version of Unicode in use is defined by 229: * the <code>Character</code> class, and the names are given in the 230: * <code>Blocks-<version>.txt</code> file corresponding to that version. 231: * The name may be specified in one of three ways: 232: * </p> 233: * <ol> 234: * <li>The canonical, human-readable name used by the Unicode standard. 235: * This is the name with all spaces and hyphens retained. For example, 236: * `Basic Latin' retrieves the block, UnicodeBlock.BASIC_LATIN.</li> 237: * <li>The canonical name with all spaces removed e.g. `BasicLatin'.</li> 238: * <li>The name used for the constants specified by this class, which 239: * is the canonical name with all spaces and hyphens replaced with 240: * underscores e.g. `BASIC_LATIN'</li> 241: * </ol> 242: * <p> 243: * The names are compared case-insensitively using the case comparison 244: * associated with the U.S. English locale. The method recognises the 245: * previous names used for blocks as well as the current ones. At 246: * present, this simply means that the deprecated `SURROGATES_AREA' 247: * will be recognised by this method (the <code>of()</code> methods 248: * only return one of the three new surrogate blocks). 249: * </p> 250: * 251: * @param blockName the name of the block to look up. 252: * @return the specified block. 253: * @throws NullPointerException if the <code>blockName</code> is 254: * <code>null</code>. 255: * @throws IllegalArgumentException if the name does not match any Unicode 256: * block. 257: * @since 1.5 258: */ 259: public static final UnicodeBlock forName(String blockName) 260: { 261: int type; 262: if (blockName.indexOf(' ') != -1) 263: type = CANONICAL_NAME; 264: else if (blockName.indexOf('_') != -1) 265: type = CONSTANT_NAME; 266: else 267: type = NO_SPACES_NAME; 268: Collator usCollator = Collator.getInstance(Locale.US); 269: usCollator.setStrength(Collator.PRIMARY); 270: /* Special case for deprecated blocks not in sets */ 271: switch (type) 272: { 273: case CANONICAL_NAME: 274: if (usCollator.compare(blockName, "Surrogates Area") == 0) 275: return SURROGATES_AREA; 276: break; 277: case NO_SPACES_NAME: 278: if (usCollator.compare(blockName, "SurrogatesArea") == 0) 279: return SURROGATES_AREA; 280: break; 281: case CONSTANT_NAME: 282: if (usCollator.compare(blockName, "SURROGATES_AREA") == 0) 283: return SURROGATES_AREA; 284: break; 285: } 286: /* Other cases */ 287: int setLength = sets.length; 288: switch (type) 289: { 290: case CANONICAL_NAME: 291: for (int i = 0; i < setLength; i++) 292: { 293: UnicodeBlock block = sets[i]; 294: if (usCollator.compare(blockName, block.canonicalName) == 0) 295: return block; 296: } 297: break; 298: case NO_SPACES_NAME: 299: for (int i = 0; i < setLength; i++) 300: { 301: UnicodeBlock block = sets[i]; 302: String nsName = block.canonicalName.replaceAll(" ",""); 303: if (usCollator.compare(blockName, nsName) == 0) 304: return block; 305: } 306: break; 307: case CONSTANT_NAME: 308: for (int i = 0; i < setLength; i++) 309: { 310: UnicodeBlock block = sets[i]; 311: if (usCollator.compare(blockName, block.toString()) == 0) 312: return block; 313: } 314: break; 315: } 316: throw new IllegalArgumentException("No Unicode block found for " + 317: blockName + "."); 318: } 319: 320: /** 321: * Basic Latin. 322: * 0x0000 - 0x007F. 323: */ 324: public static final UnicodeBlock BASIC_LATIN 325: = new UnicodeBlock(0x0000, 0x007F, 326: "BASIC_LATIN", 327: "Basic Latin"); 328: 329: /** 330: * Latin-1 Supplement. 331: * 0x0080 - 0x00FF. 332: */ 333: public static final UnicodeBlock LATIN_1_SUPPLEMENT 334: = new UnicodeBlock(0x0080, 0x00FF, 335: "LATIN_1_SUPPLEMENT", 336: "Latin-1 Supplement"); 337: 338: /** 339: * Latin Extended-A. 340: * 0x0100 - 0x017F. 341: */ 342: public static final UnicodeBlock LATIN_EXTENDED_A 343: = new UnicodeBlock(0x0100, 0x017F, 344: "LATIN_EXTENDED_A", 345: "Latin Extended-A"); 346: 347: /** 348: * Latin Extended-B. 349: * 0x0180 - 0x024F. 350: */ 351: public static final UnicodeBlock LATIN_EXTENDED_B 352: = new UnicodeBlock(0x0180, 0x024F, 353: "LATIN_EXTENDED_B", 354: "Latin Extended-B"); 355: 356: /** 357: * IPA Extensions. 358: * 0x0250 - 0x02AF. 359: */ 360: public static final UnicodeBlock IPA_EXTENSIONS 361: = new UnicodeBlock(0x0250, 0x02AF, 362: "IPA_EXTENSIONS", 363: "IPA Extensions"); 364: 365: /** 366: * Spacing Modifier Letters. 367: * 0x02B0 - 0x02FF. 368: */ 369: public static final UnicodeBlock SPACING_MODIFIER_LETTERS 370: = new UnicodeBlock(0x02B0, 0x02FF, 371: "SPACING_MODIFIER_LETTERS", 372: "Spacing Modifier Letters"); 373: 374: /** 375: * Combining Diacritical Marks. 376: * 0x0300 - 0x036F. 377: */ 378: public static final UnicodeBlock COMBINING_DIACRITICAL_MARKS 379: = new UnicodeBlock(0x0300, 0x036F, 380: "COMBINING_DIACRITICAL_MARKS", 381: "Combining Diacritical Marks"); 382: 383: /** 384: * Greek. 385: * 0x0370 - 0x03FF. 386: */ 387: public static final UnicodeBlock GREEK 388: = new UnicodeBlock(0x0370, 0x03FF, 389: "GREEK", 390: "Greek"); 391: 392: /** 393: * Cyrillic. 394: * 0x0400 - 0x04FF. 395: */ 396: public static final UnicodeBlock CYRILLIC 397: = new UnicodeBlock(0x0400, 0x04FF, 398: "CYRILLIC", 399: "Cyrillic"); 400: 401: /** 402: * Cyrillic Supplementary. 403: * 0x0500 - 0x052F. 404: * @since 1.5 405: */ 406: public static final UnicodeBlock CYRILLIC_SUPPLEMENTARY 407: = new UnicodeBlock(0x0500, 0x052F, 408: "CYRILLIC_SUPPLEMENTARY", 409: "Cyrillic Supplementary"); 410: 411: /** 412: * Armenian. 413: * 0x0530 - 0x058F. 414: */ 415: public static final UnicodeBlock ARMENIAN 416: = new UnicodeBlock(0x0530, 0x058F, 417: "ARMENIAN", 418: "Armenian"); 419: 420: /** 421: * Hebrew. 422: * 0x0590 - 0x05FF. 423: */ 424: public static final UnicodeBlock HEBREW 425: = new UnicodeBlock(0x0590, 0x05FF, 426: "HEBREW", 427: "Hebrew"); 428: 429: /** 430: * Arabic. 431: * 0x0600 - 0x06FF. 432: */ 433: public static final UnicodeBlock ARABIC 434: = new UnicodeBlock(0x0600, 0x06FF, 435: "ARABIC", 436: "Arabic"); 437: 438: /** 439: * Syriac. 440: * 0x0700 - 0x074F. 441: * @since 1.4 442: */ 443: public static final UnicodeBlock SYRIAC 444: = new UnicodeBlock(0x0700, 0x074F, 445: "SYRIAC", 446: "Syriac"); 447: 448: /** 449: * Thaana. 450: * 0x0780 - 0x07BF. 451: * @since 1.4 452: */ 453: public static final UnicodeBlock THAANA 454: = new UnicodeBlock(0x0780, 0x07BF, 455: "THAANA", 456: "Thaana"); 457: 458: /** 459: * Devanagari. 460: * 0x0900 - 0x097F. 461: */ 462: public static final UnicodeBlock DEVANAGARI 463: = new UnicodeBlock(0x0900, 0x097F, 464: "DEVANAGARI", 465: "Devanagari"); 466: 467: /** 468: * Bengali. 469: * 0x0980 - 0x09FF. 470: */ 471: public static final UnicodeBlock BENGALI 472: = new UnicodeBlock(0x0980, 0x09FF, 473: "BENGALI", 474: "Bengali"); 475: 476: /** 477: * Gurmukhi. 478: * 0x0A00 - 0x0A7F. 479: */ 480: public static final UnicodeBlock GURMUKHI 481: = new UnicodeBlock(0x0A00, 0x0A7F, 482: "GURMUKHI", 483: "Gurmukhi"); 484: 485: /** 486: * Gujarati. 487: * 0x0A80 - 0x0AFF. 488: */ 489: public static final UnicodeBlock GUJARATI 490: = new UnicodeBlock(0x0A80, 0x0AFF, 491: "GUJARATI", 492: "Gujarati"); 493: 494: /** 495: * Oriya. 496: * 0x0B00 - 0x0B7F. 497: */ 498: public static final UnicodeBlock ORIYA 499: = new UnicodeBlock(0x0B00, 0x0B7F, 500: "ORIYA", 501: "Oriya"); 502: 503: /** 504: * Tamil. 505: * 0x0B80 - 0x0BFF. 506: */ 507: public static final UnicodeBlock TAMIL 508: = new UnicodeBlock(0x0B80, 0x0BFF, 509: "TAMIL", 510: "Tamil"); 511: 512: /** 513: * Telugu. 514: * 0x0C00 - 0x0C7F. 515: */ 516: public static final UnicodeBlock TELUGU 517: = new UnicodeBlock(0x0C00, 0x0C7F, 518: "TELUGU", 519: "Telugu"); 520: 521: /** 522: * Kannada. 523: * 0x0C80 - 0x0CFF. 524: */ 525: public static final UnicodeBlock KANNADA 526: = new UnicodeBlock(0x0C80, 0x0CFF, 527: "KANNADA", 528: "Kannada"); 529: 530: /** 531: * Malayalam. 532: * 0x0D00 - 0x0D7F. 533: */ 534: public static final UnicodeBlock MALAYALAM 535: = new UnicodeBlock(0x0D00, 0x0D7F, 536: "MALAYALAM", 537: "Malayalam"); 538: 539: /** 540: * Sinhala. 541: * 0x0D80 - 0x0DFF. 542: * @since 1.4 543: */ 544: public static final UnicodeBlock SINHALA 545: = new UnicodeBlock(0x0D80, 0x0DFF, 546: "SINHALA", 547: "Sinhala"); 548: 549: /** 550: * Thai. 551: * 0x0E00 - 0x0E7F. 552: */ 553: public static final UnicodeBlock THAI 554: = new UnicodeBlock(0x0E00, 0x0E7F, 555: "THAI", 556: "Thai"); 557: 558: /** 559: * Lao. 560: * 0x0E80 - 0x0EFF. 561: */ 562: public static final UnicodeBlock LAO 563: = new UnicodeBlock(0x0E80, 0x0EFF, 564: "LAO", 565: "Lao"); 566: 567: /** 568: * Tibetan. 569: * 0x0F00 - 0x0FFF. 570: */ 571: public static final UnicodeBlock TIBETAN 572: = new UnicodeBlock(0x0F00, 0x0FFF, 573: "TIBETAN", 574: "Tibetan"); 575: 576: /** 577: * Myanmar. 578: * 0x1000 - 0x109F. 579: * @since 1.4 580: */ 581: public static final UnicodeBlock MYANMAR 582: = new UnicodeBlock(0x1000, 0x109F, 583: "MYANMAR", 584: "Myanmar"); 585: 586: /** 587: * Georgian. 588: * 0x10A0 - 0x10FF. 589: */ 590: public static final UnicodeBlock GEORGIAN 591: = new UnicodeBlock(0x10A0, 0x10FF, 592: "GEORGIAN", 593: "Georgian"); 594: 595: /** 596: * Hangul Jamo. 597: * 0x1100 - 0x11FF. 598: */ 599: public static final UnicodeBlock HANGUL_JAMO 600: = new UnicodeBlock(0x1100, 0x11FF, 601: "HANGUL_JAMO", 602: "Hangul Jamo"); 603: 604: /** 605: * Ethiopic. 606: * 0x1200 - 0x137F. 607: * @since 1.4 608: */ 609: public static final UnicodeBlock ETHIOPIC 610: = new UnicodeBlock(0x1200, 0x137F, 611: "ETHIOPIC", 612: "Ethiopic"); 613: 614: /** 615: * Cherokee. 616: * 0x13A0 - 0x13FF. 617: * @since 1.4 618: */ 619: public static final UnicodeBlock CHEROKEE 620: = new UnicodeBlock(0x13A0, 0x13FF, 621: "CHEROKEE", 622: "Cherokee"); 623: 624: /** 625: * Unified Canadian Aboriginal Syllabics. 626: * 0x1400 - 0x167F. 627: * @since 1.4 628: */ 629: public static final UnicodeBlock UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS 630: = new UnicodeBlock(0x1400, 0x167F, 631: "UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS", 632: "Unified Canadian Aboriginal Syllabics"); 633: 634: /** 635: * Ogham. 636: * 0x1680 - 0x169F. 637: * @since 1.4 638: */ 639: public static final UnicodeBlock OGHAM 640: = new UnicodeBlock(0x1680, 0x169F, 641: "OGHAM", 642: "Ogham"); 643: 644: /** 645: * Runic. 646: * 0x16A0 - 0x16FF. 647: * @since 1.4 648: */ 649: public static final UnicodeBlock RUNIC 650: = new UnicodeBlock(0x16A0, 0x16FF, 651: "RUNIC", 652: "Runic"); 653: 654: /** 655: * Tagalog. 656: * 0x1700 - 0x171F. 657: * @since 1.5 658: */ 659: public static final UnicodeBlock TAGALOG 660: = new UnicodeBlock(0x1700, 0x171F, 661: "TAGALOG", 662: "Tagalog"); 663: 664: /** 665: * Hanunoo. 666: * 0x1720 - 0x173F. 667: * @since 1.5 668: */ 669: public static final UnicodeBlock HANUNOO 670: = new UnicodeBlock(0x1720, 0x173F, 671: "HANUNOO", 672: "Hanunoo"); 673: 674: /** 675: * Buhid. 676: * 0x1740 - 0x175F. 677: * @since 1.5 678: */ 679: public static final UnicodeBlock BUHID 680: = new UnicodeBlock(0x1740, 0x175F, 681: "BUHID", 682: "Buhid"); 683: 684: /** 685: * Tagbanwa. 686: * 0x1760 - 0x177F. 687: * @since 1.5 688: */ 689: public static final UnicodeBlock TAGBANWA 690: = new UnicodeBlock(0x1760, 0x177F, 691: "TAGBANWA", 692: "Tagbanwa"); 693: 694: /** 695: * Khmer. 696: * 0x1780 - 0x17FF. 697: * @since 1.4 698: */ 699: public static final UnicodeBlock KHMER 700: = new UnicodeBlock(0x1780, 0x17FF, 701: "KHMER", 702: "Khmer"); 703: 704: /** 705: * Mongolian. 706: * 0x1800 - 0x18AF. 707: * @since 1.4 708: */ 709: public static final UnicodeBlock MONGOLIAN 710: = new UnicodeBlock(0x1800, 0x18AF, 711: "MONGOLIAN", 712: "Mongolian"); 713: 714: /** 715: * Limbu. 716: * 0x1900 - 0x194F. 717: * @since 1.5 718: */ 719: public static final UnicodeBlock LIMBU 720: = new UnicodeBlock(0x1900, 0x194F, 721: "LIMBU", 722: "Limbu"); 723: 724: /** 725: * Tai Le. 726: * 0x1950 - 0x197F. 727: * @since 1.5 728: */ 729: public static final UnicodeBlock TAI_LE 730: = new UnicodeBlock(0x1950, 0x197F, 731: "TAI_LE", 732: "Tai Le"); 733: 734: /** 735: * Khmer Symbols. 736: * 0x19E0 - 0x19FF. 737: * @since 1.5 738: */ 739: public static final UnicodeBlock KHMER_SYMBOLS 740: = new UnicodeBlock(0x19E0, 0x19FF, 741: "KHMER_SYMBOLS", 742: "Khmer Symbols"); 743: 744: /** 745: * Phonetic Extensions. 746: * 0x1D00 - 0x1D7F. 747: * @since 1.5 748: */ 749: public static final UnicodeBlock PHONETIC_EXTENSIONS 750: = new UnicodeBlock(0x1D00, 0x1D7F, 751: "PHONETIC_EXTENSIONS", 752: "Phonetic Extensions"); 753: 754: /** 755: * Latin Extended Additional. 756: * 0x1E00 - 0x1EFF. 757: */ 758: public static final UnicodeBlock LATIN_EXTENDED_ADDITIONAL 759: = new UnicodeBlock(0x1E00, 0x1EFF, 760: "LATIN_EXTENDED_ADDITIONAL", 761: "Latin Extended Additional"); 762: 763: /** 764: * Greek Extended. 765: * 0x1F00 - 0x1FFF. 766: */ 767: public static final UnicodeBlock GREEK_EXTENDED 768: = new UnicodeBlock(0x1F00, 0x1FFF, 769: "GREEK_EXTENDED", 770: "Greek Extended"); 771: 772: /** 773: * General Punctuation. 774: * 0x2000 - 0x206F. 775: */ 776: public static final UnicodeBlock GENERAL_PUNCTUATION 777: = new UnicodeBlock(0x2000, 0x206F, 778: "GENERAL_PUNCTUATION", 779: "General Punctuation"); 780: 781: /** 782: * Superscripts and Subscripts. 783: * 0x2070 - 0x209F. 784: */ 785: public static final UnicodeBlock SUPERSCRIPTS_AND_SUBSCRIPTS 786: = new UnicodeBlock(0x2070, 0x209F, 787: "SUPERSCRIPTS_AND_SUBSCRIPTS", 788: "Superscripts and Subscripts"); 789: 790: /** 791: * Currency Symbols. 792: * 0x20A0 - 0x20CF. 793: */ 794: public static final UnicodeBlock CURRENCY_SYMBOLS 795: = new UnicodeBlock(0x20A0, 0x20CF, 796: "CURRENCY_SYMBOLS", 797: "Currency Symbols"); 798: 799: /** 800: * Combining Marks for Symbols. 801: * 0x20D0 - 0x20FF. 802: */ 803: public static final UnicodeBlock COMBINING_MARKS_FOR_SYMBOLS 804: = new UnicodeBlock(0x20D0, 0x20FF, 805: "COMBINING_MARKS_FOR_SYMBOLS", 806: "Combining Marks for Symbols"); 807: 808: /** 809: * Letterlike Symbols. 810: * 0x2100 - 0x214F. 811: */ 812: public static final UnicodeBlock LETTERLIKE_SYMBOLS 813: = new UnicodeBlock(0x2100, 0x214F, 814: "LETTERLIKE_SYMBOLS", 815: "Letterlike Symbols"); 816: 817: /** 818: * Number Forms. 819: * 0x2150 - 0x218F. 820: */ 821: public static final UnicodeBlock NUMBER_FORMS 822: = new UnicodeBlock(0x2150, 0x218F, 823: "NUMBER_FORMS", 824: "Number Forms"); 825: 826: /** 827: * Arrows. 828: * 0x2190 - 0x21FF. 829: */ 830: public static final UnicodeBlock ARROWS 831: = new UnicodeBlock(0x2190, 0x21FF, 832: "ARROWS", 833: "Arrows"); 834: 835: /** 836: * Mathematical Operators. 837: * 0x2200 - 0x22FF. 838: */ 839: public static final UnicodeBlock MATHEMATICAL_OPERATORS 840: = new UnicodeBlock(0x2200, 0x22FF, 841: "MATHEMATICAL_OPERATORS", 842: "Mathematical Operators"); 843: 844: /** 845: * Miscellaneous Technical. 846: * 0x2300 - 0x23FF. 847: */ 848: public static final UnicodeBlock MISCELLANEOUS_TECHNICAL 849: = new UnicodeBlock(0x2300, 0x23FF, 850: "MISCELLANEOUS_TECHNICAL", 851: "Miscellaneous Technical"); 852: 853: /** 854: * Control Pictures. 855: * 0x2400 - 0x243F. 856: */ 857: public static final UnicodeBlock CONTROL_PICTURES 858: = new UnicodeBlock(0x2400, 0x243F, 859: "CONTROL_PICTURES", 860: "Control Pictures"); 861: 862: /** 863: * Optical Character Recognition. 864: * 0x2440 - 0x245F. 865: */ 866: public static final UnicodeBlock OPTICAL_CHARACTER_RECOGNITION 867: = new UnicodeBlock(0x2440, 0x245F, 868: "OPTICAL_CHARACTER_RECOGNITION", 869: "Optical Character Recognition"); 870: 871: /** 872: * Enclosed Alphanumerics. 873: * 0x2460 - 0x24FF. 874: */ 875: public static final UnicodeBlock ENCLOSED_ALPHANUMERICS 876: = new UnicodeBlock(0x2460, 0x24FF, 877: "ENCLOSED_ALPHANUMERICS", 878: "Enclosed Alphanumerics"); 879: 880: /** 881: * Box Drawing. 882: * 0x2500 - 0x257F. 883: */ 884: public static final UnicodeBlock BOX_DRAWING 885: = new UnicodeBlock(0x2500, 0x257F, 886: "BOX_DRAWING", 887: "Box Drawing"); 888: 889: /** 890: * Block Elements. 891: * 0x2580 - 0x259F. 892: */ 893: public static final UnicodeBlock BLOCK_ELEMENTS 894: = new UnicodeBlock(0x2580, 0x259F, 895: "BLOCK_ELEMENTS", 896: "Block Elements"); 897: 898: /** 899: * Geometric Shapes. 900: * 0x25A0 - 0x25FF. 901: */ 902: public static final UnicodeBlock GEOMETRIC_SHAPES 903: = new UnicodeBlock(0x25A0, 0x25FF, 904: "GEOMETRIC_SHAPES", 905: "Geometric Shapes"); 906: 907: /** 908: * Miscellaneous Symbols. 909: * 0x2600 - 0x26FF. 910: */ 911: public static final UnicodeBlock MISCELLANEOUS_SYMBOLS 912: = new UnicodeBlock(0x2600, 0x26FF, 913: "MISCELLANEOUS_SYMBOLS", 914: "Miscellaneous Symbols"); 915: 916: /** 917: * Dingbats. 918: * 0x2700 - 0x27BF. 919: */ 920: public static final UnicodeBlock DINGBATS 921: = new UnicodeBlock(0x2700, 0x27BF, 922: "DINGBATS", 923: "Dingbats"); 924: 925: /** 926: * Miscellaneous Mathematical Symbols-A. 927: * 0x27C0 - 0x27EF. 928: * @since 1.5 929: */ 930: public static final UnicodeBlock MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A 931: = new UnicodeBlock(0x27C0, 0x27EF, 932: "MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A", 933: "Miscellaneous Mathematical Symbols-A"); 934: 935: /** 936: * Supplemental Arrows-A. 937: * 0x27F0 - 0x27FF. 938: * @since 1.5 939: */ 940: public static final UnicodeBlock SUPPLEMENTAL_ARROWS_A 941: = new UnicodeBlock(0x27F0, 0x27FF, 942: "SUPPLEMENTAL_ARROWS_A", 943: "Supplemental Arrows-A"); 944: 945: /** 946: * Braille Patterns. 947: * 0x2800 - 0x28FF. 948: * @since 1.4 949: */ 950: public static final UnicodeBlock BRAILLE_PATTERNS 951: = new UnicodeBlock(0x2800, 0x28FF, 952: "BRAILLE_PATTERNS", 953: "Braille Patterns"); 954: 955: /** 956: * Supplemental Arrows-B. 957: * 0x2900 - 0x297F. 958: * @since 1.5 959: */ 960: public static final UnicodeBlock SUPPLEMENTAL_ARROWS_B 961: = new UnicodeBlock(0x2900, 0x297F, 962: "SUPPLEMENTAL_ARROWS_B", 963: "Supplemental Arrows-B"); 964: 965: /** 966: * Miscellaneous Mathematical Symbols-B. 967: * 0x2980 - 0x29FF. 968: * @since 1.5 969: */ 970: public static final UnicodeBlock MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B 971: = new UnicodeBlock(0x2980, 0x29FF, 972: "MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B", 973: "Miscellaneous Mathematical Symbols-B"); 974: 975: /** 976: * Supplemental Mathematical Operators. 977: * 0x2A00 - 0x2AFF. 978: * @since 1.5 979: */ 980: public static final UnicodeBlock SUPPLEMENTAL_MATHEMATICAL_OPERATORS 981: = new UnicodeBlock(0x2A00, 0x2AFF, 982: "SUPPLEMENTAL_MATHEMATICAL_OPERATORS", 983: "Supplemental Mathematical Operators"); 984: 985: /** 986: * Miscellaneous Symbols and Arrows. 987: * 0x2B00 - 0x2BFF. 988: * @since 1.5 989: */ 990: public static final UnicodeBlock MISCELLANEOUS_SYMBOLS_AND_ARROWS 991: = new UnicodeBlock(0x2B00, 0x2BFF, 992: "MISCELLANEOUS_SYMBOLS_AND_ARROWS", 993: "Miscellaneous Symbols and Arrows"); 994: 995: /** 996: * CJK Radicals Supplement. 997: * 0x2E80 - 0x2EFF. 998: * @since 1.4 999: */ 1000: public static final UnicodeBlock CJK_RADICALS_SUPPLEMENT 1001: = new UnicodeBlock(0x2E80, 0x2EFF, 1002: "CJK_RADICALS_SUPPLEMENT", 1003: "CJK Radicals Supplement"); 1004: 1005: /** 1006: * Kangxi Radicals. 1007: * 0x2F00 - 0x2FDF. 1008: * @since 1.4 1009: */ 1010: public static final UnicodeBlock KANGXI_RADICALS 1011: = new UnicodeBlock(0x2F00, 0x2FDF, 1012: "KANGXI_RADICALS", 1013: "Kangxi Radicals"); 1014: 1015: /** 1016: * Ideographic Description Characters. 1017: * 0x2FF0 - 0x2FFF. 1018: * @since 1.4 1019: */ 1020: public static final UnicodeBlock IDEOGRAPHIC_DESCRIPTION_CHARACTERS 1021: = new UnicodeBlock(0x2FF0, 0x2FFF, 1022: "IDEOGRAPHIC_DESCRIPTION_CHARACTERS", 1023: "Ideographic Description Characters"); 1024: 1025: /** 1026: * CJK Symbols and Punctuation. 1027: * 0x3000 - 0x303F. 1028: */ 1029: public static final UnicodeBlock CJK_SYMBOLS_AND_PUNCTUATION 1030: = new UnicodeBlock(0x3000, 0x303F, 1031: "CJK_SYMBOLS_AND_PUNCTUATION", 1032: "CJK Symbols and Punctuation"); 1033: 1034: /** 1035: * Hiragana. 1036: * 0x3040 - 0x309F. 1037: */ 1038: public static final UnicodeBlock HIRAGANA 1039: = new UnicodeBlock(0x3040, 0x309F, 1040: "HIRAGANA", 1041: "Hiragana"); 1042: 1043: /** 1044: * Katakana. 1045: * 0x30A0 - 0x30FF. 1046: */ 1047: public static final UnicodeBlock KATAKANA 1048: = new UnicodeBlock(0x30A0, 0x30FF, 1049: "KATAKANA", 1050: "Katakana"); 1051: 1052: /** 1053: * Bopomofo. 1054: * 0x3100 - 0x312F. 1055: */ 1056: public static final UnicodeBlock BOPOMOFO 1057: = new UnicodeBlock(0x3100, 0x312F, 1058: "BOPOMOFO", 1059: "Bopomofo"); 1060: 1061: /** 1062: * Hangul Compatibility Jamo. 1063: * 0x3130 - 0x318F. 1064: */ 1065: public static final UnicodeBlock HANGUL_COMPATIBILITY_JAMO 1066: = new UnicodeBlock(0x3130, 0x318F, 1067: "HANGUL_COMPATIBILITY_JAMO", 1068: "Hangul Compatibility Jamo"); 1069: 1070: /** 1071: * Kanbun. 1072: * 0x3190 - 0x319F. 1073: */ 1074: public static final UnicodeBlock KANBUN 1075: = new UnicodeBlock(0x3190, 0x319F, 1076: "KANBUN", 1077: "Kanbun"); 1078: 1079: /** 1080: * Bopomofo Extended. 1081: * 0x31A0 - 0x31BF. 1082: * @since 1.4 1083: */ 1084: public static final UnicodeBlock BOPOMOFO_EXTENDED 1085: = new UnicodeBlock(0x31A0, 0x31BF, 1086: "BOPOMOFO_EXTENDED", 1087: "Bopomofo Extended"); 1088: 1089: /** 1090: * Katakana Phonetic Extensions. 1091: * 0x31F0 - 0x31FF. 1092: * @since 1.5 1093: */ 1094: public static final UnicodeBlock KATAKANA_PHONETIC_EXTENSIONS 1095: = new UnicodeBlock(0x31F0, 0x31FF, 1096: "KATAKANA_PHONETIC_EXTENSIONS", 1097: "Katakana Phonetic Extensions"); 1098: 1099: /** 1100: * Enclosed CJK Letters and Months. 1101: * 0x3200 - 0x32FF. 1102: */ 1103: public static final UnicodeBlock ENCLOSED_CJK_LETTERS_AND_MONTHS 1104: = new UnicodeBlock(0x3200, 0x32FF, 1105: "ENCLOSED_CJK_LETTERS_AND_MONTHS", 1106: "Enclosed CJK Letters and Months"); 1107: 1108: /** 1109: * CJK Compatibility. 1110: * 0x3300 - 0x33FF. 1111: */ 1112: public static final UnicodeBlock CJK_COMPATIBILITY 1113: = new UnicodeBlock(0x3300, 0x33FF, 1114: "CJK_COMPATIBILITY", 1115: "CJK Compatibility"); 1116: 1117: /** 1118: * CJK Unified Ideographs Extension A. 1119: * 0x3400 - 0x4DBF. 1120: * @since 1.4 1121: */ 1122: public static final UnicodeBlock CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A 1123: = new UnicodeBlock(0x3400, 0x4DBF, 1124: "CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A", 1125: "CJK Unified Ideographs Extension A"); 1126: 1127: /** 1128: * Yijing Hexagram Symbols. 1129: * 0x4DC0 - 0x4DFF. 1130: * @since 1.5 1131: */ 1132: public static final UnicodeBlock YIJING_HEXAGRAM_SYMBOLS 1133: = new UnicodeBlock(0x4DC0, 0x4DFF, 1134: "YIJING_HEXAGRAM_SYMBOLS", 1135: "Yijing Hexagram Symbols"); 1136: 1137: /** 1138: * CJK Unified Ideographs. 1139: * 0x4E00 - 0x9FFF. 1140: */ 1141: public static final UnicodeBlock CJK_UNIFIED_IDEOGRAPHS 1142: = new UnicodeBlock(0x4E00, 0x9FFF, 1143: "CJK_UNIFIED_IDEOGRAPHS", 1144: "CJK Unified Ideographs"); 1145: 1146: /** 1147: * Yi Syllables. 1148: * 0xA000 - 0xA48F. 1149: * @since 1.4 1150: */ 1151: public static final UnicodeBlock YI_SYLLABLES 1152: = new UnicodeBlock(0xA000, 0xA48F, 1153: "YI_SYLLABLES", 1154: "Yi Syllables"); 1155: 1156: /** 1157: * Yi Radicals. 1158: * 0xA490 - 0xA4CF. 1159: * @since 1.4 1160: */ 1161: public static final UnicodeBlock YI_RADICALS 1162: = new UnicodeBlock(0xA490, 0xA4CF, 1163: "YI_RADICALS", 1164: "Yi Radicals"); 1165: 1166: /** 1167: * Hangul Syllables. 1168: * 0xAC00 - 0xD7AF. 1169: */ 1170: public static final UnicodeBlock HANGUL_SYLLABLES 1171: = new UnicodeBlock(0xAC00, 0xD7AF, 1172: "HANGUL_SYLLABLES", 1173: "Hangul Syllables"); 1174: 1175: /** 1176: * High Surrogates. 1177: * 0xD800 - 0xDB7F. 1178: * @since 1.5 1179: */ 1180: public static final UnicodeBlock HIGH_SURROGATES 1181: = new UnicodeBlock(0xD800, 0xDB7F, 1182: "HIGH_SURROGATES", 1183: "High Surrogates"); 1184: 1185: /** 1186: * High Private Use Surrogates. 1187: * 0xDB80 - 0xDBFF. 1188: * @since 1.5 1189: */ 1190: public static final UnicodeBlock HIGH_PRIVATE_USE_SURROGATES 1191: = new UnicodeBlock(0xDB80, 0xDBFF, 1192: "HIGH_PRIVATE_USE_SURROGATES", 1193: "High Private Use Surrogates"); 1194: 1195: /** 1196: * Low Surrogates. 1197: * 0xDC00 - 0xDFFF. 1198: * @since 1.5 1199: */ 1200: public static final UnicodeBlock LOW_SURROGATES 1201: = new UnicodeBlock(0xDC00, 0xDFFF, 1202: "LOW_SURROGATES", 1203: "Low Surrogates"); 1204: 1205: /** 1206: * Private Use Area. 1207: * 0xE000 - 0xF8FF. 1208: */ 1209: public static final UnicodeBlock PRIVATE_USE_AREA 1210: = new UnicodeBlock(0xE000, 0xF8FF, 1211: "PRIVATE_USE_AREA", 1212: "Private Use Area"); 1213: 1214: /** 1215: * CJK Compatibility Ideographs. 1216: * 0xF900 - 0xFAFF. 1217: */ 1218: public static final UnicodeBlock CJK_COMPATIBILITY_IDEOGRAPHS 1219: = new UnicodeBlock(0xF900, 0xFAFF, 1220: "CJK_COMPATIBILITY_IDEOGRAPHS", 1221: "CJK Compatibility Ideographs"); 1222: 1223: /** 1224: * Alphabetic Presentation Forms. 1225: * 0xFB00 - 0xFB4F. 1226: */ 1227: public static final UnicodeBlock ALPHABETIC_PRESENTATION_FORMS 1228: = new UnicodeBlock(0xFB00, 0xFB4F, 1229: "ALPHABETIC_PRESENTATION_FORMS", 1230: "Alphabetic Presentation Forms"); 1231: 1232: /** 1233: * Arabic Presentation Forms-A. 1234: * 0xFB50 - 0xFDFF. 1235: */ 1236: public static final UnicodeBlock ARABIC_PRESENTATION_FORMS_A 1237: = new UnicodeBlock(0xFB50, 0xFDFF, 1238: "ARABIC_PRESENTATION_FORMS_A", 1239: "Arabic Presentation Forms-A"); 1240: 1241: /** 1242: * Variation Selectors. 1243: * 0xFE00 - 0xFE0F. 1244: * @since 1.5 1245: */ 1246: public static final UnicodeBlock VARIATION_SELECTORS 1247: = new UnicodeBlock(0xFE00, 0xFE0F, 1248: "VARIATION_SELECTORS", 1249: "Variation Selectors"); 1250: 1251: /** 1252: * Combining Half Marks. 1253: * 0xFE20 - 0xFE2F. 1254: */ 1255: public static final UnicodeBlock COMBINING_HALF_MARKS 1256: = new UnicodeBlock(0xFE20, 0xFE2F, 1257: "COMBINING_HALF_MARKS", 1258: "Combining Half Marks"); 1259: 1260: /** 1261: * CJK Compatibility Forms. 1262: * 0xFE30 - 0xFE4F. 1263: */ 1264: public static final UnicodeBlock CJK_COMPATIBILITY_FORMS 1265: = new UnicodeBlock(0xFE30, 0xFE4F, 1266: "CJK_COMPATIBILITY_FORMS", 1267: "CJK Compatibility Forms"); 1268: 1269: /** 1270: * Small Form Variants. 1271: * 0xFE50 - 0xFE6F. 1272: */ 1273: public static final UnicodeBlock SMALL_FORM_VARIANTS 1274: = new UnicodeBlock(0xFE50, 0xFE6F, 1275: "SMALL_FORM_VARIANTS", 1276: "Small Form Variants"); 1277: 1278: /** 1279: * Arabic Presentation Forms-B. 1280: * 0xFE70 - 0xFEFF. 1281: */ 1282: public static final UnicodeBlock ARABIC_PRESENTATION_FORMS_B 1283: = new UnicodeBlock(0xFE70, 0xFEFF, 1284: "ARABIC_PRESENTATION_FORMS_B", 1285: "Arabic Presentation Forms-B"); 1286: 1287: /** 1288: * Halfwidth and Fullwidth Forms. 1289: * 0xFF00 - 0xFFEF. 1290: */ 1291: public static final UnicodeBlock HALFWIDTH_AND_FULLWIDTH_FORMS 1292: = new UnicodeBlock(0xFF00, 0xFFEF, 1293: "HALFWIDTH_AND_FULLWIDTH_FORMS", 1294: "Halfwidth and Fullwidth Forms"); 1295: 1296: /** 1297: * Specials. 1298: * 0xFFF0 - 0xFFFF. 1299: */ 1300: public static final UnicodeBlock SPECIALS 1301: = new UnicodeBlock(0xFFF0, 0xFFFF, 1302: "SPECIALS", 1303: "Specials"); 1304: 1305: /** 1306: * Linear B Syllabary. 1307: * 0x10000 - 0x1007F. 1308: * @since 1.5 1309: */ 1310: public static final UnicodeBlock LINEAR_B_SYLLABARY 1311: = new UnicodeBlock(0x10000, 0x1007F, 1312: "LINEAR_B_SYLLABARY", 1313: "Linear B Syllabary"); 1314: 1315: /** 1316: * Linear B Ideograms. 1317: * 0x10080 - 0x100FF. 1318: * @since 1.5 1319: */ 1320: public static final UnicodeBlock LINEAR_B_IDEOGRAMS 1321: = new UnicodeBlock(0x10080, 0x100FF, 1322: "LINEAR_B_IDEOGRAMS", 1323: "Linear B Ideograms"); 1324: 1325: /** 1326: * Aegean Numbers. 1327: * 0x10100 - 0x1013F. 1328: * @since 1.5 1329: */ 1330: public static final UnicodeBlock AEGEAN_NUMBERS 1331: = new UnicodeBlock(0x10100, 0x1013F, 1332: "AEGEAN_NUMBERS", 1333: "Aegean Numbers"); 1334: 1335: /** 1336: * Old Italic. 1337: * 0x10300 - 0x1032F. 1338: * @since 1.5 1339: */ 1340: public static final UnicodeBlock OLD_ITALIC 1341: = new UnicodeBlock(0x10300, 0x1032F, 1342: "OLD_ITALIC", 1343: "Old Italic"); 1344: 1345: /** 1346: * Gothic. 1347: * 0x10330 - 0x1034F. 1348: * @since 1.5 1349: */ 1350: public static final UnicodeBlock GOTHIC 1351: = new UnicodeBlock(0x10330, 0x1034F, 1352: "GOTHIC", 1353: "Gothic"); 1354: 1355: /** 1356: * Ugaritic. 1357: * 0x10380 - 0x1039F. 1358: * @since 1.5 1359: */ 1360: public static final UnicodeBlock UGARITIC 1361: = new UnicodeBlock(0x10380, 0x1039F, 1362: "UGARITIC", 1363: "Ugaritic"); 1364: 1365: /** 1366: * Deseret. 1367: * 0x10400 - 0x1044F. 1368: * @since 1.5 1369: */ 1370: public static final UnicodeBlock DESERET 1371: = new UnicodeBlock(0x10400, 0x1044F, 1372: "DESERET", 1373: "Deseret"); 1374: 1375: /** 1376: * Shavian. 1377: * 0x10450 - 0x1047F. 1378: * @since 1.5 1379: */ 1380: public static final UnicodeBlock SHAVIAN 1381: = new UnicodeBlock(0x10450, 0x1047F, 1382: "SHAVIAN", 1383: "Shavian"); 1384: 1385: /** 1386: * Osmanya. 1387: * 0x10480 - 0x104AF. 1388: * @since 1.5 1389: */ 1390: public static final UnicodeBlock OSMANYA 1391: = new UnicodeBlock(0x10480, 0x104AF, 1392: "OSMANYA", 1393: "Osmanya"); 1394: 1395: /** 1396: * Cypriot Syllabary. 1397: * 0x10800 - 0x1083F. 1398: * @since 1.5 1399: */ 1400: public static final UnicodeBlock CYPRIOT_SYLLABARY 1401: = new UnicodeBlock(0x10800, 0x1083F, 1402: "CYPRIOT_SYLLABARY", 1403: "Cypriot Syllabary"); 1404: 1405: /** 1406: * Byzantine Musical Symbols. 1407: * 0x1D000 - 0x1D0FF. 1408: * @since 1.5 1409: */ 1410: public static final UnicodeBlock BYZANTINE_MUSICAL_SYMBOLS 1411: = new UnicodeBlock(0x1D000, 0x1D0FF, 1412: "BYZANTINE_MUSICAL_SYMBOLS", 1413: "Byzantine Musical Symbols"); 1414: 1415: /** 1416: * Musical Symbols. 1417: * 0x1D100 - 0x1D1FF. 1418: * @since 1.5 1419: */ 1420: public static final UnicodeBlock MUSICAL_SYMBOLS 1421: = new UnicodeBlock(0x1D100, 0x1D1FF, 1422: "MUSICAL_SYMBOLS", 1423: "Musical Symbols"); 1424: 1425: /** 1426: * Tai Xuan Jing Symbols. 1427: * 0x1D300 - 0x1D35F. 1428: * @since 1.5 1429: */ 1430: public static final UnicodeBlock TAI_XUAN_JING_SYMBOLS 1431: = new UnicodeBlock(0x1D300, 0x1D35F, 1432: "TAI_XUAN_JING_SYMBOLS", 1433: "Tai Xuan Jing Symbols"); 1434: 1435: /** 1436: * Mathematical Alphanumeric Symbols. 1437: * 0x1D400 - 0x1D7FF. 1438: * @since 1.5 1439: */ 1440: public static final UnicodeBlock MATHEMATICAL_ALPHANUMERIC_SYMBOLS 1441: = new UnicodeBlock(0x1D400, 0x1D7FF, 1442: "MATHEMATICAL_ALPHANUMERIC_SYMBOLS", 1443: "Mathematical Alphanumeric Symbols"); 1444: 1445: /** 1446: * CJK Unified Ideographs Extension B. 1447: * 0x20000 - 0x2A6DF. 1448: * @since 1.5 1449: */ 1450: public static final UnicodeBlock CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B 1451: = new UnicodeBlock(0x20000, 0x2A6DF, 1452: "CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B", 1453: "CJK Unified Ideographs Extension B"); 1454: 1455: /** 1456: * CJK Compatibility Ideographs Supplement. 1457: * 0x2F800 - 0x2FA1F. 1458: * @since 1.5 1459: */ 1460: public static final UnicodeBlock CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT 1461: = new UnicodeBlock(0x2F800, 0x2FA1F, 1462: "CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT", 1463: "CJK Compatibility Ideographs Supplement"); 1464: 1465: /** 1466: * Tags. 1467: * 0xE0000 - 0xE007F. 1468: * @since 1.5 1469: */ 1470: public static final UnicodeBlock TAGS 1471: = new UnicodeBlock(0xE0000, 0xE007F, 1472: "TAGS", 1473: "Tags"); 1474: 1475: /** 1476: * Variation Selectors Supplement. 1477: * 0xE0100 - 0xE01EF. 1478: * @since 1.5 1479: */ 1480: public static final UnicodeBlock VARIATION_SELECTORS_SUPPLEMENT 1481: = new UnicodeBlock(0xE0100, 0xE01EF, 1482: "VARIATION_SELECTORS_SUPPLEMENT", 1483: "Variation Selectors Supplement"); 1484: 1485: /** 1486: * Supplementary Private Use Area-A. 1487: * 0xF0000 - 0xFFFFF. 1488: * @since 1.5 1489: */ 1490: public static final UnicodeBlock SUPPLEMENTARY_PRIVATE_USE_AREA_A 1491: = new UnicodeBlock(0xF0000, 0xFFFFF, 1492: "SUPPLEMENTARY_PRIVATE_USE_AREA_A", 1493: "Supplementary Private Use Area-A"); 1494: 1495: /** 1496: * Supplementary Private Use Area-B. 1497: * 0x100000 - 0x10FFFF. 1498: * @since 1.5 1499: */ 1500: public static final UnicodeBlock SUPPLEMENTARY_PRIVATE_USE_AREA_B 1501: = new UnicodeBlock(0x100000, 0x10FFFF, 1502: "SUPPLEMENTARY_PRIVATE_USE_AREA_B", 1503: "Supplementary Private Use Area-B"); 1504: 1505: /** 1506: * Surrogates Area. 1507: * 'D800' - 'DFFF'. 1508: * @deprecated As of 1.5, the three areas, 1509: * <a href="#HIGH_SURROGATES">HIGH_SURROGATES</a>, 1510: * <a href="#HIGH_PRIVATE_USE_SURROGATES">HIGH_PRIVATE_USE_SURROGATES</a> 1511: * and <a href="#LOW_SURROGATES">LOW_SURROGATES</a>, as defined 1512: * by the Unicode standard, should be used in preference to 1513: * this. These are also returned from calls to <code>of(int)</code> 1514: * and <code>of(char)</code>. 1515: */ 1516: public static final UnicodeBlock SURROGATES_AREA 1517: = new UnicodeBlock(0xD800, 0xDFFF, 1518: "SURROGATES_AREA", 1519: "Surrogates Area"); 1520: 1521: /** 1522: * The defined subsets. 1523: */ 1524: private static final UnicodeBlock sets[] = { 1525: BASIC_LATIN, 1526: LATIN_1_SUPPLEMENT, 1527: LATIN_EXTENDED_A, 1528: LATIN_EXTENDED_B, 1529: IPA_EXTENSIONS, 1530: SPACING_MODIFIER_LETTERS, 1531: COMBINING_DIACRITICAL_MARKS, 1532: GREEK, 1533: CYRILLIC, 1534: CYRILLIC_SUPPLEMENTARY, 1535: ARMENIAN, 1536: HEBREW, 1537: ARABIC, 1538: SYRIAC, 1539: THAANA, 1540: DEVANAGARI, 1541: BENGALI, 1542: GURMUKHI, 1543: GUJARATI, 1544: ORIYA, 1545: TAMIL, 1546: TELUGU, 1547: KANNADA, 1548: MALAYALAM, 1549: SINHALA, 1550: THAI, 1551: LAO, 1552: TIBETAN, 1553: MYANMAR, 1554: GEORGIAN, 1555: HANGUL_JAMO, 1556: ETHIOPIC, 1557: CHEROKEE, 1558: UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS, 1559: OGHAM, 1560: RUNIC, 1561: TAGALOG, 1562: HANUNOO, 1563: BUHID, 1564: TAGBANWA, 1565: KHMER, 1566: MONGOLIAN, 1567: LIMBU, 1568: TAI_LE, 1569: KHMER_SYMBOLS, 1570: PHONETIC_EXTENSIONS, 1571: LATIN_EXTENDED_ADDITIONAL, 1572: GREEK_EXTENDED, 1573: GENERAL_PUNCTUATION, 1574: SUPERSCRIPTS_AND_SUBSCRIPTS, 1575: CURRENCY_SYMBOLS, 1576: COMBINING_MARKS_FOR_SYMBOLS, 1577: LETTERLIKE_SYMBOLS, 1578: NUMBER_FORMS, 1579: ARROWS, 1580: MATHEMATICAL_OPERATORS, 1581: MISCELLANEOUS_TECHNICAL, 1582: CONTROL_PICTURES, 1583: OPTICAL_CHARACTER_RECOGNITION, 1584: ENCLOSED_ALPHANUMERICS, 1585: BOX_DRAWING, 1586: BLOCK_ELEMENTS, 1587: GEOMETRIC_SHAPES, 1588: MISCELLANEOUS_SYMBOLS, 1589: DINGBATS, 1590: MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A, 1591: SUPPLEMENTAL_ARROWS_A, 1592: BRAILLE_PATTERNS, 1593: SUPPLEMENTAL_ARROWS_B, 1594: MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B, 1595: SUPPLEMENTAL_MATHEMATICAL_OPERATORS, 1596: MISCELLANEOUS_SYMBOLS_AND_ARROWS, 1597: CJK_RADICALS_SUPPLEMENT, 1598: KANGXI_RADICALS, 1599: IDEOGRAPHIC_DESCRIPTION_CHARACTERS, 1600: CJK_SYMBOLS_AND_PUNCTUATION, 1601: HIRAGANA, 1602: KATAKANA, 1603: BOPOMOFO, 1604: HANGUL_COMPATIBILITY_JAMO, 1605: KANBUN, 1606: BOPOMOFO_EXTENDED, 1607: KATAKANA_PHONETIC_EXTENSIONS, 1608: ENCLOSED_CJK_LETTERS_AND_MONTHS, 1609: CJK_COMPATIBILITY, 1610: CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A, 1611: YIJING_HEXAGRAM_SYMBOLS, 1612: CJK_UNIFIED_IDEOGRAPHS, 1613: YI_SYLLABLES, 1614: YI_RADICALS, 1615: HANGUL_SYLLABLES, 1616: HIGH_SURROGATES, 1617: HIGH_PRIVATE_USE_SURROGATES, 1618: LOW_SURROGATES, 1619: PRIVATE_USE_AREA, 1620: CJK_COMPATIBILITY_IDEOGRAPHS, 1621: ALPHABETIC_PRESENTATION_FORMS, 1622: ARABIC_PRESENTATION_FORMS_A, 1623: VARIATION_SELECTORS, 1624: COMBINING_HALF_MARKS, 1625: CJK_COMPATIBILITY_FORMS, 1626: SMALL_FORM_VARIANTS, 1627: ARABIC_PRESENTATION_FORMS_B, 1628: HALFWIDTH_AND_FULLWIDTH_FORMS, 1629: SPECIALS, 1630: LINEAR_B_SYLLABARY, 1631: LINEAR_B_IDEOGRAMS, 1632: AEGEAN_NUMBERS, 1633: OLD_ITALIC, 1634: GOTHIC, 1635: UGARITIC, 1636: DESERET, 1637: SHAVIAN, 1638: OSMANYA, 1639: CYPRIOT_SYLLABARY, 1640: BYZANTINE_MUSICAL_SYMBOLS, 1641: MUSICAL_SYMBOLS, 1642: TAI_XUAN_JING_SYMBOLS, 1643: MATHEMATICAL_ALPHANUMERIC_SYMBOLS, 1644: CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B, 1645: CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT, 1646: TAGS, 1647: VARIATION_SELECTORS_SUPPLEMENT, 1648: SUPPLEMENTARY_PRIVATE_USE_AREA_A, 1649: SUPPLEMENTARY_PRIVATE_USE_AREA_B, 1650: }; 1651: } // class UnicodeBlock 1652: 1653: /** 1654: * A class to encompass all the properties of characters in the 1655: * private use blocks in the Unicode standard. This class extends 1656: * UnassignedCharacters because the return type from getType() is 1657: * different. 1658: * @author Anthony Balkissoon abalkiss at redhat dot com 1659: * 1660: */ 1661: private static class PrivateUseCharacters extends UnassignedCharacters 1662: { 1663: /** 1664: * Returns the type of the character cp. 1665: */ 1666: static int getType(int cp) 1667: { 1668: // The upper 2 code points in any plane are considered unassigned, 1669: // even in the private-use planes. 1670: if ((cp & 0xffff) >= 0xfffe) 1671: return UnassignedCharacters.getType(cp); 1672: return PRIVATE_USE; 1673: } 1674: 1675: /** 1676: * Returns true if the character cp is defined. 1677: */ 1678: static boolean isDefined(int cp) 1679: { 1680: // The upper 2 code points in any plane are considered unassigned, 1681: // even in the private-use planes. 1682: if ((cp & 0xffff) >= 0xfffe) 1683: return UnassignedCharacters.isDefined(cp); 1684: return true; 1685: } 1686: 1687: /** 1688: * Gets the directionality for the character cp. 1689: */ 1690: static byte getDirectionality(int cp) 1691: { 1692: if ((cp & 0xffff) >= 0xfffe) 1693: return UnassignedCharacters.getDirectionality(cp); 1694: return DIRECTIONALITY_LEFT_TO_RIGHT; 1695: } 1696: } 1697: 1698: /** 1699: * A class to encompass all the properties of code points that are 1700: * currently undefined in the Unicode standard. 1701: * @author Anthony Balkissoon abalkiss at redhat dot com 1702: * 1703: */ 1704: private static class UnassignedCharacters 1705: { 1706: /** 1707: * Returns the numeric value for the unassigned characters. 1708: * @param cp the character 1709: * @param radix the radix (not used) 1710: * @return the numeric value of this character in this radix 1711: */ 1712: static int digit(int cp, int radix) 1713: { 1714: return -1; 1715: } 1716: 1717: /** 1718: * Returns the Unicode directionality property for unassigned 1719: * characters. 1720: * @param cp the character 1721: * @return DIRECTIONALITY_UNDEFINED 1722: */ 1723: static byte getDirectionality(int cp) 1724: { 1725: return DIRECTIONALITY_UNDEFINED; 1726: } 1727: 1728: /** 1729: * Returns -1, the numeric value for unassigned Unicode characters. 1730: * @param cp the character 1731: * @return -1 1732: */ 1733: static int getNumericValue(int cp) 1734: { 1735: return -1; 1736: } 1737: 1738: /** 1739: * Returns UNASSIGNED, the type of unassigned Unicode characters. 1740: * @param cp the character 1741: * @return UNASSIGNED 1742: */ 1743: static int getType(int cp) 1744: { 1745: return UNASSIGNED; 1746: } 1747: 1748: /** 1749: * Returns false to indiciate that the character is not defined in the 1750: * Unicode standard. 1751: * @param cp the character 1752: * @return false 1753: */ 1754: static boolean isDefined(int cp) 1755: { 1756: return false; 1757: } 1758: 1759: /** 1760: * Returns false to indicate that the character is not a digit. 1761: * @param cp the character 1762: * @return false 1763: */ 1764: static boolean isDigit(int cp) 1765: { 1766: return false; 1767: } 1768: 1769: /** 1770: * Returns false to indicate that the character cannot be ignored 1771: * within an identifier 1772: * @param cp the character 1773: * @return false 1774: */ 1775: static boolean isIdentifierIgnorable(int cp) 1776: { 1777: return false; 1778: } 1779: 1780: /** 1781: * Returns false to indicate that the character cannot be part of a 1782: * Java identifier. 1783: * @param cp the character 1784: * @return false 1785: */ 1786: static boolean isJavaIdentifierPart(int cp) 1787: { 1788: return false; 1789: } 1790: 1791: /** 1792: * Returns false to indicate that the character cannot be start a 1793: * Java identifier. 1794: * @param cp the character 1795: * @return false 1796: */ 1797: static boolean isJavaIdentiferStart(int cp) 1798: { 1799: return false; 1800: } 1801: 1802: /** 1803: * Returns false to indicate that the character is not a letter. 1804: * @param cp the character 1805: * @return false 1806: */ 1807: static boolean isLetter(int cp) 1808: { 1809: return false; 1810: } 1811: 1812: /** 1813: * Returns false to indicate that the character cannot is neither a letter 1814: * nor a digit. 1815: * @param cp the character 1816: * @return false 1817: */ 1818: static boolean isLetterOrDigit(int cp) 1819: { 1820: return false; 1821: } 1822: 1823: /** 1824: * Returns false to indicate that the character is not a lowercase letter. 1825: * @param cp the character 1826: * @return false 1827: */ 1828: static boolean isLowerCase(int cp) 1829: { 1830: return false; 1831: } 1832: 1833: /** 1834: * Returns false to indicate that the character cannot is not mirrored. 1835: * @param cp the character 1836: * @return false 1837: */ 1838: static boolean isMirrored(int cp) 1839: { 1840: return false; 1841: } 1842: 1843: /** 1844: * Returns false to indicate that the character is not a space character. 1845: * @param cp the character 1846: * @return false 1847: */ 1848: static boolean isSpaceChar(int cp) 1849: { 1850: return false; 1851: } 1852: 1853: /** 1854: * Returns false to indicate that the character it not a titlecase letter. 1855: * @param cp the character 1856: * @return false 1857: */ 1858: static boolean isTitleCase(int cp) 1859: { 1860: return false; 1861: } 1862: 1863: /** 1864: * Returns false to indicate that the character cannot be part of a 1865: * Unicode identifier. 1866: * @param cp the character 1867: * @return false 1868: */ 1869: static boolean isUnicodeIdentifierPart(int cp) 1870: { 1871: return false; 1872: } 1873: 1874: /** 1875: * Returns false to indicate that the character cannot start a 1876: * Unicode identifier. 1877: * @param cp the character 1878: * @return false 1879: */ 1880: static boolean isUnicodeIdentifierStart(int cp) 1881: { 1882: return false; 1883: } 1884: 1885: /** 1886: * Returns false to indicate that the character is not an uppercase letter. 1887: * @param cp the character 1888: * @return false 1889: */ 1890: static boolean isUpperCase(int cp) 1891: { 1892: return false; 1893: } 1894: 1895: /** 1896: * Returns false to indicate that the character is not a whitespace 1897: * character. 1898: * @param cp the character 1899: * @return false 1900: */ 1901: static boolean isWhiteSpace(int cp) 1902: { 1903: return false; 1904: } 1905: 1906: /** 1907: * Returns cp to indicate this character has no lowercase conversion. 1908: * @param cp the character 1909: * @return cp 1910: */ 1911: static int toLowerCase(int cp) 1912: { 1913: return cp; 1914: } 1915: 1916: /** 1917: * Returns cp to indicate this character has no titlecase conversion. 1918: * @param cp the character 1919: * @return cp 1920: */ 1921: static int toTitleCase(int cp) 1922: { 1923: return cp; 1924: } 1925: 1926: /** 1927: * Returns cp to indicate this character has no uppercase conversion. 1928: * @param cp the character 1929: * @return cp 1930: */ 1931: static int toUpperCase(int cp) 1932: { 1933: return cp; 1934: } 1935: } 1936: 1937: /** 1938: * The immutable value of this Character. 1939: * 1940: * @serial the value of this Character 1941: */ 1942: private final char value; 1943: 1944: /** 1945: * Compatible with JDK 1.0+. 1946: */ 1947: private static final long serialVersionUID = 3786198910865385080L; 1948: 1949: /** 1950: * Smallest value allowed for radix arguments in Java. This value is 2. 1951: * 1952: * @see #digit(char, int) 1953: * @see #forDigit(int, int) 1954: * @see Integer#toString(int, int) 1955: * @see Integer#valueOf(String) 1956: */ 1957: public static final int MIN_RADIX = 2; 1958: 1959: /** 1960: * Largest value allowed for radix arguments in Java. This value is 36. 1961: * 1962: * @see #digit(char, int) 1963: * @see #forDigit(int, int) 1964: * @see Integer#toString(int, int) 1965: * @see Integer#valueOf(String) 1966: */ 1967: public static final int MAX_RADIX = 36; 1968: 1969: /** 1970: * The minimum value the char data type can hold. 1971: * This value is <code>'\\u0000'</code>. 1972: */ 1973: public static final char MIN_VALUE = '\u0000'; 1974: 1975: /** 1976: * The maximum value the char data type can hold. 1977: * This value is <code>'\\uFFFF'</code>. 1978: */ 1979: public static final char MAX_VALUE = '\uFFFF'; 1980: 1981: /** 1982: * Class object representing the primitive char data type. 1983: * 1984: * @since 1.1 1985: */ 1986: public static final Class TYPE = VMClassLoader.getPrimitiveClass('C'); 1987: 1988: /** 1989: * The number of bits needed to represent a <code>char</code>. 1990: * @since 1.5 1991: */ 1992: public static final int SIZE = 16; 1993: 1994: // This caches some Character values, and is used by boxing 1995: // conversions via valueOf(). We must cache at least 0..127; 1996: // this constant controls how much we actually cache. 1997: private static final int MAX_CACHE = 127; 1998: private static Character[] charCache = new Character[MAX_CACHE + 1]; 1999: 2000: /** 2001: * Lu = Letter, Uppercase (Informative). 2002: * 2003: * @since 1.1 2004: */ 2005: public static final byte UPPERCASE_LETTER = 1; 2006: 2007: /** 2008: * Ll = Letter, Lowercase (Informative). 2009: * 2010: * @since 1.1 2011: */ 2012: public static final byte LOWERCASE_LETTER = 2; 2013: 2014: /** 2015: * Lt = Letter, Titlecase (Informative). 2016: * 2017: * @since 1.1 2018: */ 2019: public static final byte TITLECASE_LETTER = 3; 2020: 2021: /** 2022: * Mn = Mark, Non-Spacing (Normative). 2023: * 2024: * @since 1.1 2025: */ 2026: public static final byte NON_SPACING_MARK = 6; 2027: 2028: /** 2029: * Mc = Mark, Spacing Combining (Normative). 2030: * 2031: * @since 1.1 2032: */ 2033: public static final byte COMBINING_SPACING_MARK = 8; 2034: 2035: /** 2036: * Me = Mark, Enclosing (Normative). 2037: * 2038: * @since 1.1 2039: */ 2040: public static final byte ENCLOSING_MARK = 7; 2041: 2042: /** 2043: * Nd = Number, Decimal Digit (Normative). 2044: * 2045: * @since 1.1 2046: */ 2047: public static final byte DECIMAL_DIGIT_NUMBER = 9; 2048: 2049: /** 2050: * Nl = Number, Letter (Normative). 2051: * 2052: * @since 1.1 2053: */ 2054: public static final byte LETTER_NUMBER = 10; 2055: 2056: /** 2057: * No = Number, Other (Normative). 2058: * 2059: * @since 1.1 2060: */ 2061: public static final byte OTHER_NUMBER = 11; 2062: 2063: /** 2064: * Zs = Separator, Space (Normative). 2065: * 2066: * @since 1.1 2067: */ 2068: public static final byte SPACE_SEPARATOR = 12; 2069: 2070: /** 2071: * Zl = Separator, Line (Normative). 2072: * 2073: * @since 1.1 2074: */ 2075: public static final byte LINE_SEPARATOR = 13; 2076: 2077: /** 2078: * Zp = Separator, Paragraph (Normative). 2079: * 2080: * @since 1.1 2081: */ 2082: public static final byte PARAGRAPH_SEPARATOR = 14; 2083: 2084: /** 2085: * Cc = Other, Control (Normative). 2086: * 2087: * @since 1.1 2088: */ 2089: public static final byte CONTROL = 15; 2090: 2091: /** 2092: * Cf = Other, Format (Normative). 2093: * 2094: * @since 1.1 2095: */ 2096: public static final byte FORMAT = 16; 2097: 2098: /** 2099: * Cs = Other, Surrogate (Normative). 2100: * 2101: * @since 1.1 2102: */ 2103: public static final byte SURROGATE = 19; 2104: 2105: /** 2106: * Co = Other, Private Use (Normative). 2107: * 2108: * @since 1.1 2109: */ 2110: public static final byte PRIVATE_USE = 18; 2111: 2112: /** 2113: * Cn = Other, Not Assigned (Normative). 2114: * 2115: * @since 1.1 2116: */ 2117: public static final byte UNASSIGNED = 0; 2118: 2119: /** 2120: * Lm = Letter, Modifier (Informative). 2121: * 2122: * @since 1.1 2123: */ 2124: public static final byte MODIFIER_LETTER = 4; 2125: 2126: /** 2127: * Lo = Letter, Other (Informative). 2128: * 2129: * @since 1.1 2130: */ 2131: public static final byte OTHER_LETTER = 5; 2132: 2133: /** 2134: * Pc = Punctuation, Connector (Informative). 2135: * 2136: * @since 1.1 2137: */ 2138: public static final byte CONNECTOR_PUNCTUATION = 23; 2139: 2140: /** 2141: * Pd = Punctuation, Dash (Informative). 2142: * 2143: * @since 1.1 2144: */ 2145: public static final byte DASH_PUNCTUATION = 20; 2146: 2147: /** 2148: * Ps = Punctuation, Open (Informative). 2149: * 2150: * @since 1.1 2151: */ 2152: public static final byte START_PUNCTUATION = 21; 2153: 2154: /** 2155: * Pe = Punctuation, Close (Informative). 2156: * 2157: * @since 1.1 2158: */ 2159: public static final byte END_PUNCTUATION = 22; 2160: 2161: /** 2162: * Pi = Punctuation, Initial Quote (Informative). 2163: * 2164: * @since 1.4 2165: */ 2166: public static final byte INITIAL_QUOTE_PUNCTUATION = 29; 2167: 2168: /** 2169: * Pf = Punctuation, Final Quote (Informative). 2170: * 2171: * @since 1.4 2172: */ 2173: public static final byte FINAL_QUOTE_PUNCTUATION = 30; 2174: 2175: /** 2176: * Po = Punctuation, Other (Informative). 2177: * 2178: * @since 1.1 2179: */ 2180: public static final byte OTHER_PUNCTUATION = 24; 2181: 2182: /** 2183: * Sm = Symbol, Math (Informative). 2184: * 2185: * @since 1.1 2186: */ 2187: public static final byte MATH_SYMBOL = 25; 2188: 2189: /** 2190: * Sc = Symbol, Currency (Informative). 2191: * 2192: * @since 1.1 2193: */ 2194: public static final byte CURRENCY_SYMBOL = 26; 2195: 2196: /** 2197: * Sk = Symbol, Modifier (Informative). 2198: * 2199: * @since 1.1 2200: */ 2201: public static final byte MODIFIER_SYMBOL = 27; 2202: 2203: /** 2204: * So = Symbol, Other (Informative). 2205: * 2206: * @since 1.1 2207: */ 2208: public static final byte OTHER_SYMBOL = 28; 2209: 2210: /** 2211: * Undefined bidirectional character type. Undefined char values have 2212: * undefined directionality in the Unicode specification. 2213: * 2214: * @since 1.4 2215: */ 2216: public static final byte DIRECTIONALITY_UNDEFINED = -1; 2217: 2218: /** 2219: * Strong bidirectional character type "L". 2220: * 2221: * @since 1.4 2222: */ 2223: public static final byte DIRECTIONALITY_LEFT_TO_RIGHT = 0; 2224: 2225: /** 2226: * Strong bidirectional character type "R". 2227: * 2228: * @since 1.4 2229: */ 2230: public static final byte DIRECTIONALITY_RIGHT_TO_LEFT = 1; 2231: 2232: /** 2233: * Strong bidirectional character type "AL". 2234: * 2235: * @since 1.4 2236: */ 2237: public static final byte DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC = 2; 2238: 2239: /** 2240: * Weak bidirectional character type "EN". 2241: * 2242: * @since 1.4 2243: */ 2244: public static final byte DIRECTIONALITY_EUROPEAN_NUMBER = 3; 2245: 2246: /** 2247: * Weak bidirectional character type "ES". 2248: * 2249: * @since 1.4 2250: */ 2251: public static final byte DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR = 4; 2252: 2253: /** 2254: * Weak bidirectional character type "ET". 2255: * 2256: * @since 1.4 2257: */ 2258: public static final byte DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR = 5; 2259: 2260: /** 2261: * Weak bidirectional character type "AN". 2262: * 2263: * @since 1.4 2264: */ 2265: public static final byte DIRECTIONALITY_ARABIC_NUMBER = 6; 2266: 2267: /** 2268: * Weak bidirectional character type "CS". 2269: * 2270: * @since 1.4 2271: */ 2272: public static final byte DIRECTIONALITY_COMMON_NUMBER_SEPARATOR = 7; 2273: 2274: /** 2275: * Weak bidirectional character type "NSM". 2276: * 2277: * @since 1.4 2278: */ 2279: public static final byte DIRECTIONALITY_NONSPACING_MARK = 8; 2280: 2281: /** 2282: * Weak bidirectional character type "BN". 2283: * 2284: * @since 1.4 2285: */ 2286: public static final byte DIRECTIONALITY_BOUNDARY_NEUTRAL = 9; 2287: 2288: /** 2289: * Neutral bidirectional character type "B". 2290: * 2291: * @since 1.4 2292: */ 2293: public static final byte DIRECTIONALITY_PARAGRAPH_SEPARATOR = 10; 2294: 2295: /** 2296: * Neutral bidirectional character type "S". 2297: * 2298: * @since 1.4 2299: */ 2300: public static final byte DIRECTIONALITY_SEGMENT_SEPARATOR = 11; 2301: 2302: /** 2303: * Strong bidirectional character type "WS". 2304: * 2305: * @since 1.4 2306: */ 2307: public static final byte DIRECTIONALITY_WHITESPACE = 12; 2308: 2309: /** 2310: * Neutral bidirectional character type "ON". 2311: * 2312: * @since 1.4 2313: */ 2314: public static final byte DIRECTIONALITY_OTHER_NEUTRALS = 13; 2315: 2316: /** 2317: * Strong bidirectional character type "LRE". 2318: * 2319: * @since 1.4 2320: */ 2321: public static final byte DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING = 14; 2322: 2323: /** 2324: * Strong bidirectional character type "LRO". 2325: * 2326: * @since 1.4 2327: */ 2328: public static final byte DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE = 15; 2329: 2330: /** 2331: * Strong bidirectional character type "RLE". 2332: * 2333: * @since 1.4 2334: */ 2335: public static final byte DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING = 16; 2336: 2337: /** 2338: * Strong bidirectional character type "RLO". 2339: * 2340: * @since 1.4 2341: */ 2342: public static final byte DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE = 17; 2343: 2344: /** 2345: * Weak bidirectional character type "PDF". 2346: * 2347: * @since 1.4 2348: */ 2349: public static final byte DIRECTIONALITY_POP_DIRECTIONAL_FORMAT = 18; 2350: 2351: /** 2352: * Stores unicode block offset lookup table. Exploit package visibility of 2353: * String.value to avoid copying the array. 2354: * @see #readCodePoint(int) 2355: * @see CharData#BLOCKS 2356: */ 2357: private static final char[][] blocks = 2358: new char[][]{ 2359: String.zeroBasedStringValue(CharData.BLOCKS[0]), 2360: String.zeroBasedStringValue(CharData.BLOCKS[1]), 2361: String.zeroBasedStringValue(CharData.BLOCKS[2]), 2362: String.zeroBasedStringValue(CharData.BLOCKS[3]), 2363: String.zeroBasedStringValue(CharData.BLOCKS[4]), 2364: String.zeroBasedStringValue(CharData.BLOCKS[5]), 2365: String.zeroBasedStringValue(CharData.BLOCKS[6]), 2366: String.zeroBasedStringValue(CharData.BLOCKS[7]), 2367: String.zeroBasedStringValue(CharData.BLOCKS[8]), 2368: String.zeroBasedStringValue(CharData.BLOCKS[9]), 2369: String.zeroBasedStringValue(CharData.BLOCKS[10]), 2370: String.zeroBasedStringValue(CharData.BLOCKS[11]), 2371: String.zeroBasedStringValue(CharData.BLOCKS[12]), 2372: String.zeroBasedStringValue(CharData.BLOCKS[13]), 2373: String.zeroBasedStringValue(CharData.BLOCKS[14]), 2374: String.zeroBasedStringValue(CharData.BLOCKS[15]), 2375: String.zeroBasedStringValue(CharData.BLOCKS[16])}; 2376: 2377: /** 2378: * Stores unicode attribute offset lookup table. Exploit package visibility 2379: * of String.value to avoid copying the array. 2380: * @see CharData#DATA 2381: */ 2382: private static final char[][] data = 2383: new char[][]{ 2384: String.zeroBasedStringValue(CharData.DATA[0]), 2385: String.zeroBasedStringValue(CharData.DATA[1]), 2386: String.zeroBasedStringValue(CharData.DATA[2]), 2387: String.zeroBasedStringValue(CharData.DATA[3]), 2388: String.zeroBasedStringValue(CharData.DATA[4]), 2389: String.zeroBasedStringValue(CharData.DATA[5]), 2390: String.zeroBasedStringValue(CharData.DATA[6]), 2391: String.zeroBasedStringValue(CharData.DATA[7]), 2392: String.zeroBasedStringValue(CharData.DATA[8]), 2393: String.zeroBasedStringValue(CharData.DATA[9]), 2394: String.zeroBasedStringValue(CharData.DATA[10]), 2395: String.zeroBasedStringValue(CharData.DATA[11]), 2396: String.zeroBasedStringValue(CharData.DATA[12]), 2397: String.zeroBasedStringValue(CharData.DATA[13]), 2398: String.zeroBasedStringValue(CharData.DATA[14]), 2399: String.zeroBasedStringValue(CharData.DATA[15]), 2400: String.zeroBasedStringValue(CharData.DATA[16])}; 2401: 2402: /** 2403: * Stores unicode numeric value attribute table. Exploit package visibility 2404: * of String.value to avoid copying the array. 2405: * @see CharData#NUM_VALUE 2406: */ 2407: private static final char[][] numValue = 2408: new char[][]{ 2409: String.zeroBasedStringValue(CharData.NUM_VALUE[0]), 2410: String.zeroBasedStringValue(CharData.NUM_VALUE[1]), 2411: String.zeroBasedStringValue(CharData.NUM_VALUE[2]), 2412: String.zeroBasedStringValue(CharData.NUM_VALUE[3]), 2413: String.zeroBasedStringValue(CharData.NUM_VALUE[4]), 2414: String.zeroBasedStringValue(CharData.NUM_VALUE[5]), 2415: String.zeroBasedStringValue(CharData.NUM_VALUE[6]), 2416: String.zeroBasedStringValue(CharData.NUM_VALUE[7]), 2417: String.zeroBasedStringValue(CharData.NUM_VALUE[8]), 2418: String.zeroBasedStringValue(CharData.NUM_VALUE[9]), 2419: String.zeroBasedStringValue(CharData.NUM_VALUE[10]), 2420: String.zeroBasedStringValue(CharData.NUM_VALUE[11]), 2421: String.zeroBasedStringValue(CharData.NUM_VALUE[12]), 2422: String.zeroBasedStringValue(CharData.NUM_VALUE[13]), 2423: String.zeroBasedStringValue(CharData.NUM_VALUE[14]), 2424: String.zeroBasedStringValue(CharData.NUM_VALUE[15]), 2425: String.zeroBasedStringValue(CharData.NUM_VALUE[16])}; 2426: 2427: /** 2428: * Stores unicode uppercase attribute table. Exploit package visibility 2429: * of String.value to avoid copying the array. 2430: * @see CharData#UPPER 2431: */ 2432: private static final char[][] upper = 2433: new char[][]{ 2434: String.zeroBasedStringValue(CharData.UPPER[0]), 2435: String.zeroBasedStringValue(CharData.UPPER[1]), 2436: String.zeroBasedStringValue(CharData.UPPER[2]), 2437: String.zeroBasedStringValue(CharData.UPPER[3]), 2438: String.zeroBasedStringValue(CharData.UPPER[4]), 2439: String.zeroBasedStringValue(CharData.UPPER[5]), 2440: String.zeroBasedStringValue(CharData.UPPER[6]), 2441: String.zeroBasedStringValue(CharData.UPPER[7]), 2442: String.zeroBasedStringValue(CharData.UPPER[8]), 2443: String.zeroBasedStringValue(CharData.UPPER[9]), 2444: String.zeroBasedStringValue(CharData.UPPER[10]), 2445: String.zeroBasedStringValue(CharData.UPPER[11]), 2446: String.zeroBasedStringValue(CharData.UPPER[12]), 2447: String.zeroBasedStringValue(CharData.UPPER[13]), 2448: String.zeroBasedStringValue(CharData.UPPER[14]), 2449: String.zeroBasedStringValue(CharData.UPPER[15]), 2450: String.zeroBasedStringValue(CharData.UPPER[16])}; 2451: 2452: /** 2453: * Stores unicode lowercase attribute table. Exploit package visibility 2454: * of String.value to avoid copying the array. 2455: * @see CharData#LOWER 2456: */ 2457: private static final char[][] lower = 2458: new char[][]{ 2459: String.zeroBasedStringValue(CharData.LOWER[0]), 2460: String.zeroBasedStringValue(CharData.LOWER[1]), 2461: String.zeroBasedStringValue(CharData.LOWER[2]), 2462: String.zeroBasedStringValue(CharData.LOWER[3]), 2463: String.zeroBasedStringValue(CharData.LOWER[4]), 2464: String.zeroBasedStringValue(CharData.LOWER[5]), 2465: String.zeroBasedStringValue(CharData.LOWER[6]), 2466: String.zeroBasedStringValue(CharData.LOWER[7]), 2467: String.zeroBasedStringValue(CharData.LOWER[8]), 2468: String.zeroBasedStringValue(CharData.LOWER[9]), 2469: String.zeroBasedStringValue(CharData.LOWER[10]), 2470: String.zeroBasedStringValue(CharData.LOWER[11]), 2471: String.zeroBasedStringValue(CharData.LOWER[12]), 2472: String.zeroBasedStringValue(CharData.LOWER[13]), 2473: String.zeroBasedStringValue(CharData.LOWER[14]), 2474: String.zeroBasedStringValue(CharData.LOWER[15]), 2475: String.zeroBasedStringValue(CharData.LOWER[16])}; 2476: 2477: /** 2478: * Stores unicode direction attribute table. Exploit package visibility 2479: * of String.value to avoid copying the array. 2480: * @see CharData#DIRECTION 2481: */ 2482: // Package visible for use by String. 2483: static final char[][] direction = 2484: new char[][]{ 2485: String.zeroBasedStringValue(CharData.DIRECTION[0]), 2486: String.zeroBasedStringValue(CharData.DIRECTION[1]), 2487: String.zeroBasedStringValue(CharData.DIRECTION[2]), 2488: String.zeroBasedStringValue(CharData.DIRECTION[3]), 2489: String.zeroBasedStringValue(CharData.DIRECTION[4]), 2490: String.zeroBasedStringValue(CharData.DIRECTION[5]), 2491: String.zeroBasedStringValue(CharData.DIRECTION[6]), 2492: String.zeroBasedStringValue(CharData.DIRECTION[7]), 2493: String.zeroBasedStringValue(CharData.DIRECTION[8]), 2494: String.zeroBasedStringValue(CharData.DIRECTION[9]), 2495: String.zeroBasedStringValue(CharData.DIRECTION[10]), 2496: String.zeroBasedStringValue(CharData.DIRECTION[11]), 2497: String.zeroBasedStringValue(CharData.DIRECTION[12]), 2498: String.zeroBasedStringValue(CharData.DIRECTION[13]), 2499: String.zeroBasedStringValue(CharData.DIRECTION[14]), 2500: String.zeroBasedStringValue(CharData.DIRECTION[15]), 2501: String.zeroBasedStringValue(CharData.DIRECTION[16])}; 2502: 2503: /** 2504: * Stores unicode titlecase table. Exploit package visibility of 2505: * String.value to avoid copying the array. 2506: * @see CharData#TITLE 2507: */ 2508: private static final char[] title = String.zeroBasedStringValue(CharData.TITLE); 2509: 2510: /** 2511: * Mask for grabbing the type out of the contents of data. 2512: * @see CharData#DATA 2513: */ 2514: private static final int TYPE_MASK = 0x1F; 2515: 2516: /** 2517: * Mask for grabbing the non-breaking space flag out of the contents of 2518: * data. 2519: * @see CharData#DATA 2520: */ 2521: private static final int NO_BREAK_MASK = 0x20; 2522: 2523: /** 2524: * Mask for grabbing the mirrored directionality flag out of the contents 2525: * of data. 2526: * @see CharData#DATA 2527: */ 2528: private static final int MIRROR_MASK = 0x40; 2529: 2530: /** 2531: * Min value for supplementary code point. 2532: * 2533: * @since 1.5 2534: */ 2535: public static final int MIN_SUPPLEMENTARY_CODE_POINT = 0x10000; 2536: 2537: /** 2538: * Min value for code point. 2539: * 2540: * @since 1.5 2541: */ 2542: public static final int MIN_CODE_POINT = 0; 2543: 2544: 2545: /** 2546: * Max value for code point. 2547: * 2548: * @since 1.5 2549: */ 2550: public static final int MAX_CODE_POINT = 0x010ffff; 2551: 2552: 2553: /** 2554: * Minimum high surrogate code in UTF-16 encoding. 2555: * 2556: * @since 1.5 2557: */ 2558: public static final char MIN_HIGH_SURROGATE = '\ud800'; 2559: 2560: /** 2561: * Maximum high surrogate code in UTF-16 encoding. 2562: * 2563: * @since 1.5 2564: */ 2565: public static final char MAX_HIGH_SURROGATE = '\udbff'; 2566: 2567: /** 2568: * Minimum low surrogate code in UTF-16 encoding. 2569: * 2570: * @since 1.5 2571: */ 2572: public static final char MIN_LOW_SURROGATE = '\udc00'; 2573: 2574: /** 2575: * Maximum low surrogate code in UTF-16 encoding. 2576: * 2577: * @since 1.5 2578: */ 2579: public static final char MAX_LOW_SURROGATE = '\udfff'; 2580: 2581: /** 2582: * Minimum surrogate code in UTF-16 encoding. 2583: * 2584: * @since 1.5 2585: */ 2586: public static final char MIN_SURROGATE = MIN_HIGH_SURROGATE; 2587: 2588: /** 2589: * Maximum low surrogate code in UTF-16 encoding. 2590: * 2591: * @since 1.5 2592: */ 2593: public static final char MAX_SURROGATE = MAX_LOW_SURROGATE; 2594: 2595: /** 2596: * Grabs an attribute offset from the Unicode attribute database. The lower 2597: * 5 bits are the character type, the next 2 bits are flags, and the top 2598: * 9 bits are the offset into the attribute tables. 2599: * 2600: * @param codePoint the character to look up 2601: * @return the character's attribute offset and type 2602: * @see #TYPE_MASK 2603: * @see #NO_BREAK_MASK 2604: * @see #MIRROR_MASK 2605: * @see CharData#DATA 2606: * @see CharData#SHIFT 2607: */ 2608: static char readCodePoint(int codePoint) 2609: { 2610: int plane = codePoint >>> 16; 2611: char offset = (char) (codePoint & 0xffff); 2612: return data[plane][(char) (blocks[plane][offset >> CharData.SHIFT[plane]] + offset)]; 2613: } 2614: 2615: /** 2616: * Wraps up a character. 2617: * 2618: * @param value the character to wrap 2619: */ 2620: public Character(char value) 2621: { 2622: this.value = value; 2623: } 2624: 2625: /** 2626: * Returns the character which has been wrapped by this class. 2627: * 2628: * @return the character wrapped 2629: */ 2630: public char charValue() 2631: { 2632: return value; 2633: } 2634: 2635: /** 2636: * Returns the numerical value (unsigned) of the wrapped character. 2637: * Range of returned values: 0x0000-0xFFFF. 2638: * 2639: * @return the value of the wrapped character 2640: */ 2641: public int hashCode() 2642: { 2643: return value; 2644: } 2645: 2646: /** 2647: * Determines if an object is equal to this object. This is only true for 2648: * another Character object wrapping the same value. 2649: * 2650: * @param o object to compare 2651: * @return true if o is a Character with the same value 2652: */ 2653: public boolean equals(Object o) 2654: { 2655: return o instanceof Character && value == ((Character) o).value; 2656: } 2657: 2658: /** 2659: * Converts the wrapped character into a String. 2660: * 2661: * @return a String containing one character -- the wrapped character 2662: * of this instance 2663: */ 2664: public String toString() 2665: { 2666: // Package constructor avoids an array copy. 2667: return new String(new char[] { value }, 0, 1, true); 2668: } 2669: 2670: /** 2671: * Returns a String of length 1 representing the specified character. 2672: * 2673: * @param ch the character to convert 2674: * @return a String containing the character 2675: * @since 1.4 2676: */ 2677: public static String toString(char ch) 2678: { 2679: // Package constructor avoids an array copy. 2680: return new String(new char[] { ch }, 0, 1, true); 2681: } 2682: 2683: /** 2684: * Determines if a character is a Unicode lowercase letter. For example, 2685: * <code>'a'</code> is lowercase. Returns true if getType() returns 2686: * LOWERCASE_LETTER. 2687: * <br> 2688: * lowercase = [Ll] 2689: * 2690: * @param ch character to test 2691: * @return true if ch is a Unicode lowercase letter, else false 2692: * @see #isUpperCase(char) 2693: * @see #isTitleCase(char) 2694: * @see #toLowerCase(char) 2695: * @see #getType(char) 2696: */ 2697: public static boolean isLowerCase(char ch) 2698: { 2699: return isLowerCase((int)ch); 2700: } 2701: 2702: /** 2703: * Determines if a character is a Unicode lowercase letter. For example, 2704: * <code>'a'</code> is lowercase. Returns true if getType() returns 2705: * LOWERCASE_LETTER. 2706: * <br> 2707: * lowercase = [Ll] 2708: * 2709: * @param codePoint character to test 2710: * @return true if ch is a Unicode lowercase letter, else false 2711: * @see #isUpperCase(char) 2712: * @see #isTitleCase(char) 2713: * @see #toLowerCase(char) 2714: * @see #getType(char) 2715: * 2716: * @since 1.5 2717: */ 2718: public static boolean isLowerCase(int codePoint) 2719: { 2720: return getType(codePoint) == LOWERCASE_LETTER; 2721: } 2722: 2723: /** 2724: * Determines if a character is a Unicode uppercase letter. For example, 2725: * <code>'A'</code> is uppercase. Returns true if getType() returns 2726: * UPPERCASE_LETTER. 2727: * <br> 2728: * uppercase = [Lu] 2729: * 2730: * @param ch character to test 2731: * @return true if ch is a Unicode uppercase letter, else false 2732: * @see #isLowerCase(char) 2733: * @see #isTitleCase(char) 2734: * @see #toUpperCase(char) 2735: * @see #getType(char) 2736: */ 2737: public static boolean isUpperCase(char ch) 2738: { 2739: return isUpperCase((int)ch); 2740: } 2741: 2742: /** 2743: * Determines if a character is a Unicode uppercase letter. For example, 2744: * <code>'A'</code> is uppercase. Returns true if getType() returns 2745: * UPPERCASE_LETTER. 2746: * <br> 2747: * uppercase = [Lu] 2748: * 2749: * @param codePoint character to test 2750: * @return true if ch is a Unicode uppercase letter, else false 2751: * @see #isLowerCase(char) 2752: * @see #isTitleCase(char) 2753: * @see #toUpperCase(char) 2754: * @see #getType(char) 2755: * 2756: * @since 1.5 2757: */ 2758: public static boolean isUpperCase(int codePoint) 2759: { 2760: return getType(codePoint) == UPPERCASE_LETTER; 2761: } 2762: 2763: /** 2764: * Determines if a character is a Unicode titlecase letter. For example, 2765: * the character "Lj" (Latin capital L with small letter j) is titlecase. 2766: * True if getType() returns TITLECASE_LETTER. 2767: * <br> 2768: * titlecase = [Lt] 2769: * 2770: * @param ch character to test 2771: * @return true if ch is a Unicode titlecase letter, else false 2772: * @see #isLowerCase(char) 2773: * @see #isUpperCase(char) 2774: * @see #toTitleCase(char) 2775: * @see #getType(char) 2776: */ 2777: public static boolean isTitleCase(char ch) 2778: { 2779: return isTitleCase((int)ch); 2780: } 2781: 2782: /** 2783: * Determines if a character is a Unicode titlecase letter. For example, 2784: * the character "Lj" (Latin capital L with small letter j) is titlecase. 2785: * True if getType() returns TITLECASE_LETTER. 2786: * <br> 2787: * titlecase = [Lt] 2788: * 2789: * @param codePoint character to test 2790: * @return true if ch is a Unicode titlecase letter, else false 2791: * @see #isLowerCase(char) 2792: * @see #isUpperCase(char) 2793: * @see #toTitleCase(char) 2794: * @see #getType(char) 2795: * 2796: * @since 1.5 2797: */ 2798: public static boolean isTitleCase(int codePoint) 2799: { 2800: return getType(codePoint) == TITLECASE_LETTER; 2801: } 2802: 2803: 2804: /** 2805: * Determines if a character is a Unicode decimal digit. For example, 2806: * <code>'0'</code> is a digit. A character is a Unicode digit if 2807: * getType() returns DECIMAL_DIGIT_NUMBER. 2808: * <br> 2809: * Unicode decimal digit = [Nd] 2810: * 2811: * @param ch character to test 2812: * @return true if ch is a Unicode decimal digit, else false 2813: * @see #digit(char, int) 2814: * @see #forDigit(int, int) 2815: * @see #getType(char) 2816: */ 2817: public static boolean isDigit(char ch) 2818: { 2819: return isDigit((int)ch); 2820: } 2821: 2822: /** 2823: * Determines if a character is a Unicode decimal digit. For example, 2824: * <code>'0'</code> is a digit. A character is a Unicode digit if 2825: * getType() returns DECIMAL_DIGIT_NUMBER. 2826: * <br> 2827: * Unicode decimal digit = [Nd] 2828: * 2829: * @param codePoint character to test 2830: * @return true if ch is a Unicode decimal digit, else false 2831: * @see #digit(char, int) 2832: * @see #forDigit(int, int) 2833: * @see #getType(char) 2834: * 2835: * @since 1.5 2836: */ 2837: 2838: public static boolean isDigit(int codePoint) 2839: { 2840: return getType(codePoint) == DECIMAL_DIGIT_NUMBER; 2841: } 2842: 2843: /** 2844: * Determines if a character is part of the Unicode Standard. This is an 2845: * evolving standard, but covers every character in the data file. 2846: * <br> 2847: * defined = not [Cn] 2848: * 2849: * @param ch character to test 2850: * @return true if ch is a Unicode character, else false 2851: * @see #isDigit(char) 2852: * @see #isLetter(char) 2853: * @see #isLetterOrDigit(char) 2854: * @see #isLowerCase(char) 2855: * @see #isTitleCase(char) 2856: * @see #isUpperCase(char) 2857: */ 2858: public static boolean isDefined(char ch) 2859: { 2860: return isDefined((int)ch); 2861: } 2862: 2863: /** 2864: * Determines if a character is part of the Unicode Standard. This is an 2865: * evolving standard, but covers every character in the data file. 2866: * <br> 2867: * defined = not [Cn] 2868: * 2869: * @param codePoint character to test 2870: * @return true if ch is a Unicode character, else false 2871: * @see #isDigit(char) 2872: * @see #isLetter(char) 2873: * @see #isLetterOrDigit(char) 2874: * @see #isLowerCase(char) 2875: * @see #isTitleCase(char) 2876: * @see #isUpperCase(char) 2877: * 2878: * @since 1.5 2879: */ 2880: public static boolean isDefined(int codePoint) 2881: { 2882: return getType(codePoint) != UNASSIGNED; 2883: } 2884: 2885: /** 2886: * Determines if a character is a Unicode letter. Not all letters have case, 2887: * so this may return true when isLowerCase and isUpperCase return false. 2888: * A character is a Unicode letter if getType() returns one of 2889: * UPPERCASE_LETTER, LOWERCASE_LETTER, TITLECASE_LETTER, MODIFIER_LETTER, 2890: * or OTHER_LETTER. 2891: * <br> 2892: * letter = [Lu]|[Ll]|[Lt]|[Lm]|[Lo] 2893: * 2894: * @param ch character to test 2895: * @return true if ch is a Unicode letter, else false 2896: * @see #isDigit(char) 2897: * @see #isJavaIdentifierStart(char) 2898: * @see #isJavaLetter(char) 2899: * @see #isJavaLetterOrDigit(char) 2900: * @see #isLetterOrDigit(char) 2901: * @see #isLowerCase(char) 2902: * @see #isTitleCase(char) 2903: * @see #isUnicodeIdentifierStart(char) 2904: * @see #isUpperCase(char) 2905: */ 2906: public static boolean isLetter(char ch) 2907: { 2908: return isLetter((int)ch); 2909: } 2910: 2911: /** 2912: * Determines if a character is a Unicode letter. Not all letters have case, 2913: * so this may return true when isLowerCase and isUpperCase return false. 2914: * A character is a Unicode letter if getType() returns one of 2915: * UPPERCASE_LETTER, LOWERCASE_LETTER, TITLECASE_LETTER, MODIFIER_LETTER, 2916: * or OTHER_LETTER. 2917: * <br> 2918: * letter = [Lu]|[Ll]|[Lt]|[Lm]|[Lo] 2919: * 2920: * @param codePoint character to test 2921: * @return true if ch is a Unicode letter, else false 2922: * @see #isDigit(char) 2923: * @see #isJavaIdentifierStart(char) 2924: * @see #isJavaLetter(char) 2925: * @see #isJavaLetterOrDigit(char) 2926: * @see #isLetterOrDigit(char) 2927: * @see #isLowerCase(char) 2928: * @see #isTitleCase(char) 2929: * @see #isUnicodeIdentifierStart(char) 2930: * @see #isUpperCase(char) 2931: * 2932: * @since 1.5 2933: */ 2934: public static boolean isLetter(int codePoint) 2935: { 2936: return ((1 << getType(codePoint)) 2937: & ((1 << UPPERCASE_LETTER) 2938: | (1 << LOWERCASE_LETTER) 2939: | (1 << TITLECASE_LETTER) 2940: | (1 << MODIFIER_LETTER) 2941: | (1 << OTHER_LETTER))) != 0; 2942: } 2943: /** 2944: * Returns the index into the given CharSequence that is offset 2945: * <code>codePointOffset</code> code points from <code>index</code>. 2946: * @param seq the CharSequence 2947: * @param index the start position in the CharSequence 2948: * @param codePointOffset the number of code points offset from the start 2949: * position 2950: * @return the index into the CharSequence that is codePointOffset code 2951: * points offset from index 2952: * 2953: * @throws NullPointerException if seq is null 2954: * @throws IndexOutOfBoundsException if index is negative or greater than the 2955: * length of the sequence. 2956: * @throws IndexOutOfBoundsException if codePointOffset is positive and the 2957: * subsequence from index to the end of seq has fewer than codePointOffset 2958: * code points 2959: * @throws IndexOutOfBoundsException if codePointOffset is negative and the 2960: * subsequence from the start of seq to index has fewer than 2961: * (-codePointOffset) code points 2962: * @since 1.5 2963: */ 2964: public static int offsetByCodePoints(CharSequence seq, 2965: int index, 2966: int codePointOffset) 2967: { 2968: int len = seq.length(); 2969: if (index < 0 || index > len) 2970: throw new IndexOutOfBoundsException(); 2971: 2972: int numToGo = codePointOffset; 2973: int offset = index; 2974: int adjust = 1; 2975: if (numToGo >= 0) 2976: { 2977: for (; numToGo > 0; offset++) 2978: { 2979: numToGo--; 2980: if (Character.isHighSurrogate(seq.charAt(offset)) 2981: && (offset + 1) < len 2982: && Character.isLowSurrogate(seq.charAt(offset + 1))) 2983: offset++; 2984: } 2985: return offset; 2986: } 2987: else 2988: { 2989: numToGo *= -1; 2990: for (; numToGo > 0;) 2991: { 2992: numToGo--; 2993: offset--; 2994: if (Character.isLowSurrogate(seq.charAt(offset)) 2995: && (offset - 1) >= 0 2996: && Character.isHighSurrogate(seq.charAt(offset - 1))) 2997: offset--; 2998: } 2999: return offset; 3000: } 3001: } 3002: 3003: /** 3004: * Returns the index into the given char subarray that is offset 3005: * <code>codePointOffset</code> code points from <code>index</code>. 3006: * @param a the char array 3007: * @param start the start index of the subarray 3008: * @param count the length of the subarray 3009: * @param index the index to be offset 3010: * @param codePointOffset the number of code points offset from <code>index 3011: * </code> 3012: * @return the index into the char array 3013: * 3014: * @throws NullPointerException if a is null 3015: * @throws IndexOutOfBoundsException if start or count is negative or if 3016: * start + count is greater than the length of the array 3017: * @throws IndexOutOfBoundsException if index is less than start or larger 3018: * than start + count 3019: * @throws IndexOutOfBoundsException if codePointOffset is positive and the 3020: * subarray from index to start + count - 1 has fewer than codePointOffset 3021: * code points. 3022: * @throws IndexOutOfBoundsException if codePointOffset is negative and the 3023: * subarray from start to index - 1 has fewer than (-codePointOffset) code 3024: * points 3025: * 3026: * @since 1.5 3027: */ 3028: public static int offsetByCodePoints(char[] a, 3029: int start, 3030: int count, 3031: int index, 3032: int codePointOffset) 3033: { 3034: int len = a.length; 3035: int end = start + count; 3036: if (start < 0 || count < 0 || end > len || index < start || index > end) 3037: throw new IndexOutOfBoundsException(); 3038: 3039: int numToGo = codePointOffset; 3040: int offset = index; 3041: int adjust = 1; 3042: if (numToGo >= 0) 3043: { 3044: for (; numToGo > 0; offset++) 3045: { 3046: numToGo--; 3047: if (Character.isHighSurrogate(a[offset]) 3048: && (offset + 1) < len 3049: && Character.isLowSurrogate(a[offset + 1])) 3050: offset++; 3051: } 3052: return offset; 3053: } 3054: else 3055: { 3056: numToGo *= -1; 3057: for (; numToGo > 0;) 3058: { 3059: numToGo--; 3060: offset--; 3061: if (Character.isLowSurrogate(a[offset]) 3062: && (offset - 1) >= 0 3063: && Character.isHighSurrogate(a[offset - 1])) 3064: offset--; 3065: if (offset < start) 3066: throw new IndexOutOfBoundsException(); 3067: } 3068: return offset; 3069: } 3070: 3071: } 3072: 3073: /** 3074: * Returns the number of Unicode code points in the specified range of the 3075: * given CharSequence. The first char in the range is at position 3076: * beginIndex and the last one is at position endIndex - 1. Paired 3077: * surrogates (supplementary characters are represented by a pair of chars - 3078: * one from the high surrogates and one from the low surrogates) 3079: * count as just one code point. 3080: * @param seq the CharSequence to inspect 3081: * @param beginIndex the beginning of the range 3082: * @param endIndex the end of the range 3083: * @return the number of Unicode code points in the given range of the 3084: * sequence 3085: * @throws NullPointerException if seq is null 3086: * @throws IndexOutOfBoundsException if beginIndex is negative, endIndex is 3087: * larger than the length of seq, or if beginIndex is greater than endIndex. 3088: * @since 1.5 3089: */ 3090: public static int codePointCount(CharSequence seq, int beginIndex, 3091: int endIndex) 3092: { 3093: int len = seq.length(); 3094: if (beginIndex < 0 || endIndex > len || beginIndex > endIndex) 3095: throw new IndexOutOfBoundsException(); 3096: 3097: int count = 0; 3098: for (int i = beginIndex; i < endIndex; i++) 3099: { 3100: count++; 3101: // If there is a pairing, count it only once. 3102: if (isHighSurrogate(seq.charAt(i)) && (i + 1) < endIndex 3103: && isLowSurrogate(seq.charAt(i + 1))) 3104: i ++; 3105: } 3106: return count; 3107: } 3108: 3109: /** 3110: * Returns the number of Unicode code points in the specified range of the 3111: * given char array. The first char in the range is at position 3112: * offset and the length of the range is count. Paired surrogates 3113: * (supplementary characters are represented by a pair of chars - 3114: * one from the high surrogates and one from the low surrogates) 3115: * count as just one code point. 3116: * @param a the char array to inspect 3117: * @param offset the beginning of the range 3118: * @param count the length of the range 3119: * @return the number of Unicode code points in the given range of the 3120: * array 3121: * @throws NullPointerException if a is null 3122: * @throws IndexOutOfBoundsException if offset or count is negative or if 3123: * offset + countendIndex is larger than the length of a. 3124: * @since 1.5 3125: */ 3126: public static int codePointCount(char[] a, int offset, 3127: int count) 3128: { 3129: int len = a.length; 3130: int end = offset + count; 3131: if (offset < 0 || count < 0 || end > len) 3132: throw new IndexOutOfBoundsException(); 3133: 3134: int counter = 0; 3135: for (int i = offset; i < end; i++) 3136: { 3137: counter++; 3138: // If there is a pairing, count it only once. 3139: if (isHighSurrogate(a[i]) && (i + 1) < end 3140: && isLowSurrogate(a[i + 1])) 3141: i ++; 3142: } 3143: return counter; 3144: } 3145: 3146: /** 3147: * Determines if a character is a Unicode letter or a Unicode digit. This 3148: * is the combination of isLetter and isDigit. 3149: * <br> 3150: * letter or digit = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nd] 3151: * 3152: * @param ch character to test 3153: * @return true if ch is a Unicode letter or a Unicode digit, else false 3154: * @see #isDigit(char) 3155: * @see #isJavaIdentifierPart(char) 3156: * @see #isJavaLetter(char) 3157: * @see #isJavaLetterOrDigit(char) 3158: * @see #isLetter(char) 3159: * @see #isUnicodeIdentifierPart(char) 3160: */ 3161: public static boolean isLetterOrDigit(char ch) 3162: { 3163: return isLetterOrDigit((int)ch); 3164: } 3165: 3166: /** 3167: * Determines if a character is a Unicode letter or a Unicode digit. This 3168: * is the combination of isLetter and isDigit. 3169: * <br> 3170: * letter or digit = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nd] 3171: * 3172: * @param codePoint character to test 3173: * @return true if ch is a Unicode letter or a Unicode digit, else false 3174: * @see #isDigit(char) 3175: * @see #isJavaIdentifierPart(char) 3176: * @see #isJavaLetter(char) 3177: * @see #isJavaLetterOrDigit(char) 3178: * @see #isLetter(char) 3179: * @see #isUnicodeIdentifierPart(char) 3180: * 3181: * @since 1.5 3182: */ 3183: public static boolean isLetterOrDigit(int codePoint) 3184: { 3185: return ((1 << getType(codePoint)) 3186: & ((1 << UPPERCASE_LETTER) 3187: | (1 << LOWERCASE_LETTER) 3188: | (1 << TITLECASE_LETTER) 3189: | (1 << MODIFIER_LETTER) 3190: | (1 << OTHER_LETTER) 3191: | (1 << DECIMAL_DIGIT_NUMBER))) != 0; 3192: } 3193: 3194: /** 3195: * Determines if a character can start a Java identifier. This is the 3196: * combination of isLetter, any character where getType returns 3197: * LETTER_NUMBER, currency symbols (like '$'), and connecting punctuation 3198: * (like '_'). 3199: * 3200: * @param ch character to test 3201: * @return true if ch can start a Java identifier, else false 3202: * @deprecated Replaced by {@link #isJavaIdentifierStart(char)} 3203: * @see #isJavaLetterOrDigit(char) 3204: * @see #isJavaIdentifierStart(char) 3205: * @see #isJavaIdentifierPart(char) 3206: * @see #isLetter(char) 3207: * @see #isLetterOrDigit(char) 3208: * @see #isUnicodeIdentifierStart(char) 3209: */ 3210: public static boolean isJavaLetter(char ch) 3211: { 3212: return isJavaIdentifierStart(ch); 3213: } 3214: 3215: /** 3216: * Determines if a character can follow the first letter in 3217: * a Java identifier. This is the combination of isJavaLetter (isLetter, 3218: * type of LETTER_NUMBER, currency, connecting punctuation) and digit, 3219: * numeric letter (like Roman numerals), combining marks, non-spacing marks, 3220: * or isIdentifierIgnorable. 3221: * 3222: * @param ch character to test 3223: * @return true if ch can follow the first letter in a Java identifier 3224: * @deprecated Replaced by {@link #isJavaIdentifierPart(char)} 3225: * @see #isJavaLetter(char) 3226: * @see #isJavaIdentifierStart(char) 3227: * @see #isJavaIdentifierPart(char) 3228: * @see #isLetter(char) 3229: * @see #isLetterOrDigit(char) 3230: * @see #isUnicodeIdentifierPart(char) 3231: * @see #isIdentifierIgnorable(char) 3232: */ 3233: public static boolean isJavaLetterOrDigit(char ch) 3234: { 3235: return isJavaIdentifierPart(ch); 3236: } 3237: 3238: /** 3239: * Determines if a character can start a Java identifier. This is the 3240: * combination of isLetter, any character where getType returns 3241: * LETTER_NUMBER, currency symbols (like '$'), and connecting punctuation 3242: * (like '_'). 3243: * <br> 3244: * Java identifier start = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Sc]|[Pc] 3245: * 3246: * @param ch character to test 3247: * @return true if ch can start a Java identifier, else false 3248: * @see #isJavaIdentifierPart(char) 3249: * @see #isLetter(char) 3250: * @see #isUnicodeIdentifierStart(char) 3251: * @since 1.1 3252: */ 3253: public static boolean isJavaIdentifierStart(char ch) 3254: { 3255: return isJavaIdentifierStart((int)ch); 3256: } 3257: 3258: /** 3259: * Determines if a character can start a Java identifier. This is the 3260: * combination of isLetter, any character where getType returns 3261: * LETTER_NUMBER, currency symbols (like '$'), and connecting punctuation 3262: * (like '_'). 3263: * <br> 3264: * Java identifier start = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Sc]|[Pc] 3265: * 3266: * @param codePoint character to test 3267: * @return true if ch can start a Java identifier, else false 3268: * @see #isJavaIdentifierPart(char) 3269: * @see #isLetter(char) 3270: * @see #isUnicodeIdentifierStart(char) 3271: * @since 1.5 3272: */ 3273: public static boolean isJavaIdentifierStart(int codePoint) 3274: { 3275: return ((1 << getType(codePoint)) 3276: & ((1 << UPPERCASE_LETTER) 3277: | (1 << LOWERCASE_LETTER) 3278: | (1 << TITLECASE_LETTER) 3279: | (1 << MODIFIER_LETTER) 3280: | (1 << OTHER_LETTER) 3281: | (1 << LETTER_NUMBER) 3282: | (1 << CURRENCY_SYMBOL) 3283: | (1 << CONNECTOR_PUNCTUATION))) != 0; 3284: } 3285: 3286: /** 3287: * Determines if a character can follow the first letter in 3288: * a Java identifier. This is the combination of isJavaLetter (isLetter, 3289: * type of LETTER_NUMBER, currency, connecting punctuation) and digit, 3290: * numeric letter (like Roman numerals), combining marks, non-spacing marks, 3291: * or isIdentifierIgnorable. 3292: * <br> 3293: * Java identifier extender = 3294: * [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Sc]|[Pc]|[Mn]|[Mc]|[Nd]|[Cf] 3295: * |U+0000-U+0008|U+000E-U+001B|U+007F-U+009F 3296: * 3297: * @param ch character to test 3298: * @return true if ch can follow the first letter in a Java identifier 3299: * @see #isIdentifierIgnorable(char) 3300: * @see #isJavaIdentifierStart(char) 3301: * @see #isLetterOrDigit(char) 3302: * @see #isUnicodeIdentifierPart(char) 3303: * @since 1.1 3304: */ 3305: public static boolean isJavaIdentifierPart(char ch) 3306: { 3307: return isJavaIdentifierPart((int)ch); 3308: } 3309: 3310: /** 3311: * Determines if a character can follow the first letter in 3312: * a Java identifier. This is the combination of isJavaLetter (isLetter, 3313: * type of LETTER_NUMBER, currency, connecting punctuation) and digit, 3314: * numeric letter (like Roman numerals), combining marks, non-spacing marks, 3315: * or isIdentifierIgnorable. 3316: * <br> 3317: * Java identifier extender = 3318: * [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Sc]|[Pc]|[Mn]|[Mc]|[Nd]|[Cf] 3319: * |U+0000-U+0008|U+000E-U+001B|U+007F-U+009F 3320: * 3321: * @param codePoint character to test 3322: * @return true if ch can follow the first letter in a Java identifier 3323: * @see #isIdentifierIgnorable(char) 3324: * @see #isJavaIdentifierStart(char) 3325: * @see #isLetterOrDigit(char) 3326: * @see #isUnicodeIdentifierPart(char) 3327: * @since 1.5 3328: */ 3329: public static boolean isJavaIdentifierPart(int codePoint) 3330: { 3331: int category = getType(codePoint); 3332: return ((1 << category) 3333: & ((1 << UPPERCASE_LETTER) 3334: | (1 << LOWERCASE_LETTER) 3335: | (1 << TITLECASE_LETTER) 3336: | (1 << MODIFIER_LETTER) 3337: | (1 << OTHER_LETTER) 3338: | (1 << NON_SPACING_MARK) 3339: | (1 << COMBINING_SPACING_MARK) 3340: | (1 << DECIMAL_DIGIT_NUMBER) 3341: | (1 << LETTER_NUMBER) 3342: | (1 << CURRENCY_SYMBOL) 3343: | (1 << CONNECTOR_PUNCTUATION) 3344: | (1 << FORMAT))) != 0 3345: || (category == CONTROL && isIdentifierIgnorable(codePoint)); 3346: } 3347: 3348: /** 3349: * Determines if a character can start a Unicode identifier. Only 3350: * letters can start a Unicode identifier, but this includes characters 3351: * in LETTER_NUMBER. 3352: * <br> 3353: * Unicode identifier start = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl] 3354: * 3355: * @param ch character to test 3356: * @return true if ch can start a Unicode identifier, else false 3357: * @see #isJavaIdentifierStart(char) 3358: * @see #isLetter(char) 3359: * @see #isUnicodeIdentifierPart(char) 3360: * @since 1.1 3361: */ 3362: public static boolean isUnicodeIdentifierStart(char ch) 3363: { 3364: return isUnicodeIdentifierStart((int)ch); 3365: } 3366: 3367: /** 3368: * Determines if a character can start a Unicode identifier. Only 3369: * letters can start a Unicode identifier, but this includes characters 3370: * in LETTER_NUMBER. 3371: * <br> 3372: * Unicode identifier start = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl] 3373: * 3374: * @param codePoint character to test 3375: * @return true if ch can start a Unicode identifier, else false 3376: * @see #isJavaIdentifierStart(char) 3377: * @see #isLetter(char) 3378: * @see #isUnicodeIdentifierPart(char) 3379: * @since 1.5 3380: */ 3381: public static boolean isUnicodeIdentifierStart(int codePoint) 3382: { 3383: return ((1 << getType(codePoint)) 3384: & ((1 << UPPERCASE_LETTER) 3385: | (1 << LOWERCASE_LETTER) 3386: | (1 << TITLECASE_LETTER) 3387: | (1 << MODIFIER_LETTER) 3388: | (1 << OTHER_LETTER) 3389: | (1 << LETTER_NUMBER))) != 0; 3390: } 3391: 3392: /** 3393: * Determines if a character can follow the first letter in 3394: * a Unicode identifier. This includes letters, connecting punctuation, 3395: * digits, numeric letters, combining marks, non-spacing marks, and 3396: * isIdentifierIgnorable. 3397: * <br> 3398: * Unicode identifier extender = 3399: * [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Mn]|[Mc]|[Nd]|[Pc]|[Cf]| 3400: * |U+0000-U+0008|U+000E-U+001B|U+007F-U+009F 3401: * 3402: * @param ch character to test 3403: * @return true if ch can follow the first letter in a Unicode identifier 3404: * @see #isIdentifierIgnorable(char) 3405: * @see #isJavaIdentifierPart(char) 3406: * @see #isLetterOrDigit(char) 3407: * @see #isUnicodeIdentifierStart(char) 3408: * @since 1.1 3409: */ 3410: public static boolean isUnicodeIdentifierPart(char ch) 3411: { 3412: return isUnicodeIdentifierPart((int)ch); 3413: } 3414: 3415: /** 3416: * Determines if a character can follow the first letter in 3417: * a Unicode identifier. This includes letters, connecting punctuation, 3418: * digits, numeric letters, combining marks, non-spacing marks, and 3419: * isIdentifierIgnorable. 3420: * <br> 3421: * Unicode identifier extender = 3422: * [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Mn]|[Mc]|[Nd]|[Pc]|[Cf]| 3423: * |U+0000-U+0008|U+000E-U+001B|U+007F-U+009F 3424: * 3425: * @param codePoint character to test 3426: * @return true if ch can follow the first letter in a Unicode identifier 3427: * @see #isIdentifierIgnorable(char) 3428: * @see #isJavaIdentifierPart(char) 3429: * @see #isLetterOrDigit(char) 3430: * @see #isUnicodeIdentifierStart(char) 3431: * @since 1.5 3432: */ 3433: public static boolean isUnicodeIdentifierPart(int codePoint) 3434: { 3435: int category = getType(codePoint); 3436: return ((1 << category) 3437: & ((1 << UPPERCASE_LETTER) 3438: | (1 << LOWERCASE_LETTER) 3439: | (1 << TITLECASE_LETTER) 3440: | (1 << MODIFIER_LETTER) 3441: | (1 << OTHER_LETTER) 3442: | (1 << NON_SPACING_MARK) 3443: | (1 << COMBINING_SPACING_MARK) 3444: | (1 << DECIMAL_DIGIT_NUMBER) 3445: | (1 << LETTER_NUMBER) 3446: | (1 << CONNECTOR_PUNCTUATION) 3447: | (1 << FORMAT))) != 0 3448: || (category == CONTROL && isIdentifierIgnorable(codePoint)); 3449: } 3450: 3451: /** 3452: * Determines if a character is ignorable in a Unicode identifier. This 3453: * includes the non-whitespace ISO control characters (<code>'\u0000'</code> 3454: * through <code>'\u0008'</code>, <code>'\u000E'</code> through 3455: * <code>'\u001B'</code>, and <code>'\u007F'</code> through 3456: * <code>'\u009F'</code>), and FORMAT characters. 3457: * <br> 3458: * Unicode identifier ignorable = [Cf]|U+0000-U+0008|U+000E-U+001B 3459: * |U+007F-U+009F 3460: * 3461: * @param ch character to test 3462: * @return true if ch is ignorable in a Unicode or Java identifier 3463: * @see #isJavaIdentifierPart(char) 3464: * @see #isUnicodeIdentifierPart(char) 3465: * @since 1.1 3466: */ 3467: public static boolean isIdentifierIgnorable(char ch) 3468: { 3469: return isIdentifierIgnorable((int)ch); 3470: } 3471: /** 3472: * Determines if a character is ignorable in a Unicode identifier. This 3473: * includes the non-whitespace ISO control characters (<code>'\u0000'</code> 3474: * through <code>'\u0008'</code>, <code>'\u000E'</code> through 3475: * <code>'\u001B'</code>, and <code>'\u007F'</code> through 3476: * <code>'\u009F'</code>), and FORMAT characters. 3477: * <br> 3478: * Unicode identifier ignorable = [Cf]|U+0000-U+0008|U+000E-U+001B 3479: * |U+007F-U+009F 3480: * 3481: * @param codePoint character to test 3482: * @return true if ch is ignorable in a Unicode or Java identifier 3483: * @see #isJavaIdentifierPart(char) 3484: * @see #isUnicodeIdentifierPart(char) 3485: * @since 1.5 3486: */ 3487: public static boolean isIdentifierIgnorable(int codePoint) 3488: { 3489: if ((codePoint >= 0 && codePoint <= 0x0008) 3490: || (codePoint >= 0x000E && codePoint <= 0x001B) 3491: || (codePoint >= 0x007F && codePoint <= 0x009F) 3492: || getType(codePoint) == FORMAT) 3493: return true; 3494: return false; 3495: } 3496: 3497: /** 3498: * Converts a Unicode character into its lowercase equivalent mapping. 3499: * If a mapping does not exist, then the character passed is returned. 3500: * Note that isLowerCase(toLowerCase(ch)) does not always return true. 3501: * 3502: * @param ch character to convert to lowercase 3503: * @return lowercase mapping of ch, or ch if lowercase mapping does 3504: * not exist 3505: * @see #isLowerCase(char) 3506: * @see #isUpperCase(char) 3507: * @see #toTitleCase(char) 3508: * @see #toUpperCase(char) 3509: */ 3510: public static char toLowerCase(char ch) 3511: { 3512: return (char) (lower[0][readCodePoint((int)ch) >>> 7] + ch); 3513: } 3514: 3515: /** 3516: * Converts a Unicode character into its lowercase equivalent mapping. 3517: * If a mapping does not exist, then the character passed is returned. 3518: * Note that isLowerCase(toLowerCase(ch)) does not always return true. 3519: * 3520: * @param codePoint character to convert to lowercase 3521: * @return lowercase mapping of ch, or ch if lowercase mapping does 3522: * not exist 3523: * @see #isLowerCase(char) 3524: * @see #isUpperCase(char) 3525: * @see #toTitleCase(char) 3526: * @see #toUpperCase(char) 3527: * 3528: * @since 1.5 3529: */ 3530: public static int toLowerCase(int codePoint) 3531: { 3532: // If the code point is unassigned or in one of the private use areas 3533: // then we delegate the call to the appropriate private static inner class. 3534: int plane = codePoint >>> 16; 3535: if (plane > 2 && plane < 14) 3536: return UnassignedCharacters.toLowerCase(codePoint); 3537: if (plane > 14) 3538: return PrivateUseCharacters.toLowerCase(codePoint); 3539: 3540: // The short value stored in lower[plane] is the signed difference between 3541: // codePoint and its lowercase conversion. 3542: return ((short)lower[plane][readCodePoint(codePoint) >>> 7]) + codePoint; 3543: } 3544: 3545: /** 3546: * Converts a Unicode character into its uppercase equivalent mapping. 3547: * If a mapping does not exist, then the character passed is returned. 3548: * Note that isUpperCase(toUpperCase(ch)) does not always return true. 3549: * 3550: * @param ch character to convert to uppercase 3551: * @return uppercase mapping of ch, or ch if uppercase mapping does 3552: * not exist 3553: * @see #isLowerCase(char) 3554: * @see #isUpperCase(char) 3555: * @see #toLowerCase(char) 3556: * @see #toTitleCase(char) 3557: */ 3558: public static char toUpperCase(char ch) 3559: { 3560: return (char) (upper[0][readCodePoint((int)ch) >>> 7] + ch); 3561: } 3562: 3563: /** 3564: * Converts a Unicode character into its uppercase equivalent mapping. 3565: * If a mapping does not exist, then the character passed is returned. 3566: * Note that isUpperCase(toUpperCase(ch)) does not always return true. 3567: * 3568: * @param codePoint character to convert to uppercase 3569: * @return uppercase mapping of ch, or ch if uppercase mapping does 3570: * not exist 3571: * @see #isLowerCase(char) 3572: * @see #isUpperCase(char) 3573: * @see #toLowerCase(char) 3574: * @see #toTitleCase(char) 3575: * 3576: * @since 1.5 3577: */ 3578: public static int toUpperCase(int codePoint) 3579: { 3580: // If the code point is unassigned or in one of the private use areas 3581: // then we delegate the call to the appropriate private static inner class. 3582: int plane = codePoint >>> 16; 3583: if (plane > 2 && plane < 14) 3584: return UnassignedCharacters.toUpperCase(codePoint); 3585: if (plane > 14) 3586: return PrivateUseCharacters.toUpperCase(codePoint); 3587: 3588: // The short value stored in upper[plane] is the signed difference between 3589: // codePoint and its uppercase conversion. 3590: return ((short)upper[plane][readCodePoint(codePoint) >>> 7]) + codePoint; 3591: } 3592: 3593: /** 3594: * Converts a Unicode character into its titlecase equivalent mapping. 3595: * If a mapping does not exist, then the character passed is returned. 3596: * Note that isTitleCase(toTitleCase(ch)) does not always return true. 3597: * 3598: * @param ch character to convert to titlecase 3599: * @return titlecase mapping of ch, or ch if titlecase mapping does 3600: * not exist 3601: * @see #isTitleCase(char) 3602: * @see #toLowerCase(char) 3603: * @see #toUpperCase(char) 3604: */ 3605: public static char toTitleCase(char ch) 3606: { 3607: // As title is short, it doesn't hurt to exhaustively iterate over it. 3608: for (int i = title.length - 2; i >= 0; i -= 2) 3609: if (title[i] == ch) 3610: return title[i + 1]; 3611: return toUpperCase(ch); 3612: } 3613: 3614: /** 3615: * Converts a Unicode character into its titlecase equivalent mapping. 3616: * If a mapping does not exist, then the character passed is returned. 3617: * Note that isTitleCase(toTitleCase(ch)) does not always return true. 3618: * 3619: * @param codePoint character to convert to titlecase 3620: * @return titlecase mapping of ch, or ch if titlecase mapping does 3621: * not exist 3622: * @see #isTitleCase(char) 3623: * @see #toLowerCase(char) 3624: * @see #toUpperCase(char) 3625: * 3626: * @since 1.5 3627: */ 3628: public static int toTitleCase(int codePoint) 3629: { 3630: // As of Unicode 4.0.0 no characters outside of plane 0 have 3631: // titlecase mappings that are different from their uppercase 3632: // mapping. 3633: if (codePoint < 0x10000) 3634: return (int) toTitleCase((char)codePoint); 3635: return toUpperCase(codePoint); 3636: } 3637: 3638: /** 3639: * Converts a character into a digit of the specified radix. If the radix 3640: * exceeds MIN_RADIX or MAX_RADIX, or if the result of getNumericValue(ch) 3641: * exceeds the radix, or if ch is not a decimal digit or in the case 3642: * insensitive set of 'a'-'z', the result is -1. 3643: * <br> 3644: * character argument boundary = [Nd]|U+0041-U+005A|U+0061-U+007A 3645: * |U+FF21-U+FF3A|U+FF41-U+FF5A 3646: * 3647: * @param ch character to convert into a digit 3648: * @param radix radix in which ch is a digit 3649: * @return digit which ch represents in radix, or -1 not a valid digit 3650: * @see #MIN_RADIX 3651: * @see #MAX_RADIX 3652: * @see #forDigit(int, int) 3653: * @see #isDigit(char) 3654: * @see #getNumericValue(char) 3655: */ 3656: public static int digit(char ch, int radix) 3657: { 3658: if (radix < MIN_RADIX || radix > MAX_RADIX) 3659: return -1; 3660: char attr = readCodePoint((int)ch); 3661: if (((1 << (attr & TYPE_MASK)) 3662: & ((1 << UPPERCASE_LETTER) 3663: | (1 << LOWERCASE_LETTER) 3664: | (1 << DECIMAL_DIGIT_NUMBER))) != 0) 3665: { 3666: // Signedness doesn't matter; 0xffff vs. -1 are both rejected. 3667: int digit = numValue[0][attr >> 7]; 3668: return (digit < radix) ? digit : -1; 3669: } 3670: return -1; 3671: } 3672: 3673: /** 3674: * Converts a character into a digit of the specified radix. If the radix 3675: * exceeds MIN_RADIX or MAX_RADIX, or if the result of getNumericValue(ch) 3676: * exceeds the radix, or if ch is not a decimal digit or in the case 3677: * insensitive set of 'a'-'z', the result is -1. 3678: * <br> 3679: * character argument boundary = [Nd]|U+0041-U+005A|U+0061-U+007A 3680: * |U+FF21-U+FF3A|U+FF41-U+FF5A 3681: * 3682: * @param codePoint character to convert into a digit 3683: * @param radix radix in which ch is a digit 3684: * @return digit which ch represents in radix, or -1 not a valid digit 3685: * @see #MIN_RADIX 3686: * @see #MAX_RADIX 3687: * @see #forDigit(int, int) 3688: * @see #isDigit(char) 3689: * @see #getNumericValue(char) 3690: */ 3691: public static int digit(int codePoint, int radix) 3692: { 3693: if (radix < MIN_RADIX || radix > MAX_RADIX) 3694: return -1; 3695: 3696: // If the code point is unassigned or in one of the private use areas 3697: // then we delegate the call to the appropriate private static inner class. 3698: int plane = codePoint >>> 16; 3699: if (plane > 2 && plane < 14) 3700: return UnassignedCharacters.digit(codePoint, radix); 3701: if (plane > 14) 3702: return PrivateUseCharacters.digit(codePoint, radix); 3703: char attr = readCodePoint(codePoint); 3704: if (((1 << (attr & TYPE_MASK)) 3705: & ((1 << UPPERCASE_LETTER) 3706: | (1 << LOWERCASE_LETTER) 3707: | (1 << DECIMAL_DIGIT_NUMBER))) != 0) 3708: { 3709: // Signedness doesn't matter; 0xffff vs. -1 are both rejected. 3710: int digit = numValue[plane][attr >> 7]; 3711: 3712: // If digit is less than or equal to -3 then the numerical value was 3713: // too large to fit into numValue and is stored in CharData.LARGENUMS. 3714: if (digit <= -3) 3715: digit = CharData.LARGENUMS[-digit - 3]; 3716: return (digit < radix) ? digit : -1; 3717: } 3718: return -1; 3719: } 3720: 3721: /** 3722: * Returns the Unicode numeric value property of a character. For example, 3723: * <code>'\\u216C'</code> (the Roman numeral fifty) returns 50. 3724: * 3725: * <p>This method also returns values for the letters A through Z, (not 3726: * specified by Unicode), in these ranges: <code>'\u0041'</code> 3727: * through <code>'\u005A'</code> (uppercase); <code>'\u0061'</code> 3728: * through <code>'\u007A'</code> (lowercase); and <code>'\uFF21'</code> 3729: * through <code>'\uFF3A'</code>, <code>'\uFF41'</code> through 3730: * <code>'\uFF5A'</code> (full width variants). 3731: * 3732: * <p>If the character lacks a numeric value property, -1 is returned. 3733: * If the character has a numeric value property which is not representable 3734: * as a nonnegative integer, such as a fraction, -2 is returned. 3735: * 3736: * character argument boundary = [Nd]|[Nl]|[No]|U+0041-U+005A|U+0061-U+007A 3737: * |U+FF21-U+FF3A|U+FF41-U+FF5A 3738: * 3739: * @param ch character from which the numeric value property will 3740: * be retrieved 3741: * @return the numeric value property of ch, or -1 if it does not exist, or 3742: * -2 if it is not representable as a nonnegative integer 3743: * @see #forDigit(int, int) 3744: * @see #digit(char, int) 3745: * @see #isDigit(char) 3746: * @since 1.1 3747: */ 3748: public static int getNumericValue(char ch) 3749: { 3750: // Treat numValue as signed. 3751: return (short) numValue[0][readCodePoint((int)ch) >> 7]; 3752: } 3753: 3754: /** 3755: * Returns the Unicode numeric value property of a character. For example, 3756: * <code>'\\u216C'</code> (the Roman numeral fifty) returns 50. 3757: * 3758: * <p>This method also returns values for the letters A through Z, (not 3759: * specified by Unicode), in these ranges: <code>'\u0041'</code> 3760: * through <code>'\u005A'</code> (uppercase); <code>'\u0061'</code> 3761: * through <code>'\u007A'</code> (lowercase); and <code>'\uFF21'</code> 3762: * through <code>'\uFF3A'</code>, <code>'\uFF41'</code> through 3763: * <code>'\uFF5A'</code> (full width variants). 3764: * 3765: * <p>If the character lacks a numeric value property, -1 is returned. 3766: * If the character has a numeric value property which is not representable 3767: * as a nonnegative integer, such as a fraction, -2 is returned. 3768: * 3769: * character argument boundary = [Nd]|[Nl]|[No]|U+0041-U+005A|U+0061-U+007A 3770: * |U+FF21-U+FF3A|U+FF41-U+FF5A 3771: * 3772: * @param codePoint character from which the numeric value property will 3773: * be retrieved 3774: * @return the numeric value property of ch, or -1 if it does not exist, or 3775: * -2 if it is not representable as a nonnegative integer 3776: * @see #forDigit(int, int) 3777: * @see #digit(char, int) 3778: * @see #isDigit(char) 3779: * @since 1.5 3780: */ 3781: public static int getNumericValue(int codePoint) 3782: { 3783: // If the code point is unassigned or in one of the private use areas 3784: // then we delegate the call to the appropriate private static inner class. 3785: int plane = codePoint >>> 16; 3786: if (plane > 2 && plane < 14) 3787: return UnassignedCharacters.getNumericValue(codePoint); 3788: if (plane > 14) 3789: return PrivateUseCharacters.getNumericValue(codePoint); 3790: 3791: // If the value N found in numValue[plane] is less than or equal to -3 3792: // then the numeric value was too big to fit into 16 bits and is 3793: // stored in CharData.LARGENUMS at offset (-N - 3). 3794: short num = (short)numValue[plane][readCodePoint(codePoint) >> 7]; 3795: if (num <= -3) 3796: return CharData.LARGENUMS[-num - 3]; 3797: return num; 3798: } 3799: 3800: /** 3801: * Determines if a character is a ISO-LATIN-1 space. This is only the five 3802: * characters <code>'\t'</code>, <code>'\n'</code>, <code>'\f'</code>, 3803: * <code>'\r'</code>, and <code>' '</code>. 3804: * <br> 3805: * Java space = U+0020|U+0009|U+000A|U+000C|U+000D 3806: * 3807: * @param ch character to test 3808: * @return true if ch is a space, else false 3809: * @deprecated Replaced by {@link #isWhitespace(char)} 3810: * @see #isSpaceChar(char) 3811: * @see #isWhitespace(char) 3812: */ 3813: public static boolean isSpace(char ch) 3814: { 3815: // Performing the subtraction up front alleviates need to compare longs. 3816: return ch-- <= ' ' && ((1 << ch) 3817: & ((1 << (' ' - 1)) 3818: | (1 << ('\t' - 1)) 3819: | (1 << ('\n' - 1)) 3820: | (1 << ('\r' - 1)) 3821: | (1 << ('\f' - 1)))) != 0; 3822: } 3823: 3824: /** 3825: * Determines if a character is a Unicode space character. This includes 3826: * SPACE_SEPARATOR, LINE_SEPARATOR, and PARAGRAPH_SEPARATOR. 3827: * <br> 3828: * Unicode space = [Zs]|[Zp]|[Zl] 3829: * 3830: * @param ch character to test 3831: * @return true if ch is a Unicode space, else false 3832: * @see #isWhitespace(char) 3833: * @since 1.1 3834: */ 3835: public static boolean isSpaceChar(char ch) 3836: { 3837: return isSpaceChar((int)ch); 3838: } 3839: 3840: /** 3841: * Determines if a character is a Unicode space character. This includes 3842: * SPACE_SEPARATOR, LINE_SEPARATOR, and PARAGRAPH_SEPARATOR. 3843: * <br> 3844: * Unicode space = [Zs]|[Zp]|[Zl] 3845: * 3846: * @param codePoint character to test 3847: * @return true if ch is a Unicode space, else false 3848: * @see #isWhitespace(char) 3849: * @since 1.5 3850: */ 3851: public static boolean isSpaceChar(int codePoint) 3852: { 3853: return ((1 << getType(codePoint)) 3854: & ((1 << SPACE_SEPARATOR) 3855: | (1 << LINE_SEPARATOR) 3856: | (1 << PARAGRAPH_SEPARATOR))) != 0; 3857: } 3858: 3859: /** 3860: * Determines if a character is Java whitespace. This includes Unicode 3861: * space characters (SPACE_SEPARATOR, LINE_SEPARATOR, and 3862: * PARAGRAPH_SEPARATOR) except the non-breaking spaces 3863: * (<code>'\u00A0'</code>, <code>'\u2007'</code>, and <code>'\u202F'</code>); 3864: * and these characters: <code>'\u0009'</code>, <code>'\u000A'</code>, 3865: * <code>'\u000B'</code>, <code>'\u000C'</code>, <code>'\u000D'</code>, 3866: * <code>'\u001C'</code>, <code>'\u001D'</code>, <code>'\u001E'</code>, 3867: * and <code>'\u001F'</code>. 3868: * <br> 3869: * Java whitespace = ([Zs] not Nb)|[Zl]|[Zp]|U+0009-U+000D|U+001C-U+001F 3870: * 3871: * @param ch character to test 3872: * @return true if ch is Java whitespace, else false 3873: * @see #isSpaceChar(char) 3874: * @since 1.1 3875: */ 3876: public static boolean isWhitespace(char ch) 3877: { 3878: return isWhitespace((int) ch); 3879: } 3880: 3881: /** 3882: * Determines if a character is Java whitespace. This includes Unicode 3883: * space characters (SPACE_SEPARATOR, LINE_SEPARATOR, and 3884: * PARAGRAPH_SEPARATOR) except the non-breaking spaces 3885: * (<code>'\u00A0'</code>, <code>'\u2007'</code>, and <code>'\u202F'</code>); 3886: * and these characters: <code>'\u0009'</code>, <code>'\u000A'</code>, 3887: * <code>'\u000B'</code>, <code>'\u000C'</code>, <code>'\u000D'</code>, 3888: * <code>'\u001C'</code>, <code>'\u001D'</code>, <code>'\u001E'</code>, 3889: * and <code>'\u001F'</code>. 3890: * <br> 3891: * Java whitespace = ([Zs] not Nb)|[Zl]|[Zp]|U+0009-U+000D|U+001C-U+001F 3892: * 3893: * @param codePoint character to test 3894: * @return true if ch is Java whitespace, else false 3895: * @see #isSpaceChar(char) 3896: * @since 1.5 3897: */ 3898: public static boolean isWhitespace(int codePoint) 3899: { 3900: int plane = codePoint >>> 16; 3901: if (plane > 2 && plane < 14) 3902: return UnassignedCharacters.isWhiteSpace(codePoint); 3903: if (plane > 14) 3904: return PrivateUseCharacters.isWhiteSpace(codePoint); 3905: 3906: int attr = readCodePoint(codePoint); 3907: return ((((1 << (attr & TYPE_MASK)) 3908: & ((1 << SPACE_SEPARATOR) 3909: | (1 << LINE_SEPARATOR) 3910: | (1 << PARAGRAPH_SEPARATOR))) != 0) 3911: && (attr & NO_BREAK_MASK) == 0) 3912: || (codePoint <= '\u001F' && ((1 << codePoint) 3913: & ((1 << '\t') 3914: | (1 << '\n') 3915: | (1 << '\u000B') 3916: | (1 << '\u000C') 3917: | (1 << '\r') 3918: | (1 << '\u001C') 3919: | (1 << '\u001D') 3920: | (1 << '\u001E') 3921: | (1 << '\u001F'))) != 0); 3922: } 3923: 3924: /** 3925: * Determines if a character has the ISO Control property. 3926: * <br> 3927: * ISO Control = [Cc] 3928: * 3929: * @param ch character to test 3930: * @return true if ch is an ISO Control character, else false 3931: * @see #isSpaceChar(char) 3932: * @see #isWhitespace(char) 3933: * @since 1.1 3934: */ 3935: public static boolean isISOControl(char ch) 3936: { 3937: return isISOControl((int)ch); 3938: } 3939: 3940: /** 3941: * Determines if the character is an ISO Control character. This is true 3942: * if the code point is in the range [0, 0x001F] or if it is in the range 3943: * [0x007F, 0x009F]. 3944: * @param codePoint the character to check 3945: * @return true if the character is in one of the above ranges 3946: * 3947: * @since 1.5 3948: */ 3949: public static boolean isISOControl(int codePoint) 3950: { 3951: if ((codePoint >= 0 && codePoint <= 0x001F) 3952: || (codePoint >= 0x007F && codePoint <= 0x009F)) 3953: return true; 3954: return false; 3955: } 3956: 3957: /** 3958: * Returns the Unicode general category property of a character. 3959: * 3960: * @param ch character from which the general category property will 3961: * be retrieved 3962: * @return the character category property of ch as an integer 3963: * @see #UNASSIGNED 3964: * @see #UPPERCASE_LETTER 3965: * @see #LOWERCASE_LETTER 3966: * @see #TITLECASE_LETTER 3967: * @see #MODIFIER_LETTER 3968: * @see #OTHER_LETTER 3969: * @see #NON_SPACING_MARK 3970: * @see #ENCLOSING_MARK 3971: * @see #COMBINING_SPACING_MARK 3972: * @see #DECIMAL_DIGIT_NUMBER 3973: * @see #LETTER_NUMBER 3974: * @see #OTHER_NUMBER 3975: * @see #SPACE_SEPARATOR 3976: * @see #LINE_SEPARATOR 3977: * @see #PARAGRAPH_SEPARATOR 3978: * @see #CONTROL 3979: * @see #FORMAT 3980: * @see #PRIVATE_USE 3981: * @see #SURROGATE 3982: * @see #DASH_PUNCTUATION 3983: * @see #START_PUNCTUATION 3984: * @see #END_PUNCTUATION 3985: * @see #CONNECTOR_PUNCTUATION 3986: * @see #OTHER_PUNCTUATION 3987: * @see #MATH_SYMBOL 3988: * @see #CURRENCY_SYMBOL 3989: * @see #MODIFIER_SYMBOL 3990: * @see #INITIAL_QUOTE_PUNCTUATION 3991: * @see #FINAL_QUOTE_PUNCTUATION 3992: * @since 1.1 3993: */ 3994: public static int getType(char ch) 3995: { 3996: return getType((int)ch); 3997: } 3998: 3999: /** 4000: * Returns the Unicode general category property of a character. 4001: * 4002: * @param codePoint character from which the general category property will 4003: * be retrieved 4004: * @return the character category property of ch as an integer 4005: * @see #UNASSIGNED 4006: * @see #UPPERCASE_LETTER 4007: * @see #LOWERCASE_LETTER 4008: * @see #TITLECASE_LETTER 4009: * @see #MODIFIER_LETTER 4010: * @see #OTHER_LETTER 4011: * @see #NON_SPACING_MARK 4012: * @see #ENCLOSING_MARK 4013: * @see #COMBINING_SPACING_MARK 4014: * @see #DECIMAL_DIGIT_NUMBER 4015: * @see #LETTER_NUMBER 4016: * @see #OTHER_NUMBER 4017: * @see #SPACE_SEPARATOR 4018: * @see #LINE_SEPARATOR 4019: * @see #PARAGRAPH_SEPARATOR 4020: * @see #CONTROL 4021: * @see #FORMAT 4022: * @see #PRIVATE_USE 4023: * @see #SURROGATE 4024: * @see #DASH_PUNCTUATION 4025: * @see #START_PUNCTUATION 4026: * @see #END_PUNCTUATION 4027: * @see #CONNECTOR_PUNCTUATION 4028: * @see #OTHER_PUNCTUATION 4029: * @see #MATH_SYMBOL 4030: * @see #CURRENCY_SYMBOL 4031: * @see #MODIFIER_SYMBOL 4032: * @see #INITIAL_QUOTE_PUNCTUATION 4033: * @see #FINAL_QUOTE_PUNCTUATION 4034: * 4035: * @since 1.5 4036: */ 4037: public static int getType(int codePoint) 4038: { 4039: // If the codePoint is unassigned or in one of the private use areas 4040: // then we delegate the call to the appropriate private static inner class. 4041: int plane = codePoint >>> 16; 4042: if (plane > 2 && plane < 14) 4043: return UnassignedCharacters.getType(codePoint); 4044: if (plane > 14) 4045: return PrivateUseCharacters.getType(codePoint); 4046: 4047: return readCodePoint(codePoint) & TYPE_MASK; 4048: } 4049: 4050: /** 4051: * Converts a digit into a character which represents that digit 4052: * in a specified radix. If the radix exceeds MIN_RADIX or MAX_RADIX, 4053: * or the digit exceeds the radix, then the null character <code>'\0'</code> 4054: * is returned. Otherwise the return value is in '0'-'9' and 'a'-'z'. 4055: * <br> 4056: * return value boundary = U+0030-U+0039|U+0061-U+007A 4057: * 4058: * @param digit digit to be converted into a character 4059: * @param radix radix of digit 4060: * @return character representing digit in radix, or '\0' 4061: * @see #MIN_RADIX 4062: * @see #MAX_RADIX 4063: * @see #digit(char, int) 4064: */ 4065: public static char forDigit(int digit, int radix) 4066: { 4067: if (radix < MIN_RADIX || radix > MAX_RADIX 4068: || digit < 0 || digit >= radix) 4069: return '\0'; 4070: return Number.digits[digit]; 4071: } 4072: 4073: /** 4074: * Returns the Unicode directionality property of the character. This 4075: * is used in the visual ordering of text. 4076: * 4077: * @param ch the character to look up 4078: * @return the directionality constant, or DIRECTIONALITY_UNDEFINED 4079: * @see #DIRECTIONALITY_UNDEFINED 4080: * @see #DIRECTIONALITY_LEFT_TO_RIGHT 4081: * @see #DIRECTIONALITY_RIGHT_TO_LEFT 4082: * @see #DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC 4083: * @see #DIRECTIONALITY_EUROPEAN_NUMBER 4084: * @see #DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR 4085: * @see #DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR 4086: * @see #DIRECTIONALITY_ARABIC_NUMBER 4087: * @see #DIRECTIONALITY_COMMON_NUMBER_SEPARATOR 4088: * @see #DIRECTIONALITY_NONSPACING_MARK 4089: * @see #DIRECTIONALITY_BOUNDARY_NEUTRAL 4090: * @see #DIRECTIONALITY_PARAGRAPH_SEPARATOR 4091: * @see #DIRECTIONALITY_SEGMENT_SEPARATOR 4092: * @see #DIRECTIONALITY_WHITESPACE 4093: * @see #DIRECTIONALITY_OTHER_NEUTRALS 4094: * @see #DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING 4095: * @see #DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE 4096: * @see #DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING 4097: * @see #DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE 4098: * @see #DIRECTIONALITY_POP_DIRECTIONAL_FORMAT 4099: * @since 1.4 4100: */ 4101: public static byte getDirectionality(char ch) 4102: { 4103: // The result will correctly be signed. 4104: return getDirectionality((int)ch); 4105: } 4106: 4107: /** 4108: * Returns the Unicode directionality property of the character. This 4109: * is used in the visual ordering of text. 4110: * 4111: * @param codePoint the character to look up 4112: * @return the directionality constant, or DIRECTIONALITY_UNDEFINED 4113: * @see #DIRECTIONALITY_UNDEFINED 4114: * @see #DIRECTIONALITY_LEFT_TO_RIGHT 4115: * @see #DIRECTIONALITY_RIGHT_TO_LEFT 4116: * @see #DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC 4117: * @see #DIRECTIONALITY_EUROPEAN_NUMBER 4118: * @see #DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR 4119: * @see #DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR 4120: * @see #DIRECTIONALITY_ARABIC_NUMBER 4121: * @see #DIRECTIONALITY_COMMON_NUMBER_SEPARATOR 4122: * @see #DIRECTIONALITY_NONSPACING_MARK 4123: * @see #DIRECTIONALITY_BOUNDARY_NEUTRAL 4124: * @see #DIRECTIONALITY_PARAGRAPH_SEPARATOR 4125: * @see #DIRECTIONALITY_SEGMENT_SEPARATOR 4126: * @see #DIRECTIONALITY_WHITESPACE 4127: * @see #DIRECTIONALITY_OTHER_NEUTRALS 4128: * @see #DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING 4129: * @see #DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE 4130: * @see #DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING 4131: * @see #DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE 4132: * @see #DIRECTIONALITY_POP_DIRECTIONAL_FORMAT 4133: * @since 1.5 4134: */ 4135: public static byte getDirectionality(int codePoint) 4136: { 4137: // If the code point is unassigned or in one of the private use areas 4138: // then we delegate the call to the appropriate private static inner class. 4139: int plane = codePoint >>> 16; 4140: if (plane > 2 && plane < 14) 4141: return UnassignedCharacters.getDirectionality(codePoint); 4142: if (plane > 14) 4143: return PrivateUseCharacters.getDirectionality(codePoint); 4144: 4145: // The result will correctly be signed. 4146: return (byte) (direction[plane][readCodePoint(codePoint) >> 7] >> 2); 4147: } 4148: 4149: /** 4150: * Determines whether the character is mirrored according to Unicode. For 4151: * example, <code>\u0028</code> (LEFT PARENTHESIS) appears as '(' in 4152: * left-to-right text, but ')' in right-to-left text. 4153: * 4154: * @param ch the character to look up 4155: * @return true if the character is mirrored 4156: * @since 1.4 4157: */ 4158: public static boolean isMirrored(char ch) 4159: { 4160: return (readCodePoint((int)ch) & MIRROR_MASK) != 0; 4161: } 4162: 4163: /** 4164: * Determines whether the character is mirrored according to Unicode. For 4165: * example, <code>\u0028</code> (LEFT PARENTHESIS) appears as '(' in 4166: * left-to-right text, but ')' in right-to-left text. 4167: * 4168: * @param codePoint the character to look up 4169: * @return true if the character is mirrored 4170: * @since 1.5 4171: */ 4172: public static boolean isMirrored(int codePoint) 4173: { 4174: // If the code point is unassigned or part of one of the private use areas 4175: // then we delegate the call to the appropriate private static inner class. 4176: int plane = codePoint >>> 16; 4177: if (plane > 2 && plane < 14) 4178: return UnassignedCharacters.isMirrored(codePoint); 4179: if (plane > 14) 4180: return PrivateUseCharacters.isMirrored(codePoint); 4181: 4182: return (readCodePoint(codePoint) & MIRROR_MASK) != 0; 4183: } 4184: 4185: /** 4186: * Compares another Character to this Character, numerically. 4187: * 4188: * @param anotherCharacter Character to compare with this Character 4189: * @return a negative integer if this Character is less than 4190: * anotherCharacter, zero if this Character is equal, and 4191: * a positive integer if this Character is greater 4192: * @throws NullPointerException if anotherCharacter is null 4193: * @since 1.2 4194: */ 4195: public int compareTo(Character anotherCharacter) 4196: { 4197: return value - anotherCharacter.value; 4198: } 4199: 4200: /** 4201: * Compares an object to this Character. Assuming the object is a 4202: * Character object, this method performs the same comparison as 4203: * compareTo(Character). 4204: * 4205: * @param o object to compare 4206: * @return the comparison value 4207: * @throws ClassCastException if o is not a Character object 4208: * @throws NullPointerException if o is null 4209: * @see #compareTo(Character) 4210: * @since 1.2 4211: */ 4212: public int compareTo(Object o) 4213: { 4214: return compareTo((Character) o); 4215: } 4216: 4217: /** 4218: * Returns an <code>Character</code> object wrapping the value. 4219: * In contrast to the <code>Character</code> constructor, this method 4220: * will cache some values. It is used by boxing conversion. 4221: * 4222: * @param val the value to wrap 4223: * @return the <code>Character</code> 4224: * 4225: * @since 1.5 4226: */ 4227: public static Character valueOf(char val) 4228: { 4229: if (val > MAX_CACHE) 4230: return new Character(val); 4231: synchronized (charCache) 4232: { 4233: if (charCache[val - MIN_VALUE] == null) 4234: charCache[val - MIN_VALUE] = new Character(val); 4235: return charCache[val - MIN_VALUE]; 4236: } 4237: } 4238: 4239: /** 4240: * Reverse the bytes in val. 4241: * @since 1.5 4242: */ 4243: public static char reverseBytes(char val) 4244: { 4245: return (char) (((val >> 8) & 0xff) | ((val << 8) & 0xff00)); 4246: } 4247: 4248: /** 4249: * Converts a unicode code point to a UTF-16 representation of that 4250: * code point. 4251: * 4252: * @param codePoint the unicode code point 4253: * 4254: * @return the UTF-16 representation of that code point 4255: * 4256: * @throws IllegalArgumentException if the code point is not a valid 4257: * unicode code point 4258: * 4259: * @since 1.5 4260: */ 4261: public static char[] toChars(int codePoint) 4262: { 4263: if (!isValidCodePoint(codePoint)) 4264: throw new IllegalArgumentException("Illegal Unicode code point : " 4265: + codePoint); 4266: char[] result = new char[charCount(codePoint)]; 4267: int ignore = toChars(codePoint, result, 0); 4268: return result; 4269: } 4270: 4271: /** 4272: * Converts a unicode code point to its UTF-16 representation. 4273: * 4274: * @param codePoint the unicode code point 4275: * @param dst the target char array 4276: * @param dstIndex the start index for the target 4277: * 4278: * @return number of characters written to <code>dst</code> 4279: * 4280: * @throws IllegalArgumentException if <code>codePoint</code> is not a 4281: * valid unicode code point 4282: * @throws NullPointerException if <code>dst</code> is <code>null</code> 4283: * @throws IndexOutOfBoundsException if <code>dstIndex</code> is not valid 4284: * in <code>dst</code> or if the UTF-16 representation does not 4285: * fit into <code>dst</code> 4286: * 4287: * @since 1.5 4288: */ 4289: public static int toChars(int codePoint, char[] dst, int dstIndex) 4290: { 4291: if (!isValidCodePoint(codePoint)) 4292: { 4293: throw new IllegalArgumentException("not a valid code point: " 4294: + codePoint); 4295: } 4296: 4297: int result; 4298: if (isSupplementaryCodePoint(codePoint)) 4299: { 4300: // Write second char first to cause IndexOutOfBoundsException 4301: // immediately. 4302: final int cp2 = codePoint - 0x10000; 4303: dst[dstIndex + 1] = (char) ((cp2 % 0x400) + (int) MIN_LOW_SURROGATE); 4304: dst[dstIndex] = (char) ((cp2 / 0x400) + (int) MIN_HIGH_SURROGATE); 4305: result = 2; 4306: } 4307: else 4308: { 4309: dst[dstIndex] = (char) codePoint; 4310: result = 1; 4311: } 4312: return result; 4313: } 4314: 4315: /** 4316: * Return number of 16-bit characters required to represent the given 4317: * code point. 4318: * 4319: * @param codePoint a unicode code point 4320: * 4321: * @return 2 if codePoint >= 0x10000, 1 otherwise. 4322: * 4323: * @since 1.5 4324: */ 4325: public static int charCount(int codePoint) 4326: { 4327: return 4328: (codePoint >= MIN_SUPPLEMENTARY_CODE_POINT) 4329: ? 2 4330: : 1; 4331: } 4332: 4333: /** 4334: * Determines whether the specified code point is 4335: * in the range 0x10000 .. 0x10FFFF, i.e. the character is within the Unicode 4336: * supplementary character range. 4337: * 4338: * @param codePoint a Unicode code point 4339: * 4340: * @return <code>true</code> if code point is in supplementary range 4341: * 4342: * @since 1.5 4343: */ 4344: public static boolean isSupplementaryCodePoint(int codePoint) 4345: { 4346: return codePoint >= MIN_SUPPLEMENTARY_CODE_POINT 4347: && codePoint <= MAX_CODE_POINT; 4348: } 4349: 4350: /** 4351: * Determines whether the specified code point is 4352: * in the range 0x0000 .. 0x10FFFF, i.e. it is a valid Unicode code point. 4353: * 4354: * @param codePoint a Unicode code point 4355: * 4356: * @return <code>true</code> if code point is valid 4357: * 4358: * @since 1.5 4359: */ 4360: public static boolean isValidCodePoint(int codePoint) 4361: { 4362: return codePoint >= MIN_CODE_POINT && codePoint <= MAX_CODE_POINT; 4363: } 4364: 4365: /** 4366: * Return true if the given character is a high surrogate. 4367: * @param ch the character 4368: * @return true if the character is a high surrogate character 4369: * 4370: * @since 1.5 4371: */ 4372: public static boolean isHighSurrogate(char ch) 4373: { 4374: return ch >= MIN_HIGH_SURROGATE && ch <= MAX_HIGH_SURROGATE; 4375: } 4376: 4377: /** 4378: * Return true if the given character is a low surrogate. 4379: * @param ch the character 4380: * @return true if the character is a low surrogate character 4381: * 4382: * @since 1.5 4383: */ 4384: public static boolean isLowSurrogate(char ch) 4385: { 4386: return ch >= MIN_LOW_SURROGATE && ch <= MAX_LOW_SURROGATE; 4387: } 4388: 4389: /** 4390: * Return true if the given characters compose a surrogate pair. 4391: * This is true if the first character is a high surrogate and the 4392: * second character is a low surrogate. 4393: * @param ch1 the first character 4394: * @param ch2 the first character 4395: * @return true if the characters compose a surrogate pair 4396: * 4397: * @since 1.5 4398: */ 4399: public static boolean isSurrogatePair(char ch1, char ch2) 4400: { 4401: return isHighSurrogate(ch1) && isLowSurrogate(ch2); 4402: } 4403: 4404: /** 4405: * Given a valid surrogate pair, this returns the corresponding 4406: * code point. 4407: * @param high the high character of the pair 4408: * @param low the low character of the pair 4409: * @return the corresponding code point 4410: * 4411: * @since 1.5 4412: */ 4413: public static int toCodePoint(char high, char low) 4414: { 4415: return ((high - MIN_HIGH_SURROGATE) * 0x400) + 4416: (low - MIN_LOW_SURROGATE) + 0x10000; 4417: } 4418: 4419: /** 4420: * Get the code point at the specified index in the CharSequence. 4421: * This is like CharSequence#charAt(int), but if the character is 4422: * the start of a surrogate pair, and there is a following 4423: * character, and this character completes the pair, then the 4424: * corresponding supplementary code point is returned. Otherwise, 4425: * the character at the index is returned. 4426: * 4427: * @param sequence the CharSequence 4428: * @param index the index of the codepoint to get, starting at 0 4429: * @return the codepoint at the specified index 4430: * @throws IndexOutOfBoundsException if index is negative or >= length() 4431: * @since 1.5 4432: */ 4433: public static int codePointAt(CharSequence sequence, int index) 4434: { 4435: int len = sequence.length(); 4436: if (index < 0 || index >= len) 4437: throw new IndexOutOfBoundsException(); 4438: char high = sequence.charAt(index); 4439: if (! isHighSurrogate(high) || ++index >= len) 4440: return high; 4441: char low = sequence.charAt(index); 4442: if (! isLowSurrogate(low)) 4443: return high; 4444: return toCodePoint(high, low); 4445: } 4446: 4447: /** 4448: * Get the code point at the specified index in the CharSequence. 4449: * If the character is the start of a surrogate pair, and there is a 4450: * following character, and this character completes the pair, then 4451: * the corresponding supplementary code point is returned. 4452: * Otherwise, the character at the index is returned. 4453: * 4454: * @param chars the character array in which to look 4455: * @param index the index of the codepoint to get, starting at 0 4456: * @return the codepoint at the specified index 4457: * @throws IndexOutOfBoundsException if index is negative or >= length() 4458: * @since 1.5 4459: */ 4460: public static int codePointAt(char[] chars, int index) 4461: { 4462: return codePointAt(chars, index, chars.length); 4463: } 4464: 4465: /** 4466: * Get the code point at the specified index in the CharSequence. 4467: * If the character is the start of a surrogate pair, and there is a 4468: * following character within the specified range, and this 4469: * character completes the pair, then the corresponding 4470: * supplementary code point is returned. Otherwise, the character 4471: * at the index is returned. 4472: * 4473: * @param chars the character array in which to look 4474: * @param index the index of the codepoint to get, starting at 0 4475: * @param limit the limit past which characters should not be examined 4476: * @return the codepoint at the specified index 4477: * @throws IndexOutOfBoundsException if index is negative or >= 4478: * limit, or if limit is negative or >= the length of the array 4479: * @since 1.5 4480: */ 4481: public static int codePointAt(char[] chars, int index, int limit) 4482: { 4483: if (index < 0 || index >= limit || limit < 0 || limit > chars.length) 4484: throw new IndexOutOfBoundsException(); 4485: char high = chars[index]; 4486: if (! isHighSurrogate(high) || ++index >= limit) 4487: return high; 4488: char low = chars[index]; 4489: if (! isLowSurrogate(low)) 4490: return high; 4491: return toCodePoint(high, low); 4492: } 4493: 4494: /** 4495: * Get the code point before the specified index. This is like 4496: * #codePointAt(char[], int), but checks the characters at 4497: * <code>index-1</code> and <code>index-2</code> to see if they form 4498: * a supplementary code point. If they do not, the character at 4499: * <code>index-1</code> is returned. 4500: * 4501: * @param chars the character array 4502: * @param index the index just past the codepoint to get, starting at 0 4503: * @return the codepoint at the specified index 4504: * @throws IndexOutOfBoundsException if index is negative or >= length() 4505: * @since 1.5 4506: */ 4507: public static int codePointBefore(char[] chars, int index) 4508: { 4509: return codePointBefore(chars, index, 1); 4510: } 4511: 4512: /** 4513: * Get the code point before the specified index. This is like 4514: * #codePointAt(char[], int), but checks the characters at 4515: * <code>index-1</code> and <code>index-2</code> to see if they form 4516: * a supplementary code point. If they do not, the character at 4517: * <code>index-1</code> is returned. The start parameter is used to 4518: * limit the range of the array which may be examined. 4519: * 4520: * @param chars the character array 4521: * @param index the index just past the codepoint to get, starting at 0 4522: * @param start the index before which characters should not be examined 4523: * @return the codepoint at the specified index 4524: * @throws IndexOutOfBoundsException if index is > start or > 4525: * the length of the array, or if limit is negative or >= the 4526: * length of the array 4527: * @since 1.5 4528: */ 4529: public static int codePointBefore(char[] chars, int index, int start) 4530: { 4531: if (index < start || index > chars.length 4532: || start < 0 || start >= chars.length) 4533: throw new IndexOutOfBoundsException(); 4534: --index; 4535: char low = chars[index]; 4536: if (! isLowSurrogate(low) || --index < start) 4537: return low; 4538: char high = chars[index]; 4539: if (! isHighSurrogate(high)) 4540: return low; 4541: return toCodePoint(high, low); 4542: } 4543: 4544: /** 4545: * Get the code point before the specified index. This is like 4546: * #codePointAt(CharSequence, int), but checks the characters at 4547: * <code>index-1</code> and <code>index-2</code> to see if they form 4548: * a supplementary code point. If they do not, the character at 4549: * <code>index-1</code> is returned. 4550: * 4551: * @param sequence the CharSequence 4552: * @param index the index just past the codepoint to get, starting at 0 4553: * @return the codepoint at the specified index 4554: * @throws IndexOutOfBoundsException if index is negative or >= length() 4555: * @since 1.5 4556: */ 4557: public static int codePointBefore(CharSequence sequence, int index) 4558: { 4559: int len = sequence.length(); 4560: if (index < 1 || index > len) 4561: throw new IndexOutOfBoundsException(); 4562: --index; 4563: char low = sequence.charAt(index); 4564: if (! isLowSurrogate(low) || --index < 0) 4565: return low; 4566: char high = sequence.charAt(index); 4567: if (! isHighSurrogate(high)) 4568: return low; 4569: return toCodePoint(high, low); 4570: } 4571: } // class Character
GNU Classpath (0.92) |