GNU Classpath (0.18) | ||
Frames | No Frames |
1: /* java.lang.Character -- Wrapper class for char, and Unicode subsets 2: Copyright (C) 1998, 1999, 2001, 2002 Free Software Foundation, Inc. 3: 4: This file is part of GNU Classpath. 5: 6: GNU Classpath is free software; you can redistribute it and/or modify 7: it under the terms of the GNU General Public License as published by 8: the Free Software Foundation; either version 2, or (at your option) 9: any later version. 10: 11: GNU Classpath is distributed in the hope that it will be useful, but 12: WITHOUT ANY WARRANTY; without even the implied warranty of 13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14: General Public License for more details. 15: 16: You should have received a copy of the GNU General Public License 17: along with GNU Classpath; see the file COPYING. If not, write to the 18: Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 19: 02110-1301 USA. 20: 21: Linking this library statically or dynamically with other modules is 22: making a combined work based on this library. Thus, the terms and 23: conditions of the GNU General Public License cover the whole 24: combination. 25: 26: As a special exception, the copyright holders of this library give you 27: permission to link this library with independent modules to produce an 28: executable, regardless of the license terms of these independent 29: modules, and to copy and distribute the resulting executable under 30: terms of your choice, provided that you also meet, for each linked 31: independent module, the terms and conditions of the license of that 32: module. An independent module is a module which is not derived from 33: or based on this library. If you modify this library, you may extend 34: this exception to your version of the library, but you are not 35: obligated to do so. If you do not wish to do so, delete this 36: exception statement from your version. */ 37: 38: 39: package java.lang; 40: 41: import gnu.java.lang.CharData; 42: 43: import java.io.Serializable; 44: 45: /** 46: * Wrapper class for the primitive char data type. In addition, this class 47: * allows one to retrieve property information and perform transformations 48: * on the 57,707 defined characters in the Unicode Standard, Version 3.0.0. 49: * java.lang.Character is designed to be very dynamic, and as such, it 50: * retrieves information on the Unicode character set from a separate 51: * database, gnu.java.lang.CharData, which can be easily upgraded. 52: * 53: * <p>For predicates, boundaries are used to describe 54: * the set of characters for which the method will return true. 55: * This syntax uses fairly normal regular expression notation. 56: * See 5.13 of the Unicode Standard, Version 3.0, for the 57: * boundary specification. 58: * 59: * <p>See <a href="http://www.unicode.org">http://www.unicode.org</a> 60: * for more information on the Unicode Standard. 61: * 62: * @author Tom Tromey (tromey@cygnus.com) 63: * @author Paul N. Fisher 64: * @author Jochen Hoenicke 65: * @author Eric Blake (ebb9@email.byu.edu) 66: * @see CharData 67: * @since 1.0 68: * @status updated to 1.4 69: */ 70: public final class Character implements Serializable, Comparable 71: { 72: /** 73: * A subset of Unicode blocks. 74: * 75: * @author Paul N. Fisher 76: * @author Eric Blake (ebb9@email.byu.edu) 77: * @since 1.2 78: */ 79: public static class Subset 80: { 81: /** The name of the subset. */ 82: private final String name; 83: 84: /** 85: * Construct a new subset of characters. 86: * 87: * @param name the name of the subset 88: * @throws NullPointerException if name is null 89: */ 90: protected Subset(String name) 91: { 92: // Note that name.toString() is name, unless name was null. 93: this.name = name.toString(); 94: } 95: 96: /** 97: * Compares two Subsets for equality. This is <code>final</code>, and 98: * restricts the comparison on the <code>==</code> operator, so it returns 99: * true only for the same object. 100: * 101: * @param o the object to compare 102: * @return true if o is this 103: */ 104: public final boolean equals(Object o) 105: { 106: return o == this; 107: } 108: 109: /** 110: * Makes the original hashCode of Object final, to be consistent with 111: * equals. 112: * 113: * @return the hash code for this object 114: */ 115: public final int hashCode() 116: { 117: return super.hashCode(); 118: } 119: 120: /** 121: * Returns the name of the subset. 122: * 123: * @return the name 124: */ 125: public final String toString() 126: { 127: return name; 128: } 129: } // class Subset 130: 131: /** 132: * A family of character subsets in the Unicode specification. A character 133: * is in at most one of these blocks. 134: * 135: * This inner class was generated automatically from 136: * <code>doc/unicode/Block-3.txt</code>, by some perl scripts. 137: * This Unicode definition file can be found on the 138: * <a href="http://www.unicode.org">http://www.unicode.org</a> website. 139: * JDK 1.4 uses Unicode version 3.0.0. 140: * 141: * @author scripts/unicode-blocks.pl (written by Eric Blake) 142: * @since 1.2 143: */ 144: public static final class UnicodeBlock extends Subset 145: { 146: /** The start of the subset. */ 147: private final char start; 148: 149: /** The end of the subset. */ 150: private final char end; 151: 152: /** 153: * Constructor for strictly defined blocks. 154: * 155: * @param start the start character of the range 156: * @param end the end character of the range 157: * @param name the block name 158: */ 159: private UnicodeBlock(char start, char end, String name) 160: { 161: super(name); 162: this.start = start; 163: this.end = end; 164: } 165: 166: /** 167: * Returns the Unicode character block which a character belongs to. 168: * 169: * @param ch the character to look up 170: * @return the set it belongs to, or null if it is not in one 171: */ 172: public static UnicodeBlock of(char ch) 173: { 174: // Special case, since SPECIALS contains two ranges. 175: if (ch == '\uFEFF') 176: return SPECIALS; 177: // Simple binary search for the correct block. 178: int low = 0; 179: int hi = sets.length - 1; 180: while (low <= hi) 181: { 182: int mid = (low + hi) >> 1; 183: UnicodeBlock b = sets[mid]; 184: if (ch < b.start) 185: hi = mid - 1; 186: else if (ch > b.end) 187: low = mid + 1; 188: else 189: return b; 190: } 191: return null; 192: } 193: 194: /** 195: * Basic Latin. 196: * '\u0000' - '\u007F'. 197: */ 198: public static final UnicodeBlock BASIC_LATIN 199: = new UnicodeBlock('\u0000', '\u007F', 200: "BASIC_LATIN"); 201: 202: /** 203: * Latin-1 Supplement. 204: * '\u0080' - '\u00FF'. 205: */ 206: public static final UnicodeBlock LATIN_1_SUPPLEMENT 207: = new UnicodeBlock('\u0080', '\u00FF', 208: "LATIN_1_SUPPLEMENT"); 209: 210: /** 211: * Latin Extended-A. 212: * '\u0100' - '\u017F'. 213: */ 214: public static final UnicodeBlock LATIN_EXTENDED_A 215: = new UnicodeBlock('\u0100', '\u017F', 216: "LATIN_EXTENDED_A"); 217: 218: /** 219: * Latin Extended-B. 220: * '\u0180' - '\u024F'. 221: */ 222: public static final UnicodeBlock LATIN_EXTENDED_B 223: = new UnicodeBlock('\u0180', '\u024F', 224: "LATIN_EXTENDED_B"); 225: 226: /** 227: * IPA Extensions. 228: * '\u0250' - '\u02AF'. 229: */ 230: public static final UnicodeBlock IPA_EXTENSIONS 231: = new UnicodeBlock('\u0250', '\u02AF', 232: "IPA_EXTENSIONS"); 233: 234: /** 235: * Spacing Modifier Letters. 236: * '\u02B0' - '\u02FF'. 237: */ 238: public static final UnicodeBlock SPACING_MODIFIER_LETTERS 239: = new UnicodeBlock('\u02B0', '\u02FF', 240: "SPACING_MODIFIER_LETTERS"); 241: 242: /** 243: * Combining Diacritical Marks. 244: * '\u0300' - '\u036F'. 245: */ 246: public static final UnicodeBlock COMBINING_DIACRITICAL_MARKS 247: = new UnicodeBlock('\u0300', '\u036F', 248: "COMBINING_DIACRITICAL_MARKS"); 249: 250: /** 251: * Greek. 252: * '\u0370' - '\u03FF'. 253: */ 254: public static final UnicodeBlock GREEK 255: = new UnicodeBlock('\u0370', '\u03FF', 256: "GREEK"); 257: 258: /** 259: * Cyrillic. 260: * '\u0400' - '\u04FF'. 261: */ 262: public static final UnicodeBlock CYRILLIC 263: = new UnicodeBlock('\u0400', '\u04FF', 264: "CYRILLIC"); 265: 266: /** 267: * Armenian. 268: * '\u0530' - '\u058F'. 269: */ 270: public static final UnicodeBlock ARMENIAN 271: = new UnicodeBlock('\u0530', '\u058F', 272: "ARMENIAN"); 273: 274: /** 275: * Hebrew. 276: * '\u0590' - '\u05FF'. 277: */ 278: public static final UnicodeBlock HEBREW 279: = new UnicodeBlock('\u0590', '\u05FF', 280: "HEBREW"); 281: 282: /** 283: * Arabic. 284: * '\u0600' - '\u06FF'. 285: */ 286: public static final UnicodeBlock ARABIC 287: = new UnicodeBlock('\u0600', '\u06FF', 288: "ARABIC"); 289: 290: /** 291: * Syriac. 292: * '\u0700' - '\u074F'. 293: * @since 1.4 294: */ 295: public static final UnicodeBlock SYRIAC 296: = new UnicodeBlock('\u0700', '\u074F', 297: "SYRIAC"); 298: 299: /** 300: * Thaana. 301: * '\u0780' - '\u07BF'. 302: * @since 1.4 303: */ 304: public static final UnicodeBlock THAANA 305: = new UnicodeBlock('\u0780', '\u07BF', 306: "THAANA"); 307: 308: /** 309: * Devanagari. 310: * '\u0900' - '\u097F'. 311: */ 312: public static final UnicodeBlock DEVANAGARI 313: = new UnicodeBlock('\u0900', '\u097F', 314: "DEVANAGARI"); 315: 316: /** 317: * Bengali. 318: * '\u0980' - '\u09FF'. 319: */ 320: public static final UnicodeBlock BENGALI 321: = new UnicodeBlock('\u0980', '\u09FF', 322: "BENGALI"); 323: 324: /** 325: * Gurmukhi. 326: * '\u0A00' - '\u0A7F'. 327: */ 328: public static final UnicodeBlock GURMUKHI 329: = new UnicodeBlock('\u0A00', '\u0A7F', 330: "GURMUKHI"); 331: 332: /** 333: * Gujarati. 334: * '\u0A80' - '\u0AFF'. 335: */ 336: public static final UnicodeBlock GUJARATI 337: = new UnicodeBlock('\u0A80', '\u0AFF', 338: "GUJARATI"); 339: 340: /** 341: * Oriya. 342: * '\u0B00' - '\u0B7F'. 343: */ 344: public static final UnicodeBlock ORIYA 345: = new UnicodeBlock('\u0B00', '\u0B7F', 346: "ORIYA"); 347: 348: /** 349: * Tamil. 350: * '\u0B80' - '\u0BFF'. 351: */ 352: public static final UnicodeBlock TAMIL 353: = new UnicodeBlock('\u0B80', '\u0BFF', 354: "TAMIL"); 355: 356: /** 357: * Telugu. 358: * '\u0C00' - '\u0C7F'. 359: */ 360: public static final UnicodeBlock TELUGU 361: = new UnicodeBlock('\u0C00', '\u0C7F', 362: "TELUGU"); 363: 364: /** 365: * Kannada. 366: * '\u0C80' - '\u0CFF'. 367: */ 368: public static final UnicodeBlock KANNADA 369: = new UnicodeBlock('\u0C80', '\u0CFF', 370: "KANNADA"); 371: 372: /** 373: * Malayalam. 374: * '\u0D00' - '\u0D7F'. 375: */ 376: public static final UnicodeBlock MALAYALAM 377: = new UnicodeBlock('\u0D00', '\u0D7F', 378: "MALAYALAM"); 379: 380: /** 381: * Sinhala. 382: * '\u0D80' - '\u0DFF'. 383: * @since 1.4 384: */ 385: public static final UnicodeBlock SINHALA 386: = new UnicodeBlock('\u0D80', '\u0DFF', 387: "SINHALA"); 388: 389: /** 390: * Thai. 391: * '\u0E00' - '\u0E7F'. 392: */ 393: public static final UnicodeBlock THAI 394: = new UnicodeBlock('\u0E00', '\u0E7F', 395: "THAI"); 396: 397: /** 398: * Lao. 399: * '\u0E80' - '\u0EFF'. 400: */ 401: public static final UnicodeBlock LAO 402: = new UnicodeBlock('\u0E80', '\u0EFF', 403: "LAO"); 404: 405: /** 406: * Tibetan. 407: * '\u0F00' - '\u0FFF'. 408: */ 409: public static final UnicodeBlock TIBETAN 410: = new UnicodeBlock('\u0F00', '\u0FFF', 411: "TIBETAN"); 412: 413: /** 414: * Myanmar. 415: * '\u1000' - '\u109F'. 416: * @since 1.4 417: */ 418: public static final UnicodeBlock MYANMAR 419: = new UnicodeBlock('\u1000', '\u109F', 420: "MYANMAR"); 421: 422: /** 423: * Georgian. 424: * '\u10A0' - '\u10FF'. 425: */ 426: public static final UnicodeBlock GEORGIAN 427: = new UnicodeBlock('\u10A0', '\u10FF', 428: "GEORGIAN"); 429: 430: /** 431: * Hangul Jamo. 432: * '\u1100' - '\u11FF'. 433: */ 434: public static final UnicodeBlock HANGUL_JAMO 435: = new UnicodeBlock('\u1100', '\u11FF', 436: "HANGUL_JAMO"); 437: 438: /** 439: * Ethiopic. 440: * '\u1200' - '\u137F'. 441: * @since 1.4 442: */ 443: public static final UnicodeBlock ETHIOPIC 444: = new UnicodeBlock('\u1200', '\u137F', 445: "ETHIOPIC"); 446: 447: /** 448: * Cherokee. 449: * '\u13A0' - '\u13FF'. 450: * @since 1.4 451: */ 452: public static final UnicodeBlock CHEROKEE 453: = new UnicodeBlock('\u13A0', '\u13FF', 454: "CHEROKEE"); 455: 456: /** 457: * Unified Canadian Aboriginal Syllabics. 458: * '\u1400' - '\u167F'. 459: * @since 1.4 460: */ 461: public static final UnicodeBlock UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS 462: = new UnicodeBlock('\u1400', '\u167F', 463: "UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS"); 464: 465: /** 466: * Ogham. 467: * '\u1680' - '\u169F'. 468: * @since 1.4 469: */ 470: public static final UnicodeBlock OGHAM 471: = new UnicodeBlock('\u1680', '\u169F', 472: "OGHAM"); 473: 474: /** 475: * Runic. 476: * '\u16A0' - '\u16FF'. 477: * @since 1.4 478: */ 479: public static final UnicodeBlock RUNIC 480: = new UnicodeBlock('\u16A0', '\u16FF', 481: "RUNIC"); 482: 483: /** 484: * Khmer. 485: * '\u1780' - '\u17FF'. 486: * @since 1.4 487: */ 488: public static final UnicodeBlock KHMER 489: = new UnicodeBlock('\u1780', '\u17FF', 490: "KHMER"); 491: 492: /** 493: * Mongolian. 494: * '\u1800' - '\u18AF'. 495: * @since 1.4 496: */ 497: public static final UnicodeBlock MONGOLIAN 498: = new UnicodeBlock('\u1800', '\u18AF', 499: "MONGOLIAN"); 500: 501: /** 502: * Latin Extended Additional. 503: * '\u1E00' - '\u1EFF'. 504: */ 505: public static final UnicodeBlock LATIN_EXTENDED_ADDITIONAL 506: = new UnicodeBlock('\u1E00', '\u1EFF', 507: "LATIN_EXTENDED_ADDITIONAL"); 508: 509: /** 510: * Greek Extended. 511: * '\u1F00' - '\u1FFF'. 512: */ 513: public static final UnicodeBlock GREEK_EXTENDED 514: = new UnicodeBlock('\u1F00', '\u1FFF', 515: "GREEK_EXTENDED"); 516: 517: /** 518: * General Punctuation. 519: * '\u2000' - '\u206F'. 520: */ 521: public static final UnicodeBlock GENERAL_PUNCTUATION 522: = new UnicodeBlock('\u2000', '\u206F', 523: "GENERAL_PUNCTUATION"); 524: 525: /** 526: * Superscripts and Subscripts. 527: * '\u2070' - '\u209F'. 528: */ 529: public static final UnicodeBlock SUPERSCRIPTS_AND_SUBSCRIPTS 530: = new UnicodeBlock('\u2070', '\u209F', 531: "SUPERSCRIPTS_AND_SUBSCRIPTS"); 532: 533: /** 534: * Currency Symbols. 535: * '\u20A0' - '\u20CF'. 536: */ 537: public static final UnicodeBlock CURRENCY_SYMBOLS 538: = new UnicodeBlock('\u20A0', '\u20CF', 539: "CURRENCY_SYMBOLS"); 540: 541: /** 542: * Combining Marks for Symbols. 543: * '\u20D0' - '\u20FF'. 544: */ 545: public static final UnicodeBlock COMBINING_MARKS_FOR_SYMBOLS 546: = new UnicodeBlock('\u20D0', '\u20FF', 547: "COMBINING_MARKS_FOR_SYMBOLS"); 548: 549: /** 550: * Letterlike Symbols. 551: * '\u2100' - '\u214F'. 552: */ 553: public static final UnicodeBlock LETTERLIKE_SYMBOLS 554: = new UnicodeBlock('\u2100', '\u214F', 555: "LETTERLIKE_SYMBOLS"); 556: 557: /** 558: * Number Forms. 559: * '\u2150' - '\u218F'. 560: */ 561: public static final UnicodeBlock NUMBER_FORMS 562: = new UnicodeBlock('\u2150', '\u218F', 563: "NUMBER_FORMS"); 564: 565: /** 566: * Arrows. 567: * '\u2190' - '\u21FF'. 568: */ 569: public static final UnicodeBlock ARROWS 570: = new UnicodeBlock('\u2190', '\u21FF', 571: "ARROWS"); 572: 573: /** 574: * Mathematical Operators. 575: * '\u2200' - '\u22FF'. 576: */ 577: public static final UnicodeBlock MATHEMATICAL_OPERATORS 578: = new UnicodeBlock('\u2200', '\u22FF', 579: "MATHEMATICAL_OPERATORS"); 580: 581: /** 582: * Miscellaneous Technical. 583: * '\u2300' - '\u23FF'. 584: */ 585: public static final UnicodeBlock MISCELLANEOUS_TECHNICAL 586: = new UnicodeBlock('\u2300', '\u23FF', 587: "MISCELLANEOUS_TECHNICAL"); 588: 589: /** 590: * Control Pictures. 591: * '\u2400' - '\u243F'. 592: */ 593: public static final UnicodeBlock CONTROL_PICTURES 594: = new UnicodeBlock('\u2400', '\u243F', 595: "CONTROL_PICTURES"); 596: 597: /** 598: * Optical Character Recognition. 599: * '\u2440' - '\u245F'. 600: */ 601: public static final UnicodeBlock OPTICAL_CHARACTER_RECOGNITION 602: = new UnicodeBlock('\u2440', '\u245F', 603: "OPTICAL_CHARACTER_RECOGNITION"); 604: 605: /** 606: * Enclosed Alphanumerics. 607: * '\u2460' - '\u24FF'. 608: */ 609: public static final UnicodeBlock ENCLOSED_ALPHANUMERICS 610: = new UnicodeBlock('\u2460', '\u24FF', 611: "ENCLOSED_ALPHANUMERICS"); 612: 613: /** 614: * Box Drawing. 615: * '\u2500' - '\u257F'. 616: */ 617: public static final UnicodeBlock BOX_DRAWING 618: = new UnicodeBlock('\u2500', '\u257F', 619: "BOX_DRAWING"); 620: 621: /** 622: * Block Elements. 623: * '\u2580' - '\u259F'. 624: */ 625: public static final UnicodeBlock BLOCK_ELEMENTS 626: = new UnicodeBlock('\u2580', '\u259F', 627: "BLOCK_ELEMENTS"); 628: 629: /** 630: * Geometric Shapes. 631: * '\u25A0' - '\u25FF'. 632: */ 633: public static final UnicodeBlock GEOMETRIC_SHAPES 634: = new UnicodeBlock('\u25A0', '\u25FF', 635: "GEOMETRIC_SHAPES"); 636: 637: /** 638: * Miscellaneous Symbols. 639: * '\u2600' - '\u26FF'. 640: */ 641: public static final UnicodeBlock MISCELLANEOUS_SYMBOLS 642: = new UnicodeBlock('\u2600', '\u26FF', 643: "MISCELLANEOUS_SYMBOLS"); 644: 645: /** 646: * Dingbats. 647: * '\u2700' - '\u27BF'. 648: */ 649: public static final UnicodeBlock DINGBATS 650: = new UnicodeBlock('\u2700', '\u27BF', 651: "DINGBATS"); 652: 653: /** 654: * Braille Patterns. 655: * '\u2800' - '\u28FF'. 656: * @since 1.4 657: */ 658: public static final UnicodeBlock BRAILLE_PATTERNS 659: = new UnicodeBlock('\u2800', '\u28FF', 660: "BRAILLE_PATTERNS"); 661: 662: /** 663: * CJK Radicals Supplement. 664: * '\u2E80' - '\u2EFF'. 665: * @since 1.4 666: */ 667: public static final UnicodeBlock CJK_RADICALS_SUPPLEMENT 668: = new UnicodeBlock('\u2E80', '\u2EFF', 669: "CJK_RADICALS_SUPPLEMENT"); 670: 671: /** 672: * Kangxi Radicals. 673: * '\u2F00' - '\u2FDF'. 674: * @since 1.4 675: */ 676: public static final UnicodeBlock KANGXI_RADICALS 677: = new UnicodeBlock('\u2F00', '\u2FDF', 678: "KANGXI_RADICALS"); 679: 680: /** 681: * Ideographic Description Characters. 682: * '\u2FF0' - '\u2FFF'. 683: * @since 1.4 684: */ 685: public static final UnicodeBlock IDEOGRAPHIC_DESCRIPTION_CHARACTERS 686: = new UnicodeBlock('\u2FF0', '\u2FFF', 687: "IDEOGRAPHIC_DESCRIPTION_CHARACTERS"); 688: 689: /** 690: * CJK Symbols and Punctuation. 691: * '\u3000' - '\u303F'. 692: */ 693: public static final UnicodeBlock CJK_SYMBOLS_AND_PUNCTUATION 694: = new UnicodeBlock('\u3000', '\u303F', 695: "CJK_SYMBOLS_AND_PUNCTUATION"); 696: 697: /** 698: * Hiragana. 699: * '\u3040' - '\u309F'. 700: */ 701: public static final UnicodeBlock HIRAGANA 702: = new UnicodeBlock('\u3040', '\u309F', 703: "HIRAGANA"); 704: 705: /** 706: * Katakana. 707: * '\u30A0' - '\u30FF'. 708: */ 709: public static final UnicodeBlock KATAKANA 710: = new UnicodeBlock('\u30A0', '\u30FF', 711: "KATAKANA"); 712: 713: /** 714: * Bopomofo. 715: * '\u3100' - '\u312F'. 716: */ 717: public static final UnicodeBlock BOPOMOFO 718: = new UnicodeBlock('\u3100', '\u312F', 719: "BOPOMOFO"); 720: 721: /** 722: * Hangul Compatibility Jamo. 723: * '\u3130' - '\u318F'. 724: */ 725: public static final UnicodeBlock HANGUL_COMPATIBILITY_JAMO 726: = new UnicodeBlock('\u3130', '\u318F', 727: "HANGUL_COMPATIBILITY_JAMO"); 728: 729: /** 730: * Kanbun. 731: * '\u3190' - '\u319F'. 732: */ 733: public static final UnicodeBlock KANBUN 734: = new UnicodeBlock('\u3190', '\u319F', 735: "KANBUN"); 736: 737: /** 738: * Bopomofo Extended. 739: * '\u31A0' - '\u31BF'. 740: * @since 1.4 741: */ 742: public static final UnicodeBlock BOPOMOFO_EXTENDED 743: = new UnicodeBlock('\u31A0', '\u31BF', 744: "BOPOMOFO_EXTENDED"); 745: 746: /** 747: * Enclosed CJK Letters and Months. 748: * '\u3200' - '\u32FF'. 749: */ 750: public static final UnicodeBlock ENCLOSED_CJK_LETTERS_AND_MONTHS 751: = new UnicodeBlock('\u3200', '\u32FF', 752: "ENCLOSED_CJK_LETTERS_AND_MONTHS"); 753: 754: /** 755: * CJK Compatibility. 756: * '\u3300' - '\u33FF'. 757: */ 758: public static final UnicodeBlock CJK_COMPATIBILITY 759: = new UnicodeBlock('\u3300', '\u33FF', 760: "CJK_COMPATIBILITY"); 761: 762: /** 763: * CJK Unified Ideographs Extension A. 764: * '\u3400' - '\u4DB5'. 765: * @since 1.4 766: */ 767: public static final UnicodeBlock CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A 768: = new UnicodeBlock('\u3400', '\u4DB5', 769: "CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A"); 770: 771: /** 772: * CJK Unified Ideographs. 773: * '\u4E00' - '\u9FFF'. 774: */ 775: public static final UnicodeBlock CJK_UNIFIED_IDEOGRAPHS 776: = new UnicodeBlock('\u4E00', '\u9FFF', 777: "CJK_UNIFIED_IDEOGRAPHS"); 778: 779: /** 780: * Yi Syllables. 781: * '\uA000' - '\uA48F'. 782: * @since 1.4 783: */ 784: public static final UnicodeBlock YI_SYLLABLES 785: = new UnicodeBlock('\uA000', '\uA48F', 786: "YI_SYLLABLES"); 787: 788: /** 789: * Yi Radicals. 790: * '\uA490' - '\uA4CF'. 791: * @since 1.4 792: */ 793: public static final UnicodeBlock YI_RADICALS 794: = new UnicodeBlock('\uA490', '\uA4CF', 795: "YI_RADICALS"); 796: 797: /** 798: * Hangul Syllables. 799: * '\uAC00' - '\uD7A3'. 800: */ 801: public static final UnicodeBlock HANGUL_SYLLABLES 802: = new UnicodeBlock('\uAC00', '\uD7A3', 803: "HANGUL_SYLLABLES"); 804: 805: /** 806: * Surrogates Area. 807: * '\uD800' - '\uDFFF'. 808: */ 809: public static final UnicodeBlock SURROGATES_AREA 810: = new UnicodeBlock('\uD800', '\uDFFF', 811: "SURROGATES_AREA"); 812: 813: /** 814: * Private Use Area. 815: * '\uE000' - '\uF8FF'. 816: */ 817: public static final UnicodeBlock PRIVATE_USE_AREA 818: = new UnicodeBlock('\uE000', '\uF8FF', 819: "PRIVATE_USE_AREA"); 820: 821: /** 822: * CJK Compatibility Ideographs. 823: * '\uF900' - '\uFAFF'. 824: */ 825: public static final UnicodeBlock CJK_COMPATIBILITY_IDEOGRAPHS 826: = new UnicodeBlock('\uF900', '\uFAFF', 827: "CJK_COMPATIBILITY_IDEOGRAPHS"); 828: 829: /** 830: * Alphabetic Presentation Forms. 831: * '\uFB00' - '\uFB4F'. 832: */ 833: public static final UnicodeBlock ALPHABETIC_PRESENTATION_FORMS 834: = new UnicodeBlock('\uFB00', '\uFB4F', 835: "ALPHABETIC_PRESENTATION_FORMS"); 836: 837: /** 838: * Arabic Presentation Forms-A. 839: * '\uFB50' - '\uFDFF'. 840: */ 841: public static final UnicodeBlock ARABIC_PRESENTATION_FORMS_A 842: = new UnicodeBlock('\uFB50', '\uFDFF', 843: "ARABIC_PRESENTATION_FORMS_A"); 844: 845: /** 846: * Combining Half Marks. 847: * '\uFE20' - '\uFE2F'. 848: */ 849: public static final UnicodeBlock COMBINING_HALF_MARKS 850: = new UnicodeBlock('\uFE20', '\uFE2F', 851: "COMBINING_HALF_MARKS"); 852: 853: /** 854: * CJK Compatibility Forms. 855: * '\uFE30' - '\uFE4F'. 856: */ 857: public static final UnicodeBlock CJK_COMPATIBILITY_FORMS 858: = new UnicodeBlock('\uFE30', '\uFE4F', 859: "CJK_COMPATIBILITY_FORMS"); 860: 861: /** 862: * Small Form Variants. 863: * '\uFE50' - '\uFE6F'. 864: */ 865: public static final UnicodeBlock SMALL_FORM_VARIANTS 866: = new UnicodeBlock('\uFE50', '\uFE6F', 867: "SMALL_FORM_VARIANTS"); 868: 869: /** 870: * Arabic Presentation Forms-B. 871: * '\uFE70' - '\uFEFE'. 872: */ 873: public static final UnicodeBlock ARABIC_PRESENTATION_FORMS_B 874: = new UnicodeBlock('\uFE70', '\uFEFE', 875: "ARABIC_PRESENTATION_FORMS_B"); 876: 877: /** 878: * Halfwidth and Fullwidth Forms. 879: * '\uFF00' - '\uFFEF'. 880: */ 881: public static final UnicodeBlock HALFWIDTH_AND_FULLWIDTH_FORMS 882: = new UnicodeBlock('\uFF00', '\uFFEF', 883: "HALFWIDTH_AND_FULLWIDTH_FORMS"); 884: 885: /** 886: * Specials. 887: * '\uFEFF', '\uFFF0' - '\uFFFD'. 888: */ 889: public static final UnicodeBlock SPECIALS 890: = new UnicodeBlock('\uFFF0', '\uFFFD', 891: "SPECIALS"); 892: 893: /** 894: * The defined subsets. 895: */ 896: private static final UnicodeBlock sets[] = { 897: BASIC_LATIN, 898: LATIN_1_SUPPLEMENT, 899: LATIN_EXTENDED_A, 900: LATIN_EXTENDED_B, 901: IPA_EXTENSIONS, 902: SPACING_MODIFIER_LETTERS, 903: COMBINING_DIACRITICAL_MARKS, 904: GREEK, 905: CYRILLIC, 906: ARMENIAN, 907: HEBREW, 908: ARABIC, 909: SYRIAC, 910: THAANA, 911: DEVANAGARI, 912: BENGALI, 913: GURMUKHI, 914: GUJARATI, 915: ORIYA, 916: TAMIL, 917: TELUGU, 918: KANNADA, 919: MALAYALAM, 920: SINHALA, 921: THAI, 922: LAO, 923: TIBETAN, 924: MYANMAR, 925: GEORGIAN, 926: HANGUL_JAMO, 927: ETHIOPIC, 928: CHEROKEE, 929: UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS, 930: OGHAM, 931: RUNIC, 932: KHMER, 933: MONGOLIAN, 934: LATIN_EXTENDED_ADDITIONAL, 935: GREEK_EXTENDED, 936: GENERAL_PUNCTUATION, 937: SUPERSCRIPTS_AND_SUBSCRIPTS, 938: CURRENCY_SYMBOLS, 939: COMBINING_MARKS_FOR_SYMBOLS, 940: LETTERLIKE_SYMBOLS, 941: NUMBER_FORMS, 942: ARROWS, 943: MATHEMATICAL_OPERATORS, 944: MISCELLANEOUS_TECHNICAL, 945: CONTROL_PICTURES, 946: OPTICAL_CHARACTER_RECOGNITION, 947: ENCLOSED_ALPHANUMERICS, 948: BOX_DRAWING, 949: BLOCK_ELEMENTS, 950: GEOMETRIC_SHAPES, 951: MISCELLANEOUS_SYMBOLS, 952: DINGBATS, 953: BRAILLE_PATTERNS, 954: CJK_RADICALS_SUPPLEMENT, 955: KANGXI_RADICALS, 956: IDEOGRAPHIC_DESCRIPTION_CHARACTERS, 957: CJK_SYMBOLS_AND_PUNCTUATION, 958: HIRAGANA, 959: KATAKANA, 960: BOPOMOFO, 961: HANGUL_COMPATIBILITY_JAMO, 962: KANBUN, 963: BOPOMOFO_EXTENDED, 964: ENCLOSED_CJK_LETTERS_AND_MONTHS, 965: CJK_COMPATIBILITY, 966: CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A, 967: CJK_UNIFIED_IDEOGRAPHS, 968: YI_SYLLABLES, 969: YI_RADICALS, 970: HANGUL_SYLLABLES, 971: SURROGATES_AREA, 972: PRIVATE_USE_AREA, 973: CJK_COMPATIBILITY_IDEOGRAPHS, 974: ALPHABETIC_PRESENTATION_FORMS, 975: ARABIC_PRESENTATION_FORMS_A, 976: COMBINING_HALF_MARKS, 977: CJK_COMPATIBILITY_FORMS, 978: SMALL_FORM_VARIANTS, 979: ARABIC_PRESENTATION_FORMS_B, 980: HALFWIDTH_AND_FULLWIDTH_FORMS, 981: SPECIALS, 982: }; 983: } // class UnicodeBlock 984: 985: /** 986: * The immutable value of this Character. 987: * 988: * @serial the value of this Character 989: */ 990: private final char value; 991: 992: /** 993: * Compatible with JDK 1.0+. 994: */ 995: private static final long serialVersionUID = 3786198910865385080L; 996: 997: /** 998: * Smallest value allowed for radix arguments in Java. This value is 2. 999: * 1000: * @see #digit(char, int) 1001: * @see #forDigit(int, int) 1002: * @see Integer#toString(int, int) 1003: * @see Integer#valueOf(String) 1004: */ 1005: public static final int MIN_RADIX = 2; 1006: 1007: /** 1008: * Largest value allowed for radix arguments in Java. This value is 36. 1009: * 1010: * @see #digit(char, int) 1011: * @see #forDigit(int, int) 1012: * @see Integer#toString(int, int) 1013: * @see Integer#valueOf(String) 1014: */ 1015: public static final int MAX_RADIX = 36; 1016: 1017: /** 1018: * The minimum value the char data type can hold. 1019: * This value is <code>'\\u0000'</code>. 1020: */ 1021: public static final char MIN_VALUE = '\u0000'; 1022: 1023: /** 1024: * The maximum value the char data type can hold. 1025: * This value is <code>'\\uFFFF'</code>. 1026: */ 1027: public static final char MAX_VALUE = '\uFFFF'; 1028: 1029: /** 1030: * Class object representing the primitive char data type. 1031: * 1032: * @since 1.1 1033: */ 1034: public static final Class TYPE = VMClassLoader.getPrimitiveClass('C'); 1035: 1036: /** 1037: * Lu = Letter, Uppercase (Informative). 1038: * 1039: * @since 1.1 1040: */ 1041: public static final byte UPPERCASE_LETTER = 1; 1042: 1043: /** 1044: * Ll = Letter, Lowercase (Informative). 1045: * 1046: * @since 1.1 1047: */ 1048: public static final byte LOWERCASE_LETTER = 2; 1049: 1050: /** 1051: * Lt = Letter, Titlecase (Informative). 1052: * 1053: * @since 1.1 1054: */ 1055: public static final byte TITLECASE_LETTER = 3; 1056: 1057: /** 1058: * Mn = Mark, Non-Spacing (Normative). 1059: * 1060: * @since 1.1 1061: */ 1062: public static final byte NON_SPACING_MARK = 6; 1063: 1064: /** 1065: * Mc = Mark, Spacing Combining (Normative). 1066: * 1067: * @since 1.1 1068: */ 1069: public static final byte COMBINING_SPACING_MARK = 8; 1070: 1071: /** 1072: * Me = Mark, Enclosing (Normative). 1073: * 1074: * @since 1.1 1075: */ 1076: public static final byte ENCLOSING_MARK = 7; 1077: 1078: /** 1079: * Nd = Number, Decimal Digit (Normative). 1080: * 1081: * @since 1.1 1082: */ 1083: public static final byte DECIMAL_DIGIT_NUMBER = 9; 1084: 1085: /** 1086: * Nl = Number, Letter (Normative). 1087: * 1088: * @since 1.1 1089: */ 1090: public static final byte LETTER_NUMBER = 10; 1091: 1092: /** 1093: * No = Number, Other (Normative). 1094: * 1095: * @since 1.1 1096: */ 1097: public static final byte OTHER_NUMBER = 11; 1098: 1099: /** 1100: * Zs = Separator, Space (Normative). 1101: * 1102: * @since 1.1 1103: */ 1104: public static final byte SPACE_SEPARATOR = 12; 1105: 1106: /** 1107: * Zl = Separator, Line (Normative). 1108: * 1109: * @since 1.1 1110: */ 1111: public static final byte LINE_SEPARATOR = 13; 1112: 1113: /** 1114: * Zp = Separator, Paragraph (Normative). 1115: * 1116: * @since 1.1 1117: */ 1118: public static final byte PARAGRAPH_SEPARATOR = 14; 1119: 1120: /** 1121: * Cc = Other, Control (Normative). 1122: * 1123: * @since 1.1 1124: */ 1125: public static final byte CONTROL = 15; 1126: 1127: /** 1128: * Cf = Other, Format (Normative). 1129: * 1130: * @since 1.1 1131: */ 1132: public static final byte FORMAT = 16; 1133: 1134: /** 1135: * Cs = Other, Surrogate (Normative). 1136: * 1137: * @since 1.1 1138: */ 1139: public static final byte SURROGATE = 19; 1140: 1141: /** 1142: * Co = Other, Private Use (Normative). 1143: * 1144: * @since 1.1 1145: */ 1146: public static final byte PRIVATE_USE = 18; 1147: 1148: /** 1149: * Cn = Other, Not Assigned (Normative). 1150: * 1151: * @since 1.1 1152: */ 1153: public static final byte UNASSIGNED = 0; 1154: 1155: /** 1156: * Lm = Letter, Modifier (Informative). 1157: * 1158: * @since 1.1 1159: */ 1160: public static final byte MODIFIER_LETTER = 4; 1161: 1162: /** 1163: * Lo = Letter, Other (Informative). 1164: * 1165: * @since 1.1 1166: */ 1167: public static final byte OTHER_LETTER = 5; 1168: 1169: /** 1170: * Pc = Punctuation, Connector (Informative). 1171: * 1172: * @since 1.1 1173: */ 1174: public static final byte CONNECTOR_PUNCTUATION = 23; 1175: 1176: /** 1177: * Pd = Punctuation, Dash (Informative). 1178: * 1179: * @since 1.1 1180: */ 1181: public static final byte DASH_PUNCTUATION = 20; 1182: 1183: /** 1184: * Ps = Punctuation, Open (Informative). 1185: * 1186: * @since 1.1 1187: */ 1188: public static final byte START_PUNCTUATION = 21; 1189: 1190: /** 1191: * Pe = Punctuation, Close (Informative). 1192: * 1193: * @since 1.1 1194: */ 1195: public static final byte END_PUNCTUATION = 22; 1196: 1197: /** 1198: * Pi = Punctuation, Initial Quote (Informative). 1199: * 1200: * @since 1.4 1201: */ 1202: public static final byte INITIAL_QUOTE_PUNCTUATION = 29; 1203: 1204: /** 1205: * Pf = Punctuation, Final Quote (Informative). 1206: * 1207: * @since 1.4 1208: */ 1209: public static final byte FINAL_QUOTE_PUNCTUATION = 30; 1210: 1211: /** 1212: * Po = Punctuation, Other (Informative). 1213: * 1214: * @since 1.1 1215: */ 1216: public static final byte OTHER_PUNCTUATION = 24; 1217: 1218: /** 1219: * Sm = Symbol, Math (Informative). 1220: * 1221: * @since 1.1 1222: */ 1223: public static final byte MATH_SYMBOL = 25; 1224: 1225: /** 1226: * Sc = Symbol, Currency (Informative). 1227: * 1228: * @since 1.1 1229: */ 1230: public static final byte CURRENCY_SYMBOL = 26; 1231: 1232: /** 1233: * Sk = Symbol, Modifier (Informative). 1234: * 1235: * @since 1.1 1236: */ 1237: public static final byte MODIFIER_SYMBOL = 27; 1238: 1239: /** 1240: * So = Symbol, Other (Informative). 1241: * 1242: * @since 1.1 1243: */ 1244: public static final byte OTHER_SYMBOL = 28; 1245: 1246: /** 1247: * Undefined bidirectional character type. Undefined char values have 1248: * undefined directionality in the Unicode specification. 1249: * 1250: * @since 1.4 1251: */ 1252: public static final byte DIRECTIONALITY_UNDEFINED = -1; 1253: 1254: /** 1255: * Strong bidirectional character type "L". 1256: * 1257: * @since 1.4 1258: */ 1259: public static final byte DIRECTIONALITY_LEFT_TO_RIGHT = 0; 1260: 1261: /** 1262: * Strong bidirectional character type "R". 1263: * 1264: * @since 1.4 1265: */ 1266: public static final byte DIRECTIONALITY_RIGHT_TO_LEFT = 1; 1267: 1268: /** 1269: * Strong bidirectional character type "AL". 1270: * 1271: * @since 1.4 1272: */ 1273: public static final byte DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC = 2; 1274: 1275: /** 1276: * Weak bidirectional character type "EN". 1277: * 1278: * @since 1.4 1279: */ 1280: public static final byte DIRECTIONALITY_EUROPEAN_NUMBER = 3; 1281: 1282: /** 1283: * Weak bidirectional character type "ES". 1284: * 1285: * @since 1.4 1286: */ 1287: public static final byte DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR = 4; 1288: 1289: /** 1290: * Weak bidirectional character type "ET". 1291: * 1292: * @since 1.4 1293: */ 1294: public static final byte DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR = 5; 1295: 1296: /** 1297: * Weak bidirectional character type "AN". 1298: * 1299: * @since 1.4 1300: */ 1301: public static final byte DIRECTIONALITY_ARABIC_NUMBER = 6; 1302: 1303: /** 1304: * Weak bidirectional character type "CS". 1305: * 1306: * @since 1.4 1307: */ 1308: public static final byte DIRECTIONALITY_COMMON_NUMBER_SEPARATOR = 7; 1309: 1310: /** 1311: * Weak bidirectional character type "NSM". 1312: * 1313: * @since 1.4 1314: */ 1315: public static final byte DIRECTIONALITY_NONSPACING_MARK = 8; 1316: 1317: /** 1318: * Weak bidirectional character type "BN". 1319: * 1320: * @since 1.4 1321: */ 1322: public static final byte DIRECTIONALITY_BOUNDARY_NEUTRAL = 9; 1323: 1324: /** 1325: * Neutral bidirectional character type "B". 1326: * 1327: * @since 1.4 1328: */ 1329: public static final byte DIRECTIONALITY_PARAGRAPH_SEPARATOR = 10; 1330: 1331: /** 1332: * Neutral bidirectional character type "S". 1333: * 1334: * @since 1.4 1335: */ 1336: public static final byte DIRECTIONALITY_SEGMENT_SEPARATOR = 11; 1337: 1338: /** 1339: * Strong bidirectional character type "WS". 1340: * 1341: * @since 1.4 1342: */ 1343: public static final byte DIRECTIONALITY_WHITESPACE = 12; 1344: 1345: /** 1346: * Neutral bidirectional character type "ON". 1347: * 1348: * @since 1.4 1349: */ 1350: public static final byte DIRECTIONALITY_OTHER_NEUTRALS = 13; 1351: 1352: /** 1353: * Strong bidirectional character type "LRE". 1354: * 1355: * @since 1.4 1356: */ 1357: public static final byte DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING = 14; 1358: 1359: /** 1360: * Strong bidirectional character type "LRO". 1361: * 1362: * @since 1.4 1363: */ 1364: public static final byte DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE = 15; 1365: 1366: /** 1367: * Strong bidirectional character type "RLE". 1368: * 1369: * @since 1.4 1370: */ 1371: public static final byte DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING = 16; 1372: 1373: /** 1374: * Strong bidirectional character type "RLO". 1375: * 1376: * @since 1.4 1377: */ 1378: public static final byte DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE = 17; 1379: 1380: /** 1381: * Weak bidirectional character type "PDF". 1382: * 1383: * @since 1.4 1384: */ 1385: public static final byte DIRECTIONALITY_POP_DIRECTIONAL_FORMAT = 18; 1386: 1387: /** 1388: * Stores unicode block offset lookup table. Exploit package visibility of 1389: * String.value to avoid copying the array. 1390: * @see #readChar(char) 1391: * @see CharData#BLOCKS 1392: */ 1393: private static final char[] blocks = String.zeroBasedStringValue(CharData.BLOCKS); 1394: 1395: /** 1396: * Stores unicode attribute offset lookup table. Exploit package visibility 1397: * of String.value to avoid copying the array. 1398: * @see CharData#DATA 1399: */ 1400: private static final char[] data = String.zeroBasedStringValue(CharData.DATA); 1401: 1402: /** 1403: * Stores unicode numeric value attribute table. Exploit package visibility 1404: * of String.value to avoid copying the array. 1405: * @see CharData#NUM_VALUE 1406: */ 1407: private static final char[] numValue 1408: = String.zeroBasedStringValue(CharData.NUM_VALUE); 1409: 1410: /** 1411: * Stores unicode uppercase attribute table. Exploit package visibility 1412: * of String.value to avoid copying the array. 1413: * @see CharData#UPPER 1414: */ 1415: private static final char[] upper = String.zeroBasedStringValue(CharData.UPPER); 1416: 1417: /** 1418: * Stores unicode lowercase attribute table. Exploit package visibility 1419: * of String.value to avoid copying the array. 1420: * @see CharData#LOWER 1421: */ 1422: private static final char[] lower = String.zeroBasedStringValue(CharData.LOWER); 1423: 1424: /** 1425: * Stores unicode direction attribute table. Exploit package visibility 1426: * of String.value to avoid copying the array. 1427: * @see CharData#DIRECTION 1428: */ 1429: // Package visible for use by String. 1430: static final char[] direction = String.zeroBasedStringValue(CharData.DIRECTION); 1431: 1432: /** 1433: * Stores unicode titlecase table. Exploit package visibility of 1434: * String.value to avoid copying the array. 1435: * @see CharData#TITLE 1436: */ 1437: private static final char[] title = String.zeroBasedStringValue(CharData.TITLE); 1438: 1439: /** 1440: * Mask for grabbing the type out of the contents of data. 1441: * @see CharData#DATA 1442: */ 1443: private static final int TYPE_MASK = 0x1F; 1444: 1445: /** 1446: * Mask for grabbing the non-breaking space flag out of the contents of 1447: * data. 1448: * @see CharData#DATA 1449: */ 1450: private static final int NO_BREAK_MASK = 0x20; 1451: 1452: /** 1453: * Mask for grabbing the mirrored directionality flag out of the contents 1454: * of data. 1455: * @see CharData#DATA 1456: */ 1457: private static final int MIRROR_MASK = 0x40; 1458: 1459: /** 1460: * Min value for supplementary code point. 1461: * 1462: * @since 1.5 1463: */ 1464: public static final int MIN_SUPPLEMENTARY_CODE_POINT = 0x10000; 1465: 1466: /** 1467: * Min value for code point. 1468: * 1469: * @since 1.5 1470: */ 1471: public static final int MIN_CODE_POINT = 0; 1472: 1473: 1474: /** 1475: * Max value for code point. 1476: * 1477: * @since 1.5 1478: */ 1479: public static final int MAX_CODE_POINT = 0x010ffff; 1480: 1481: 1482: /** 1483: * Minimum high surrrogate code in UTF-16 encoding. 1484: * 1485: * @since 1.5 1486: */ 1487: public static final char MIN_HIGH_SURROGATE = '\ud800'; 1488: 1489: /** 1490: * Maximum high surrrogate code in UTF-16 encoding. 1491: * 1492: * @since 1.5 1493: */ 1494: public static final char MAX_HIGH_SURROGATE = '\udbff'; 1495: 1496: /** 1497: * Minimum low surrrogate code in UTF-16 encoding. 1498: * 1499: * @since 1.5 1500: */ 1501: public static final char MIN_LOW_SURROGATE = '\udc00'; 1502: 1503: /** 1504: * Maximum low surrrogate code in UTF-16 encoding. 1505: * 1506: * @since 1.5 1507: */ 1508: public static final char MAX_LOW_SURROGATE = '\udfff'; 1509: 1510: /** 1511: * Grabs an attribute offset from the Unicode attribute database. The lower 1512: * 5 bits are the character type, the next 2 bits are flags, and the top 1513: * 9 bits are the offset into the attribute tables. 1514: * 1515: * @param ch the character to look up 1516: * @return the character's attribute offset and type 1517: * @see #TYPE_MASK 1518: * @see #NO_BREAK_MASK 1519: * @see #MIRROR_MASK 1520: * @see CharData#DATA 1521: * @see CharData#SHIFT 1522: */ 1523: // Package visible for use in String. 1524: static char readChar(char ch) 1525: { 1526: // Perform 16-bit addition to find the correct entry in data. 1527: return data[(char) (blocks[ch >> CharData.SHIFT] + ch)]; 1528: } 1529: 1530: /** 1531: * Wraps up a character. 1532: * 1533: * @param value the character to wrap 1534: */ 1535: public Character(char value) 1536: { 1537: this.value = value; 1538: } 1539: 1540: /** 1541: * Returns the character which has been wrapped by this class. 1542: * 1543: * @return the character wrapped 1544: */ 1545: public char charValue() 1546: { 1547: return value; 1548: } 1549: 1550: /** 1551: * Returns the numerical value (unsigned) of the wrapped character. 1552: * Range of returned values: 0x0000-0xFFFF. 1553: * 1554: * @return the value of the wrapped character 1555: */ 1556: public int hashCode() 1557: { 1558: return value; 1559: } 1560: 1561: /** 1562: * Determines if an object is equal to this object. This is only true for 1563: * another Character object wrapping the same value. 1564: * 1565: * @param o object to compare 1566: * @return true if o is a Character with the same value 1567: */ 1568: public boolean equals(Object o) 1569: { 1570: return o instanceof Character && value == ((Character) o).value; 1571: } 1572: 1573: /** 1574: * Converts the wrapped character into a String. 1575: * 1576: * @return a String containing one character -- the wrapped character 1577: * of this instance 1578: */ 1579: public String toString() 1580: { 1581: // Package constructor avoids an array copy. 1582: return new String(new char[] { value }, 0, 1, true); 1583: } 1584: 1585: /** 1586: * Returns a String of length 1 representing the specified character. 1587: * 1588: * @param ch the character to convert 1589: * @return a String containing the character 1590: * @since 1.4 1591: */ 1592: public static String toString(char ch) 1593: { 1594: // Package constructor avoids an array copy. 1595: return new String(new char[] { ch }, 0, 1, true); 1596: } 1597: 1598: /** 1599: * Determines if a character is a Unicode lowercase letter. For example, 1600: * <code>'a'</code> is lowercase. 1601: * <br> 1602: * lowercase = [Ll] 1603: * 1604: * @param ch character to test 1605: * @return true if ch is a Unicode lowercase letter, else false 1606: * @see #isUpperCase(char) 1607: * @see #isTitleCase(char) 1608: * @see #toLowerCase(char) 1609: * @see #getType(char) 1610: */ 1611: public static boolean isLowerCase(char ch) 1612: { 1613: return getType(ch) == LOWERCASE_LETTER; 1614: } 1615: 1616: /** 1617: * Determines if a character is a Unicode uppercase letter. For example, 1618: * <code>'A'</code> is uppercase. 1619: * <br> 1620: * uppercase = [Lu] 1621: * 1622: * @param ch character to test 1623: * @return true if ch is a Unicode uppercase letter, else false 1624: * @see #isLowerCase(char) 1625: * @see #isTitleCase(char) 1626: * @see #toUpperCase(char) 1627: * @see #getType(char) 1628: */ 1629: public static boolean isUpperCase(char ch) 1630: { 1631: return getType(ch) == UPPERCASE_LETTER; 1632: } 1633: 1634: /** 1635: * Determines if a character is a Unicode titlecase letter. For example, 1636: * the character "Lj" (Latin capital L with small letter j) is titlecase. 1637: * <br> 1638: * titlecase = [Lt] 1639: * 1640: * @param ch character to test 1641: * @return true if ch is a Unicode titlecase letter, else false 1642: * @see #isLowerCase(char) 1643: * @see #isUpperCase(char) 1644: * @see #toTitleCase(char) 1645: * @see #getType(char) 1646: */ 1647: public static boolean isTitleCase(char ch) 1648: { 1649: return getType(ch) == TITLECASE_LETTER; 1650: } 1651: 1652: /** 1653: * Determines if a character is a Unicode decimal digit. For example, 1654: * <code>'0'</code> is a digit. 1655: * <br> 1656: * Unicode decimal digit = [Nd] 1657: * 1658: * @param ch character to test 1659: * @return true if ch is a Unicode decimal digit, else false 1660: * @see #digit(char, int) 1661: * @see #forDigit(int, int) 1662: * @see #getType(char) 1663: */ 1664: public static boolean isDigit(char ch) 1665: { 1666: return getType(ch) == DECIMAL_DIGIT_NUMBER; 1667: } 1668: 1669: /** 1670: * Determines if a character is part of the Unicode Standard. This is an 1671: * evolving standard, but covers every character in the data file. 1672: * <br> 1673: * defined = not [Cn] 1674: * 1675: * @param ch character to test 1676: * @return true if ch is a Unicode character, else false 1677: * @see #isDigit(char) 1678: * @see #isLetter(char) 1679: * @see #isLetterOrDigit(char) 1680: * @see #isLowerCase(char) 1681: * @see #isTitleCase(char) 1682: * @see #isUpperCase(char) 1683: */ 1684: public static boolean isDefined(char ch) 1685: { 1686: return getType(ch) != UNASSIGNED; 1687: } 1688: 1689: /** 1690: * Determines if a character is a Unicode letter. Not all letters have case, 1691: * so this may return true when isLowerCase and isUpperCase return false. 1692: * <br> 1693: * letter = [Lu]|[Ll]|[Lt]|[Lm]|[Lo] 1694: * 1695: * @param ch character to test 1696: * @return true if ch is a Unicode letter, else false 1697: * @see #isDigit(char) 1698: * @see #isJavaIdentifierStart(char) 1699: * @see #isJavaLetter(char) 1700: * @see #isJavaLetterOrDigit(char) 1701: * @see #isLetterOrDigit(char) 1702: * @see #isLowerCase(char) 1703: * @see #isTitleCase(char) 1704: * @see #isUnicodeIdentifierStart(char) 1705: * @see #isUpperCase(char) 1706: */ 1707: public static boolean isLetter(char ch) 1708: { 1709: return ((1 << getType(ch)) 1710: & ((1 << UPPERCASE_LETTER) 1711: | (1 << LOWERCASE_LETTER) 1712: | (1 << TITLECASE_LETTER) 1713: | (1 << MODIFIER_LETTER) 1714: | (1 << OTHER_LETTER))) != 0; 1715: } 1716: 1717: /** 1718: * Determines if a character is a Unicode letter or a Unicode digit. This 1719: * is the combination of isLetter and isDigit. 1720: * <br> 1721: * letter or digit = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nd] 1722: * 1723: * @param ch character to test 1724: * @return true if ch is a Unicode letter or a Unicode digit, else false 1725: * @see #isDigit(char) 1726: * @see #isJavaIdentifierPart(char) 1727: * @see #isJavaLetter(char) 1728: * @see #isJavaLetterOrDigit(char) 1729: * @see #isLetter(char) 1730: * @see #isUnicodeIdentifierPart(char) 1731: */ 1732: public static boolean isLetterOrDigit(char ch) 1733: { 1734: return ((1 << getType(ch)) 1735: & ((1 << UPPERCASE_LETTER) 1736: | (1 << LOWERCASE_LETTER) 1737: | (1 << TITLECASE_LETTER) 1738: | (1 << MODIFIER_LETTER) 1739: | (1 << OTHER_LETTER) 1740: | (1 << DECIMAL_DIGIT_NUMBER))) != 0; 1741: } 1742: 1743: /** 1744: * Determines if a character can start a Java identifier. This is the 1745: * combination of isLetter, any character where getType returns 1746: * LETTER_NUMBER, currency symbols (like '$'), and connecting punctuation 1747: * (like '_'). 1748: * 1749: * @param ch character to test 1750: * @return true if ch can start a Java identifier, else false 1751: * @deprecated Replaced by {@link #isJavaIdentifierStart(char)} 1752: * @see #isJavaLetterOrDigit(char) 1753: * @see #isJavaIdentifierStart(char) 1754: * @see #isJavaIdentifierPart(char) 1755: * @see #isLetter(char) 1756: * @see #isLetterOrDigit(char) 1757: * @see #isUnicodeIdentifierStart(char) 1758: */ 1759: public static boolean isJavaLetter(char ch) 1760: { 1761: return isJavaIdentifierStart(ch); 1762: } 1763: 1764: /** 1765: * Determines if a character can follow the first letter in 1766: * a Java identifier. This is the combination of isJavaLetter (isLetter, 1767: * type of LETTER_NUMBER, currency, connecting punctuation) and digit, 1768: * numeric letter (like Roman numerals), combining marks, non-spacing marks, 1769: * or isIdentifierIgnorable. 1770: * 1771: * @param ch character to test 1772: * @return true if ch can follow the first letter in a Java identifier 1773: * @deprecated Replaced by {@link #isJavaIdentifierPart(char)} 1774: * @see #isJavaLetter(char) 1775: * @see #isJavaIdentifierStart(char) 1776: * @see #isJavaIdentifierPart(char) 1777: * @see #isLetter(char) 1778: * @see #isLetterOrDigit(char) 1779: * @see #isUnicodeIdentifierPart(char) 1780: * @see #isIdentifierIgnorable(char) 1781: */ 1782: public static boolean isJavaLetterOrDigit(char ch) 1783: { 1784: return isJavaIdentifierPart(ch); 1785: } 1786: 1787: /** 1788: * Determines if a character can start a Java identifier. This is the 1789: * combination of isLetter, any character where getType returns 1790: * LETTER_NUMBER, currency symbols (like '$'), and connecting punctuation 1791: * (like '_'). 1792: * <br> 1793: * Java identifier start = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Sc]|[Pc] 1794: * 1795: * @param ch character to test 1796: * @return true if ch can start a Java identifier, else false 1797: * @see #isJavaIdentifierPart(char) 1798: * @see #isLetter(char) 1799: * @see #isUnicodeIdentifierStart(char) 1800: * @since 1.1 1801: */ 1802: public static boolean isJavaIdentifierStart(char ch) 1803: { 1804: return ((1 << getType(ch)) 1805: & ((1 << UPPERCASE_LETTER) 1806: | (1 << LOWERCASE_LETTER) 1807: | (1 << TITLECASE_LETTER) 1808: | (1 << MODIFIER_LETTER) 1809: | (1 << OTHER_LETTER) 1810: | (1 << LETTER_NUMBER) 1811: | (1 << CURRENCY_SYMBOL) 1812: | (1 << CONNECTOR_PUNCTUATION))) != 0; 1813: } 1814: 1815: /** 1816: * Determines if a character can follow the first letter in 1817: * a Java identifier. This is the combination of isJavaLetter (isLetter, 1818: * type of LETTER_NUMBER, currency, connecting punctuation) and digit, 1819: * numeric letter (like Roman numerals), combining marks, non-spacing marks, 1820: * or isIdentifierIgnorable. 1821: * <br> 1822: * Java identifier extender = 1823: * [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Sc]|[Pc]|[Mn]|[Mc]|[Nd]|[Cf] 1824: * |U+0000-U+0008|U+000E-U+001B|U+007F-U+009F 1825: * 1826: * @param ch character to test 1827: * @return true if ch can follow the first letter in a Java identifier 1828: * @see #isIdentifierIgnorable(char) 1829: * @see #isJavaIdentifierStart(char) 1830: * @see #isLetterOrDigit(char) 1831: * @see #isUnicodeIdentifierPart(char) 1832: * @since 1.1 1833: */ 1834: public static boolean isJavaIdentifierPart(char ch) 1835: { 1836: int category = getType(ch); 1837: return ((1 << category) 1838: & ((1 << UPPERCASE_LETTER) 1839: | (1 << LOWERCASE_LETTER) 1840: | (1 << TITLECASE_LETTER) 1841: | (1 << MODIFIER_LETTER) 1842: | (1 << OTHER_LETTER) 1843: | (1 << NON_SPACING_MARK) 1844: | (1 << COMBINING_SPACING_MARK) 1845: | (1 << DECIMAL_DIGIT_NUMBER) 1846: | (1 << LETTER_NUMBER) 1847: | (1 << CURRENCY_SYMBOL) 1848: | (1 << CONNECTOR_PUNCTUATION) 1849: | (1 << FORMAT))) != 0 1850: || (category == CONTROL && isIdentifierIgnorable(ch)); 1851: } 1852: 1853: /** 1854: * Determines if a character can start a Unicode identifier. Only 1855: * letters can start a Unicode identifier, but this includes characters 1856: * in LETTER_NUMBER. 1857: * <br> 1858: * Unicode identifier start = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl] 1859: * 1860: * @param ch character to test 1861: * @return true if ch can start a Unicode identifier, else false 1862: * @see #isJavaIdentifierStart(char) 1863: * @see #isLetter(char) 1864: * @see #isUnicodeIdentifierPart(char) 1865: * @since 1.1 1866: */ 1867: public static boolean isUnicodeIdentifierStart(char ch) 1868: { 1869: return ((1 << getType(ch)) 1870: & ((1 << UPPERCASE_LETTER) 1871: | (1 << LOWERCASE_LETTER) 1872: | (1 << TITLECASE_LETTER) 1873: | (1 << MODIFIER_LETTER) 1874: | (1 << OTHER_LETTER) 1875: | (1 << LETTER_NUMBER))) != 0; 1876: } 1877: 1878: /** 1879: * Determines if a character can follow the first letter in 1880: * a Unicode identifier. This includes letters, connecting punctuation, 1881: * digits, numeric letters, combining marks, non-spacing marks, and 1882: * isIdentifierIgnorable. 1883: * <br> 1884: * Unicode identifier extender = 1885: * [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Mn]|[Mc]|[Nd]|[Pc]|[Cf]| 1886: * |U+0000-U+0008|U+000E-U+001B|U+007F-U+009F 1887: * 1888: * @param ch character to test 1889: * @return true if ch can follow the first letter in a Unicode identifier 1890: * @see #isIdentifierIgnorable(char) 1891: * @see #isJavaIdentifierPart(char) 1892: * @see #isLetterOrDigit(char) 1893: * @see #isUnicodeIdentifierStart(char) 1894: * @since 1.1 1895: */ 1896: public static boolean isUnicodeIdentifierPart(char ch) 1897: { 1898: int category = getType(ch); 1899: return ((1 << category) 1900: & ((1 << UPPERCASE_LETTER) 1901: | (1 << LOWERCASE_LETTER) 1902: | (1 << TITLECASE_LETTER) 1903: | (1 << MODIFIER_LETTER) 1904: | (1 << OTHER_LETTER) 1905: | (1 << NON_SPACING_MARK) 1906: | (1 << COMBINING_SPACING_MARK) 1907: | (1 << DECIMAL_DIGIT_NUMBER) 1908: | (1 << LETTER_NUMBER) 1909: | (1 << CONNECTOR_PUNCTUATION) 1910: | (1 << FORMAT))) != 0 1911: || (category == CONTROL && isIdentifierIgnorable(ch)); 1912: } 1913: 1914: /** 1915: * Determines if a character is ignorable in a Unicode identifier. This 1916: * includes the non-whitespace ISO control characters (<code>'\u0000'</code> 1917: * through <code>'\u0008'</code>, <code>'\u000E'</code> through 1918: * <code>'\u001B'</code>, and <code>'\u007F'</code> through 1919: * <code>'\u009F'</code>), and FORMAT characters. 1920: * <br> 1921: * Unicode identifier ignorable = [Cf]|U+0000-U+0008|U+000E-U+001B 1922: * |U+007F-U+009F 1923: * 1924: * @param ch character to test 1925: * @return true if ch is ignorable in a Unicode or Java identifier 1926: * @see #isJavaIdentifierPart(char) 1927: * @see #isUnicodeIdentifierPart(char) 1928: * @since 1.1 1929: */ 1930: public static boolean isIdentifierIgnorable(char ch) 1931: { 1932: return (ch <= '\u009F' && (ch < '\t' || ch >= '\u007F' 1933: || (ch <= '\u001B' && ch >= '\u000E'))) 1934: || getType(ch) == FORMAT; 1935: } 1936: 1937: /** 1938: * Converts a Unicode character into its lowercase equivalent mapping. 1939: * If a mapping does not exist, then the character passed is returned. 1940: * Note that isLowerCase(toLowerCase(ch)) does not always return true. 1941: * 1942: * @param ch character to convert to lowercase 1943: * @return lowercase mapping of ch, or ch if lowercase mapping does 1944: * not exist 1945: * @see #isLowerCase(char) 1946: * @see #isUpperCase(char) 1947: * @see #toTitleCase(char) 1948: * @see #toUpperCase(char) 1949: */ 1950: public static char toLowerCase(char ch) 1951: { 1952: // Signedness doesn't matter, as result is cast back to char. 1953: return (char) (ch + lower[readChar(ch) >> 7]); 1954: } 1955: 1956: /** 1957: * Converts a Unicode character into its uppercase equivalent mapping. 1958: * If a mapping does not exist, then the character passed is returned. 1959: * Note that isUpperCase(toUpperCase(ch)) does not always return true. 1960: * 1961: * @param ch character to convert to uppercase 1962: * @return uppercase mapping of ch, or ch if uppercase mapping does 1963: * not exist 1964: * @see #isLowerCase(char) 1965: * @see #isUpperCase(char) 1966: * @see #toLowerCase(char) 1967: * @see #toTitleCase(char) 1968: */ 1969: public static char toUpperCase(char ch) 1970: { 1971: // Signedness doesn't matter, as result is cast back to char. 1972: return (char) (ch + upper[readChar(ch) >> 7]); 1973: } 1974: 1975: /** 1976: * Converts a Unicode character into its titlecase equivalent mapping. 1977: * If a mapping does not exist, then the character passed is returned. 1978: * Note that isTitleCase(toTitleCase(ch)) does not always return true. 1979: * 1980: * @param ch character to convert to titlecase 1981: * @return titlecase mapping of ch, or ch if titlecase mapping does 1982: * not exist 1983: * @see #isTitleCase(char) 1984: * @see #toLowerCase(char) 1985: * @see #toUpperCase(char) 1986: */ 1987: public static char toTitleCase(char ch) 1988: { 1989: // As title is short, it doesn't hurt to exhaustively iterate over it. 1990: for (int i = title.length - 2; i >= 0; i -= 2) 1991: if (title[i] == ch) 1992: return title[i + 1]; 1993: return toUpperCase(ch); 1994: } 1995: 1996: /** 1997: * Converts a character into a digit of the specified radix. If the radix 1998: * exceeds MIN_RADIX or MAX_RADIX, or if the result of getNumericValue(ch) 1999: * exceeds the radix, or if ch is not a decimal digit or in the case 2000: * insensitive set of 'a'-'z', the result is -1. 2001: * <br> 2002: * character argument boundary = [Nd]|U+0041-U+005A|U+0061-U+007A 2003: * |U+FF21-U+FF3A|U+FF41-U+FF5A 2004: * 2005: * @param ch character to convert into a digit 2006: * @param radix radix in which ch is a digit 2007: * @return digit which ch represents in radix, or -1 not a valid digit 2008: * @see #MIN_RADIX 2009: * @see #MAX_RADIX 2010: * @see #forDigit(int, int) 2011: * @see #isDigit(char) 2012: * @see #getNumericValue(char) 2013: */ 2014: public static int digit(char ch, int radix) 2015: { 2016: if (radix < MIN_RADIX || radix > MAX_RADIX) 2017: return -1; 2018: char attr = readChar(ch); 2019: if (((1 << (attr & TYPE_MASK)) 2020: & ((1 << UPPERCASE_LETTER) 2021: | (1 << LOWERCASE_LETTER) 2022: | (1 << DECIMAL_DIGIT_NUMBER))) != 0) 2023: { 2024: // Signedness doesn't matter; 0xffff vs. -1 are both rejected. 2025: int digit = numValue[attr >> 7]; 2026: return (digit < radix) ? digit : -1; 2027: } 2028: return -1; 2029: } 2030: 2031: /** 2032: * Returns the Unicode numeric value property of a character. For example, 2033: * <code>'\\u216C'</code> (the Roman numeral fifty) returns 50. 2034: * 2035: * <p>This method also returns values for the letters A through Z, (not 2036: * specified by Unicode), in these ranges: <code>'\u0041'</code> 2037: * through <code>'\u005A'</code> (uppercase); <code>'\u0061'</code> 2038: * through <code>'\u007A'</code> (lowercase); and <code>'\uFF21'</code> 2039: * through <code>'\uFF3A'</code>, <code>'\uFF41'</code> through 2040: * <code>'\uFF5A'</code> (full width variants). 2041: * 2042: * <p>If the character lacks a numeric value property, -1 is returned. 2043: * If the character has a numeric value property which is not representable 2044: * as a nonnegative integer, such as a fraction, -2 is returned. 2045: * 2046: * character argument boundary = [Nd]|[Nl]|[No]|U+0041-U+005A|U+0061-U+007A 2047: * |U+FF21-U+FF3A|U+FF41-U+FF5A 2048: * 2049: * @param ch character from which the numeric value property will 2050: * be retrieved 2051: * @return the numeric value property of ch, or -1 if it does not exist, or 2052: * -2 if it is not representable as a nonnegative integer 2053: * @see #forDigit(int, int) 2054: * @see #digit(char, int) 2055: * @see #isDigit(char) 2056: * @since 1.1 2057: */ 2058: public static int getNumericValue(char ch) 2059: { 2060: // Treat numValue as signed. 2061: return (short) numValue[readChar(ch) >> 7]; 2062: } 2063: 2064: /** 2065: * Determines if a character is a ISO-LATIN-1 space. This is only the five 2066: * characters <code>'\t'</code>, <code>'\n'</code>, <code>'\f'</code>, 2067: * <code>'\r'</code>, and <code>' '</code>. 2068: * <br> 2069: * Java space = U+0020|U+0009|U+000A|U+000C|U+000D 2070: * 2071: * @param ch character to test 2072: * @return true if ch is a space, else false 2073: * @deprecated Replaced by {@link #isWhitespace(char)} 2074: * @see #isSpaceChar(char) 2075: * @see #isWhitespace(char) 2076: */ 2077: public static boolean isSpace(char ch) 2078: { 2079: // Performing the subtraction up front alleviates need to compare longs. 2080: return ch-- <= ' ' && ((1 << ch) 2081: & ((1 << (' ' - 1)) 2082: | (1 << ('\t' - 1)) 2083: | (1 << ('\n' - 1)) 2084: | (1 << ('\r' - 1)) 2085: | (1 << ('\f' - 1)))) != 0; 2086: } 2087: 2088: /** 2089: * Determines if a character is a Unicode space character. This includes 2090: * SPACE_SEPARATOR, LINE_SEPARATOR, and PARAGRAPH_SEPARATOR. 2091: * <br> 2092: * Unicode space = [Zs]|[Zp]|[Zl] 2093: * 2094: * @param ch character to test 2095: * @return true if ch is a Unicode space, else false 2096: * @see #isWhitespace(char) 2097: * @since 1.1 2098: */ 2099: public static boolean isSpaceChar(char ch) 2100: { 2101: return ((1 << getType(ch)) 2102: & ((1 << SPACE_SEPARATOR) 2103: | (1 << LINE_SEPARATOR) 2104: | (1 << PARAGRAPH_SEPARATOR))) != 0; 2105: } 2106: 2107: /** 2108: * Determines if a character is Java whitespace. This includes Unicode 2109: * space characters (SPACE_SEPARATOR, LINE_SEPARATOR, and 2110: * PARAGRAPH_SEPARATOR) except the non-breaking spaces 2111: * (<code>'\u00A0'</code>, <code>'\u2007'</code>, and <code>'\u202F'</code>); 2112: * and these characters: <code>'\u0009'</code>, <code>'\u000A'</code>, 2113: * <code>'\u000B'</code>, <code>'\u000C'</code>, <code>'\u000D'</code>, 2114: * <code>'\u001C'</code>, <code>'\u001D'</code>, <code>'\u001E'</code>, 2115: * and <code>'\u001F'</code>. 2116: * <br> 2117: * Java whitespace = ([Zs] not Nb)|[Zl]|[Zp]|U+0009-U+000D|U+001C-U+001F 2118: * 2119: * @param ch character to test 2120: * @return true if ch is Java whitespace, else false 2121: * @see #isSpaceChar(char) 2122: * @since 1.1 2123: */ 2124: public static boolean isWhitespace(char ch) 2125: { 2126: int attr = readChar(ch); 2127: return ((((1 << (attr & TYPE_MASK)) 2128: & ((1 << SPACE_SEPARATOR) 2129: | (1 << LINE_SEPARATOR) 2130: | (1 << PARAGRAPH_SEPARATOR))) != 0) 2131: && (attr & NO_BREAK_MASK) == 0) 2132: || (ch <= '\u001F' && ((1 << ch) 2133: & ((1 << '\t') 2134: | (1 << '\n') 2135: | (1 << '\u000B') 2136: | (1 << '\u000C') 2137: | (1 << '\r') 2138: | (1 << '\u001C') 2139: | (1 << '\u001D') 2140: | (1 << '\u001E') 2141: | (1 << '\u001F'))) != 0); 2142: } 2143: 2144: /** 2145: * Determines if a character has the ISO Control property. 2146: * <br> 2147: * ISO Control = [Cc] 2148: * 2149: * @param ch character to test 2150: * @return true if ch is an ISO Control character, else false 2151: * @see #isSpaceChar(char) 2152: * @see #isWhitespace(char) 2153: * @since 1.1 2154: */ 2155: public static boolean isISOControl(char ch) 2156: { 2157: return getType(ch) == CONTROL; 2158: } 2159: 2160: /** 2161: * Returns the Unicode general category property of a character. 2162: * 2163: * @param ch character from which the general category property will 2164: * be retrieved 2165: * @return the character category property of ch as an integer 2166: * @see #UNASSIGNED 2167: * @see #UPPERCASE_LETTER 2168: * @see #LOWERCASE_LETTER 2169: * @see #TITLECASE_LETTER 2170: * @see #MODIFIER_LETTER 2171: * @see #OTHER_LETTER 2172: * @see #NON_SPACING_MARK 2173: * @see #ENCLOSING_MARK 2174: * @see #COMBINING_SPACING_MARK 2175: * @see #DECIMAL_DIGIT_NUMBER 2176: * @see #LETTER_NUMBER 2177: * @see #OTHER_NUMBER 2178: * @see #SPACE_SEPARATOR 2179: * @see #LINE_SEPARATOR 2180: * @see #PARAGRAPH_SEPARATOR 2181: * @see #CONTROL 2182: * @see #FORMAT 2183: * @see #PRIVATE_USE 2184: * @see #SURROGATE 2185: * @see #DASH_PUNCTUATION 2186: * @see #START_PUNCTUATION 2187: * @see #END_PUNCTUATION 2188: * @see #CONNECTOR_PUNCTUATION 2189: * @see #OTHER_PUNCTUATION 2190: * @see #MATH_SYMBOL 2191: * @see #CURRENCY_SYMBOL 2192: * @see #MODIFIER_SYMBOL 2193: * @see #INITIAL_QUOTE_PUNCTUATION 2194: * @see #FINAL_QUOTE_PUNCTUATION 2195: * @since 1.1 2196: */ 2197: public static int getType(char ch) 2198: { 2199: return readChar(ch) & TYPE_MASK; 2200: } 2201: 2202: /** 2203: * Converts a digit into a character which represents that digit 2204: * in a specified radix. If the radix exceeds MIN_RADIX or MAX_RADIX, 2205: * or the digit exceeds the radix, then the null character <code>'\0'</code> 2206: * is returned. Otherwise the return value is in '0'-'9' and 'a'-'z'. 2207: * <br> 2208: * return value boundary = U+0030-U+0039|U+0061-U+007A 2209: * 2210: * @param digit digit to be converted into a character 2211: * @param radix radix of digit 2212: * @return character representing digit in radix, or '\0' 2213: * @see #MIN_RADIX 2214: * @see #MAX_RADIX 2215: * @see #digit(char, int) 2216: */ 2217: public static char forDigit(int digit, int radix) 2218: { 2219: if (radix < MIN_RADIX || radix > MAX_RADIX 2220: || digit < 0 || digit >= radix) 2221: return '\0'; 2222: return Number.digits[digit]; 2223: } 2224: 2225: /** 2226: * Returns the Unicode directionality property of the character. This 2227: * is used in the visual ordering of text. 2228: * 2229: * @param ch the character to look up 2230: * @return the directionality constant, or DIRECTIONALITY_UNDEFINED 2231: * @see #DIRECTIONALITY_UNDEFINED 2232: * @see #DIRECTIONALITY_LEFT_TO_RIGHT 2233: * @see #DIRECTIONALITY_RIGHT_TO_LEFT 2234: * @see #DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC 2235: * @see #DIRECTIONALITY_EUROPEAN_NUMBER 2236: * @see #DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR 2237: * @see #DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR 2238: * @see #DIRECTIONALITY_ARABIC_NUMBER 2239: * @see #DIRECTIONALITY_COMMON_NUMBER_SEPARATOR 2240: * @see #DIRECTIONALITY_NONSPACING_MARK 2241: * @see #DIRECTIONALITY_BOUNDARY_NEUTRAL 2242: * @see #DIRECTIONALITY_PARAGRAPH_SEPARATOR 2243: * @see #DIRECTIONALITY_SEGMENT_SEPARATOR 2244: * @see #DIRECTIONALITY_WHITESPACE 2245: * @see #DIRECTIONALITY_OTHER_NEUTRALS 2246: * @see #DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING 2247: * @see #DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE 2248: * @see #DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING 2249: * @see #DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE 2250: * @see #DIRECTIONALITY_POP_DIRECTIONAL_FORMAT 2251: * @since 1.4 2252: */ 2253: public static byte getDirectionality(char ch) 2254: { 2255: // The result will correctly be signed. 2256: return (byte) (direction[readChar(ch) >> 7] >> 2); 2257: } 2258: 2259: /** 2260: * Determines whether the character is mirrored according to Unicode. For 2261: * example, <code>\u0028</code> (LEFT PARENTHESIS) appears as '(' in 2262: * left-to-right text, but ')' in right-to-left text. 2263: * 2264: * @param ch the character to look up 2265: * @return true if the character is mirrored 2266: * @since 1.4 2267: */ 2268: public static boolean isMirrored(char ch) 2269: { 2270: return (readChar(ch) & MIRROR_MASK) != 0; 2271: } 2272: 2273: /** 2274: * Compares another Character to this Character, numerically. 2275: * 2276: * @param anotherCharacter Character to compare with this Character 2277: * @return a negative integer if this Character is less than 2278: * anotherCharacter, zero if this Character is equal, and 2279: * a positive integer if this Character is greater 2280: * @throws NullPointerException if anotherCharacter is null 2281: * @since 1.2 2282: */ 2283: public int compareTo(Character anotherCharacter) 2284: { 2285: return value - anotherCharacter.value; 2286: } 2287: 2288: /** 2289: * Compares an object to this Character. Assuming the object is a 2290: * Character object, this method performs the same comparison as 2291: * compareTo(Character). 2292: * 2293: * @param o object to compare 2294: * @return the comparison value 2295: * @throws ClassCastException if o is not a Character object 2296: * @throws NullPointerException if o is null 2297: * @see #compareTo(Character) 2298: * @since 1.2 2299: */ 2300: public int compareTo(Object o) 2301: { 2302: return compareTo((Character) o); 2303: } 2304: 2305: /** 2306: * Converts a unicode code point to a UTF-16 representation of that 2307: * code point. 2308: * 2309: * @param codePoint the unicode code point 2310: * 2311: * @return the UTF-16 representation of that code point 2312: * 2313: * @throws IllegalArgumentException if the code point is not a valid 2314: * unicode code point 2315: * 2316: * @since 1.5 2317: */ 2318: public static char[] toChars(int codePoint) 2319: { 2320: char[] result = new char[charCount(codePoint)]; 2321: int ignore = toChars(codePoint, result, 0); 2322: return result; 2323: } 2324: 2325: /** 2326: * Converts a unicode code point to its UTF-16 representation. 2327: * 2328: * @param codePoint the unicode code point 2329: * @param dst the target char array 2330: * @param dstIndex the start index for the target 2331: * 2332: * @return number of characters written to <code>dst</code> 2333: * 2334: * @throws IllegalArgumentException if <code>codePoint</code> is not a 2335: * valid unicode code point 2336: * @throws NullPointerException if <code>dst</code> is <code>null</code> 2337: * @throws IndexOutOfBoundsException if <code>dstIndex</code> is not valid 2338: * in <code>dst</code> or if the UTF-16 representation does not 2339: * fit into <code>dst</code> 2340: * 2341: * @since 1.5 2342: */ 2343: public static int toChars(int codePoint, char[] dst, int dstIndex) 2344: { 2345: if (!isValidCodePoint(codePoint)) 2346: { 2347: throw new IllegalArgumentException("not a valid code point: " 2348: + codePoint); 2349: } 2350: 2351: int result; 2352: if (isSupplementaryCodePoint(codePoint)) 2353: { 2354: // Write second char first to cause IndexOutOfBoundsException 2355: // immediately. 2356: dst[dstIndex + 1] = (char) ((codePoint & 0x3ff) 2357: + (int) MIN_LOW_SURROGATE ); 2358: dst[dstIndex] = (char) ((codePoint >> 10) + (int) MIN_HIGH_SURROGATE); 2359: result = 2; 2360: } 2361: else 2362: { 2363: dst[dstIndex] = (char) codePoint; 2364: result = 1; 2365: } 2366: return result; 2367: } 2368: 2369: /** 2370: * Return number of 16-bit characters required to represent the given 2371: * code point. 2372: * 2373: * @param codePoint a uncode code point 2374: * 2375: * @return 2 if codePoint >= 0x10000, 1 otherwise. 2376: * 2377: * @since 1.5 2378: */ 2379: public static int charCount(int codePoint) 2380: { 2381: return 2382: (codePoint >= MIN_SUPPLEMENTARY_CODE_POINT) 2383: ? 2 2384: : 1; 2385: } 2386: 2387: /** 2388: * Determines whether the specified code point is 2389: * in the range 0x10000 .. 0x10FFFF, i.e. the character is within the Unicode 2390: * supplementary character range. 2391: * 2392: * @param codePoint a Unicode code point 2393: * 2394: * @return <code>true</code> if code point is in supplementary range 2395: * 2396: * @since 1.5 2397: */ 2398: public static boolean isSupplementaryCodePoint(int codePoint) 2399: { 2400: return codePoint >= MIN_SUPPLEMENTARY_CODE_POINT 2401: && codePoint <= MAX_CODE_POINT; 2402: } 2403: 2404: /** 2405: * Determines whether the specified code point is 2406: * in the range 0x0000 .. 0x10FFFF, i.e. it is a valid Unicode code point. 2407: * 2408: * @param codePoint a Unicode code point 2409: * 2410: * @return <code>true</code> if code point is valid 2411: * 2412: * @since 1.5 2413: */ 2414: public static boolean isValidCodePoint(int codePoint) 2415: { 2416: return codePoint >= MIN_CODE_POINT && codePoint <= MAX_CODE_POINT; 2417: } 2418: } // class Character
GNU Classpath (0.18) |