Source for gnu.regexp.RESyntax

   1: /* gnu/regexp/RESyntax.java
   2:    Copyright (C) 2006 Free Software Foundation, Inc.
   3: 
   4: This file is part of GNU Classpath.
   5: 
   6: GNU Classpath is free software; you can redistribute it and/or modify
   7: it under the terms of the GNU General Public License as published by
   8: the Free Software Foundation; either version 2, or (at your option)
   9: any later version.
  10: 
  11: GNU Classpath is distributed in the hope that it will be useful, but
  12: WITHOUT ANY WARRANTY; without even the implied warranty of
  13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14: General Public License for more details.
  15: 
  16: You should have received a copy of the GNU General Public License
  17: along with GNU Classpath; see the file COPYING.  If not, write to the
  18: Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  19: 02110-1301 USA.
  20: 
  21: Linking this library statically or dynamically with other modules is
  22: making a combined work based on this library.  Thus, the terms and
  23: conditions of the GNU General Public License cover the whole
  24: combination.
  25: 
  26: As a special exception, the copyright holders of this library give you
  27: permission to link this library with independent modules to produce an
  28: executable, regardless of the license terms of these independent
  29: modules, and to copy and distribute the resulting executable under
  30: terms of your choice, provided that you also meet, for each linked
  31: independent module, the terms and conditions of the license of that
  32: module.  An independent module is a module which is not derived from
  33: or based on this library.  If you modify this library, you may extend
  34: this exception to your version of the library, but you are not
  35: obligated to do so.  If you do not wish to do so, delete this
  36: exception statement from your version. */
  37: 
  38: 
  39: package gnu.regexp;
  40: import java.io.Serializable;
  41: import java.util.BitSet;
  42: 
  43: /**
  44:  * An RESyntax specifies the way a regular expression will be compiled.
  45:  * This class provides a number of predefined useful constants for
  46:  * emulating popular regular expression syntaxes.  Additionally the
  47:  * user may construct his or her own syntax, using any combination of the
  48:  * syntax bit constants.  The syntax is an optional argument to any of the
  49:  * matching methods on class RE.
  50:  *
  51:  * @author <A HREF="mailto:wes@cacas.org">Wes Biggs</A>
  52:  */
  53: 
  54: public final class RESyntax implements Serializable {
  55:     static final String DEFAULT_LINE_SEPARATOR = System.getProperty("line.separator");
  56: 
  57:     private static final String SYNTAX_IS_FINAL = RE.getLocalizedMessage("syntax.final");
  58: 
  59:     private BitSet bits;
  60: 
  61:     // true for the constant defined syntaxes
  62:     private boolean isFinal = false;
  63: 
  64:     private String lineSeparator = DEFAULT_LINE_SEPARATOR;
  65: 
  66:   // Values for constants are bit indexes
  67: 
  68:   /**
  69:    * Syntax bit. Backslash is an escape character in lists.
  70:    */
  71:   public static final int RE_BACKSLASH_ESCAPE_IN_LISTS =  0;
  72: 
  73:   /**
  74:    * Syntax bit. Use \? instead of ? and \+ instead of +.
  75:    */
  76:   public static final int RE_BK_PLUS_QM                =  1;
  77: 
  78:   /**
  79:    * Syntax bit. POSIX character classes ([:...:]) in lists are allowed.
  80:    */
  81:   public static final int RE_CHAR_CLASSES              =  2;
  82: 
  83:   /**
  84:    * Syntax bit. ^ and $ are special everywhere.
  85:    * <B>Not implemented.</B>
  86:    */
  87:   public static final int RE_CONTEXT_INDEP_ANCHORS     =  3; 
  88: 
  89:   /**
  90:    * Syntax bit. Repetition operators are only special in valid positions.
  91:    * <B>Not implemented.</B>
  92:    */
  93:   public static final int RE_CONTEXT_INDEP_OPS         =  4; 
  94: 
  95:   /**
  96:    * Syntax bit. Repetition and alternation operators are invalid
  97:    * at start and end of pattern and other places. 
  98:    * <B>Not implemented</B>.
  99:    */
 100:   public static final int RE_CONTEXT_INVALID_OPS       =  5; 
 101: 
 102:   /**
 103:    * Syntax bit. Match-any-character operator (.) matches a newline.
 104:    */
 105:   public static final int RE_DOT_NEWLINE               =  6;
 106: 
 107:   /**
 108:    * Syntax bit. Match-any-character operator (.) does not match a null.
 109:    */
 110:   public static final int RE_DOT_NOT_NULL              =  7;
 111: 
 112:   /**
 113:    * Syntax bit. Intervals ({x}, {x,}, {x,y}) are allowed.
 114:    */
 115:   public static final int RE_INTERVALS                 =  8;
 116: 
 117:   /**
 118:    * Syntax bit. No alternation (|), match one-or-more (+), or 
 119:    * match zero-or-one (?) operators.
 120:    */
 121:   public static final int RE_LIMITED_OPS               =  9;
 122: 
 123:   /**
 124:    * Syntax bit. Newline is an alternation operator.
 125:    */
 126:   public static final int RE_NEWLINE_ALT               = 10; // impl.
 127: 
 128:   /**
 129:    * Syntax bit. Intervals use { } instead of \{ \}
 130:    */
 131:   public static final int RE_NO_BK_BRACES              = 11; 
 132: 
 133:   /**
 134:    * Syntax bit. Grouping uses ( ) instead of \( \).
 135:    */
 136:   public static final int RE_NO_BK_PARENS              = 12;
 137: 
 138:   /**
 139:    * Syntax bit. Backreferences not allowed.
 140:    */
 141:   public static final int RE_NO_BK_REFS                = 13;
 142: 
 143:   /**
 144:    * Syntax bit. Alternation uses | instead of \|
 145:    */
 146:   public static final int RE_NO_BK_VBAR                = 14;
 147: 
 148:   /**
 149:    * Syntax bit. <B>Not implemented</B>.
 150:    */
 151:   public static final int RE_NO_EMPTY_RANGES           = 15;
 152: 
 153:   /**
 154:    * Syntax bit. An unmatched right parenthesis (')' or '\)', depending
 155:    * on RE_NO_BK_PARENS) will throw an exception when compiling.
 156:    */
 157:   public static final int RE_UNMATCHED_RIGHT_PAREN_ORD = 16;
 158: 
 159:   /**
 160:    * Syntax bit. <B>Not implemented.</B>
 161:    */
 162:   public static final int RE_HAT_LISTS_NOT_NEWLINE     = 17;
 163: 
 164:   /**
 165:    * Syntax bit.  Stingy matching is allowed (+?, *?, ??, {x,y}?).
 166:    */
 167:   public static final int RE_STINGY_OPS                = 18;
 168: 
 169:   /**
 170:    * Syntax bit. Allow character class escapes (\d, \D, \s, \S, \w, \W).
 171:    */
 172:   public static final int RE_CHAR_CLASS_ESCAPES        = 19;
 173: 
 174:   /**
 175:    * Syntax bit. Allow use of (?:xxx) grouping (subexpression is not saved).
 176:    */
 177:   public static final int RE_PURE_GROUPING             = 20;
 178: 
 179:   /**
 180:    * Syntax bit. Allow use of (?=xxx) and (?!xxx) apply the subexpression
 181:    * to the text following the current position without consuming that text.
 182:    */
 183:   public static final int RE_LOOKAHEAD                 = 21;
 184: 
 185:   /**
 186:    * Syntax bit. Allow beginning- and end-of-string anchors (\A, \Z).
 187:    */
 188:   public static final int RE_STRING_ANCHORS            = 22;
 189: 
 190:   /**
 191:    * Syntax bit. Allow embedded comments, (?#comment), as in Perl5.
 192:    */
 193:   public static final int RE_COMMENTS                  = 23;
 194: 
 195:   /**
 196:    * Syntax bit. Allow character class escapes within lists, as in Perl5.
 197:    */
 198:   public static final int RE_CHAR_CLASS_ESC_IN_LISTS   = 24;
 199: 
 200:   /**
 201:    * Syntax bit.  Possessive matching is allowed (++, *+, ?+, {x,y}+).
 202:    */
 203:   public static final int RE_POSSESSIVE_OPS            = 25;
 204: 
 205:   /**
 206:    * Syntax bit.  Allow embedded flags, (?is-x), as in Perl5.
 207:    */
 208:   public static final int RE_EMBEDDED_FLAGS            = 26;
 209: 
 210:   /**
 211:    * Syntax bit.  Allow octal char (\0377), as in Perl5.
 212:    */
 213:   public static final int RE_OCTAL_CHAR                = 27;
 214: 
 215:   /**
 216:    * Syntax bit.  Allow hex char (\x1b), as in Perl5.
 217:    */
 218:   public static final int RE_HEX_CHAR                  = 28;
 219: 
 220:   /**
 221:    * Syntax bit.  Allow Unicode char (\u1234), as in Java 1.4.
 222:    */
 223:   public static final int RE_UNICODE_CHAR              = 29;
 224: 
 225:   /**
 226:    * Syntax bit.  Allow named property (\p{P}, \P{p}), as in Perl5.
 227:    */
 228:   public static final int RE_NAMED_PROPERTY            = 30;
 229: 
 230:   /**
 231:    * Syntax bit.  Allow nested characterclass ([a-z&&[^p-r]]), as in Java 1.4.
 232:    */
 233:   public static final int RE_NESTED_CHARCLASS          = 31;
 234: 
 235:   private static final int BIT_TOTAL                   = 32;
 236: 
 237:   /**
 238:    * Predefined syntax.
 239:    * Emulates regular expression support in the awk utility.
 240:    */
 241:   public static final RESyntax RE_SYNTAX_AWK;
 242: 
 243:   /**
 244:    * Predefined syntax.
 245:    * Emulates regular expression support in the ed utility.
 246:    */
 247:   public static final RESyntax RE_SYNTAX_ED;
 248: 
 249:   /**
 250:    * Predefined syntax.
 251:    * Emulates regular expression support in the egrep utility.
 252:    */
 253:   public static final RESyntax RE_SYNTAX_EGREP;
 254: 
 255:   /**
 256:    * Predefined syntax.
 257:    * Emulates regular expression support in the GNU Emacs editor.
 258:    */
 259:   public static final RESyntax RE_SYNTAX_EMACS;
 260: 
 261:   /**
 262:    * Predefined syntax.
 263:    * Emulates regular expression support in the grep utility.
 264:    */
 265:   public static final RESyntax RE_SYNTAX_GREP;
 266: 
 267:   /**
 268:    * Predefined syntax.
 269:    * Emulates regular expression support in the POSIX awk specification.
 270:    */
 271:   public static final RESyntax RE_SYNTAX_POSIX_AWK;
 272: 
 273:   /**
 274:    * Predefined syntax.
 275:    * Emulates POSIX basic regular expression support.
 276:    */
 277:   public static final RESyntax RE_SYNTAX_POSIX_BASIC;
 278: 
 279:   /**
 280:    * Predefined syntax.
 281:    * Emulates regular expression support in the POSIX egrep specification.
 282:    */
 283:   public static final RESyntax RE_SYNTAX_POSIX_EGREP;
 284: 
 285:   /**
 286:    * Predefined syntax.
 287:    * Emulates POSIX extended regular expression support.
 288:    */
 289:   public static final RESyntax RE_SYNTAX_POSIX_EXTENDED;
 290: 
 291:   /**
 292:    * Predefined syntax.
 293:    * Emulates POSIX basic minimal regular expressions.
 294:    */
 295:   public static final RESyntax RE_SYNTAX_POSIX_MINIMAL_BASIC;
 296: 
 297:   /**
 298:    * Predefined syntax.
 299:    * Emulates POSIX extended minimal regular expressions.
 300:    */
 301:   public static final RESyntax RE_SYNTAX_POSIX_MINIMAL_EXTENDED;
 302: 
 303:   /**
 304:    * Predefined syntax.
 305:    * Emulates regular expression support in the sed utility.
 306:    */
 307:   public static final RESyntax RE_SYNTAX_SED;
 308: 
 309:   /**
 310:    * Predefined syntax.
 311:    * Emulates regular expression support in Larry Wall's perl, version 4,
 312:    */
 313:   public static final RESyntax RE_SYNTAX_PERL4;
 314: 
 315:   /**
 316:    * Predefined syntax.
 317:    * Emulates regular expression support in Larry Wall's perl, version 4,
 318:    * using single line mode (/s modifier).
 319:    */
 320:   public static final RESyntax RE_SYNTAX_PERL4_S; // single line mode (/s)
 321: 
 322:   /**
 323:    * Predefined syntax.
 324:    * Emulates regular expression support in Larry Wall's perl, version 5.
 325:    */
 326:   public static final RESyntax RE_SYNTAX_PERL5;  
 327: 
 328:   /**
 329:    * Predefined syntax.
 330:    * Emulates regular expression support in Larry Wall's perl, version 5,
 331:    * using single line mode (/s modifier).
 332:    */
 333:   public static final RESyntax RE_SYNTAX_PERL5_S;
 334: 
 335:     /**
 336:      * Predefined syntax.
 337:      * Emulates regular expression support in Java 1.4's java.util.regex
 338:      * package.
 339:      */
 340:     public static final RESyntax RE_SYNTAX_JAVA_1_4;
 341: 
 342:   static {
 343:       // Define syntaxes
 344:       
 345:       RE_SYNTAX_EMACS = new RESyntax().makeFinal();
 346:       
 347:       RESyntax RE_SYNTAX_POSIX_COMMON = new RESyntax()
 348:       .set(RE_CHAR_CLASSES)
 349:       .set(RE_DOT_NEWLINE)
 350:       .set(RE_DOT_NOT_NULL)
 351:       .set(RE_INTERVALS)
 352:       .set(RE_NO_EMPTY_RANGES)
 353:       .makeFinal();
 354:       
 355:       RE_SYNTAX_POSIX_BASIC = new RESyntax(RE_SYNTAX_POSIX_COMMON)
 356:       .set(RE_BK_PLUS_QM)
 357:       .makeFinal();
 358:       
 359:       RE_SYNTAX_POSIX_EXTENDED = new RESyntax(RE_SYNTAX_POSIX_COMMON)
 360:       .set(RE_CONTEXT_INDEP_ANCHORS)
 361:       .set(RE_CONTEXT_INDEP_OPS)
 362:       .set(RE_NO_BK_BRACES)
 363:       .set(RE_NO_BK_PARENS)
 364:       .set(RE_NO_BK_VBAR)
 365:       .set(RE_UNMATCHED_RIGHT_PAREN_ORD)
 366:       .makeFinal();
 367: 
 368:       RE_SYNTAX_AWK = new RESyntax()
 369:       .set(RE_BACKSLASH_ESCAPE_IN_LISTS)
 370:       .set(RE_DOT_NOT_NULL)
 371:       .set(RE_NO_BK_PARENS)
 372:       .set(RE_NO_BK_REFS)
 373:       .set(RE_NO_BK_VBAR)
 374:       .set(RE_NO_EMPTY_RANGES)
 375:       .set(RE_UNMATCHED_RIGHT_PAREN_ORD)
 376:       .makeFinal();
 377:       
 378:       RE_SYNTAX_POSIX_AWK = new RESyntax(RE_SYNTAX_POSIX_EXTENDED)
 379:       .set(RE_BACKSLASH_ESCAPE_IN_LISTS)
 380:       .makeFinal();
 381:       
 382:       RE_SYNTAX_GREP = new RESyntax()
 383:       .set(RE_BK_PLUS_QM)
 384:       .set(RE_CHAR_CLASSES)
 385:       .set(RE_HAT_LISTS_NOT_NEWLINE)
 386:       .set(RE_INTERVALS)
 387:       .set(RE_NEWLINE_ALT)
 388:       .makeFinal();
 389:       
 390:       RE_SYNTAX_EGREP = new RESyntax()
 391:       .set(RE_CHAR_CLASSES)
 392:       .set(RE_CONTEXT_INDEP_ANCHORS)
 393:       .set(RE_CONTEXT_INDEP_OPS)
 394:       .set(RE_HAT_LISTS_NOT_NEWLINE)
 395:       .set(RE_NEWLINE_ALT)
 396:       .set(RE_NO_BK_PARENS)
 397:       .set(RE_NO_BK_VBAR)
 398:       .makeFinal();
 399:     
 400:       RE_SYNTAX_POSIX_EGREP = new RESyntax(RE_SYNTAX_EGREP)
 401:       .set(RE_INTERVALS)
 402:       .set(RE_NO_BK_BRACES)
 403:       .makeFinal();
 404:     
 405:       /* P1003.2/D11.2, section 4.20.7.1, lines 5078ff.  */
 406:     
 407:       RE_SYNTAX_ED = new RESyntax(RE_SYNTAX_POSIX_BASIC)
 408:       .makeFinal();
 409:     
 410:       RE_SYNTAX_SED = new RESyntax(RE_SYNTAX_POSIX_BASIC)
 411:       .makeFinal();
 412:       
 413:       RE_SYNTAX_POSIX_MINIMAL_BASIC = new RESyntax(RE_SYNTAX_POSIX_COMMON)
 414:       .set(RE_LIMITED_OPS)
 415:       .makeFinal();
 416:       
 417:       /* Differs from RE_SYNTAX_POSIX_EXTENDED in that RE_CONTEXT_INVALID_OPS
 418:      replaces RE_CONTEXT_INDEP_OPS and RE_NO_BK_REFS is added. */
 419:       
 420:       RE_SYNTAX_POSIX_MINIMAL_EXTENDED = new RESyntax(RE_SYNTAX_POSIX_COMMON)
 421:       .set(RE_CONTEXT_INDEP_ANCHORS)
 422:       .set(RE_CONTEXT_INVALID_OPS)
 423:       .set(RE_NO_BK_BRACES)
 424:       .set(RE_NO_BK_PARENS)
 425:       .set(RE_NO_BK_REFS)
 426:       .set(RE_NO_BK_VBAR)
 427:       .set(RE_UNMATCHED_RIGHT_PAREN_ORD)
 428:       .makeFinal();
 429:       
 430:       /* There is no official Perl spec, but here's a "best guess" */
 431:       
 432:       RE_SYNTAX_PERL4 = new RESyntax()
 433:       .set(RE_BACKSLASH_ESCAPE_IN_LISTS)
 434:       .set(RE_CONTEXT_INDEP_ANCHORS)
 435:       .set(RE_CONTEXT_INDEP_OPS)          // except for '{', apparently
 436:       .set(RE_INTERVALS)
 437:       .set(RE_NO_BK_BRACES)
 438:       .set(RE_NO_BK_PARENS)
 439:       .set(RE_NO_BK_VBAR)
 440:       .set(RE_NO_EMPTY_RANGES)
 441:       .set(RE_CHAR_CLASS_ESCAPES)    // \d,\D,\w,\W,\s,\S
 442:       .makeFinal();
 443:       
 444:       RE_SYNTAX_PERL4_S = new RESyntax(RE_SYNTAX_PERL4)
 445:       .set(RE_DOT_NEWLINE)
 446:       .makeFinal();
 447:       
 448:       RE_SYNTAX_PERL5 = new RESyntax(RE_SYNTAX_PERL4)
 449:       .set(RE_PURE_GROUPING)          // (?:)
 450:       .set(RE_STINGY_OPS)             // *?,??,+?,{}?
 451:       .set(RE_LOOKAHEAD)              // (?=)(?!)
 452:       .set(RE_STRING_ANCHORS)         // \A,\Z
 453:       .set(RE_CHAR_CLASS_ESC_IN_LISTS)// \d,\D,\w,\W,\s,\S within []
 454:       .set(RE_COMMENTS)              // (?#)
 455:       .set(RE_EMBEDDED_FLAGS)         // (?imsx-imsx)
 456:       .set(RE_OCTAL_CHAR)             // \0377
 457:       .set(RE_HEX_CHAR)               // \x1b
 458:       .set(RE_NAMED_PROPERTY)         // \p{prop}, \P{prop}
 459:       .makeFinal();
 460:       
 461:       RE_SYNTAX_PERL5_S = new RESyntax(RE_SYNTAX_PERL5)
 462:       .set(RE_DOT_NEWLINE)
 463:       .makeFinal();
 464: 
 465:       RE_SYNTAX_JAVA_1_4 = new RESyntax(RE_SYNTAX_PERL5)
 466:       // XXX
 467:       .set(RE_POSSESSIVE_OPS)         // *+,?+,++,{}+
 468:       .set(RE_UNICODE_CHAR)           // \u1234
 469:       .set(RE_NESTED_CHARCLASS)       // [a-z&&[^p-r]]
 470:       .makeFinal();
 471:   }
 472: 
 473:   /**
 474:    * Construct a new syntax object with all bits turned off.
 475:    * This is equivalent to RE_SYNTAX_EMACS.
 476:    */
 477:   public RESyntax() {
 478:     bits = new BitSet(BIT_TOTAL);
 479:   }
 480: 
 481:     /**
 482:      * Called internally when constructing predefined syntaxes
 483:      * so their interpretation cannot vary.  Conceivably useful
 484:      * for your syntaxes as well.  Causes IllegalAccessError to
 485:      * be thrown if any attempt to modify the syntax is made.
 486:      *
 487:      * @return this object for convenient chaining
 488:      */
 489:     public RESyntax makeFinal() {
 490:     isFinal = true;
 491:     return this;
 492:     }
 493: 
 494:   /**
 495:    * Construct a new syntax object with all bits set the same 
 496:    * as the other syntax.
 497:    */
 498:   public RESyntax(RESyntax other) {
 499:     bits = (BitSet) other.bits.clone();
 500:   }
 501: 
 502:   /**
 503:    * Check if a given bit is set in this syntax.
 504:    */
 505:   public boolean get(int index) {
 506:     return bits.get(index);
 507:   }
 508: 
 509:   /**
 510:    * Set a given bit in this syntax. 
 511:    *
 512:    * @param index the constant (RESyntax.RE_xxx) bit to set.
 513:    * @return a reference to this object for easy chaining.
 514:    */
 515:   public RESyntax set(int index) {
 516:       if (isFinal) throw new IllegalAccessError(SYNTAX_IS_FINAL);
 517:     bits.set(index);
 518:     return this;
 519:   }
 520: 
 521:   /**
 522:    * Clear a given bit in this syntax. 
 523:    *
 524:    * @param index the constant (RESyntax.RE_xxx) bit to clear.
 525:    * @return a reference to this object for easy chaining.
 526:    */
 527:   public RESyntax clear(int index) {
 528:       if (isFinal) throw new IllegalAccessError(SYNTAX_IS_FINAL);
 529:       bits.clear(index);
 530:       return this;
 531:   }
 532: 
 533:     /**
 534:      * Changes the line separator string for regular expressions
 535:      * created using this RESyntax.  The default separator is the
 536:      * value returned by the system property "line.separator", which
 537:      * should be correct when reading platform-specific files from a
 538:      * filesystem.  However, many programs may collect input from
 539:      * sources where the line separator is differently specified (for
 540:      * example, in the applet environment, the text box widget
 541:      * interprets line breaks as single-character newlines,
 542:      * regardless of the host platform.
 543:      *
 544:      * Note that setting the line separator to a character or
 545:      * characters that have specific meaning within the current syntax
 546:      * can cause unexpected chronosynclastic infundibula.
 547:      *
 548:      * @return this object for convenient chaining 
 549:      */
 550:     public RESyntax setLineSeparator(String aSeparator) {
 551:     if (isFinal) throw new IllegalAccessError(SYNTAX_IS_FINAL);
 552:     lineSeparator = aSeparator;
 553:     return this;
 554:     }
 555: 
 556:     /**
 557:      * Returns the currently active line separator string.  The default
 558:      * is the platform-dependent system property "line.separator".
 559:      */
 560:     public String getLineSeparator() {
 561:     return lineSeparator;
 562:     }
 563: }