Source for gnu.java.util.regex.RE

   1: /* gnu/regexp/RE.java
   2:    Copyright (C) 2006 Free Software Foundation, Inc.
   3: 
   4: This file is part of GNU Classpath.
   5: 
   6: GNU Classpath is free software; you can redistribute it and/or modify
   7: it under the terms of the GNU General Public License as published by
   8: the Free Software Foundation; either version 2, or (at your option)
   9: any later version.
  10: 
  11: GNU Classpath is distributed in the hope that it will be useful, but
  12: WITHOUT ANY WARRANTY; without even the implied warranty of
  13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14: General Public License for more details.
  15: 
  16: You should have received a copy of the GNU General Public License
  17: along with GNU Classpath; see the file COPYING.  If not, write to the
  18: Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  19: 02110-1301 USA.
  20: 
  21: Linking this library statically or dynamically with other modules is
  22: making a combined work based on this library.  Thus, the terms and
  23: conditions of the GNU General Public License cover the whole
  24: combination.
  25: 
  26: As a special exception, the copyright holders of this library give you
  27: permission to link this library with independent modules to produce an
  28: executable, regardless of the license terms of these independent
  29: modules, and to copy and distribute the resulting executable under
  30: terms of your choice, provided that you also meet, for each linked
  31: independent module, the terms and conditions of the license of that
  32: module.  An independent module is a module which is not derived from
  33: or based on this library.  If you modify this library, you may extend
  34: this exception to your version of the library, but you are not
  35: obligated to do so.  If you do not wish to do so, delete this
  36: exception statement from your version. */
  37: 
  38: package gnu.java.util.regex;
  39: import java.io.InputStream;
  40: import java.io.Serializable;
  41: import java.util.Locale;
  42: import java.util.PropertyResourceBundle;
  43: import java.util.ResourceBundle;
  44: import java.util.Stack;
  45: import java.util.Vector;
  46: 
  47: /**
  48:  * RE provides the user interface for compiling and matching regular
  49:  * expressions.
  50:  * <P>
  51:  * A regular expression object (class RE) is compiled by constructing it
  52:  * from a String, StringBuffer or character array, with optional 
  53:  * compilation flags (below)
  54:  * and an optional syntax specification (see RESyntax; if not specified,
  55:  * <code>RESyntax.RE_SYNTAX_PERL5</code> is used).
  56:  * <P>
  57:  * Once compiled, a regular expression object is reusable as well as
  58:  * threadsafe: multiple threads can use the RE instance simultaneously
  59:  * to match against different input text.
  60:  * <P>
  61:  * Various methods attempt to match input text against a compiled
  62:  * regular expression.  These methods are:
  63:  * <LI><code>isMatch</code>: returns true if the input text in its
  64:  * entirety matches the regular expression pattern.
  65:  * <LI><code>getMatch</code>: returns the first match found in the
  66:  * input text, or null if no match is found.
  67:  * <LI><code>getAllMatches</code>: returns an array of all
  68:  * non-overlapping matches found in the input text.  If no matches are
  69:  * found, the array is zero-length.
  70:  * <LI><code>substitute</code>: substitute the first occurence of the
  71:  * pattern in the input text with a replacement string (which may
  72:  * include metacharacters $0-$9, see REMatch.substituteInto).
  73:  * <LI><code>substituteAll</code>: same as above, but repeat for each
  74:  * match before returning.
  75:  * <LI><code>getMatchEnumeration</code>: returns an REMatchEnumeration
  76:  * object that allows iteration over the matches (see
  77:  * REMatchEnumeration for some reasons why you may want to do this
  78:  * instead of using <code>getAllMatches</code>.
  79:  * <P>
  80:  *
  81:  * These methods all have similar argument lists.  The input can be a
  82:  * CharIndexed, String, a character array, a StringBuffer, or an
  83:  * InputStream of some sort.  Note that when using an
  84:  * InputStream, the stream read position cannot be guaranteed after
  85:  * attempting a match (this is not a bug, but a consequence of the way
  86:  * regular expressions work).  Using an REMatchEnumeration can
  87:  * eliminate most positioning problems.
  88:  *
  89:  * Although the input object can be of various types, it is recommended
  90:  * that it should be a CharIndexed because {@link CharIndexed#getLastMatch()}
  91:  * can show the last match found on this input, which helps the expression
  92:  * \G work as the end of the previous match.
  93:  *
  94:  * <P>
  95:  *
  96:  * The optional index argument specifies the offset from the beginning
  97:  * of the text at which the search should start (see the descriptions
  98:  * of some of the execution flags for how this can affect positional
  99:  * pattern operators).  For an InputStream, this means an
 100:  * offset from the current read position, so subsequent calls with the
 101:  * same index argument on an InputStream will not
 102:  * necessarily access the same position on the stream, whereas
 103:  * repeated searches at a given index in a fixed string will return
 104:  * consistent results.
 105:  *
 106:  * <P>
 107:  * You can optionally affect the execution environment by using a
 108:  * combination of execution flags (constants listed below).
 109:  * 
 110:  * <P>
 111:  * All operations on a regular expression are performed in a
 112:  * thread-safe manner.
 113:  *
 114:  * @author <A HREF="mailto:wes@cacas.org">Wes Biggs</A>
 115:  * @version 1.1.5-dev, to be released
 116:  */
 117: 
 118: public class RE extends REToken {
 119: 
 120:   private static final class IntPair implements Serializable {
 121:     public int first, second;
 122:   }
 123: 
 124:   private static final class CharUnit implements Serializable {
 125:     public char ch;
 126:     public boolean bk;
 127:   }
 128: 
 129:   // This String will be returned by getVersion()
 130:   private static final String VERSION = "1.1.5-dev";
 131: 
 132:   // The localized strings are kept in a separate file
 133:   // Used by getLocalizedMessage().
 134:   private static ResourceBundle messages;
 135: 
 136:   // Name of the bundle that contains the localized messages.
 137:   private static final String bundle = "gnu/java/util/regex/MessagesBundle";
 138: 
 139:   // These are, respectively, the first and last tokens in our linked list
 140:   // If there is only one token, firstToken == lastToken
 141:   private REToken firstToken, lastToken;
 142: 
 143:   // This is the number of subexpressions in this regular expression,
 144:   // with a minimum value of zero.  Returned by getNumSubs()
 145:   private int numSubs;
 146: 
 147:     /** Minimum length, in characters, of any possible match. */
 148:     private int minimumLength;
 149:     private int maximumLength;
 150: 
 151:   /**
 152:    * Compilation flag. Do  not  differentiate  case.   Subsequent
 153:    * searches  using  this  RE will be case insensitive.
 154:    */
 155:   public static final int REG_ICASE = 0x02;
 156: 
 157:   /**
 158:    * Compilation flag. The match-any-character operator (dot)
 159:    * will match a newline character.  When set this overrides the syntax
 160:    * bit RE_DOT_NEWLINE (see RESyntax for details).  This is equivalent to
 161:    * the "/s" operator in Perl.
 162:    */
 163:   public static final int REG_DOT_NEWLINE = 0x04;
 164: 
 165:   /**
 166:    * Compilation flag. Use multiline mode.  In this mode, the ^ and $
 167:    * anchors will match based on newlines within the input. This is
 168:    * equivalent to the "/m" operator in Perl.
 169:    */
 170:   public static final int REG_MULTILINE = 0x08;
 171: 
 172:   /**
 173:    * Execution flag.
 174:    * The match-beginning operator (^) will not match at the beginning
 175:    * of the input string. Useful for matching on a substring when you
 176:    * know the context of the input is such that position zero of the
 177:    * input to the match test is not actually position zero of the text.
 178:    * <P>
 179:    * This example demonstrates the results of various ways of matching on
 180:    * a substring.
 181:    * <P>
 182:    * <CODE>
 183:    * String s = "food bar fool";<BR>
 184:    * RE exp = new RE("^foo.");<BR>
 185:    * REMatch m0 = exp.getMatch(s);<BR>
 186:    * REMatch m1 = exp.getMatch(s.substring(8));<BR>
 187:    * REMatch m2 = exp.getMatch(s.substring(8),0,RE.REG_NOTBOL); <BR>
 188:    * REMatch m3 = exp.getMatch(s,8);                            <BR>
 189:    * REMatch m4 = exp.getMatch(s,8,RE.REG_ANCHORINDEX);         <BR>
 190:    * <P>
 191:    * // Results:<BR>
 192:    * //  m0.toString(): "food"<BR>
 193:    * //  m1.toString(): "fool"<BR>
 194:    * //  m2.toString(): null<BR>
 195:    * //  m3.toString(): null<BR>
 196:    * //  m4.toString(): "fool"<BR>
 197:    * </CODE>
 198:    */
 199:   public static final int REG_NOTBOL = 0x10;
 200: 
 201:   /**
 202:    * Execution flag.
 203:    * The match-end operator ($) does not match at the end
 204:    * of the input string. Useful for matching on substrings.
 205:    */
 206:   public static final int REG_NOTEOL = 0x20;
 207: 
 208:   /**
 209:    * Execution flag.
 210:    * When a match method is invoked that starts matching at a non-zero
 211:    * index into the input, treat the input as if it begins at the index
 212:    * given.  The effect of this flag is that the engine does not "see"
 213:    * any text in the input before the given index.  This is useful so
 214:    * that the match-beginning operator (^) matches not at position 0
 215:    * in the input string, but at the position the search started at
 216:    * (based on the index input given to the getMatch function).  See
 217:    * the example under REG_NOTBOL.  It also affects the use of the \&lt;
 218:    * and \b operators.
 219:    */
 220:   public static final int REG_ANCHORINDEX = 0x40;
 221: 
 222:   /**
 223:    * Execution flag.
 224:    * The substitute and substituteAll methods will not attempt to
 225:    * interpolate occurrences of $1-$9 in the replacement text with
 226:    * the corresponding subexpressions.  For example, you may want to
 227:    * replace all matches of "one dollar" with "$1".
 228:    */
 229:   public static final int REG_NO_INTERPOLATE = 0x80;
 230: 
 231:   /**
 232:    * Execution flag.
 233:    * Try to match the whole input string. An implicit match-end operator
 234:    * is added to this regexp.
 235:    */
 236:   public static final int REG_TRY_ENTIRE_MATCH = 0x0100;
 237: 
 238:   /**
 239:    * Execution flag.
 240:    * The substitute and substituteAll methods will treat the
 241:    * character '\' in the replacement as an escape to a literal
 242:    * character. In this case "\n", "\$", "\\", "\x40" and "\012"
 243:    * will become "n", "$", "\", "x40" and "012" respectively.
 244:    * This flag has no effect if REG_NO_INTERPOLATE is set on.
 245:    */
 246:   public static final int REG_REPLACE_USE_BACKSLASHESCAPE = 0x0200;
 247: 
 248:   /**
 249:    * Compilation flag. Allow whitespace and comments in pattern.
 250:    * This is equivalent to the "/x" operator in Perl.
 251:    */
 252:   public static final int REG_X_COMMENTS = 0x0400;
 253: 
 254:   /**
 255:    * Compilation flag. If set, REG_ICASE is effective only for US-ASCII.
 256:    */
 257:   public static final int REG_ICASE_USASCII = 0x0800;
 258: 
 259:   /**
 260:    * Execution flag.
 261:    * Do not move the position at which the search begins.  If not set,
 262:    * the starting position will be moved until a match is found.
 263:    */
 264:   public static final int REG_FIX_STARTING_POSITION = 0x1000;
 265: 
 266:   /** Returns a string representing the version of the gnu.regexp package. */
 267:   public static final String version() {
 268:     return VERSION;
 269:   }
 270: 
 271:   // Retrieves a message from the ResourceBundle
 272:   static final String getLocalizedMessage(String key) {
 273:     if (messages == null)
 274:       messages = PropertyResourceBundle.getBundle(bundle, Locale.getDefault());
 275:     return messages.getString(key);
 276:   }
 277: 
 278:   /**
 279:    * Constructs a regular expression pattern buffer without any compilation
 280:    * flags set, and using the default syntax (RESyntax.RE_SYNTAX_PERL5).
 281:    *
 282:    * @param pattern A regular expression pattern, in the form of a String,
 283:    *   StringBuffer or char[].  Other input types will be converted to
 284:    *   strings using the toString() method.
 285:    * @exception REException The input pattern could not be parsed.
 286:    * @exception NullPointerException The pattern was null.
 287:    */
 288:   public RE(Object pattern) throws REException {
 289:     this(pattern,0,RESyntax.RE_SYNTAX_PERL5,0,0);
 290:   }
 291: 
 292:   /**
 293:    * Constructs a regular expression pattern buffer using the specified
 294:    * compilation flags and the default syntax (RESyntax.RE_SYNTAX_PERL5).
 295:    *
 296:    * @param pattern A regular expression pattern, in the form of a String,
 297:    *   StringBuffer, or char[].  Other input types will be converted to
 298:    *   strings using the toString() method.
 299:    * @param cflags The logical OR of any combination of the compilation flags listed above.
 300:    * @exception REException The input pattern could not be parsed.
 301:    * @exception NullPointerException The pattern was null.
 302:    */
 303:   public RE(Object pattern, int cflags) throws REException {
 304:     this(pattern,cflags,RESyntax.RE_SYNTAX_PERL5,0,0);
 305:   }
 306: 
 307:   /**
 308:    * Constructs a regular expression pattern buffer using the specified
 309:    * compilation flags and regular expression syntax.
 310:    *
 311:    * @param pattern A regular expression pattern, in the form of a String,
 312:    *   StringBuffer, or char[].  Other input types will be converted to
 313:    *   strings using the toString() method.
 314:    * @param cflags The logical OR of any combination of the compilation flags listed above.
 315:    * @param syntax The type of regular expression syntax to use.
 316:    * @exception REException The input pattern could not be parsed.
 317:    * @exception NullPointerException The pattern was null.
 318:    */
 319:   public RE(Object pattern, int cflags, RESyntax syntax) throws REException {
 320:     this(pattern,cflags,syntax,0,0);
 321:   }
 322: 
 323:   // internal constructor used for alternation
 324:   private RE(REToken first, REToken last,int subs, int subIndex, int minLength, int maxLength) {
 325:     super(subIndex);
 326:     firstToken = first;
 327:     lastToken = last;
 328:     numSubs = subs;
 329:     minimumLength = minLength;
 330:     maximumLength = maxLength;
 331:     addToken(new RETokenEndSub(subIndex));
 332:   }
 333: 
 334:   private RE(Object patternObj, int cflags, RESyntax syntax, int myIndex, int nextSub) throws REException {
 335:     super(myIndex); // Subexpression index of this token.
 336:     initialize(patternObj, cflags, syntax, myIndex, nextSub);
 337:   }
 338: 
 339:     // For use by subclasses
 340:     protected RE() { super(0); }
 341: 
 342:     // The meat of construction
 343:   protected void initialize(Object patternObj, int cflags, RESyntax syntax, int myIndex, int nextSub) throws REException {
 344:       char[] pattern;
 345:     if (patternObj instanceof String) {
 346:       pattern = ((String) patternObj).toCharArray();
 347:     } else if (patternObj instanceof char[]) {
 348:       pattern = (char[]) patternObj;
 349:     } else if (patternObj instanceof StringBuffer) {
 350:       pattern = new char [((StringBuffer) patternObj).length()];
 351:       ((StringBuffer) patternObj).getChars(0,pattern.length,pattern,0);
 352:     } else {
 353:     pattern = patternObj.toString().toCharArray();
 354:     }
 355: 
 356:     int pLength = pattern.length;
 357: 
 358:     numSubs = 0; // Number of subexpressions in this token.
 359:     Vector branches = null;
 360: 
 361:     // linked list of tokens (sort of -- some closed loops can exist)
 362:     firstToken = lastToken = null;
 363: 
 364:     // Precalculate these so we don't pay for the math every time we
 365:     // need to access them.
 366:     boolean insens = ((cflags & REG_ICASE) > 0);
 367:     boolean insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0);
 368: 
 369:     // Parse pattern into tokens.  Does anyone know if it's more efficient
 370:     // to use char[] than a String.charAt()?  I'm assuming so.
 371: 
 372:     // index tracks the position in the char array
 373:     int index = 0;
 374: 
 375:     // this will be the current parse character (pattern[index])
 376:     CharUnit unit = new CharUnit();
 377: 
 378:     // This is used for {x,y} calculations
 379:     IntPair minMax = new IntPair();
 380: 
 381:     // Buffer a token so we can create a TokenRepeated, etc.
 382:     REToken currentToken = null;
 383:     char ch;
 384:     boolean quot = false;
 385: 
 386:     // Saved syntax and flags.
 387:     RESyntax savedSyntax = null;
 388:     int savedCflags = 0;
 389:     boolean flagsSaved = false;
 390: 
 391:     while (index < pLength) {
 392:       // read the next character unit (including backslash escapes)
 393:       index = getCharUnit(pattern,index,unit,quot);
 394: 
 395:       if (unit.bk)
 396:         if (unit.ch == 'Q') {
 397:           quot = true;
 398:           continue;
 399:         } else if (unit.ch == 'E') {
 400:           quot = false;
 401:           continue;
 402:         }
 403:       if (quot)
 404:           unit.bk = false;
 405: 
 406:       if (((cflags & REG_X_COMMENTS) > 0) && (!unit.bk) && (!quot)) {
 407:     if (Character.isWhitespace(unit.ch)) {
 408:          continue;
 409:     }
 410:     if (unit.ch == '#') {
 411:       for (int i = index; i < pLength; i++) {
 412:         if (pattern[i] == '\n') {
 413:           index = i + 1;
 414:           continue;
 415:         }
 416:         else if (pattern[i] == '\r') {
 417:           if (i + 1 < pLength && pattern[i + 1] == '\n') {
 418:         index = i + 2;
 419:           }
 420:           else {
 421:         index = i + 1;
 422:           }
 423:           continue;
 424:         }
 425:       }
 426:       index = pLength;
 427:       continue;
 428:     }
 429:       }
 430: 
 431:       // ALTERNATION OPERATOR
 432:       //  \| or | (if RE_NO_BK_VBAR) or newline (if RE_NEWLINE_ALT)
 433:       //  not available if RE_LIMITED_OPS is set
 434: 
 435:       // TODO: the '\n' literal here should be a test against REToken.newline,
 436:       // which unfortunately may be more than a single character.
 437:       if ( ( (unit.ch == '|' && (syntax.get(RESyntax.RE_NO_BK_VBAR) ^ (unit.bk || quot)))
 438:          || (syntax.get(RESyntax.RE_NEWLINE_ALT) && (unit.ch == '\n') && !(unit.bk || quot)) )
 439:        && !syntax.get(RESyntax.RE_LIMITED_OPS)) {
 440:     // make everything up to here be a branch. create vector if nec.
 441:     addToken(currentToken);
 442:     RE theBranch = new RE(firstToken, lastToken, numSubs, subIndex, minimumLength, maximumLength);
 443:     minimumLength = 0;
 444:     maximumLength = 0;
 445:     if (branches == null) {
 446:         branches = new Vector();
 447:     }
 448:     branches.addElement(theBranch);
 449:     firstToken = lastToken = currentToken = null;
 450:       }
 451:       
 452:       // INTERVAL OPERATOR:
 453:       //  {x} | {x,} | {x,y}  (RE_INTERVALS && RE_NO_BK_BRACES)
 454:       //  \{x\} | \{x,\} | \{x,y\} (RE_INTERVALS && !RE_NO_BK_BRACES)
 455:       //
 456:       // OPEN QUESTION: 
 457:       //  what is proper interpretation of '{' at start of string?
 458:       //
 459:       // This method used to check "repeat.empty.token" to avoid such regexp
 460:       // as "(a*){2,}", but now "repeat.empty.token" is allowed.
 461: 
 462:       else if ((unit.ch == '{') && syntax.get(RESyntax.RE_INTERVALS) && (syntax.get(RESyntax.RE_NO_BK_BRACES) ^ (unit.bk || quot))) {
 463:     int newIndex = getMinMax(pattern,index,minMax,syntax);
 464:         if (newIndex > index) {
 465:           if (minMax.first > minMax.second)
 466:             throw new REException(getLocalizedMessage("interval.order"),REException.REG_BADRPT,newIndex);
 467:           if (currentToken == null)
 468:             throw new REException(getLocalizedMessage("repeat.no.token"),REException.REG_BADRPT,newIndex);
 469:           if (currentToken instanceof RETokenRepeated) 
 470:             throw new REException(getLocalizedMessage("repeat.chained"),REException.REG_BADRPT,newIndex);
 471:           if (currentToken instanceof RETokenWordBoundary || currentToken instanceof RETokenWordBoundary)
 472:             throw new REException(getLocalizedMessage("repeat.assertion"),REException.REG_BADRPT,newIndex);
 473:           index = newIndex;
 474:           currentToken = setRepeated(currentToken,minMax.first,minMax.second,index); 
 475:         }
 476:         else {
 477:           addToken(currentToken);
 478:           currentToken = new RETokenChar(subIndex,unit.ch,insens);
 479:       if (insensUSASCII) currentToken.unicodeAware = false;
 480:         } 
 481:       }
 482:       
 483:       // LIST OPERATOR:
 484:       //  [...] | [^...]
 485: 
 486:       else if ((unit.ch == '[') && !(unit.bk || quot)) {
 487:     // Create a new RETokenOneOf
 488:     ParseCharClassResult result = parseCharClass(
 489:         subIndex, pattern, index, pLength, cflags, syntax, 0);
 490:     addToken(currentToken);
 491:     currentToken = result.token;
 492:     index = result.index;
 493:       }
 494: 
 495:       // SUBEXPRESSIONS
 496:       //  (...) | \(...\) depending on RE_NO_BK_PARENS
 497: 
 498:       else if ((unit.ch == '(') && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ (unit.bk || quot))) {
 499:     boolean pure = false;
 500:     boolean comment = false;
 501:         boolean lookAhead = false;
 502:         boolean lookBehind = false;
 503:         boolean independent = false;
 504:         boolean negativelh = false;
 505:         boolean negativelb = false;
 506:     if ((index+1 < pLength) && (pattern[index] == '?')) {
 507:       switch (pattern[index+1]) {
 508:           case '!':
 509:             if (syntax.get(RESyntax.RE_LOOKAHEAD)) {
 510:               pure = true;
 511:               negativelh = true;
 512:               lookAhead = true;
 513:               index += 2;
 514:             }
 515:             break;
 516:           case '=':
 517:             if (syntax.get(RESyntax.RE_LOOKAHEAD)) {
 518:               pure = true;
 519:               lookAhead = true;
 520:               index += 2;
 521:             }
 522:             break;
 523:       case '<':
 524:         // We assume that if the syntax supports look-ahead,
 525:         // it also supports look-behind.
 526:         if (syntax.get(RESyntax.RE_LOOKAHEAD)) {
 527:         index++;
 528:         switch (pattern[index +1]) {
 529:         case '!':
 530:           pure = true;
 531:           negativelb = true;
 532:           lookBehind = true;
 533:           index += 2;
 534:           break;
 535:         case '=':
 536:           pure = true;
 537:           lookBehind = true;
 538:           index += 2;
 539:         }
 540:         }
 541:         break;
 542:       case '>':
 543:         // We assume that if the syntax supports look-ahead,
 544:         // it also supports independent group.
 545:             if (syntax.get(RESyntax.RE_LOOKAHEAD)) {
 546:               pure = true;
 547:               independent = true;
 548:               index += 2;
 549:             }
 550:             break;
 551:       case 'i':
 552:       case 'd':
 553:       case 'm':
 554:       case 's':
 555:       case 'u':
 556:       case 'x':
 557:       case '-':
 558:             if (!syntax.get(RESyntax.RE_EMBEDDED_FLAGS)) break;
 559:         // Set or reset syntax flags.
 560:         int flagIndex = index + 1;
 561:         int endFlag = -1;
 562:         RESyntax newSyntax = new RESyntax(syntax);
 563:         int newCflags = cflags;
 564:         boolean negate = false;
 565:         while (flagIndex < pLength && endFlag < 0) {
 566:             switch(pattern[flagIndex]) {
 567:           case 'i':
 568:           if (negate)
 569:             newCflags &= ~REG_ICASE;
 570:           else
 571:             newCflags |= REG_ICASE;
 572:           flagIndex++;
 573:           break;
 574:           case 'd':
 575:           if (negate)
 576:             newSyntax.setLineSeparator(RESyntax.DEFAULT_LINE_SEPARATOR);
 577:           else
 578:             newSyntax.setLineSeparator("\n");
 579:           flagIndex++;
 580:           break;
 581:           case 'm':
 582:           if (negate)
 583:             newCflags &= ~REG_MULTILINE;
 584:           else
 585:             newCflags |= REG_MULTILINE;
 586:           flagIndex++;
 587:           break;
 588:           case 's':
 589:           if (negate)
 590:             newCflags &= ~REG_DOT_NEWLINE;
 591:           else
 592:             newCflags |= REG_DOT_NEWLINE;
 593:           flagIndex++;
 594:           break;
 595:           case 'u':
 596:           if (negate)
 597:             newCflags |= REG_ICASE_USASCII;
 598:           else
 599:             newCflags &= ~REG_ICASE_USASCII;
 600:           flagIndex++;
 601:           break;
 602:           case 'x':
 603:           if (negate)
 604:             newCflags &= ~REG_X_COMMENTS;
 605:           else
 606:             newCflags |= REG_X_COMMENTS;
 607:           flagIndex++;
 608:           break;
 609:           case '-':
 610:           negate = true;
 611:           flagIndex++;
 612:           break;
 613:         case ':':
 614:         case ')':
 615:           endFlag = pattern[flagIndex];
 616:           break;
 617:         default:
 618:                   throw new REException(getLocalizedMessage("repeat.no.token"), REException.REG_BADRPT, index);
 619:         }
 620:         }
 621:         if (endFlag == ')') {
 622:         syntax = newSyntax;
 623:         cflags = newCflags;
 624:         insens = ((cflags & REG_ICASE) > 0);
 625:         insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0);
 626:         // This can be treated as though it were a comment.
 627:         comment = true;
 628:         index = flagIndex - 1;
 629:         break;
 630:         }
 631:         if (endFlag == ':') {
 632:         savedSyntax = syntax;
 633:         savedCflags = cflags;
 634:         flagsSaved = true;
 635:         syntax = newSyntax;
 636:         cflags = newCflags;
 637:         insens = ((cflags & REG_ICASE) > 0);
 638:         insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0);
 639:         index = flagIndex -1;
 640:         // Fall through to the next case.
 641:         }
 642:         else {
 643:             throw new REException(getLocalizedMessage("unmatched.paren"), REException.REG_ESUBREG,index);
 644:         }
 645:       case ':':
 646:         if (syntax.get(RESyntax.RE_PURE_GROUPING)) {
 647:           pure = true;
 648:           index += 2;
 649:         }
 650:         break;
 651:       case '#':
 652:         if (syntax.get(RESyntax.RE_COMMENTS)) {
 653:           comment = true;
 654:         }
 655:         break;
 656:           default:
 657:             throw new REException(getLocalizedMessage("repeat.no.token"), REException.REG_BADRPT, index);
 658:       }
 659:     }
 660: 
 661:     if (index >= pLength) {
 662:         throw new REException(getLocalizedMessage("unmatched.paren"), REException.REG_ESUBREG,index);
 663:     }
 664: 
 665:     // find end of subexpression
 666:     int endIndex = index;
 667:     int nextIndex = index;
 668:     int nested = 0;
 669: 
 670:     while ( ((nextIndex = getCharUnit(pattern,endIndex,unit,false)) > 0)
 671:         && !(nested == 0 && (unit.ch == ')') && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ (unit.bk || quot))) ) {
 672:       if ((endIndex = nextIndex) >= pLength)
 673:         throw new REException(getLocalizedMessage("subexpr.no.end"),REException.REG_ESUBREG,nextIndex);
 674:       else if ((unit.ch == '[') && !(unit.bk || quot)) {
 675:         // I hate to do something similar to the LIST OPERATOR matters
 676:         // above, but ...
 677:         int listIndex = nextIndex;
 678:         if (listIndex < pLength && pattern[listIndex] == '^') listIndex++;
 679:         if (listIndex < pLength && pattern[listIndex] == ']') listIndex++;
 680:         int listEndIndex = -1;
 681:         int listNest = 0;
 682:         while (listIndex < pLength && listEndIndex < 0) {
 683:           switch(pattern[listIndex++]) {
 684:         case '\\':
 685:           listIndex++;
 686:           break;
 687:         case '[':
 688:           // Sun's API document says that regexp like "[a-d[m-p]]"
 689:           // is legal. Even something like "[[[^]]]]" is accepted.
 690:           listNest++;
 691:           if (listIndex < pLength && pattern[listIndex] == '^') listIndex++;
 692:           if (listIndex < pLength && pattern[listIndex] == ']') listIndex++;
 693:           break;
 694:         case ']':
 695:           if (listNest == 0)
 696:             listEndIndex = listIndex;
 697:           listNest--;
 698:           break;
 699:           }
 700:         }
 701:         if (listEndIndex >= 0) {
 702:           nextIndex = listEndIndex;
 703:           if ((endIndex = nextIndex) >= pLength)
 704:             throw new REException(getLocalizedMessage("subexpr.no.end"),REException.REG_ESUBREG,nextIndex);
 705:           else
 706:             continue;
 707:         }
 708:         throw new REException(getLocalizedMessage("subexpr.no.end"),REException.REG_ESUBREG,nextIndex);
 709:       }
 710:       else if (unit.ch == '(' && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ (unit.bk || quot)))
 711:         nested++;
 712:       else if (unit.ch == ')' && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ (unit.bk || quot)))
 713:         nested--;
 714:     }
 715: 
 716:     // endIndex is now position at a ')','\)' 
 717:     // nextIndex is end of string or position after ')' or '\)'
 718: 
 719:     if (comment) index = nextIndex;
 720:     else { // not a comment
 721:       // create RE subexpression as token.
 722:       addToken(currentToken);
 723:       if (!pure) {
 724:         numSubs++;
 725:       }
 726: 
 727:       int useIndex = (pure || lookAhead || lookBehind || independent) ?
 728:              0 : nextSub + numSubs;
 729:       currentToken = new RE(String.valueOf(pattern,index,endIndex-index).toCharArray(),cflags,syntax,useIndex,nextSub + numSubs);
 730:       numSubs += ((RE) currentToken).getNumSubs();
 731: 
 732:           if (lookAhead) {
 733:           currentToken = new RETokenLookAhead(currentToken,negativelh);
 734:       }
 735:           else if (lookBehind) {
 736:           currentToken = new RETokenLookBehind(currentToken,negativelb);
 737:       }
 738:           else if (independent) {
 739:           currentToken = new RETokenIndependent(currentToken);
 740:       }
 741: 
 742:       index = nextIndex;
 743:       if (flagsSaved) {
 744:           syntax = savedSyntax;
 745:           cflags = savedCflags;
 746:           insens = ((cflags & REG_ICASE) > 0);
 747:           insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0);
 748:           flagsSaved = false;
 749:       }
 750:     } // not a comment
 751:       } // subexpression
 752:     
 753:       // UNMATCHED RIGHT PAREN
 754:       // ) or \) throw exception if
 755:       // !syntax.get(RESyntax.RE_UNMATCHED_RIGHT_PAREN_ORD)
 756:       else if (!syntax.get(RESyntax.RE_UNMATCHED_RIGHT_PAREN_ORD) && ((unit.ch == ')') && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ (unit.bk || quot)))) {
 757:     throw new REException(getLocalizedMessage("unmatched.paren"),REException.REG_EPAREN,index);
 758:       }
 759: 
 760:       // START OF LINE OPERATOR
 761:       //  ^
 762: 
 763:       else if ((unit.ch == '^') && !(unit.bk || quot)) {
 764:     addToken(currentToken);
 765:     currentToken = null;
 766:     RETokenStart token = null;
 767:     if ((cflags & REG_MULTILINE) > 0) {
 768:         String sep = syntax.getLineSeparator();
 769:         if (sep == null) {
 770:             token = new RETokenStart(subIndex, null, true);
 771:         }
 772:         else {
 773:         token = new RETokenStart(subIndex, sep);
 774:         }
 775:     }
 776:     else {
 777:         token = new RETokenStart(subIndex, null);
 778:     }
 779:     addToken(token);
 780:       }
 781: 
 782:       // END OF LINE OPERATOR
 783:       //  $
 784: 
 785:       else if ((unit.ch == '$') && !(unit.bk || quot)) {
 786:     addToken(currentToken);
 787:     currentToken = null;
 788:     RETokenEnd token = null;
 789:     if ((cflags & REG_MULTILINE) > 0) {
 790:         String sep = syntax.getLineSeparator();
 791:         if (sep == null) {
 792:             token = new RETokenEnd(subIndex, null, true);
 793:         }
 794:         else {
 795:         token = new RETokenEnd(subIndex, sep);
 796:         }
 797:     }
 798:     else {
 799:         token = new RETokenEnd(subIndex, null);
 800:     }
 801:     addToken(token);
 802:       }
 803: 
 804:       // MATCH-ANY-CHARACTER OPERATOR (except possibly newline and null)
 805:       //  .
 806: 
 807:       else if ((unit.ch == '.') && !(unit.bk || quot)) {
 808:     addToken(currentToken);
 809:     currentToken = new RETokenAny(subIndex,syntax.get(RESyntax.RE_DOT_NEWLINE) || ((cflags & REG_DOT_NEWLINE) > 0),syntax.get(RESyntax.RE_DOT_NOT_NULL));
 810:       }
 811: 
 812:       // ZERO-OR-MORE REPEAT OPERATOR
 813:       //  *
 814:       //
 815:       // This method used to check "repeat.empty.token" to avoid such regexp
 816:       // as "(a*)*", but now "repeat.empty.token" is allowed.
 817: 
 818:       else if ((unit.ch == '*') && !(unit.bk || quot)) {
 819:     if (currentToken == null)
 820:           throw new REException(getLocalizedMessage("repeat.no.token"),REException.REG_BADRPT,index);
 821:     if (currentToken instanceof RETokenRepeated)
 822:           throw new REException(getLocalizedMessage("repeat.chained"),REException.REG_BADRPT,index);
 823:     if (currentToken instanceof RETokenWordBoundary || currentToken instanceof RETokenWordBoundary)
 824:       throw new REException(getLocalizedMessage("repeat.assertion"),REException.REG_BADRPT,index);
 825:     currentToken = setRepeated(currentToken,0,Integer.MAX_VALUE,index);
 826:       }
 827: 
 828:       // ONE-OR-MORE REPEAT OPERATOR / POSSESSIVE MATCHING OPERATOR
 829:       //  + | \+ depending on RE_BK_PLUS_QM
 830:       //  not available if RE_LIMITED_OPS is set
 831:       //
 832:       // This method used to check "repeat.empty.token" to avoid such regexp
 833:       // as "(a*)+", but now "repeat.empty.token" is allowed.
 834: 
 835:       else if ((unit.ch == '+') && !syntax.get(RESyntax.RE_LIMITED_OPS) && (!syntax.get(RESyntax.RE_BK_PLUS_QM) ^ (unit.bk || quot))) {
 836:     if (currentToken == null)
 837:           throw new REException(getLocalizedMessage("repeat.no.token"),REException.REG_BADRPT,index);
 838:     
 839:     // Check for possessive matching on RETokenRepeated
 840:     if (currentToken instanceof RETokenRepeated) {
 841:       RETokenRepeated tokenRep = (RETokenRepeated)currentToken;
 842:       if (syntax.get(RESyntax.RE_POSSESSIVE_OPS) && !tokenRep.isPossessive() && !tokenRep.isStingy())
 843:         tokenRep.makePossessive();
 844:       else
 845:         throw new REException(getLocalizedMessage("repeat.chained"),REException.REG_BADRPT,index);
 846: 
 847:     }
 848:     else if (currentToken instanceof RETokenWordBoundary || currentToken instanceof RETokenWordBoundary)
 849:       throw new REException(getLocalizedMessage("repeat.assertion"),REException.REG_BADRPT,index);
 850:     else
 851:       currentToken = setRepeated(currentToken,1,Integer.MAX_VALUE,index);
 852:       }
 853: 
 854:       // ZERO-OR-ONE REPEAT OPERATOR / STINGY MATCHING OPERATOR
 855:       //  ? | \? depending on RE_BK_PLUS_QM
 856:       //  not available if RE_LIMITED_OPS is set
 857:       //  stingy matching if RE_STINGY_OPS is set and it follows a quantifier
 858: 
 859:       else if ((unit.ch == '?') && !syntax.get(RESyntax.RE_LIMITED_OPS) && (!syntax.get(RESyntax.RE_BK_PLUS_QM) ^ (unit.bk || quot))) {
 860:     if (currentToken == null) throw new REException(getLocalizedMessage("repeat.no.token"),REException.REG_BADRPT,index);
 861: 
 862:     // Check for stingy matching on RETokenRepeated
 863:     if (currentToken instanceof RETokenRepeated) {
 864:       RETokenRepeated tokenRep = (RETokenRepeated)currentToken;
 865:       if (syntax.get(RESyntax.RE_STINGY_OPS) && !tokenRep.isStingy() && !tokenRep.isPossessive())
 866:         tokenRep.makeStingy();
 867:       else
 868:         throw new REException(getLocalizedMessage("repeat.chained"),REException.REG_BADRPT,index);
 869:     }
 870:     else if (currentToken instanceof RETokenWordBoundary || currentToken instanceof RETokenWordBoundary)
 871:       throw new REException(getLocalizedMessage("repeat.assertion"),REException.REG_BADRPT,index);
 872:     else
 873:       currentToken = setRepeated(currentToken,0,1,index);
 874:       }
 875: 
 876:       // OCTAL CHARACTER
 877:       //  \0377
 878:     
 879:       else if (unit.bk && (unit.ch == '0') && syntax.get(RESyntax.RE_OCTAL_CHAR)) {
 880:     CharExpression ce = getCharExpression(pattern, index - 2, pLength, syntax);
 881:     if (ce == null)
 882:       throw new REException("invalid octal character", REException.REG_ESCAPE, index);
 883:     index = index - 2 + ce.len;
 884:     addToken(currentToken);
 885:     currentToken = new RETokenChar(subIndex,ce.ch,insens);
 886:     if (insensUSASCII) currentToken.unicodeAware = false;
 887:       }
 888: 
 889:       // BACKREFERENCE OPERATOR
 890:       //  \1 \2 ... \9 and \10 \11 \12 ...
 891:       // not available if RE_NO_BK_REFS is set
 892:       // Perl recognizes \10, \11, and so on only if enough number of
 893:       // parentheses have opened before it, otherwise they are treated
 894:       // as aliases of \010, \011, ... (octal characters).  In case of
 895:       // Sun's JDK, octal character expression must always begin with \0.
 896:       // We will do as JDK does. But FIXME, take a look at "(a)(b)\29".
 897:       // JDK treats \2 as a back reference to the 2nd group because
 898:       // there are only two groups. But in our poor implementation,
 899:       // we cannot help but treat \29 as a back reference to the 29th group.
 900: 
 901:       else if (unit.bk && Character.isDigit(unit.ch) && !syntax.get(RESyntax.RE_NO_BK_REFS)) {
 902:     addToken(currentToken);
 903:     int numBegin = index - 1;
 904:     int numEnd = pLength;
 905:     for (int i = index; i < pLength; i++) {
 906:         if (! Character.isDigit(pattern[i])) {
 907:         numEnd = i;
 908:         break;
 909:         }
 910:     }
 911:     int num = parseInt(pattern, numBegin, numEnd-numBegin, 10);
 912: 
 913:     currentToken = new RETokenBackRef(subIndex,num,insens);
 914:     if (insensUSASCII) currentToken.unicodeAware = false;
 915:     index = numEnd;
 916:       }
 917: 
 918:       // START OF STRING OPERATOR
 919:       //  \A if RE_STRING_ANCHORS is set
 920:       
 921:       else if (unit.bk && (unit.ch == 'A') && syntax.get(RESyntax.RE_STRING_ANCHORS)) {
 922:     addToken(currentToken);
 923:     currentToken = new RETokenStart(subIndex,null);
 924:       }
 925: 
 926:       // WORD BREAK OPERATOR
 927:       //  \b if ????
 928: 
 929:       else if (unit.bk && (unit.ch == 'b') && syntax.get(RESyntax.RE_STRING_ANCHORS)) {
 930:       addToken(currentToken);
 931:       currentToken = new RETokenWordBoundary(subIndex, RETokenWordBoundary.BEGIN | RETokenWordBoundary.END, false);
 932:       } 
 933: 
 934:       // WORD BEGIN OPERATOR 
 935:       //  \< if ????
 936:       else if (unit.bk && (unit.ch == '<')) {
 937:       addToken(currentToken);
 938:       currentToken = new RETokenWordBoundary(subIndex, RETokenWordBoundary.BEGIN, false);
 939:       } 
 940: 
 941:       // WORD END OPERATOR 
 942:       //  \> if ????
 943:       else if (unit.bk && (unit.ch == '>')) {
 944:       addToken(currentToken);
 945:       currentToken = new RETokenWordBoundary(subIndex, RETokenWordBoundary.END, false);
 946:       } 
 947: 
 948:       // NON-WORD BREAK OPERATOR
 949:       // \B if ????
 950: 
 951:       else if (unit.bk && (unit.ch == 'B') && syntax.get(RESyntax.RE_STRING_ANCHORS)) {
 952:       addToken(currentToken);
 953:       currentToken = new RETokenWordBoundary(subIndex, RETokenWordBoundary.BEGIN | RETokenWordBoundary.END, true);
 954:       } 
 955: 
 956:       
 957:       // DIGIT OPERATOR
 958:       //  \d if RE_CHAR_CLASS_ESCAPES is set
 959:       
 960:       else if (unit.bk && (unit.ch == 'd') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
 961:     addToken(currentToken);
 962:     currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.DIGIT,insens,false);
 963:     if (insensUSASCII) currentToken.unicodeAware = false;
 964:       }
 965: 
 966:       // NON-DIGIT OPERATOR
 967:       //  \D
 968: 
 969:     else if (unit.bk && (unit.ch == 'D') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
 970:       addToken(currentToken);
 971:       currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.DIGIT,insens,true);
 972:       if (insensUSASCII) currentToken.unicodeAware = false;
 973:     }
 974: 
 975:     // NEWLINE ESCAPE
 976:         //  \n
 977: 
 978:     else if (unit.bk && (unit.ch == 'n')) {
 979:       addToken(currentToken);
 980:       currentToken = new RETokenChar(subIndex,'\n',false);
 981:     }
 982: 
 983:     // RETURN ESCAPE
 984:         //  \r
 985: 
 986:     else if (unit.bk && (unit.ch == 'r')) {
 987:       addToken(currentToken);
 988:       currentToken = new RETokenChar(subIndex,'\r',false);
 989:     }
 990: 
 991:     // WHITESPACE OPERATOR
 992:         //  \s if RE_CHAR_CLASS_ESCAPES is set
 993: 
 994:     else if (unit.bk && (unit.ch == 's') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
 995:       addToken(currentToken);
 996:       currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.SPACE,insens,false);
 997:       if (insensUSASCII) currentToken.unicodeAware = false;
 998:     }
 999: 
1000:     // NON-WHITESPACE OPERATOR
1001:         //  \S
1002: 
1003:     else if (unit.bk && (unit.ch == 'S') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
1004:       addToken(currentToken);
1005:       currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.SPACE,insens,true);
1006:       if (insensUSASCII) currentToken.unicodeAware = false;
1007:     }
1008: 
1009:     // TAB ESCAPE
1010:         //  \t
1011: 
1012:     else if (unit.bk && (unit.ch == 't')) {
1013:       addToken(currentToken);
1014:       currentToken = new RETokenChar(subIndex,'\t',false);
1015:     }
1016: 
1017:     // ALPHANUMERIC OPERATOR
1018:         //  \w
1019: 
1020:     else if (unit.bk && (unit.ch == 'w') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
1021:       addToken(currentToken);
1022:       currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.ALNUM,insens,false);
1023:       if (insensUSASCII) currentToken.unicodeAware = false;
1024:     }
1025: 
1026:     // NON-ALPHANUMERIC OPERATOR
1027:         //  \W
1028: 
1029:     else if (unit.bk && (unit.ch == 'W') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
1030:       addToken(currentToken);
1031:       currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.ALNUM,insens,true);
1032:       if (insensUSASCII) currentToken.unicodeAware = false;
1033:     }
1034: 
1035:     // END OF STRING OPERATOR
1036:         //  \Z, \z
1037: 
1038:     // FIXME: \Z and \z are different in that if the input string
1039:     // ends with a line terminator, \Z matches the position before
1040:     // the final terminator.  This special behavior of \Z is yet
1041:     // to be implemented.
1042: 
1043:     else if (unit.bk && (unit.ch == 'Z' || unit.ch == 'z') &&
1044:          syntax.get(RESyntax.RE_STRING_ANCHORS)) {
1045:       addToken(currentToken);
1046:       currentToken = new RETokenEnd(subIndex,null);
1047:     }
1048: 
1049:         // HEX CHARACTER, UNICODE CHARACTER
1050:         //  \x1B, \u1234
1051:     
1052:     else if ((unit.bk && (unit.ch == 'x') && syntax.get(RESyntax.RE_HEX_CHAR)) ||
1053:          (unit.bk && (unit.ch == 'u') && syntax.get(RESyntax.RE_UNICODE_CHAR))) {
1054:       CharExpression ce = getCharExpression(pattern, index - 2, pLength, syntax);
1055:       if (ce == null)
1056:         throw new REException("invalid hex character", REException.REG_ESCAPE, index);
1057:       index = index - 2 + ce.len;
1058:       addToken(currentToken);
1059:       currentToken = new RETokenChar(subIndex,ce.ch,insens);
1060:       if (insensUSASCII) currentToken.unicodeAware = false;
1061:     }
1062: 
1063:     // NAMED PROPERTY
1064:     // \p{prop}, \P{prop}
1065: 
1066:     else if ((unit.bk && (unit.ch == 'p') && syntax.get(RESyntax.RE_NAMED_PROPERTY)) ||
1067:              (unit.bk && (unit.ch == 'P') && syntax.get(RESyntax.RE_NAMED_PROPERTY))) {
1068:       NamedProperty np = getNamedProperty(pattern, index - 2, pLength);
1069:       if (np == null)
1070:           throw new REException("invalid escape sequence", REException.REG_ESCAPE, index);
1071:       index = index - 2 + np.len;
1072:       addToken(currentToken);
1073:       currentToken = getRETokenNamedProperty(subIndex,np,insens,index);
1074:       if (insensUSASCII) currentToken.unicodeAware = false;
1075:     }
1076: 
1077:     // END OF PREVIOUS MATCH
1078:         //  \G
1079: 
1080:     else if (unit.bk && (unit.ch == 'G') &&
1081:          syntax.get(RESyntax.RE_STRING_ANCHORS)) {
1082:       addToken(currentToken);
1083:       currentToken = new RETokenEndOfPreviousMatch(subIndex);
1084:     }
1085: 
1086:     // NON-SPECIAL CHARACTER (or escape to make literal)
1087:         //  c | \* for example
1088: 
1089:     else {  // not a special character
1090:       addToken(currentToken);
1091:       currentToken = new RETokenChar(subIndex,unit.ch,insens);
1092:       if (insensUSASCII) currentToken.unicodeAware = false;
1093:     } 
1094:       } // end while
1095: 
1096:     // Add final buffered token and an EndSub marker
1097:     addToken(currentToken);
1098:       
1099:     if (branches != null) {
1100:     branches.addElement(new RE(firstToken,lastToken,numSubs,subIndex,minimumLength, maximumLength));
1101:     branches.trimToSize(); // compact the Vector
1102:     minimumLength = 0;
1103:     maximumLength = 0;
1104:     firstToken = lastToken = null;
1105:     addToken(new RETokenOneOf(subIndex,branches,false));
1106:     } 
1107:     else addToken(new RETokenEndSub(subIndex));
1108: 
1109:   }
1110: 
1111:   private static class ParseCharClassResult {
1112:       RETokenOneOf token;
1113:       int index;
1114:       boolean returnAtAndOperator = false;
1115:   }
1116: 
1117:   /**
1118:    * Parse [...] or [^...] and make an RETokenOneOf instance.
1119:    * @param subIndex subIndex to be given to the created RETokenOneOf instance.
1120:    * @param pattern Input array of characters to be parsed.
1121:    * @param index Index pointing to the character next to the beginning '['.
1122:    * @param pLength Limit of the input array.
1123:    * @param cflags Compilation flags used to parse the pattern.
1124:    * @param pflags Flags that affect the behavior of this method.
1125:    * @param syntax Syntax used to parse the pattern.
1126:    */
1127:   private static ParseCharClassResult parseCharClass(int subIndex,
1128:         char[] pattern, int index,
1129:         int pLength, int cflags, RESyntax syntax, int pflags)
1130:         throws REException {
1131: 
1132:     boolean insens = ((cflags & REG_ICASE) > 0);
1133:     boolean insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0);
1134:     Vector options = new Vector();
1135:     Vector addition = new Vector();
1136:     boolean additionAndAppeared = false;
1137:     final int RETURN_AT_AND = 0x01;
1138:     boolean returnAtAndOperator = ((pflags & RETURN_AT_AND) != 0);
1139:     boolean negative = false;
1140:     char ch;
1141: 
1142:     char lastChar = 0;
1143:     boolean lastCharIsSet = false;
1144:     if (index == pLength) throw new REException(getLocalizedMessage("unmatched.bracket"),REException.REG_EBRACK,index);
1145:     
1146:     // Check for initial caret, negation
1147:     if ((ch = pattern[index]) == '^') {
1148:       negative = true;
1149:       if (++index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
1150:       ch = pattern[index];
1151:     }
1152: 
1153:     // Check for leading right bracket literal
1154:     if (ch == ']') {
1155:       lastChar = ch; lastCharIsSet = true;
1156:       if (++index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
1157:     }
1158: 
1159:     while ((ch = pattern[index++]) != ']') {
1160:       if ((ch == '-') && (lastCharIsSet)) {
1161:         if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
1162:         if ((ch = pattern[index]) == ']') {
1163:           RETokenChar t = new RETokenChar(subIndex,lastChar,insens);
1164:           if (insensUSASCII) t.unicodeAware = false;
1165:           options.addElement(t);
1166:           lastChar = '-';
1167:         } else {
1168:           if ((ch == '\\') && syntax.get(RESyntax.RE_BACKSLASH_ESCAPE_IN_LISTS)) {
1169:             CharExpression ce = getCharExpression(pattern, index, pLength, syntax);
1170:             if (ce == null)
1171:           throw new REException("invalid escape sequence", REException.REG_ESCAPE, index);
1172:         ch = ce.ch;
1173:         index = index + ce.len - 1;
1174:           }
1175:           RETokenRange t = new RETokenRange(subIndex,lastChar,ch,insens);
1176:           if (insensUSASCII) t.unicodeAware = false;
1177:           options.addElement(t);
1178:           lastChar = 0; lastCharIsSet = false;
1179:           index++;
1180:         }
1181:           } else if ((ch == '\\') && syntax.get(RESyntax.RE_BACKSLASH_ESCAPE_IN_LISTS)) {
1182:             if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
1183:         int posixID = -1;
1184:         boolean negate = false;
1185:             char asciiEsc = 0;
1186:         boolean asciiEscIsSet = false;
1187:         NamedProperty np = null;
1188:         if (("dswDSW".indexOf(pattern[index]) != -1) && syntax.get(RESyntax.RE_CHAR_CLASS_ESC_IN_LISTS)) {
1189:           switch (pattern[index]) {
1190:           case 'D':
1191:         negate = true;
1192:           case 'd':
1193:         posixID = RETokenPOSIX.DIGIT;
1194:         break;
1195:           case 'S':
1196:         negate = true;
1197:           case 's':
1198:         posixID = RETokenPOSIX.SPACE;
1199:         break;
1200:           case 'W':
1201:         negate = true;
1202:           case 'w':
1203:         posixID = RETokenPOSIX.ALNUM;
1204:         break;
1205:           }
1206:         }
1207:         if (("pP".indexOf(pattern[index]) != -1) && syntax.get(RESyntax.RE_NAMED_PROPERTY)) {
1208:           np = getNamedProperty(pattern, index - 1, pLength);
1209:           if (np == null)
1210:         throw new REException("invalid escape sequence", REException.REG_ESCAPE, index);
1211:           index = index - 1 + np.len - 1;
1212:         }
1213:         else {
1214:           CharExpression ce = getCharExpression(pattern, index - 1, pLength, syntax);
1215:           if (ce == null)
1216:         throw new REException("invalid escape sequence", REException.REG_ESCAPE, index);
1217:           asciiEsc = ce.ch; asciiEscIsSet = true;
1218:           index = index - 1 + ce.len - 1;
1219:         }
1220:         if (lastCharIsSet) {
1221:           RETokenChar t = new RETokenChar(subIndex,lastChar,insens);
1222:           if (insensUSASCII) t.unicodeAware = false;
1223:           options.addElement(t);
1224:         }
1225:         
1226:         if (posixID != -1) {
1227:           RETokenPOSIX t = new RETokenPOSIX(subIndex,posixID,insens,negate);
1228:           if (insensUSASCII) t.unicodeAware = false;
1229:           options.addElement(t);
1230:         } else if (np != null) {
1231:           RETokenNamedProperty t = getRETokenNamedProperty(subIndex,np,insens,index);
1232:           if (insensUSASCII) t.unicodeAware = false;
1233:           options.addElement(t);
1234:         } else if (asciiEscIsSet) {
1235:           lastChar = asciiEsc; lastCharIsSet = true;
1236:         } else {
1237:           lastChar = pattern[index]; lastCharIsSet = true;
1238:         }
1239:         ++index;
1240:       } else if ((ch == '[') && (syntax.get(RESyntax.RE_CHAR_CLASSES)) && (index < pLength) && (pattern[index] == ':')) {
1241:         StringBuffer posixSet = new StringBuffer();
1242:         index = getPosixSet(pattern,index+1,posixSet);
1243:         int posixId = RETokenPOSIX.intValue(posixSet.toString());
1244:         if (posixId != -1) {
1245:           RETokenPOSIX t = new RETokenPOSIX(subIndex,posixId,insens,false);
1246:           if (insensUSASCII) t.unicodeAware = false;
1247:           options.addElement(t);
1248:         }
1249:       } else if ((ch == '[') && (syntax.get(RESyntax.RE_NESTED_CHARCLASS))) {
1250:         ParseCharClassResult result = parseCharClass(
1251:             subIndex, pattern, index, pLength, cflags, syntax, 0);
1252:         addition.addElement(result.token);
1253:         addition.addElement("|");
1254:         index = result.index;
1255:       } else if ((ch == '&') &&
1256:              (syntax.get(RESyntax.RE_NESTED_CHARCLASS)) &&
1257:              (index < pLength) && (pattern[index] == '&')) {
1258:         if (returnAtAndOperator) {
1259:             ParseCharClassResult result = new ParseCharClassResult(); 
1260:             options.trimToSize();
1261:             if (additionAndAppeared) addition.addElement("&");
1262:             if (addition.size() == 0) addition = null;
1263:             result.token = new RETokenOneOf(subIndex,
1264:             options, addition, negative);
1265:             result.index = index - 1;
1266:             result.returnAtAndOperator = true;
1267:             return result;
1268:         }
1269:         // The precedence of the operator "&&" is the lowest.
1270:         // So we postpone adding "&" until other elements
1271:         // are added. And we insert Boolean.FALSE at the
1272:         // beginning of the list of tokens following "&&".
1273:         // So, "&&[a-b][k-m]" will be stored in the Vecter
1274:         // addition in this order:
1275:         //     Boolean.FALSE, [a-b], "|", [k-m], "|", "&"
1276:         if (additionAndAppeared) addition.addElement("&");
1277:         addition.addElement(Boolean.FALSE);
1278:         additionAndAppeared = true;
1279: 
1280:         // The part on which "&&" operates may be either
1281:         //   (1) explicitly enclosed by []
1282:         //   or
1283:         //   (2) not enclosed by [] and terminated by the
1284:         //       next "&&" or the end of the character list.
1285:             //  Let the preceding else if block do the case (1).
1286:         //  We must do something in case of (2).
1287:         if ((index + 1 < pLength) && (pattern[index + 1] != '[')) {
1288:             ParseCharClassResult result = parseCharClass(
1289:             subIndex, pattern, index+1, pLength, cflags, syntax,
1290:             RETURN_AT_AND);
1291:             addition.addElement(result.token);
1292:             addition.addElement("|");
1293:             // If the method returned at the next "&&", it is OK.
1294:             // Otherwise we have eaten the mark of the end of this
1295:             // character list "]".  In this case we must give back
1296:             // the end mark.
1297:             index = (result.returnAtAndOperator ?
1298:             result.index: result.index - 1);
1299:         }
1300:       } else {
1301:         if (lastCharIsSet) {
1302:           RETokenChar t = new RETokenChar(subIndex,lastChar,insens);
1303:           if (insensUSASCII) t.unicodeAware = false;
1304:           options.addElement(t);
1305:         }
1306:         lastChar = ch; lastCharIsSet = true;
1307:       }
1308:       if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
1309:     } // while in list
1310:     // Out of list, index is one past ']'
1311:         
1312:     if (lastCharIsSet) {
1313:       RETokenChar t = new RETokenChar(subIndex,lastChar,insens);
1314:       if (insensUSASCII) t.unicodeAware = false;
1315:       options.addElement(t);
1316:     }
1317:        
1318:     ParseCharClassResult result = new ParseCharClassResult(); 
1319:     // Create a new RETokenOneOf
1320:     options.trimToSize();
1321:     if (additionAndAppeared) addition.addElement("&");
1322:     if (addition.size() == 0) addition = null;
1323:     result.token = new RETokenOneOf(subIndex,options, addition, negative);
1324:     result.index = index;
1325:     return result;
1326:   }
1327: 
1328:   private static int getCharUnit(char[] input, int index, CharUnit unit, boolean quot) throws REException {
1329:     unit.ch = input[index++];
1330:     unit.bk = (unit.ch == '\\'
1331:            && (!quot || index >= input.length || input[index] == 'E'));
1332:     if (unit.bk)
1333:       if (index < input.length)
1334:     unit.ch = input[index++];
1335:       else throw new REException(getLocalizedMessage("ends.with.backslash"),REException.REG_ESCAPE,index);
1336:     return index;
1337:   }
1338: 
1339:   private static int parseInt(char[] input, int pos, int len, int radix) {
1340:     int ret = 0;
1341:     for (int i = pos; i < pos + len; i++) {
1342:     ret = ret * radix + Character.digit(input[i], radix);
1343:     }
1344:     return ret;
1345:   }
1346: 
1347:   /**
1348:    * This class represents various expressions for a character.
1349:    * "a"      : 'a' itself.
1350:    * "\0123"  : Octal char 0123
1351:    * "\x1b"   : Hex char 0x1b
1352:    * "\u1234" : Unicode char \u1234
1353:    */
1354:   private static class CharExpression {
1355:     /** character represented by this expression */
1356:     char ch;
1357:     /** String expression */
1358:     String expr;
1359:     /** length of this expression */
1360:     int len;
1361:     public String toString() { return expr; }
1362:   }
1363: 
1364:   private static CharExpression getCharExpression(char[] input, int pos, int lim,
1365:         RESyntax syntax) {
1366:     CharExpression ce = new CharExpression();
1367:     char c = input[pos];
1368:     if (c == '\\') {
1369:       if (pos + 1 >= lim) return null;
1370:       c = input[pos + 1];
1371:       switch(c) {
1372:       case 't':
1373:         ce.ch = '\t';
1374:         ce.len = 2;
1375:         break;
1376:       case 'n':
1377:         ce.ch = '\n';
1378:         ce.len = 2;
1379:         break;
1380:       case 'r':
1381:         ce.ch = '\r';
1382:         ce.len = 2;
1383:         break;
1384:       case 'x':
1385:       case 'u':
1386:         if ((c == 'x' && syntax.get(RESyntax.RE_HEX_CHAR)) ||
1387:             (c == 'u' && syntax.get(RESyntax.RE_UNICODE_CHAR))) {
1388:           int l = 0;
1389:           int expectedLength = (c == 'x' ? 2 : 4);
1390:           for (int i = pos + 2; i < pos + 2 + expectedLength; i++) {
1391:             if (i >= lim) break;
1392:             if (!((input[i] >= '0' && input[i] <= '9') ||
1393:                   (input[i] >= 'A' && input[i] <= 'F') ||
1394:                   (input[i] >= 'a' && input[i] <= 'f')))
1395:                 break;
1396:         l++;
1397:           }
1398:           if (l != expectedLength) return null;
1399:           ce.ch = (char)(parseInt(input, pos + 2, l, 16));
1400:       ce.len = l + 2;
1401:         }
1402:         else {
1403:           ce.ch = c;
1404:           ce.len = 2;
1405:         }
1406:         break;
1407:       case '0':
1408:         if (syntax.get(RESyntax.RE_OCTAL_CHAR)) {
1409:           int l = 0;
1410:           for (int i = pos + 2; i < pos + 2 + 3; i++) {
1411:             if (i >= lim) break;
1412:         if (input[i] < '0' || input[i] > '7') break;
1413:             l++;
1414:           }
1415:           if (l == 3 && input[pos + 2] > '3') l--;
1416:           if (l <= 0) return null;
1417:           ce.ch = (char)(parseInt(input, pos + 2, l, 8));
1418:           ce.len = l + 2;
1419:         }
1420:         else {
1421:           ce.ch = c;
1422:           ce.len = 2;
1423:         }
1424:         break;
1425:       default:
1426:         ce.ch = c;
1427:         ce.len = 2;
1428:         break;
1429:       }
1430:     }
1431:     else {
1432:       ce.ch = input[pos];
1433:       ce.len = 1;
1434:     }
1435:     ce.expr = new String(input, pos, ce.len);
1436:     return ce;
1437:   }
1438: 
1439:   /**
1440:    * This class represents a substring in a pattern string expressing
1441:    * a named property.
1442:    * "\pA"      : Property named "A"
1443:    * "\p{prop}" : Property named "prop"
1444:    * "\PA"      : Property named "A" (Negated)
1445:    * "\P{prop}" : Property named "prop" (Negated)
1446:    */
1447:   private static class NamedProperty {
1448:     /** Property name */
1449:     String name;
1450:     /** Negated or not */
1451:     boolean negate;
1452:     /** length of this expression */
1453:     int len;
1454:   }
1455: 
1456:   private static NamedProperty getNamedProperty(char[] input, int pos, int lim) {
1457:     NamedProperty np = new NamedProperty();
1458:     char c = input[pos];
1459:     if (c == '\\') {
1460:       if (++pos >= lim) return null;
1461:       c = input[pos++];
1462:       switch(c) {
1463:       case 'p':
1464:         np.negate = false;
1465:         break;
1466:       case 'P':
1467:         np.negate = true;
1468:         break;
1469:       default:
1470:     return null;
1471:       }
1472:       c = input[pos++];
1473:       if (c == '{') {
1474:           int p = -1;
1475:       for (int i = pos; i < lim; i++) {
1476:           if (input[i] == '}') {
1477:           p = i;
1478:           break;
1479:           }
1480:       }
1481:       if (p < 0) return null;
1482:       int len = p - pos;
1483:           np.name = new String(input, pos, len);
1484:       np.len = len + 4;
1485:       }
1486:       else {
1487:           np.name = new String(input, pos - 1, 1);
1488:       np.len = 3;
1489:       }
1490:       return np;
1491:     }
1492:     else return null;
1493:   }
1494: 
1495:   private static RETokenNamedProperty getRETokenNamedProperty(
1496:       int subIndex, NamedProperty np, boolean insens, int index)
1497:       throws REException {
1498:     try {
1499:     return new RETokenNamedProperty(subIndex, np.name, insens, np.negate);
1500:     }
1501:     catch (REException e) {
1502:     REException ree;
1503:     ree = new REException(e.getMessage(), REException.REG_ESCAPE, index);
1504:     ree.initCause(e);
1505:     throw ree;
1506:     }
1507:   }
1508: 
1509:   /**
1510:    * Checks if the regular expression matches the input in its entirety.
1511:    *
1512:    * @param input The input text.
1513:    */
1514:   public boolean isMatch(Object input) {
1515:     return isMatch(input,0,0);
1516:   }
1517:   
1518:   /**
1519:    * Checks if the input string, starting from index, is an exact match of
1520:    * this regular expression.
1521:    *
1522:    * @param input The input text.
1523:    * @param index The offset index at which the search should be begin.
1524:    */
1525:   public boolean isMatch(Object input,int index) {
1526:     return isMatch(input,index,0);
1527:   }
1528:   
1529: 
1530:   /**
1531:    * Checks if the input, starting from index and using the specified
1532:    * execution flags, is an exact match of this regular expression.
1533:    *
1534:    * @param input The input text.
1535:    * @param index The offset index at which the search should be begin.
1536:    * @param eflags The logical OR of any execution flags above.
1537:    */
1538:   public boolean isMatch(Object input,int index,int eflags) {
1539:     return isMatchImpl(makeCharIndexed(input,index),index,eflags);
1540:   }
1541: 
1542:   private boolean isMatchImpl(CharIndexed input, int index, int eflags) {
1543:     if (firstToken == null)  // Trivial case
1544:       return (input.charAt(0) == CharIndexed.OUT_OF_BOUNDS);
1545:     REMatch m = new REMatch(numSubs, index, eflags);
1546:     if (firstToken.match(input, m)) {
1547:     if (m != null) {
1548:         if (input.charAt(m.index) == CharIndexed.OUT_OF_BOUNDS) {
1549:         return true;
1550:         }
1551:     }
1552:     }
1553:     return false;
1554:   }
1555:     
1556:   /**
1557:    * Returns the maximum number of subexpressions in this regular expression.
1558:    * If the expression contains branches, the value returned will be the
1559:    * maximum subexpressions in any of the branches.
1560:    */
1561:   public int getNumSubs() {
1562:     return numSubs;
1563:   }
1564: 
1565:   // Overrides REToken.setUncle
1566:   void setUncle(REToken uncle) {
1567:       if (lastToken != null) {
1568:       lastToken.setUncle(uncle);
1569:       } else super.setUncle(uncle); // to deal with empty subexpressions
1570:   }
1571: 
1572:   // Overrides REToken.chain
1573: 
1574:   boolean chain(REToken next) {
1575:     super.chain(next);
1576:     setUncle(next);
1577:     return true;
1578:   }
1579: 
1580:   /**
1581:    * Returns the minimum number of characters that could possibly
1582:    * constitute a match of this regular expression.
1583:    */
1584:   public int getMinimumLength() {
1585:       return minimumLength;
1586:   }
1587: 
1588:   public int getMaximumLength() {
1589:       return maximumLength;
1590:   }
1591: 
1592:   /**
1593:    * Returns an array of all matches found in the input.
1594:    *
1595:    * If the regular expression allows the empty string to match, it will
1596:    * substitute matches at all positions except the end of the input.
1597:    *
1598:    * @param input The input text.
1599:    * @return a non-null (but possibly zero-length) array of matches
1600:    */
1601:   public REMatch[] getAllMatches(Object input) {
1602:     return getAllMatches(input,0,0);
1603:   }
1604: 
1605:   /**
1606:    * Returns an array of all matches found in the input,
1607:    * beginning at the specified index position.
1608:    *
1609:    * If the regular expression allows the empty string to match, it will
1610:    * substitute matches at all positions except the end of the input.
1611:    *
1612:    * @param input The input text.
1613:    * @param index The offset index at which the search should be begin.
1614:    * @return a non-null (but possibly zero-length) array of matches
1615:    */
1616:   public REMatch[] getAllMatches(Object input, int index) {
1617:     return getAllMatches(input,index,0);
1618:   }
1619: 
1620:   /**
1621:    * Returns an array of all matches found in the input string,
1622:    * beginning at the specified index position and using the specified
1623:    * execution flags.
1624:    *
1625:    * If the regular expression allows the empty string to match, it will
1626:    * substitute matches at all positions except the end of the input.
1627:    *
1628:    * @param input The input text.
1629:    * @param index The offset index at which the search should be begin.
1630:    * @param eflags The logical OR of any execution flags above.
1631:    * @return a non-null (but possibly zero-length) array of matches
1632:    */
1633:   public REMatch[] getAllMatches(Object input, int index, int eflags) {
1634:     return getAllMatchesImpl(makeCharIndexed(input,index),index,eflags);
1635:   }
1636: 
1637:   // this has been changed since 1.03 to be non-overlapping matches
1638:   private REMatch[] getAllMatchesImpl(CharIndexed input, int index, int eflags) {
1639:     Vector all = new Vector();
1640:     REMatch m = null;
1641:     while ((m = getMatchImpl(input,index,eflags,null)) != null) {
1642:       all.addElement(m);
1643:       index = m.getEndIndex();
1644:       if (m.end[0] == 0) {   // handle pathological case of zero-length match
1645:     index++;
1646:     input.move(1);
1647:       } else {
1648:     input.move(m.end[0]);
1649:       }
1650:       if (!input.isValid()) break;
1651:     }
1652:     REMatch[] mset = new REMatch[all.size()];
1653:     all.copyInto(mset);
1654:     return mset;
1655:   }
1656:   
1657:     /* Implements abstract method REToken.match() */
1658:     boolean match(CharIndexed input, REMatch mymatch) {
1659:         input.setHitEnd(mymatch);
1660:     if (firstToken == null) {
1661:         return next(input, mymatch);
1662:     }
1663: 
1664:     // Note the start of this subexpression
1665:     mymatch.start1[subIndex] = mymatch.index;
1666: 
1667:     return firstToken.match(input, mymatch);
1668:     }
1669: 
1670:     REMatch findMatch(CharIndexed input, REMatch mymatch) {
1671:         if (mymatch.backtrackStack == null)
1672:       mymatch.backtrackStack = new BacktrackStack();
1673:     boolean b = match(input, mymatch);
1674:     if (b) {
1675:         return mymatch;
1676:     }
1677:     return null;
1678:     }
1679: 
1680:   /**
1681:    * Returns the first match found in the input.  If no match is found,
1682:    * null is returned.
1683:    *
1684:    * @param input The input text.
1685:    * @return An REMatch instance referencing the match, or null if none.
1686:    */
1687:   public REMatch getMatch(Object input) {
1688:     return getMatch(input,0,0);
1689:   }
1690:   
1691:   /**
1692:    * Returns the first match found in the input, beginning
1693:    * the search at the specified index.  If no match is found,
1694:    * returns null.
1695:    *
1696:    * @param input The input text.
1697:    * @param index The offset within the text to begin looking for a match.
1698:    * @return An REMatch instance referencing the match, or null if none.
1699:    */
1700:   public REMatch getMatch(Object input, int index) {
1701:     return getMatch(input,index,0);
1702:   }
1703:   
1704:   /**
1705:    * Returns the first match found in the input, beginning
1706:    * the search at the specified index, and using the specified
1707:    * execution flags.  If no match is found, returns null.
1708:    *
1709:    * @param input The input text.
1710:    * @param index The offset index at which the search should be begin.
1711:    * @param eflags The logical OR of any execution flags above.
1712:    * @return An REMatch instance referencing the match, or null if none.
1713:    */
1714:   public REMatch getMatch(Object input, int index, int eflags) {
1715:     return getMatch(input,index,eflags,null);
1716:   }
1717: 
1718:   /**
1719:    * Returns the first match found in the input, beginning the search
1720:    * at the specified index, and using the specified execution flags.
1721:    * If no match is found, returns null.  If a StringBuffer is
1722:    * provided and is non-null, the contents of the input text from the
1723:    * index to the beginning of the match (or to the end of the input,
1724:    * if there is no match) are appended to the StringBuffer.
1725:    *
1726:    * @param input The input text.
1727:    * @param index The offset index at which the search should be begin.
1728:    * @param eflags The logical OR of any execution flags above.
1729:    * @param buffer The StringBuffer to save pre-match text in.
1730:    * @return An REMatch instance referencing the match, or null if none.  */
1731:   public REMatch getMatch(Object input, int index, int eflags, StringBuffer buffer) {
1732:     return getMatchImpl(makeCharIndexed(input,index),index,eflags,buffer);
1733:   }
1734: 
1735:   REMatch getMatchImpl(CharIndexed input, int anchor, int eflags, StringBuffer buffer) {
1736:       boolean tryEntireMatch = ((eflags & REG_TRY_ENTIRE_MATCH) != 0);
1737:       boolean doMove = ((eflags & REG_FIX_STARTING_POSITION) == 0);
1738:       RE re = (tryEntireMatch ? (RE) this.clone() : this);
1739:       if (tryEntireMatch) {
1740:       RETokenEnd reEnd = new RETokenEnd(0, null);
1741:       reEnd.setFake(true);
1742:       re.chain(reEnd);
1743:       }
1744:       // Create a new REMatch to hold results
1745:       REMatch mymatch = new REMatch(numSubs, anchor, eflags);
1746:       do {
1747:           /* The following potimization is commented out because
1748:              the matching should be tried even if the length of
1749:              input is obviously too short in order that
1750:              java.util.regex.Matcher#hitEnd() may work correctly.
1751:       // Optimization: check if anchor + minimumLength > length
1752:       if (minimumLength == 0 || input.charAt(minimumLength-1) != CharIndexed.OUT_OF_BOUNDS) {
1753:           */
1754:           if (re.match(input, mymatch)) {
1755:           REMatch best = mymatch;
1756:           // We assume that the match that coms first is the best.
1757:           // And the following "The longer, the better" rule has
1758:           // been commented out. The longest is not neccesarily
1759:           // the best. For example, "a" out of "aaa" is the best
1760:           // match for /a+?/.
1761:           /*
1762:           // Find best match of them all to observe leftmost longest
1763:           while ((mymatch = mymatch.next) != null) {
1764:               if (mymatch.index > best.index) {
1765:                best = mymatch;
1766:               }
1767:           }
1768:           */
1769:           best.end[0] = best.index;
1770:           best.finish(input);
1771:           input.setLastMatch(best);
1772:           return best;
1773:           }
1774:       /* End of the optimization commented out
1775:           }
1776:       */
1777:       mymatch.clear(++anchor);
1778:       // Append character to buffer if needed
1779:       if (buffer != null && input.charAt(0) != CharIndexed.OUT_OF_BOUNDS) {
1780:           buffer.append(input.charAt(0));
1781:       }
1782:       // java.util.regex.Matcher#hitEnd() requires that the search should
1783:       // be tried at the end of input, so we use move1(1) instead of move(1) 
1784:       } while (doMove && input.move1(1));
1785:       
1786:       // Special handling at end of input for e.g. "$"
1787:       if (minimumLength == 0) {
1788:       if (match(input, mymatch)) {
1789:           mymatch.finish(input);
1790:           return mymatch;
1791:       }
1792:       }
1793: 
1794:       return null;
1795:   }
1796: 
1797:   /**
1798:    * Returns an REMatchEnumeration that can be used to iterate over the
1799:    * matches found in the input text.
1800:    *
1801:    * @param input The input text.
1802:    * @return A non-null REMatchEnumeration instance.
1803:    */
1804:   public REMatchEnumeration getMatchEnumeration(Object input) {
1805:     return getMatchEnumeration(input,0,0);
1806:   }
1807: 
1808: 
1809:   /**
1810:    * Returns an REMatchEnumeration that can be used to iterate over the
1811:    * matches found in the input text.
1812:    *
1813:    * @param input The input text.
1814:    * @param index The offset index at which the search should be begin.
1815:    * @return A non-null REMatchEnumeration instance, with its input cursor
1816:    *  set to the index position specified.
1817:    */
1818:   public REMatchEnumeration getMatchEnumeration(Object input, int index) {
1819:     return getMatchEnumeration(input,index,0);
1820:   }
1821: 
1822:   /**
1823:    * Returns an REMatchEnumeration that can be used to iterate over the
1824:    * matches found in the input text.
1825:    *
1826:    * @param input The input text.
1827:    * @param index The offset index at which the search should be begin.
1828:    * @param eflags The logical OR of any execution flags above.
1829:    * @return A non-null REMatchEnumeration instance, with its input cursor
1830:    *  set to the index position specified.
1831:    */
1832:   public REMatchEnumeration getMatchEnumeration(Object input, int index, int eflags) {
1833:     return new REMatchEnumeration(this,makeCharIndexed(input,index),index,eflags);
1834:   }
1835: 
1836: 
1837:   /**
1838:    * Substitutes the replacement text for the first match found in the input.
1839:    *
1840:    * @param input The input text.
1841:    * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1842:    * @return A String interpolating the substituted text.
1843:    * @see REMatch#substituteInto
1844:    */
1845:   public String substitute(Object input,String replace) {
1846:     return substitute(input,replace,0,0);
1847:   }
1848: 
1849:   /**
1850:    * Substitutes the replacement text for the first match found in the input
1851:    * beginning at the specified index position.  Specifying an index
1852:    * effectively causes the regular expression engine to throw away the
1853:    * specified number of characters. 
1854:    *
1855:    * @param input The input text.
1856:    * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1857:    * @param index The offset index at which the search should be begin.
1858:    * @return A String containing the substring of the input, starting
1859:    *   at the index position, and interpolating the substituted text.
1860:    * @see REMatch#substituteInto
1861:    */
1862:   public String substitute(Object input,String replace,int index) {
1863:     return substitute(input,replace,index,0);
1864:   }
1865: 
1866:   /**
1867:    * Substitutes the replacement text for the first match found in the input
1868:    * string, beginning at the specified index position and using the
1869:    * specified execution flags.
1870:    *
1871:    * @param input The input text.
1872:    * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1873:    * @param index The offset index at which the search should be begin.
1874:    * @param eflags The logical OR of any execution flags above.
1875:    * @return A String containing the substring of the input, starting
1876:    *   at the index position, and interpolating the substituted text.
1877:    * @see REMatch#substituteInto
1878:    */
1879:   public String substitute(Object input,String replace,int index,int eflags) {
1880:     return substituteImpl(makeCharIndexed(input,index),replace,index,eflags);
1881:   }
1882: 
1883:   private String substituteImpl(CharIndexed input,String replace,int index,int eflags) {
1884:     StringBuffer buffer = new StringBuffer();
1885:     REMatch m = getMatchImpl(input,index,eflags,buffer);
1886:     if (m==null) return buffer.toString();
1887:     buffer.append(getReplacement(replace, m, eflags));
1888:     if (input.move(m.end[0])) {
1889:       do {
1890:     buffer.append(input.charAt(0));
1891:       } while (input.move(1));
1892:     }
1893:     return buffer.toString();
1894:   }
1895:   
1896:   /**
1897:    * Substitutes the replacement text for each non-overlapping match found 
1898:    * in the input text.
1899:    *
1900:    * @param input The input text.
1901:    * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1902:    * @return A String interpolating the substituted text.
1903:    * @see REMatch#substituteInto
1904:    */
1905:   public String substituteAll(Object input,String replace) {
1906:     return substituteAll(input,replace,0,0);
1907:   }
1908: 
1909:   /**
1910:    * Substitutes the replacement text for each non-overlapping match found 
1911:    * in the input text, starting at the specified index.
1912:    *
1913:    * If the regular expression allows the empty string to match, it will
1914:    * substitute matches at all positions except the end of the input.
1915:    *
1916:    * @param input The input text.
1917:    * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1918:    * @param index The offset index at which the search should be begin.
1919:    * @return A String containing the substring of the input, starting
1920:    *   at the index position, and interpolating the substituted text.
1921:    * @see REMatch#substituteInto
1922:    */
1923:   public String substituteAll(Object input,String replace,int index) {
1924:     return substituteAll(input,replace,index,0);
1925:   }
1926:  
1927:   /**
1928:    * Substitutes the replacement text for each non-overlapping match found 
1929:    * in the input text, starting at the specified index and using the
1930:    * specified execution flags.
1931:    *
1932:    * @param input The input text.
1933:    * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1934:    * @param index The offset index at which the search should be begin.
1935:    * @param eflags The logical OR of any execution flags above.
1936:    * @return A String containing the substring of the input, starting
1937:    *   at the index position, and interpolating the substituted text.
1938:    * @see REMatch#substituteInto
1939:    */
1940:   public String substituteAll(Object input,String replace,int index,int eflags) {
1941:     return substituteAllImpl(makeCharIndexed(input,index),replace,index,eflags);
1942:   }
1943: 
1944:   private String substituteAllImpl(CharIndexed input,String replace,int index,int eflags) {
1945:     StringBuffer buffer = new StringBuffer();
1946:     REMatch m;
1947:     while ((m = getMatchImpl(input,index,eflags,buffer)) != null) {
1948:       buffer.append(getReplacement(replace, m, eflags));
1949:       index = m.getEndIndex();
1950:       if (m.end[0] == 0) {
1951:     char ch = input.charAt(0);
1952:     if (ch != CharIndexed.OUT_OF_BOUNDS) 
1953:         buffer.append(ch);
1954:     input.move(1);
1955:       } else {
1956:       input.move(m.end[0]);
1957:       }
1958: 
1959:       if (!input.isValid()) break;
1960:     }
1961:     return buffer.toString();
1962:   }
1963: 
1964:   public static String getReplacement(String replace, REMatch m, int eflags) {
1965:     if ((eflags & REG_NO_INTERPOLATE) > 0)
1966:       return replace;
1967:     else {
1968:       if ((eflags & REG_REPLACE_USE_BACKSLASHESCAPE) > 0) {
1969:         StringBuffer sb = new StringBuffer();
1970:         int l = replace.length();
1971:         for (int i = 0; i < l; i++) {
1972:         char c = replace.charAt(i);
1973:             switch(c) {
1974:             case '\\':
1975:               i++;
1976:               // Let StringIndexOutOfBoundsException be thrown.
1977:               sb.append(replace.charAt(i));
1978:               break;
1979:             case '$':
1980:           int i1 = i + 1;
1981:           while (i1 < replace.length() &&
1982:         Character.isDigit(replace.charAt(i1))) i1++;
1983:               sb.append(m.substituteInto(replace.substring(i, i1)));
1984:               i = i1 - 1;
1985:               break;
1986:             default:
1987:               sb.append(c);
1988:             }
1989:         }
1990:         return sb.toString();
1991:       }
1992:       else
1993:         return m.substituteInto(replace);
1994:     }
1995:   }    
1996:   
1997:   /* Helper function for constructor */
1998:   private void addToken(REToken next) {
1999:     if (next == null) return;
2000:     minimumLength += next.getMinimumLength();
2001:     int nmax = next.getMaximumLength();
2002:     if (nmax < Integer.MAX_VALUE && maximumLength < Integer.MAX_VALUE)
2003:     maximumLength += nmax;
2004:     else 
2005:     maximumLength = Integer.MAX_VALUE;
2006: 
2007:     if (firstToken == null) {
2008:     lastToken = firstToken = next;
2009:     } else {
2010:       // if chain returns false, it "rejected" the token due to
2011:       // an optimization, and next was combined with lastToken
2012:       if (lastToken.chain(next)) {
2013:       lastToken = next;
2014:       }
2015:     }
2016:   }
2017: 
2018:   private static REToken setRepeated(REToken current, int min, int max, int index) throws REException {
2019:     if (current == null) throw new REException(getLocalizedMessage("repeat.no.token"),REException.REG_BADRPT,index);
2020:     return new RETokenRepeated(current.subIndex,current,min,max);
2021:   }
2022: 
2023:   private static int getPosixSet(char[] pattern,int index,StringBuffer buf) {
2024:     // Precondition: pattern[index-1] == ':'
2025:     // we will return pos of closing ']'.
2026:     int i;
2027:     for (i=index; i<(pattern.length-1); i++) {
2028:       if ((pattern[i] == ':') && (pattern[i+1] == ']'))
2029:     return i+2;
2030:       buf.append(pattern[i]);
2031:     }
2032:     return index; // didn't match up
2033:   }
2034: 
2035:   private int getMinMax(char[] input,int index,IntPair minMax,RESyntax syntax) throws REException {
2036:     // Precondition: input[index-1] == '{', minMax != null
2037: 
2038:     boolean mustMatch = !syntax.get(RESyntax.RE_NO_BK_BRACES);
2039:     int startIndex = index;
2040:     if (index == input.length) {
2041:       if (mustMatch)
2042:         throw new REException(getLocalizedMessage("unmatched.brace"),REException.REG_EBRACE,index);
2043:       else
2044:         return startIndex;
2045:     }
2046:     
2047:     int min,max=0;
2048:     CharUnit unit = new CharUnit();
2049:     StringBuffer buf = new StringBuffer();
2050:     
2051:     // Read string of digits
2052:     do {
2053:       index = getCharUnit(input,index,unit,false);
2054:       if (Character.isDigit(unit.ch))
2055:         buf.append(unit.ch);
2056:     } while ((index != input.length) && Character.isDigit(unit.ch));
2057: 
2058:     // Check for {} tomfoolery
2059:     if (buf.length() == 0) {
2060:       if (mustMatch)
2061:         throw new REException(getLocalizedMessage("interval.error"),REException.REG_EBRACE,index);
2062:       else
2063:         return startIndex;
2064:     }
2065: 
2066:     min = Integer.parseInt(buf.toString());
2067:     
2068:     if ((unit.ch == '}') && (syntax.get(RESyntax.RE_NO_BK_BRACES) ^ unit.bk))
2069:       max = min;
2070:     else if (index == input.length)
2071:       if (mustMatch)
2072:         throw new REException(getLocalizedMessage("interval.no.end"),REException.REG_EBRACE,index);
2073:       else
2074:         return startIndex;
2075:     else if ((unit.ch == ',') && !unit.bk) {
2076:       buf = new StringBuffer();
2077:       // Read string of digits
2078:       while (((index = getCharUnit(input,index,unit,false)) != input.length) && Character.isDigit(unit.ch))
2079:     buf.append(unit.ch);
2080: 
2081:       if (!((unit.ch == '}') && (syntax.get(RESyntax.RE_NO_BK_BRACES) ^ unit.bk)))
2082:         if (mustMatch)
2083:           throw new REException(getLocalizedMessage("interval.error"),REException.REG_EBRACE,index);
2084:         else
2085:           return startIndex;
2086: 
2087:       // This is the case of {x,}
2088:       if (buf.length() == 0) max = Integer.MAX_VALUE;
2089:       else max = Integer.parseInt(buf.toString());
2090:     } else
2091:       if (mustMatch)
2092:         throw new REException(getLocalizedMessage("interval.error"),REException.REG_EBRACE,index);
2093:       else
2094:         return startIndex;
2095: 
2096:     // We know min and max now, and they are valid.
2097: 
2098:     minMax.first = min;
2099:     minMax.second = max;
2100: 
2101:     // return the index following the '}'
2102:     return index;
2103:   }
2104: 
2105:    /**
2106:     * Return a human readable form of the compiled regular expression,
2107:     * useful for debugging.
2108:     */
2109:    public String toString() {
2110:      StringBuffer sb = new StringBuffer();
2111:      dump(sb);
2112:      return sb.toString();
2113:    }
2114: 
2115:   void dump(StringBuffer os) {
2116:     os.append("(?#startRE subIndex=" + subIndex + ")");
2117:     if (subIndex == 0)
2118:       os.append("?:");
2119:     if (firstToken != null)
2120:       firstToken.dumpAll(os);
2121:     if (subIndex == 0)
2122:       os.append(")");
2123:     os.append("(?#endRE subIndex=" + subIndex + ")");
2124:   }
2125: 
2126:   // Cast input appropriately or throw exception
2127:   // This method was originally a private method, but has been made
2128:   // public because java.util.regex.Matcher uses this.
2129:   public static CharIndexed makeCharIndexed(Object input, int index) {
2130:       // The case where input is already a CharIndexed is supposed
2131:       // be the most likely because this is the case with
2132:       // java.util.regex.Matcher.
2133:       // We could let a String or a CharSequence fall through
2134:       // to final input, but since it'a very likely input type, 
2135:       // we check it first.
2136:     if (input instanceof CharIndexed) {
2137:     CharIndexed ci = (CharIndexed) input;
2138:     ci.setAnchor(index);
2139:     return ci;
2140:     }
2141:     else if (input instanceof CharSequence)
2142:       return new CharIndexedCharSequence((CharSequence) input,index);
2143:     else if (input instanceof String)
2144:       return new CharIndexedString((String) input,index);
2145:     else if (input instanceof char[])
2146:       return new CharIndexedCharArray((char[]) input,index);
2147:     else if (input instanceof StringBuffer)
2148:       return new CharIndexedStringBuffer((StringBuffer) input,index);
2149:     else if (input instanceof InputStream)
2150:       return new CharIndexedInputStream((InputStream) input,index);
2151:     else 
2152:     return new CharIndexedString(input.toString(), index);
2153:   }
2154: }