Source for gnu.regexp.RE

   1: /* gnu/regexp/RE.java
   2:    Copyright (C) 2006 Free Software Foundation, Inc.
   3: 
   4: This file is part of GNU Classpath.
   5: 
   6: GNU Classpath is free software; you can redistribute it and/or modify
   7: it under the terms of the GNU General Public License as published by
   8: the Free Software Foundation; either version 2, or (at your option)
   9: any later version.
  10: 
  11: GNU Classpath is distributed in the hope that it will be useful, but
  12: WITHOUT ANY WARRANTY; without even the implied warranty of
  13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14: General Public License for more details.
  15: 
  16: You should have received a copy of the GNU General Public License
  17: along with GNU Classpath; see the file COPYING.  If not, write to the
  18: Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  19: 02110-1301 USA.
  20: 
  21: Linking this library statically or dynamically with other modules is
  22: making a combined work based on this library.  Thus, the terms and
  23: conditions of the GNU General Public License cover the whole
  24: combination.
  25: 
  26: As a special exception, the copyright holders of this library give you
  27: permission to link this library with independent modules to produce an
  28: executable, regardless of the license terms of these independent
  29: modules, and to copy and distribute the resulting executable under
  30: terms of your choice, provided that you also meet, for each linked
  31: independent module, the terms and conditions of the license of that
  32: module.  An independent module is a module which is not derived from
  33: or based on this library.  If you modify this library, you may extend
  34: this exception to your version of the library, but you are not
  35: obligated to do so.  If you do not wish to do so, delete this
  36: exception statement from your version. */
  37: 
  38: package gnu.regexp;
  39: import java.io.InputStream;
  40: import java.io.Serializable;
  41: import java.util.Locale;
  42: import java.util.PropertyResourceBundle;
  43: import java.util.ResourceBundle;
  44: import java.util.Vector;
  45: 
  46: /**
  47:  * RE provides the user interface for compiling and matching regular
  48:  * expressions.
  49:  * <P>
  50:  * A regular expression object (class RE) is compiled by constructing it
  51:  * from a String, StringBuffer or character array, with optional 
  52:  * compilation flags (below)
  53:  * and an optional syntax specification (see RESyntax; if not specified,
  54:  * <code>RESyntax.RE_SYNTAX_PERL5</code> is used).
  55:  * <P>
  56:  * Once compiled, a regular expression object is reusable as well as
  57:  * threadsafe: multiple threads can use the RE instance simultaneously
  58:  * to match against different input text.
  59:  * <P>
  60:  * Various methods attempt to match input text against a compiled
  61:  * regular expression.  These methods are:
  62:  * <LI><code>isMatch</code>: returns true if the input text in its
  63:  * entirety matches the regular expression pattern.
  64:  * <LI><code>getMatch</code>: returns the first match found in the
  65:  * input text, or null if no match is found.
  66:  * <LI><code>getAllMatches</code>: returns an array of all
  67:  * non-overlapping matches found in the input text.  If no matches are
  68:  * found, the array is zero-length.
  69:  * <LI><code>substitute</code>: substitute the first occurence of the
  70:  * pattern in the input text with a replacement string (which may
  71:  * include metacharacters $0-$9, see REMatch.substituteInto).
  72:  * <LI><code>substituteAll</code>: same as above, but repeat for each
  73:  * match before returning.
  74:  * <LI><code>getMatchEnumeration</code>: returns an REMatchEnumeration
  75:  * object that allows iteration over the matches (see
  76:  * REMatchEnumeration for some reasons why you may want to do this
  77:  * instead of using <code>getAllMatches</code>.
  78:  * <P>
  79:  *
  80:  * These methods all have similar argument lists.  The input can be a
  81:  * String, a character array, a StringBuffer, or an
  82:  * InputStream of some sort.  Note that when using an
  83:  * InputStream, the stream read position cannot be guaranteed after
  84:  * attempting a match (this is not a bug, but a consequence of the way
  85:  * regular expressions work).  Using an REMatchEnumeration can
  86:  * eliminate most positioning problems.
  87:  *
  88:  * <P>
  89:  *
  90:  * The optional index argument specifies the offset from the beginning
  91:  * of the text at which the search should start (see the descriptions
  92:  * of some of the execution flags for how this can affect positional
  93:  * pattern operators).  For an InputStream, this means an
  94:  * offset from the current read position, so subsequent calls with the
  95:  * same index argument on an InputStream will not
  96:  * necessarily access the same position on the stream, whereas
  97:  * repeated searches at a given index in a fixed string will return
  98:  * consistent results.
  99:  *
 100:  * <P>
 101:  * You can optionally affect the execution environment by using a
 102:  * combination of execution flags (constants listed below).
 103:  * 
 104:  * <P>
 105:  * All operations on a regular expression are performed in a
 106:  * thread-safe manner.
 107:  *
 108:  * @author <A HREF="mailto:wes@cacas.org">Wes Biggs</A>
 109:  * @version 1.1.5-dev, to be released
 110:  */
 111: 
 112: public class RE extends REToken {
 113: 
 114:   private static final class IntPair implements Serializable {
 115:     public int first, second;
 116:   }
 117: 
 118:   private static final class CharUnit implements Serializable {
 119:     public char ch;
 120:     public boolean bk;
 121:   }
 122: 
 123:   // This String will be returned by getVersion()
 124:   private static final String VERSION = "1.1.5-dev";
 125: 
 126:   // The localized strings are kept in a separate file
 127:   private static ResourceBundle messages = PropertyResourceBundle.getBundle("gnu/regexp/MessagesBundle", Locale.getDefault());
 128: 
 129:   // These are, respectively, the first and last tokens in our linked list
 130:   // If there is only one token, firstToken == lastToken
 131:   private REToken firstToken, lastToken;
 132: 
 133:   // This is the number of subexpressions in this regular expression,
 134:   // with a minimum value of zero.  Returned by getNumSubs()
 135:   private int numSubs;
 136: 
 137:     /** Minimum length, in characters, of any possible match. */
 138:     private int minimumLength;
 139:     private int maximumLength;
 140: 
 141:   /**
 142:    * Compilation flag. Do  not  differentiate  case.   Subsequent
 143:    * searches  using  this  RE will be case insensitive.
 144:    */
 145:   public static final int REG_ICASE = 0x02;
 146: 
 147:   /**
 148:    * Compilation flag. The match-any-character operator (dot)
 149:    * will match a newline character.  When set this overrides the syntax
 150:    * bit RE_DOT_NEWLINE (see RESyntax for details).  This is equivalent to
 151:    * the "/s" operator in Perl.
 152:    */
 153:   public static final int REG_DOT_NEWLINE = 0x04;
 154: 
 155:   /**
 156:    * Compilation flag. Use multiline mode.  In this mode, the ^ and $
 157:    * anchors will match based on newlines within the input. This is
 158:    * equivalent to the "/m" operator in Perl.
 159:    */
 160:   public static final int REG_MULTILINE = 0x08;
 161: 
 162:   /**
 163:    * Execution flag.
 164:    * The match-beginning operator (^) will not match at the beginning
 165:    * of the input string. Useful for matching on a substring when you
 166:    * know the context of the input is such that position zero of the
 167:    * input to the match test is not actually position zero of the text.
 168:    * <P>
 169:    * This example demonstrates the results of various ways of matching on
 170:    * a substring.
 171:    * <P>
 172:    * <CODE>
 173:    * String s = "food bar fool";<BR>
 174:    * RE exp = new RE("^foo.");<BR>
 175:    * REMatch m0 = exp.getMatch(s);<BR>
 176:    * REMatch m1 = exp.getMatch(s.substring(8));<BR>
 177:    * REMatch m2 = exp.getMatch(s.substring(8),0,RE.REG_NOTBOL); <BR>
 178:    * REMatch m3 = exp.getMatch(s,8);                            <BR>
 179:    * REMatch m4 = exp.getMatch(s,8,RE.REG_ANCHORINDEX);         <BR>
 180:    * <P>
 181:    * // Results:<BR>
 182:    * //  m0.toString(): "food"<BR>
 183:    * //  m1.toString(): "fool"<BR>
 184:    * //  m2.toString(): null<BR>
 185:    * //  m3.toString(): null<BR>
 186:    * //  m4.toString(): "fool"<BR>
 187:    * </CODE>
 188:    */
 189:   public static final int REG_NOTBOL = 0x10;
 190: 
 191:   /**
 192:    * Execution flag.
 193:    * The match-end operator ($) does not match at the end
 194:    * of the input string. Useful for matching on substrings.
 195:    */
 196:   public static final int REG_NOTEOL = 0x20;
 197: 
 198:   /**
 199:    * Execution flag.
 200:    * When a match method is invoked that starts matching at a non-zero
 201:    * index into the input, treat the input as if it begins at the index
 202:    * given.  The effect of this flag is that the engine does not "see"
 203:    * any text in the input before the given index.  This is useful so
 204:    * that the match-beginning operator (^) matches not at position 0
 205:    * in the input string, but at the position the search started at
 206:    * (based on the index input given to the getMatch function).  See
 207:    * the example under REG_NOTBOL.  It also affects the use of the \&lt;
 208:    * and \b operators.
 209:    */
 210:   public static final int REG_ANCHORINDEX = 0x40;
 211: 
 212:   /**
 213:    * Execution flag.
 214:    * The substitute and substituteAll methods will not attempt to
 215:    * interpolate occurrences of $1-$9 in the replacement text with
 216:    * the corresponding subexpressions.  For example, you may want to
 217:    * replace all matches of "one dollar" with "$1".
 218:    */
 219:   public static final int REG_NO_INTERPOLATE = 0x80;
 220: 
 221:   /**
 222:    * Execution flag.
 223:    * Try to match the whole input string. An implicit match-end operator
 224:    * is added to this regexp.
 225:    */
 226:   public static final int REG_TRY_ENTIRE_MATCH = 0x0100;
 227: 
 228:   /**
 229:    * Execution flag.
 230:    * The substitute and substituteAll methods will treat the
 231:    * character '\' in the replacement as an escape to a literal
 232:    * character. In this case "\n", "\$", "\\", "\x40" and "\012"
 233:    * will become "n", "$", "\", "x40" and "012" respectively.
 234:    * This flag has no effect if REG_NO_INTERPOLATE is set on.
 235:    */
 236:   public static final int REG_REPLACE_USE_BACKSLASHESCAPE = 0x0200;
 237: 
 238:   /** Returns a string representing the version of the gnu.regexp package. */
 239:   public static final String version() {
 240:     return VERSION;
 241:   }
 242: 
 243:   // Retrieves a message from the ResourceBundle
 244:   static final String getLocalizedMessage(String key) {
 245:     return messages.getString(key);
 246:   }
 247: 
 248:   /**
 249:    * Constructs a regular expression pattern buffer without any compilation
 250:    * flags set, and using the default syntax (RESyntax.RE_SYNTAX_PERL5).
 251:    *
 252:    * @param pattern A regular expression pattern, in the form of a String,
 253:    *   StringBuffer or char[].  Other input types will be converted to
 254:    *   strings using the toString() method.
 255:    * @exception REException The input pattern could not be parsed.
 256:    * @exception NullPointerException The pattern was null.
 257:    */
 258:   public RE(Object pattern) throws REException {
 259:     this(pattern,0,RESyntax.RE_SYNTAX_PERL5,0,0);
 260:   }
 261: 
 262:   /**
 263:    * Constructs a regular expression pattern buffer using the specified
 264:    * compilation flags and the default syntax (RESyntax.RE_SYNTAX_PERL5).
 265:    *
 266:    * @param pattern A regular expression pattern, in the form of a String,
 267:    *   StringBuffer, or char[].  Other input types will be converted to
 268:    *   strings using the toString() method.
 269:    * @param cflags The logical OR of any combination of the compilation flags listed above.
 270:    * @exception REException The input pattern could not be parsed.
 271:    * @exception NullPointerException The pattern was null.
 272:    */
 273:   public RE(Object pattern, int cflags) throws REException {
 274:     this(pattern,cflags,RESyntax.RE_SYNTAX_PERL5,0,0);
 275:   }
 276: 
 277:   /**
 278:    * Constructs a regular expression pattern buffer using the specified
 279:    * compilation flags and regular expression syntax.
 280:    *
 281:    * @param pattern A regular expression pattern, in the form of a String,
 282:    *   StringBuffer, or char[].  Other input types will be converted to
 283:    *   strings using the toString() method.
 284:    * @param cflags The logical OR of any combination of the compilation flags listed above.
 285:    * @param syntax The type of regular expression syntax to use.
 286:    * @exception REException The input pattern could not be parsed.
 287:    * @exception NullPointerException The pattern was null.
 288:    */
 289:   public RE(Object pattern, int cflags, RESyntax syntax) throws REException {
 290:     this(pattern,cflags,syntax,0,0);
 291:   }
 292: 
 293:   // internal constructor used for alternation
 294:   private RE(REToken first, REToken last,int subs, int subIndex, int minLength, int maxLength) {
 295:     super(subIndex);
 296:     firstToken = first;
 297:     lastToken = last;
 298:     numSubs = subs;
 299:     minimumLength = minLength;
 300:     maximumLength = maxLength;
 301:     addToken(new RETokenEndSub(subIndex));
 302:   }
 303: 
 304:   private RE(Object patternObj, int cflags, RESyntax syntax, int myIndex, int nextSub) throws REException {
 305:     super(myIndex); // Subexpression index of this token.
 306:     initialize(patternObj, cflags, syntax, myIndex, nextSub);
 307:   }
 308: 
 309:     // For use by subclasses
 310:     protected RE() { super(0); }
 311: 
 312:     // The meat of construction
 313:   protected void initialize(Object patternObj, int cflags, RESyntax syntax, int myIndex, int nextSub) throws REException {
 314:       char[] pattern;
 315:     if (patternObj instanceof String) {
 316:       pattern = ((String) patternObj).toCharArray();
 317:     } else if (patternObj instanceof char[]) {
 318:       pattern = (char[]) patternObj;
 319:     } else if (patternObj instanceof StringBuffer) {
 320:       pattern = new char [((StringBuffer) patternObj).length()];
 321:       ((StringBuffer) patternObj).getChars(0,pattern.length,pattern,0);
 322:     } else {
 323:     pattern = patternObj.toString().toCharArray();
 324:     }
 325: 
 326:     int pLength = pattern.length;
 327: 
 328:     numSubs = 0; // Number of subexpressions in this token.
 329:     Vector branches = null;
 330: 
 331:     // linked list of tokens (sort of -- some closed loops can exist)
 332:     firstToken = lastToken = null;
 333: 
 334:     // Precalculate these so we don't pay for the math every time we
 335:     // need to access them.
 336:     boolean insens = ((cflags & REG_ICASE) > 0);
 337: 
 338:     // Parse pattern into tokens.  Does anyone know if it's more efficient
 339:     // to use char[] than a String.charAt()?  I'm assuming so.
 340: 
 341:     // index tracks the position in the char array
 342:     int index = 0;
 343: 
 344:     // this will be the current parse character (pattern[index])
 345:     CharUnit unit = new CharUnit();
 346: 
 347:     // This is used for {x,y} calculations
 348:     IntPair minMax = new IntPair();
 349: 
 350:     // Buffer a token so we can create a TokenRepeated, etc.
 351:     REToken currentToken = null;
 352:     char ch;
 353:     boolean quot = false;
 354: 
 355:     // Saved syntax and flags.
 356:     RESyntax savedSyntax = null;
 357:     int savedCflags = 0;
 358:     boolean flagsSaved = false;
 359: 
 360:     while (index < pLength) {
 361:       // read the next character unit (including backslash escapes)
 362:       index = getCharUnit(pattern,index,unit,quot);
 363: 
 364:       if (unit.bk)
 365:         if (unit.ch == 'Q') {
 366:           quot = true;
 367:           continue;
 368:         } else if (unit.ch == 'E') {
 369:           quot = false;
 370:           continue;
 371:         }
 372:       if (quot)
 373:           unit.bk = false;
 374: 
 375:       // ALTERNATION OPERATOR
 376:       //  \| or | (if RE_NO_BK_VBAR) or newline (if RE_NEWLINE_ALT)
 377:       //  not available if RE_LIMITED_OPS is set
 378: 
 379:       // TODO: the '\n' literal here should be a test against REToken.newline,
 380:       // which unfortunately may be more than a single character.
 381:       if ( ( (unit.ch == '|' && (syntax.get(RESyntax.RE_NO_BK_VBAR) ^ (unit.bk || quot)))
 382:          || (syntax.get(RESyntax.RE_NEWLINE_ALT) && (unit.ch == '\n') && !(unit.bk || quot)) )
 383:        && !syntax.get(RESyntax.RE_LIMITED_OPS)) {
 384:     // make everything up to here be a branch. create vector if nec.
 385:     addToken(currentToken);
 386:     RE theBranch = new RE(firstToken, lastToken, numSubs, subIndex, minimumLength, maximumLength);
 387:     minimumLength = 0;
 388:     maximumLength = 0;
 389:     if (branches == null) {
 390:         branches = new Vector();
 391:     }
 392:     branches.addElement(theBranch);
 393:     firstToken = lastToken = currentToken = null;
 394:       }
 395:       
 396:       // INTERVAL OPERATOR:
 397:       //  {x} | {x,} | {x,y}  (RE_INTERVALS && RE_NO_BK_BRACES)
 398:       //  \{x\} | \{x,\} | \{x,y\} (RE_INTERVALS && !RE_NO_BK_BRACES)
 399:       //
 400:       // OPEN QUESTION: 
 401:       //  what is proper interpretation of '{' at start of string?
 402:       //
 403:       // This method used to check "repeat.empty.token" to avoid such regexp
 404:       // as "(a*){2,}", but now "repeat.empty.token" is allowed.
 405: 
 406:       else if ((unit.ch == '{') && syntax.get(RESyntax.RE_INTERVALS) && (syntax.get(RESyntax.RE_NO_BK_BRACES) ^ (unit.bk || quot))) {
 407:     int newIndex = getMinMax(pattern,index,minMax,syntax);
 408:         if (newIndex > index) {
 409:           if (minMax.first > minMax.second)
 410:             throw new REException(getLocalizedMessage("interval.order"),REException.REG_BADRPT,newIndex);
 411:           if (currentToken == null)
 412:             throw new REException(getLocalizedMessage("repeat.no.token"),REException.REG_BADRPT,newIndex);
 413:           if (currentToken instanceof RETokenRepeated) 
 414:             throw new REException(getLocalizedMessage("repeat.chained"),REException.REG_BADRPT,newIndex);
 415:           if (currentToken instanceof RETokenWordBoundary || currentToken instanceof RETokenWordBoundary)
 416:             throw new REException(getLocalizedMessage("repeat.assertion"),REException.REG_BADRPT,newIndex);
 417:           index = newIndex;
 418:           currentToken = setRepeated(currentToken,minMax.first,minMax.second,index); 
 419:         }
 420:         else {
 421:           addToken(currentToken);
 422:           currentToken = new RETokenChar(subIndex,unit.ch,insens);
 423:         } 
 424:       }
 425:       
 426:       // LIST OPERATOR:
 427:       //  [...] | [^...]
 428: 
 429:       else if ((unit.ch == '[') && !(unit.bk || quot)) {
 430:     // Create a new RETokenOneOf
 431:     ParseCharClassResult result = parseCharClass(
 432:         subIndex, pattern, index, pLength, cflags, syntax, 0);
 433:     addToken(currentToken);
 434:     currentToken = result.token;
 435:     index = result.index;
 436:       }
 437: 
 438:       // SUBEXPRESSIONS
 439:       //  (...) | \(...\) depending on RE_NO_BK_PARENS
 440: 
 441:       else if ((unit.ch == '(') && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ (unit.bk || quot))) {
 442:     boolean pure = false;
 443:     boolean comment = false;
 444:         boolean lookAhead = false;
 445:         boolean lookBehind = false;
 446:         boolean independent = false;
 447:         boolean negativelh = false;
 448:         boolean negativelb = false;
 449:     if ((index+1 < pLength) && (pattern[index] == '?')) {
 450:       switch (pattern[index+1]) {
 451:           case '!':
 452:             if (syntax.get(RESyntax.RE_LOOKAHEAD)) {
 453:               pure = true;
 454:               negativelh = true;
 455:               lookAhead = true;
 456:               index += 2;
 457:             }
 458:             break;
 459:           case '=':
 460:             if (syntax.get(RESyntax.RE_LOOKAHEAD)) {
 461:               pure = true;
 462:               lookAhead = true;
 463:               index += 2;
 464:             }
 465:             break;
 466:       case '<':
 467:         // We assume that if the syntax supports look-ahead,
 468:         // it also supports look-behind.
 469:         if (syntax.get(RESyntax.RE_LOOKAHEAD)) {
 470:         index++;
 471:         switch (pattern[index +1]) {
 472:         case '!':
 473:           pure = true;
 474:           negativelb = true;
 475:           lookBehind = true;
 476:           index += 2;
 477:           break;
 478:         case '=':
 479:           pure = true;
 480:           lookBehind = true;
 481:           index += 2;
 482:         }
 483:         }
 484:         break;
 485:       case '>':
 486:         // We assume that if the syntax supports look-ahead,
 487:         // it also supports independent group.
 488:             if (syntax.get(RESyntax.RE_LOOKAHEAD)) {
 489:               pure = true;
 490:               independent = true;
 491:               index += 2;
 492:             }
 493:             break;
 494:       case 'i':
 495:       case 'd':
 496:       case 'm':
 497:       case 's':
 498:       // case 'u':  not supported
 499:       // case 'x':  not supported
 500:       case '-':
 501:             if (!syntax.get(RESyntax.RE_EMBEDDED_FLAGS)) break;
 502:         // Set or reset syntax flags.
 503:         int flagIndex = index + 1;
 504:         int endFlag = -1;
 505:         RESyntax newSyntax = new RESyntax(syntax);
 506:         int newCflags = cflags;
 507:         boolean negate = false;
 508:         while (flagIndex < pLength && endFlag < 0) {
 509:             switch(pattern[flagIndex]) {
 510:           case 'i':
 511:           if (negate)
 512:             newCflags &= ~REG_ICASE;
 513:           else
 514:             newCflags |= REG_ICASE;
 515:           flagIndex++;
 516:           break;
 517:           case 'd':
 518:           if (negate)
 519:             newSyntax.setLineSeparator(RESyntax.DEFAULT_LINE_SEPARATOR);
 520:           else
 521:             newSyntax.setLineSeparator("\n");
 522:           flagIndex++;
 523:           break;
 524:           case 'm':
 525:           if (negate)
 526:             newCflags &= ~REG_MULTILINE;
 527:           else
 528:             newCflags |= REG_MULTILINE;
 529:           flagIndex++;
 530:           break;
 531:           case 's':
 532:           if (negate)
 533:             newCflags &= ~REG_DOT_NEWLINE;
 534:           else
 535:             newCflags |= REG_DOT_NEWLINE;
 536:           flagIndex++;
 537:           break;
 538:           // case 'u': not supported
 539:           // case 'x': not supported
 540:           case '-':
 541:           negate = true;
 542:           flagIndex++;
 543:           break;
 544:         case ':':
 545:         case ')':
 546:           endFlag = pattern[flagIndex];
 547:           break;
 548:         default:
 549:                   throw new REException(getLocalizedMessage("repeat.no.token"), REException.REG_BADRPT, index);
 550:         }
 551:         }
 552:         if (endFlag == ')') {
 553:         syntax = newSyntax;
 554:         cflags = newCflags;
 555:         insens = ((cflags & REG_ICASE) > 0);
 556:         // This can be treated as though it were a comment.
 557:         comment = true;
 558:         index = flagIndex - 1;
 559:         break;
 560:         }
 561:         if (endFlag == ':') {
 562:         savedSyntax = syntax;
 563:         savedCflags = cflags;
 564:         flagsSaved = true;
 565:         syntax = newSyntax;
 566:         cflags = newCflags;
 567:         insens = ((cflags & REG_ICASE) > 0);
 568:         index = flagIndex -1;
 569:         // Fall through to the next case.
 570:         }
 571:         else {
 572:             throw new REException(getLocalizedMessage("unmatched.paren"), REException.REG_ESUBREG,index);
 573:         }
 574:       case ':':
 575:         if (syntax.get(RESyntax.RE_PURE_GROUPING)) {
 576:           pure = true;
 577:           index += 2;
 578:         }
 579:         break;
 580:       case '#':
 581:         if (syntax.get(RESyntax.RE_COMMENTS)) {
 582:           comment = true;
 583:         }
 584:         break;
 585:           default:
 586:             throw new REException(getLocalizedMessage("repeat.no.token"), REException.REG_BADRPT, index);
 587:       }
 588:     }
 589: 
 590:     if (index >= pLength) {
 591:         throw new REException(getLocalizedMessage("unmatched.paren"), REException.REG_ESUBREG,index);
 592:     }
 593: 
 594:     // find end of subexpression
 595:     int endIndex = index;
 596:     int nextIndex = index;
 597:     int nested = 0;
 598: 
 599:     while ( ((nextIndex = getCharUnit(pattern,endIndex,unit,false)) > 0)
 600:         && !(nested == 0 && (unit.ch == ')') && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ (unit.bk || quot))) ) {
 601:       if ((endIndex = nextIndex) >= pLength)
 602:         throw new REException(getLocalizedMessage("subexpr.no.end"),REException.REG_ESUBREG,nextIndex);
 603:       else if ((unit.ch == '[') && !(unit.bk || quot)) {
 604:         // I hate to do something similar to the LIST OPERATOR matters
 605:         // above, but ...
 606:         int listIndex = nextIndex;
 607:         if (listIndex < pLength && pattern[listIndex] == '^') listIndex++;
 608:         if (listIndex < pLength && pattern[listIndex] == ']') listIndex++;
 609:         int listEndIndex = -1;
 610:         int listNest = 0;
 611:         while (listIndex < pLength && listEndIndex < 0) {
 612:           switch(pattern[listIndex++]) {
 613:         case '\\':
 614:           listIndex++;
 615:           break;
 616:         case '[':
 617:           // Sun's API document says that regexp like "[a-d[m-p]]"
 618:           // is legal. Even something like "[[[^]]]]" is accepted.
 619:           listNest++;
 620:           if (listIndex < pLength && pattern[listIndex] == '^') listIndex++;
 621:           if (listIndex < pLength && pattern[listIndex] == ']') listIndex++;
 622:           break;
 623:         case ']':
 624:           if (listNest == 0)
 625:             listEndIndex = listIndex;
 626:           listNest--;
 627:           break;
 628:           }
 629:         }
 630:         if (listEndIndex >= 0) {
 631:           nextIndex = listEndIndex;
 632:           if ((endIndex = nextIndex) >= pLength)
 633:             throw new REException(getLocalizedMessage("subexpr.no.end"),REException.REG_ESUBREG,nextIndex);
 634:           else
 635:             continue;
 636:         }
 637:         throw new REException(getLocalizedMessage("subexpr.no.end"),REException.REG_ESUBREG,nextIndex);
 638:       }
 639:       else if (unit.ch == '(' && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ (unit.bk || quot)))
 640:         nested++;
 641:       else if (unit.ch == ')' && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ (unit.bk || quot)))
 642:         nested--;
 643:     }
 644: 
 645:     // endIndex is now position at a ')','\)' 
 646:     // nextIndex is end of string or position after ')' or '\)'
 647: 
 648:     if (comment) index = nextIndex;
 649:     else { // not a comment
 650:       // create RE subexpression as token.
 651:       addToken(currentToken);
 652:       if (!pure) {
 653:         numSubs++;
 654:       }
 655: 
 656:       int useIndex = (pure || lookAhead || lookBehind || independent) ?
 657:              0 : nextSub + numSubs;
 658:       currentToken = new RE(String.valueOf(pattern,index,endIndex-index).toCharArray(),cflags,syntax,useIndex,nextSub + numSubs);
 659:       numSubs += ((RE) currentToken).getNumSubs();
 660: 
 661:           if (lookAhead) {
 662:           currentToken = new RETokenLookAhead(currentToken,negativelh);
 663:       }
 664:           else if (lookBehind) {
 665:           currentToken = new RETokenLookBehind(currentToken,negativelb);
 666:       }
 667:           else if (independent) {
 668:           currentToken = new RETokenIndependent(currentToken);
 669:       }
 670: 
 671:       index = nextIndex;
 672:       if (flagsSaved) {
 673:           syntax = savedSyntax;
 674:           cflags = savedCflags;
 675:           insens = ((cflags & REG_ICASE) > 0);
 676:           flagsSaved = false;
 677:       }
 678:     } // not a comment
 679:       } // subexpression
 680:     
 681:       // UNMATCHED RIGHT PAREN
 682:       // ) or \) throw exception if
 683:       // !syntax.get(RESyntax.RE_UNMATCHED_RIGHT_PAREN_ORD)
 684:       else if (!syntax.get(RESyntax.RE_UNMATCHED_RIGHT_PAREN_ORD) && ((unit.ch == ')') && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ (unit.bk || quot)))) {
 685:     throw new REException(getLocalizedMessage("unmatched.paren"),REException.REG_EPAREN,index);
 686:       }
 687: 
 688:       // START OF LINE OPERATOR
 689:       //  ^
 690: 
 691:       else if ((unit.ch == '^') && !(unit.bk || quot)) {
 692:     addToken(currentToken);
 693:     currentToken = null;
 694:     addToken(new RETokenStart(subIndex,((cflags & REG_MULTILINE) > 0) ? syntax.getLineSeparator() : null));
 695:       }
 696: 
 697:       // END OF LINE OPERATOR
 698:       //  $
 699: 
 700:       else if ((unit.ch == '$') && !(unit.bk || quot)) {
 701:     addToken(currentToken);
 702:     currentToken = null;
 703:     addToken(new RETokenEnd(subIndex,((cflags & REG_MULTILINE) > 0) ? syntax.getLineSeparator() : null));
 704:       }
 705: 
 706:       // MATCH-ANY-CHARACTER OPERATOR (except possibly newline and null)
 707:       //  .
 708: 
 709:       else if ((unit.ch == '.') && !(unit.bk || quot)) {
 710:     addToken(currentToken);
 711:     currentToken = new RETokenAny(subIndex,syntax.get(RESyntax.RE_DOT_NEWLINE) || ((cflags & REG_DOT_NEWLINE) > 0),syntax.get(RESyntax.RE_DOT_NOT_NULL));
 712:       }
 713: 
 714:       // ZERO-OR-MORE REPEAT OPERATOR
 715:       //  *
 716:       //
 717:       // This method used to check "repeat.empty.token" to avoid such regexp
 718:       // as "(a*)*", but now "repeat.empty.token" is allowed.
 719: 
 720:       else if ((unit.ch == '*') && !(unit.bk || quot)) {
 721:     if (currentToken == null)
 722:           throw new REException(getLocalizedMessage("repeat.no.token"),REException.REG_BADRPT,index);
 723:     if (currentToken instanceof RETokenRepeated)
 724:           throw new REException(getLocalizedMessage("repeat.chained"),REException.REG_BADRPT,index);
 725:     if (currentToken instanceof RETokenWordBoundary || currentToken instanceof RETokenWordBoundary)
 726:       throw new REException(getLocalizedMessage("repeat.assertion"),REException.REG_BADRPT,index);
 727:     currentToken = setRepeated(currentToken,0,Integer.MAX_VALUE,index);
 728:       }
 729: 
 730:       // ONE-OR-MORE REPEAT OPERATOR / POSSESSIVE MATCHING OPERATOR
 731:       //  + | \+ depending on RE_BK_PLUS_QM
 732:       //  not available if RE_LIMITED_OPS is set
 733:       //
 734:       // This method used to check "repeat.empty.token" to avoid such regexp
 735:       // as "(a*)+", but now "repeat.empty.token" is allowed.
 736: 
 737:       else if ((unit.ch == '+') && !syntax.get(RESyntax.RE_LIMITED_OPS) && (!syntax.get(RESyntax.RE_BK_PLUS_QM) ^ (unit.bk || quot))) {
 738:     if (currentToken == null)
 739:           throw new REException(getLocalizedMessage("repeat.no.token"),REException.REG_BADRPT,index);
 740:     
 741:     // Check for possessive matching on RETokenRepeated
 742:     if (currentToken instanceof RETokenRepeated) {
 743:       RETokenRepeated tokenRep = (RETokenRepeated)currentToken;
 744:       if (syntax.get(RESyntax.RE_POSSESSIVE_OPS) && !tokenRep.isPossessive() && !tokenRep.isStingy())
 745:         tokenRep.makePossessive();
 746:       else
 747:         throw new REException(getLocalizedMessage("repeat.chained"),REException.REG_BADRPT,index);
 748: 
 749:     }
 750:     else if (currentToken instanceof RETokenWordBoundary || currentToken instanceof RETokenWordBoundary)
 751:       throw new REException(getLocalizedMessage("repeat.assertion"),REException.REG_BADRPT,index);
 752:     else
 753:       currentToken = setRepeated(currentToken,1,Integer.MAX_VALUE,index);
 754:       }
 755: 
 756:       // ZERO-OR-ONE REPEAT OPERATOR / STINGY MATCHING OPERATOR
 757:       //  ? | \? depending on RE_BK_PLUS_QM
 758:       //  not available if RE_LIMITED_OPS is set
 759:       //  stingy matching if RE_STINGY_OPS is set and it follows a quantifier
 760: 
 761:       else if ((unit.ch == '?') && !syntax.get(RESyntax.RE_LIMITED_OPS) && (!syntax.get(RESyntax.RE_BK_PLUS_QM) ^ (unit.bk || quot))) {
 762:     if (currentToken == null) throw new REException(getLocalizedMessage("repeat.no.token"),REException.REG_BADRPT,index);
 763: 
 764:     // Check for stingy matching on RETokenRepeated
 765:     if (currentToken instanceof RETokenRepeated) {
 766:       RETokenRepeated tokenRep = (RETokenRepeated)currentToken;
 767:       if (syntax.get(RESyntax.RE_STINGY_OPS) && !tokenRep.isStingy() && !tokenRep.isPossessive())
 768:         tokenRep.makeStingy();
 769:       else
 770:         throw new REException(getLocalizedMessage("repeat.chained"),REException.REG_BADRPT,index);
 771:     }
 772:     else if (currentToken instanceof RETokenWordBoundary || currentToken instanceof RETokenWordBoundary)
 773:       throw new REException(getLocalizedMessage("repeat.assertion"),REException.REG_BADRPT,index);
 774:     else
 775:       currentToken = setRepeated(currentToken,0,1,index);
 776:       }
 777: 
 778:       // OCTAL CHARACTER
 779:       //  \0377
 780:     
 781:       else if (unit.bk && (unit.ch == '0') && syntax.get(RESyntax.RE_OCTAL_CHAR)) {
 782:     CharExpression ce = getCharExpression(pattern, index - 2, pLength, syntax);
 783:     if (ce == null)
 784:       throw new REException("invalid octal character", REException.REG_ESCAPE, index);
 785:     index = index - 2 + ce.len;
 786:     addToken(currentToken);
 787:     currentToken = new RETokenChar(subIndex,ce.ch,insens);
 788:       }
 789: 
 790:       // BACKREFERENCE OPERATOR
 791:       //  \1 \2 ... \9 and \10 \11 \12 ...
 792:       // not available if RE_NO_BK_REFS is set
 793:       // Perl recognizes \10, \11, and so on only if enough number of
 794:       // parentheses have opened before it, otherwise they are treated
 795:       // as aliases of \010, \011, ... (octal characters).  In case of
 796:       // Sun's JDK, octal character expression must always begin with \0.
 797:       // We will do as JDK does. But FIXME, take a look at "(a)(b)\29".
 798:       // JDK treats \2 as a back reference to the 2nd group because
 799:       // there are only two groups. But in our poor implementation,
 800:       // we cannot help but treat \29 as a back reference to the 29th group.
 801: 
 802:       else if (unit.bk && Character.isDigit(unit.ch) && !syntax.get(RESyntax.RE_NO_BK_REFS)) {
 803:     addToken(currentToken);
 804:     int numBegin = index - 1;
 805:     int numEnd = pLength;
 806:     for (int i = index; i < pLength; i++) {
 807:         if (! Character.isDigit(pattern[i])) {
 808:         numEnd = i;
 809:         break;
 810:         }
 811:     }
 812:     int num = parseInt(pattern, numBegin, numEnd-numBegin, 10);
 813: 
 814:     currentToken = new RETokenBackRef(subIndex,num,insens);
 815:     index = numEnd;
 816:       }
 817: 
 818:       // START OF STRING OPERATOR
 819:       //  \A if RE_STRING_ANCHORS is set
 820:       
 821:       else if (unit.bk && (unit.ch == 'A') && syntax.get(RESyntax.RE_STRING_ANCHORS)) {
 822:     addToken(currentToken);
 823:     currentToken = new RETokenStart(subIndex,null);
 824:       }
 825: 
 826:       // WORD BREAK OPERATOR
 827:       //  \b if ????
 828: 
 829:       else if (unit.bk && (unit.ch == 'b') && syntax.get(RESyntax.RE_STRING_ANCHORS)) {
 830:       addToken(currentToken);
 831:       currentToken = new RETokenWordBoundary(subIndex, RETokenWordBoundary.BEGIN | RETokenWordBoundary.END, false);
 832:       } 
 833: 
 834:       // WORD BEGIN OPERATOR 
 835:       //  \< if ????
 836:       else if (unit.bk && (unit.ch == '<')) {
 837:       addToken(currentToken);
 838:       currentToken = new RETokenWordBoundary(subIndex, RETokenWordBoundary.BEGIN, false);
 839:       } 
 840: 
 841:       // WORD END OPERATOR 
 842:       //  \> if ????
 843:       else if (unit.bk && (unit.ch == '>')) {
 844:       addToken(currentToken);
 845:       currentToken = new RETokenWordBoundary(subIndex, RETokenWordBoundary.END, false);
 846:       } 
 847: 
 848:       // NON-WORD BREAK OPERATOR
 849:       // \B if ????
 850: 
 851:       else if (unit.bk && (unit.ch == 'B') && syntax.get(RESyntax.RE_STRING_ANCHORS)) {
 852:       addToken(currentToken);
 853:       currentToken = new RETokenWordBoundary(subIndex, RETokenWordBoundary.BEGIN | RETokenWordBoundary.END, true);
 854:       } 
 855: 
 856:       
 857:       // DIGIT OPERATOR
 858:       //  \d if RE_CHAR_CLASS_ESCAPES is set
 859:       
 860:       else if (unit.bk && (unit.ch == 'd') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
 861:     addToken(currentToken);
 862:     currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.DIGIT,insens,false);
 863:       }
 864: 
 865:       // NON-DIGIT OPERATOR
 866:       //  \D
 867: 
 868:     else if (unit.bk && (unit.ch == 'D') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
 869:       addToken(currentToken);
 870:       currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.DIGIT,insens,true);
 871:     }
 872: 
 873:     // NEWLINE ESCAPE
 874:         //  \n
 875: 
 876:     else if (unit.bk && (unit.ch == 'n')) {
 877:       addToken(currentToken);
 878:       currentToken = new RETokenChar(subIndex,'\n',false);
 879:     }
 880: 
 881:     // RETURN ESCAPE
 882:         //  \r
 883: 
 884:     else if (unit.bk && (unit.ch == 'r')) {
 885:       addToken(currentToken);
 886:       currentToken = new RETokenChar(subIndex,'\r',false);
 887:     }
 888: 
 889:     // WHITESPACE OPERATOR
 890:         //  \s if RE_CHAR_CLASS_ESCAPES is set
 891: 
 892:     else if (unit.bk && (unit.ch == 's') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
 893:       addToken(currentToken);
 894:       currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.SPACE,insens,false);
 895:     }
 896: 
 897:     // NON-WHITESPACE OPERATOR
 898:         //  \S
 899: 
 900:     else if (unit.bk && (unit.ch == 'S') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
 901:       addToken(currentToken);
 902:       currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.SPACE,insens,true);
 903:     }
 904: 
 905:     // TAB ESCAPE
 906:         //  \t
 907: 
 908:     else if (unit.bk && (unit.ch == 't')) {
 909:       addToken(currentToken);
 910:       currentToken = new RETokenChar(subIndex,'\t',false);
 911:     }
 912: 
 913:     // ALPHANUMERIC OPERATOR
 914:         //  \w
 915: 
 916:     else if (unit.bk && (unit.ch == 'w') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
 917:       addToken(currentToken);
 918:       currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.ALNUM,insens,false);
 919:     }
 920: 
 921:     // NON-ALPHANUMERIC OPERATOR
 922:         //  \W
 923: 
 924:     else if (unit.bk && (unit.ch == 'W') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
 925:       addToken(currentToken);
 926:       currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.ALNUM,insens,true);
 927:     }
 928: 
 929:     // END OF STRING OPERATOR
 930:         //  \Z
 931: 
 932:     else if (unit.bk && (unit.ch == 'Z') && syntax.get(RESyntax.RE_STRING_ANCHORS)) {
 933:       addToken(currentToken);
 934:       currentToken = new RETokenEnd(subIndex,null);
 935:     }
 936: 
 937:         // HEX CHARACTER, UNICODE CHARACTER
 938:         //  \x1B, \u1234
 939:     
 940:     else if ((unit.bk && (unit.ch == 'x') && syntax.get(RESyntax.RE_HEX_CHAR)) ||
 941:          (unit.bk && (unit.ch == 'u') && syntax.get(RESyntax.RE_UNICODE_CHAR))) {
 942:       CharExpression ce = getCharExpression(pattern, index - 2, pLength, syntax);
 943:       if (ce == null)
 944:         throw new REException("invalid hex character", REException.REG_ESCAPE, index);
 945:       index = index - 2 + ce.len;
 946:       addToken(currentToken);
 947:       currentToken = new RETokenChar(subIndex,ce.ch,insens);
 948:     }
 949: 
 950:     // NAMED PROPERTY
 951:     // \p{prop}, \P{prop}
 952: 
 953:     else if ((unit.bk && (unit.ch == 'p') && syntax.get(RESyntax.RE_NAMED_PROPERTY)) ||
 954:              (unit.bk && (unit.ch == 'P') && syntax.get(RESyntax.RE_NAMED_PROPERTY))) {
 955:       NamedProperty np = getNamedProperty(pattern, index - 2, pLength);
 956:       if (np == null)
 957:           throw new REException("invalid escape sequence", REException.REG_ESCAPE, index);
 958:       index = index - 2 + np.len;
 959:       addToken(currentToken);
 960:       currentToken = getRETokenNamedProperty(subIndex,np,insens,index);
 961:     }
 962: 
 963:     // NON-SPECIAL CHARACTER (or escape to make literal)
 964:         //  c | \* for example
 965: 
 966:     else {  // not a special character
 967:       addToken(currentToken);
 968:       currentToken = new RETokenChar(subIndex,unit.ch,insens);
 969:     } 
 970:       } // end while
 971: 
 972:     // Add final buffered token and an EndSub marker
 973:     addToken(currentToken);
 974:       
 975:     if (branches != null) {
 976:     branches.addElement(new RE(firstToken,lastToken,numSubs,subIndex,minimumLength, maximumLength));
 977:     branches.trimToSize(); // compact the Vector
 978:     minimumLength = 0;
 979:     maximumLength = 0;
 980:     firstToken = lastToken = null;
 981:     addToken(new RETokenOneOf(subIndex,branches,false));
 982:     } 
 983:     else addToken(new RETokenEndSub(subIndex));
 984: 
 985:   }
 986: 
 987:   private static class ParseCharClassResult {
 988:       RETokenOneOf token;
 989:       int index;
 990:       boolean returnAtAndOperator = false;
 991:   }
 992: 
 993:   /**
 994:    * Parse [...] or [^...] and make an RETokenOneOf instance.
 995:    * @param subIndex subIndex to be given to the created RETokenOneOf instance.
 996:    * @param pattern Input array of characters to be parsed.
 997:    * @param index Index pointing to the character next to the beginning '['.
 998:    * @param pLength Limit of the input array.
 999:    * @param cflags Compilation flags used to parse the pattern.
1000:    * @param pflags Flags that affect the behavior of this method.
1001:    * @param syntax Syntax used to parse the pattern.
1002:    */
1003:   private static ParseCharClassResult parseCharClass(int subIndex,
1004:         char[] pattern, int index,
1005:         int pLength, int cflags, RESyntax syntax, int pflags)
1006:         throws REException {
1007: 
1008:     boolean insens = ((cflags & REG_ICASE) > 0);
1009:     Vector options = new Vector();
1010:     Vector addition = new Vector();
1011:     boolean additionAndAppeared = false;
1012:     final int RETURN_AT_AND = 0x01;
1013:     boolean returnAtAndOperator = ((pflags & RETURN_AT_AND) != 0);
1014:     boolean negative = false;
1015:     char ch;
1016: 
1017:     char lastChar = 0;
1018:     boolean lastCharIsSet = false;
1019:     if (index == pLength) throw new REException(getLocalizedMessage("unmatched.bracket"),REException.REG_EBRACK,index);
1020:     
1021:     // Check for initial caret, negation
1022:     if ((ch = pattern[index]) == '^') {
1023:       negative = true;
1024:       if (++index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
1025:       ch = pattern[index];
1026:     }
1027: 
1028:     // Check for leading right bracket literal
1029:     if (ch == ']') {
1030:       lastChar = ch; lastCharIsSet = true;
1031:       if (++index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
1032:     }
1033: 
1034:     while ((ch = pattern[index++]) != ']') {
1035:       if ((ch == '-') && (lastCharIsSet)) {
1036:         if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
1037:         if ((ch = pattern[index]) == ']') {
1038:           options.addElement(new RETokenChar(subIndex,lastChar,insens));
1039:           lastChar = '-';
1040:         } else {
1041:           if ((ch == '\\') && syntax.get(RESyntax.RE_BACKSLASH_ESCAPE_IN_LISTS)) {
1042:             CharExpression ce = getCharExpression(pattern, index, pLength, syntax);
1043:             if (ce == null)
1044:           throw new REException("invalid escape sequence", REException.REG_ESCAPE, index);
1045:         ch = ce.ch;
1046:         index = index + ce.len - 1;
1047:           }
1048:           options.addElement(new RETokenRange(subIndex,lastChar,ch,insens));
1049:           lastChar = 0; lastCharIsSet = false;
1050:           index++;
1051:         }
1052:           } else if ((ch == '\\') && syntax.get(RESyntax.RE_BACKSLASH_ESCAPE_IN_LISTS)) {
1053:             if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
1054:         int posixID = -1;
1055:         boolean negate = false;
1056:             char asciiEsc = 0;
1057:         boolean asciiEscIsSet = false;
1058:         NamedProperty np = null;
1059:         if (("dswDSW".indexOf(pattern[index]) != -1) && syntax.get(RESyntax.RE_CHAR_CLASS_ESC_IN_LISTS)) {
1060:           switch (pattern[index]) {
1061:           case 'D':
1062:         negate = true;
1063:           case 'd':
1064:         posixID = RETokenPOSIX.DIGIT;
1065:         break;
1066:           case 'S':
1067:         negate = true;
1068:           case 's':
1069:         posixID = RETokenPOSIX.SPACE;
1070:         break;
1071:           case 'W':
1072:         negate = true;
1073:           case 'w':
1074:         posixID = RETokenPOSIX.ALNUM;
1075:         break;
1076:           }
1077:         }
1078:         if (("pP".indexOf(pattern[index]) != -1) && syntax.get(RESyntax.RE_NAMED_PROPERTY)) {
1079:           np = getNamedProperty(pattern, index - 1, pLength);
1080:           if (np == null)
1081:         throw new REException("invalid escape sequence", REException.REG_ESCAPE, index);
1082:           index = index - 1 + np.len - 1;
1083:         }
1084:         else {
1085:           CharExpression ce = getCharExpression(pattern, index - 1, pLength, syntax);
1086:           if (ce == null)
1087:         throw new REException("invalid escape sequence", REException.REG_ESCAPE, index);
1088:           asciiEsc = ce.ch; asciiEscIsSet = true;
1089:           index = index - 1 + ce.len - 1;
1090:         }
1091:         if (lastCharIsSet) options.addElement(new RETokenChar(subIndex,lastChar,insens));
1092:         
1093:         if (posixID != -1) {
1094:           options.addElement(new RETokenPOSIX(subIndex,posixID,insens,negate));
1095:         } else if (np != null) {
1096:           options.addElement(getRETokenNamedProperty(subIndex,np,insens,index));
1097:         } else if (asciiEscIsSet) {
1098:           lastChar = asciiEsc; lastCharIsSet = true;
1099:         } else {
1100:           lastChar = pattern[index]; lastCharIsSet = true;
1101:         }
1102:         ++index;
1103:       } else if ((ch == '[') && (syntax.get(RESyntax.RE_CHAR_CLASSES)) && (index < pLength) && (pattern[index] == ':')) {
1104:         StringBuffer posixSet = new StringBuffer();
1105:         index = getPosixSet(pattern,index+1,posixSet);
1106:         int posixId = RETokenPOSIX.intValue(posixSet.toString());
1107:         if (posixId != -1)
1108:           options.addElement(new RETokenPOSIX(subIndex,posixId,insens,false));
1109:       } else if ((ch == '[') && (syntax.get(RESyntax.RE_NESTED_CHARCLASS))) {
1110:         ParseCharClassResult result = parseCharClass(
1111:             subIndex, pattern, index, pLength, cflags, syntax, 0);
1112:         addition.addElement(result.token);
1113:         addition.addElement("|");
1114:         index = result.index;
1115:       } else if ((ch == '&') &&
1116:              (syntax.get(RESyntax.RE_NESTED_CHARCLASS)) &&
1117:              (index < pLength) && (pattern[index] == '&')) {
1118:         if (returnAtAndOperator) {
1119:             ParseCharClassResult result = new ParseCharClassResult(); 
1120:             options.trimToSize();
1121:             if (additionAndAppeared) addition.addElement("&");
1122:             if (addition.size() == 0) addition = null;
1123:             result.token = new RETokenOneOf(subIndex,
1124:             options, addition, negative);
1125:             result.index = index - 1;
1126:             result.returnAtAndOperator = true;
1127:             return result;
1128:         }
1129:         // The precedence of the operator "&&" is the lowest.
1130:         // So we postpone adding "&" until other elements
1131:         // are added. And we insert Boolean.FALSE at the
1132:         // beginning of the list of tokens following "&&".
1133:         // So, "&&[a-b][k-m]" will be stored in the Vecter
1134:         // addition in this order:
1135:         //     Boolean.FALSE, [a-b], "|", [k-m], "|", "&"
1136:         if (additionAndAppeared) addition.addElement("&");
1137:         addition.addElement(Boolean.FALSE);
1138:         additionAndAppeared = true;
1139: 
1140:         // The part on which "&&" operates may be either
1141:         //   (1) explicitly enclosed by []
1142:         //   or
1143:         //   (2) not enclosed by [] and terminated by the
1144:         //       next "&&" or the end of the character list.
1145:             //  Let the preceding else if block do the case (1).
1146:         //  We must do something in case of (2).
1147:         if ((index + 1 < pLength) && (pattern[index + 1] != '[')) {
1148:             ParseCharClassResult result = parseCharClass(
1149:             subIndex, pattern, index+1, pLength, cflags, syntax,
1150:             RETURN_AT_AND);
1151:             addition.addElement(result.token);
1152:             addition.addElement("|");
1153:             // If the method returned at the next "&&", it is OK.
1154:             // Otherwise we have eaten the mark of the end of this
1155:             // character list "]".  In this case we must give back
1156:             // the end mark.
1157:             index = (result.returnAtAndOperator ?
1158:             result.index: result.index - 1);
1159:         }
1160:       } else {
1161:         if (lastCharIsSet) options.addElement(new RETokenChar(subIndex,lastChar,insens));
1162:         lastChar = ch; lastCharIsSet = true;
1163:       }
1164:       if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
1165:     } // while in list
1166:     // Out of list, index is one past ']'
1167:         
1168:     if (lastCharIsSet) options.addElement(new RETokenChar(subIndex,lastChar,insens));
1169:        
1170:     ParseCharClassResult result = new ParseCharClassResult(); 
1171:     // Create a new RETokenOneOf
1172:     options.trimToSize();
1173:     if (additionAndAppeared) addition.addElement("&");
1174:     if (addition.size() == 0) addition = null;
1175:     result.token = new RETokenOneOf(subIndex,options, addition, negative);
1176:     result.index = index;
1177:     return result;
1178:   }
1179: 
1180:   private static int getCharUnit(char[] input, int index, CharUnit unit, boolean quot) throws REException {
1181:     unit.ch = input[index++];
1182:     unit.bk = (unit.ch == '\\'
1183:            && (!quot || index >= input.length || input[index] == 'E'));
1184:     if (unit.bk)
1185:       if (index < input.length)
1186:     unit.ch = input[index++];
1187:       else throw new REException(getLocalizedMessage("ends.with.backslash"),REException.REG_ESCAPE,index);
1188:     return index;
1189:   }
1190: 
1191:   private static int parseInt(char[] input, int pos, int len, int radix) {
1192:     int ret = 0;
1193:     for (int i = pos; i < pos + len; i++) {
1194:     ret = ret * radix + Character.digit(input[i], radix);
1195:     }
1196:     return ret;
1197:   }
1198: 
1199:   /**
1200:    * This class represents various expressions for a character.
1201:    * "a"      : 'a' itself.
1202:    * "\0123"  : Octal char 0123
1203:    * "\x1b"   : Hex char 0x1b
1204:    * "\u1234" : Unicode char \u1234
1205:    */
1206:   private static class CharExpression {
1207:     /** character represented by this expression */
1208:     char ch;
1209:     /** String expression */
1210:     String expr;
1211:     /** length of this expression */
1212:     int len;
1213:     public String toString() { return expr; }
1214:   }
1215: 
1216:   private static CharExpression getCharExpression(char[] input, int pos, int lim,
1217:         RESyntax syntax) {
1218:     CharExpression ce = new CharExpression();
1219:     char c = input[pos];
1220:     if (c == '\\') {
1221:       if (pos + 1 >= lim) return null;
1222:       c = input[pos + 1];
1223:       switch(c) {
1224:       case 't':
1225:         ce.ch = '\t';
1226:         ce.len = 2;
1227:         break;
1228:       case 'n':
1229:         ce.ch = '\n';
1230:         ce.len = 2;
1231:         break;
1232:       case 'r':
1233:         ce.ch = '\r';
1234:         ce.len = 2;
1235:         break;
1236:       case 'x':
1237:       case 'u':
1238:         if ((c == 'x' && syntax.get(RESyntax.RE_HEX_CHAR)) ||
1239:             (c == 'u' && syntax.get(RESyntax.RE_UNICODE_CHAR))) {
1240:           int l = 0;
1241:           int expectedLength = (c == 'x' ? 2 : 4);
1242:           for (int i = pos + 2; i < pos + 2 + expectedLength; i++) {
1243:             if (i >= lim) break;
1244:             if (!((input[i] >= '0' && input[i] <= '9') ||
1245:                   (input[i] >= 'A' && input[i] <= 'F') ||
1246:                   (input[i] >= 'a' && input[i] <= 'f')))
1247:                 break;
1248:         l++;
1249:           }
1250:           if (l != expectedLength) return null;
1251:           ce.ch = (char)(parseInt(input, pos + 2, l, 16));
1252:       ce.len = l + 2;
1253:         }
1254:         else {
1255:           ce.ch = c;
1256:           ce.len = 2;
1257:         }
1258:         break;
1259:       case '0':
1260:         if (syntax.get(RESyntax.RE_OCTAL_CHAR)) {
1261:           int l = 0;
1262:           for (int i = pos + 2; i < pos + 2 + 3; i++) {
1263:             if (i >= lim) break;
1264:         if (input[i] < '0' || input[i] > '7') break;
1265:             l++;
1266:           }
1267:           if (l == 3 && input[pos + 2] > '3') l--;
1268:           if (l <= 0) return null;
1269:           ce.ch = (char)(parseInt(input, pos + 2, l, 8));
1270:           ce.len = l + 2;
1271:         }
1272:         else {
1273:           ce.ch = c;
1274:           ce.len = 2;
1275:         }
1276:         break;
1277:       default:
1278:         ce.ch = c;
1279:         ce.len = 2;
1280:         break;
1281:       }
1282:     }
1283:     else {
1284:       ce.ch = input[pos];
1285:       ce.len = 1;
1286:     }
1287:     ce.expr = new String(input, pos, ce.len);
1288:     return ce;
1289:   }
1290: 
1291:   /**
1292:    * This class represents a substring in a pattern string expressing
1293:    * a named property.
1294:    * "\pA"      : Property named "A"
1295:    * "\p{prop}" : Property named "prop"
1296:    * "\PA"      : Property named "A" (Negated)
1297:    * "\P{prop}" : Property named "prop" (Negated)
1298:    */
1299:   private static class NamedProperty {
1300:     /** Property name */
1301:     String name;
1302:     /** Negated or not */
1303:     boolean negate;
1304:     /** length of this expression */
1305:     int len;
1306:   }
1307: 
1308:   private static NamedProperty getNamedProperty(char[] input, int pos, int lim) {
1309:     NamedProperty np = new NamedProperty();
1310:     char c = input[pos];
1311:     if (c == '\\') {
1312:       if (++pos >= lim) return null;
1313:       c = input[pos++];
1314:       switch(c) {
1315:       case 'p':
1316:         np.negate = false;
1317:         break;
1318:       case 'P':
1319:         np.negate = true;
1320:         break;
1321:       default:
1322:     return null;
1323:       }
1324:       c = input[pos++];
1325:       if (c == '{') {
1326:           int p = -1;
1327:       for (int i = pos; i < lim; i++) {
1328:           if (input[i] == '}') {
1329:           p = i;
1330:           break;
1331:           }
1332:       }
1333:       if (p < 0) return null;
1334:       int len = p - pos;
1335:           np.name = new String(input, pos, len);
1336:       np.len = len + 4;
1337:       }
1338:       else {
1339:           np.name = new String(input, pos - 1, 1);
1340:       np.len = 3;
1341:       }
1342:       return np;
1343:     }
1344:     else return null;
1345:   }
1346: 
1347:   private static RETokenNamedProperty getRETokenNamedProperty(
1348:       int subIndex, NamedProperty np, boolean insens, int index)
1349:       throws REException {
1350:     try {
1351:     return new RETokenNamedProperty(subIndex, np.name, insens, np.negate);
1352:     }
1353:     catch (REException e) {
1354:     REException ree;
1355:     ree = new REException(e.getMessage(), REException.REG_ESCAPE, index);
1356:     ree.initCause(e);
1357:     throw ree;
1358:     }
1359:   }
1360: 
1361:   /**
1362:    * Checks if the regular expression matches the input in its entirety.
1363:    *
1364:    * @param input The input text.
1365:    */
1366:   public boolean isMatch(Object input) {
1367:     return isMatch(input,0,0);
1368:   }
1369:   
1370:   /**
1371:    * Checks if the input string, starting from index, is an exact match of
1372:    * this regular expression.
1373:    *
1374:    * @param input The input text.
1375:    * @param index The offset index at which the search should be begin.
1376:    */
1377:   public boolean isMatch(Object input,int index) {
1378:     return isMatch(input,index,0);
1379:   }
1380:   
1381: 
1382:   /**
1383:    * Checks if the input, starting from index and using the specified
1384:    * execution flags, is an exact match of this regular expression.
1385:    *
1386:    * @param input The input text.
1387:    * @param index The offset index at which the search should be begin.
1388:    * @param eflags The logical OR of any execution flags above.
1389:    */
1390:   public boolean isMatch(Object input,int index,int eflags) {
1391:     return isMatchImpl(makeCharIndexed(input,index),index,eflags);
1392:   }
1393: 
1394:   private boolean isMatchImpl(CharIndexed input, int index, int eflags) {
1395:     if (firstToken == null)  // Trivial case
1396:       return (input.charAt(0) == CharIndexed.OUT_OF_BOUNDS);
1397:     REMatch m = new REMatch(numSubs, index, eflags);
1398:     if (firstToken.match(input, m)) {
1399:     while (m != null) {
1400:         if (input.charAt(m.index) == CharIndexed.OUT_OF_BOUNDS) {
1401:         return true;
1402:         }
1403:         m = m.next;
1404:     }
1405:     }
1406:     return false;
1407:   }
1408:     
1409:   /**
1410:    * Returns the maximum number of subexpressions in this regular expression.
1411:    * If the expression contains branches, the value returned will be the
1412:    * maximum subexpressions in any of the branches.
1413:    */
1414:   public int getNumSubs() {
1415:     return numSubs;
1416:   }
1417: 
1418:   // Overrides REToken.setUncle
1419:   void setUncle(REToken uncle) {
1420:       if (lastToken != null) {
1421:       lastToken.setUncle(uncle);
1422:       } else super.setUncle(uncle); // to deal with empty subexpressions
1423:   }
1424: 
1425:   // Overrides REToken.chain
1426: 
1427:   boolean chain(REToken next) {
1428:     super.chain(next);
1429:     setUncle(next);
1430:     return true;
1431:   }
1432: 
1433:   /**
1434:    * Returns the minimum number of characters that could possibly
1435:    * constitute a match of this regular expression.
1436:    */
1437:   public int getMinimumLength() {
1438:       return minimumLength;
1439:   }
1440: 
1441:   public int getMaximumLength() {
1442:       return maximumLength;
1443:   }
1444: 
1445:   /**
1446:    * Returns an array of all matches found in the input.
1447:    *
1448:    * If the regular expression allows the empty string to match, it will
1449:    * substitute matches at all positions except the end of the input.
1450:    *
1451:    * @param input The input text.
1452:    * @return a non-null (but possibly zero-length) array of matches
1453:    */
1454:   public REMatch[] getAllMatches(Object input) {
1455:     return getAllMatches(input,0,0);
1456:   }
1457: 
1458:   /**
1459:    * Returns an array of all matches found in the input,
1460:    * beginning at the specified index position.
1461:    *
1462:    * If the regular expression allows the empty string to match, it will
1463:    * substitute matches at all positions except the end of the input.
1464:    *
1465:    * @param input The input text.
1466:    * @param index The offset index at which the search should be begin.
1467:    * @return a non-null (but possibly zero-length) array of matches
1468:    */
1469:   public REMatch[] getAllMatches(Object input, int index) {
1470:     return getAllMatches(input,index,0);
1471:   }
1472: 
1473:   /**
1474:    * Returns an array of all matches found in the input string,
1475:    * beginning at the specified index position and using the specified
1476:    * execution flags.
1477:    *
1478:    * If the regular expression allows the empty string to match, it will
1479:    * substitute matches at all positions except the end of the input.
1480:    *
1481:    * @param input The input text.
1482:    * @param index The offset index at which the search should be begin.
1483:    * @param eflags The logical OR of any execution flags above.
1484:    * @return a non-null (but possibly zero-length) array of matches
1485:    */
1486:   public REMatch[] getAllMatches(Object input, int index, int eflags) {
1487:     return getAllMatchesImpl(makeCharIndexed(input,index),index,eflags);
1488:   }
1489: 
1490:   // this has been changed since 1.03 to be non-overlapping matches
1491:   private REMatch[] getAllMatchesImpl(CharIndexed input, int index, int eflags) {
1492:     Vector all = new Vector();
1493:     REMatch m = null;
1494:     while ((m = getMatchImpl(input,index,eflags,null)) != null) {
1495:       all.addElement(m);
1496:       index = m.getEndIndex();
1497:       if (m.end[0] == 0) {   // handle pathological case of zero-length match
1498:     index++;
1499:     input.move(1);
1500:       } else {
1501:     input.move(m.end[0]);
1502:       }
1503:       if (!input.isValid()) break;
1504:     }
1505:     REMatch[] mset = new REMatch[all.size()];
1506:     all.copyInto(mset);
1507:     return mset;
1508:   }
1509:   
1510:     /* Implements abstract method REToken.match() */
1511:     boolean match(CharIndexed input, REMatch mymatch) { 
1512:     if (firstToken == null) {
1513:         return next(input, mymatch);
1514:     }
1515: 
1516:     // Note the start of this subexpression
1517:     mymatch.start[subIndex] = mymatch.index;
1518: 
1519:     return firstToken.match(input, mymatch);
1520:     }
1521:   
1522:   /**
1523:    * Returns the first match found in the input.  If no match is found,
1524:    * null is returned.
1525:    *
1526:    * @param input The input text.
1527:    * @return An REMatch instance referencing the match, or null if none.
1528:    */
1529:   public REMatch getMatch(Object input) {
1530:     return getMatch(input,0,0);
1531:   }
1532:   
1533:   /**
1534:    * Returns the first match found in the input, beginning
1535:    * the search at the specified index.  If no match is found,
1536:    * returns null.
1537:    *
1538:    * @param input The input text.
1539:    * @param index The offset within the text to begin looking for a match.
1540:    * @return An REMatch instance referencing the match, or null if none.
1541:    */
1542:   public REMatch getMatch(Object input, int index) {
1543:     return getMatch(input,index,0);
1544:   }
1545:   
1546:   /**
1547:    * Returns the first match found in the input, beginning
1548:    * the search at the specified index, and using the specified
1549:    * execution flags.  If no match is found, returns null.
1550:    *
1551:    * @param input The input text.
1552:    * @param index The offset index at which the search should be begin.
1553:    * @param eflags The logical OR of any execution flags above.
1554:    * @return An REMatch instance referencing the match, or null if none.
1555:    */
1556:   public REMatch getMatch(Object input, int index, int eflags) {
1557:     return getMatch(input,index,eflags,null);
1558:   }
1559: 
1560:   /**
1561:    * Returns the first match found in the input, beginning the search
1562:    * at the specified index, and using the specified execution flags.
1563:    * If no match is found, returns null.  If a StringBuffer is
1564:    * provided and is non-null, the contents of the input text from the
1565:    * index to the beginning of the match (or to the end of the input,
1566:    * if there is no match) are appended to the StringBuffer.
1567:    *
1568:    * @param input The input text.
1569:    * @param index The offset index at which the search should be begin.
1570:    * @param eflags The logical OR of any execution flags above.
1571:    * @param buffer The StringBuffer to save pre-match text in.
1572:    * @return An REMatch instance referencing the match, or null if none.  */
1573:   public REMatch getMatch(Object input, int index, int eflags, StringBuffer buffer) {
1574:     return getMatchImpl(makeCharIndexed(input,index),index,eflags,buffer);
1575:   }
1576: 
1577:   REMatch getMatchImpl(CharIndexed input, int anchor, int eflags, StringBuffer buffer) {
1578:       boolean tryEntireMatch = ((eflags & REG_TRY_ENTIRE_MATCH) != 0);
1579:       RE re = (tryEntireMatch ? (RE) this.clone() : this);
1580:       if (tryEntireMatch) {
1581:       re.chain(new RETokenEnd(0, null));
1582:       }
1583:       // Create a new REMatch to hold results
1584:       REMatch mymatch = new REMatch(numSubs, anchor, eflags);
1585:       do {
1586:       // Optimization: check if anchor + minimumLength > length
1587:       if (minimumLength == 0 || input.charAt(minimumLength-1) != CharIndexed.OUT_OF_BOUNDS) {
1588:           if (re.match(input, mymatch)) {
1589:           REMatch best = mymatch;
1590:           // We assume that the match that coms first is the best.
1591:           // And the following "The longer, the better" rule has
1592:           // been commented out. The longest is not neccesarily
1593:           // the best. For example, "a" out of "aaa" is the best
1594:           // match for /a+?/.
1595:           /*
1596:           // Find best match of them all to observe leftmost longest
1597:           while ((mymatch = mymatch.next) != null) {
1598:               if (mymatch.index > best.index) {
1599:                best = mymatch;
1600:               }
1601:           }
1602:           */
1603:           best.end[0] = best.index;
1604:           best.finish(input);
1605:           return best;
1606:           }
1607:       }
1608:       mymatch.clear(++anchor);
1609:       // Append character to buffer if needed
1610:       if (buffer != null && input.charAt(0) != CharIndexed.OUT_OF_BOUNDS) {
1611:           buffer.append(input.charAt(0));
1612:       }
1613:       } while (input.move(1));
1614:       
1615:       // Special handling at end of input for e.g. "$"
1616:       if (minimumLength == 0) {
1617:       if (match(input, mymatch)) {
1618:           mymatch.finish(input);
1619:           return mymatch;
1620:       }
1621:       }
1622: 
1623:       return null;
1624:   }
1625: 
1626:   /**
1627:    * Returns an REMatchEnumeration that can be used to iterate over the
1628:    * matches found in the input text.
1629:    *
1630:    * @param input The input text.
1631:    * @return A non-null REMatchEnumeration instance.
1632:    */
1633:   public REMatchEnumeration getMatchEnumeration(Object input) {
1634:     return getMatchEnumeration(input,0,0);
1635:   }
1636: 
1637: 
1638:   /**
1639:    * Returns an REMatchEnumeration that can be used to iterate over the
1640:    * matches found in the input text.
1641:    *
1642:    * @param input The input text.
1643:    * @param index The offset index at which the search should be begin.
1644:    * @return A non-null REMatchEnumeration instance, with its input cursor
1645:    *  set to the index position specified.
1646:    */
1647:   public REMatchEnumeration getMatchEnumeration(Object input, int index) {
1648:     return getMatchEnumeration(input,index,0);
1649:   }
1650: 
1651:   /**
1652:    * Returns an REMatchEnumeration that can be used to iterate over the
1653:    * matches found in the input text.
1654:    *
1655:    * @param input The input text.
1656:    * @param index The offset index at which the search should be begin.
1657:    * @param eflags The logical OR of any execution flags above.
1658:    * @return A non-null REMatchEnumeration instance, with its input cursor
1659:    *  set to the index position specified.
1660:    */
1661:   public REMatchEnumeration getMatchEnumeration(Object input, int index, int eflags) {
1662:     return new REMatchEnumeration(this,makeCharIndexed(input,index),index,eflags);
1663:   }
1664: 
1665: 
1666:   /**
1667:    * Substitutes the replacement text for the first match found in the input.
1668:    *
1669:    * @param input The input text.
1670:    * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1671:    * @return A String interpolating the substituted text.
1672:    * @see REMatch#substituteInto
1673:    */
1674:   public String substitute(Object input,String replace) {
1675:     return substitute(input,replace,0,0);
1676:   }
1677: 
1678:   /**
1679:    * Substitutes the replacement text for the first match found in the input
1680:    * beginning at the specified index position.  Specifying an index
1681:    * effectively causes the regular expression engine to throw away the
1682:    * specified number of characters. 
1683:    *
1684:    * @param input The input text.
1685:    * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1686:    * @param index The offset index at which the search should be begin.
1687:    * @return A String containing the substring of the input, starting
1688:    *   at the index position, and interpolating the substituted text.
1689:    * @see REMatch#substituteInto
1690:    */
1691:   public String substitute(Object input,String replace,int index) {
1692:     return substitute(input,replace,index,0);
1693:   }
1694: 
1695:   /**
1696:    * Substitutes the replacement text for the first match found in the input
1697:    * string, beginning at the specified index position and using the
1698:    * specified execution flags.
1699:    *
1700:    * @param input The input text.
1701:    * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1702:    * @param index The offset index at which the search should be begin.
1703:    * @param eflags The logical OR of any execution flags above.
1704:    * @return A String containing the substring of the input, starting
1705:    *   at the index position, and interpolating the substituted text.
1706:    * @see REMatch#substituteInto
1707:    */
1708:   public String substitute(Object input,String replace,int index,int eflags) {
1709:     return substituteImpl(makeCharIndexed(input,index),replace,index,eflags);
1710:   }
1711: 
1712:   private String substituteImpl(CharIndexed input,String replace,int index,int eflags) {
1713:     StringBuffer buffer = new StringBuffer();
1714:     REMatch m = getMatchImpl(input,index,eflags,buffer);
1715:     if (m==null) return buffer.toString();
1716:     buffer.append(getReplacement(replace, m, eflags));
1717:     if (input.move(m.end[0])) {
1718:       do {
1719:     buffer.append(input.charAt(0));
1720:       } while (input.move(1));
1721:     }
1722:     return buffer.toString();
1723:   }
1724:   
1725:   /**
1726:    * Substitutes the replacement text for each non-overlapping match found 
1727:    * in the input text.
1728:    *
1729:    * @param input The input text.
1730:    * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1731:    * @return A String interpolating the substituted text.
1732:    * @see REMatch#substituteInto
1733:    */
1734:   public String substituteAll(Object input,String replace) {
1735:     return substituteAll(input,replace,0,0);
1736:   }
1737: 
1738:   /**
1739:    * Substitutes the replacement text for each non-overlapping match found 
1740:    * in the input text, starting at the specified index.
1741:    *
1742:    * If the regular expression allows the empty string to match, it will
1743:    * substitute matches at all positions except the end of the input.
1744:    *
1745:    * @param input The input text.
1746:    * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1747:    * @param index The offset index at which the search should be begin.
1748:    * @return A String containing the substring of the input, starting
1749:    *   at the index position, and interpolating the substituted text.
1750:    * @see REMatch#substituteInto
1751:    */
1752:   public String substituteAll(Object input,String replace,int index) {
1753:     return substituteAll(input,replace,index,0);
1754:   }
1755:  
1756:   /**
1757:    * Substitutes the replacement text for each non-overlapping match found 
1758:    * in the input text, starting at the specified index and using the
1759:    * specified execution flags.
1760:    *
1761:    * @param input The input text.
1762:    * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1763:    * @param index The offset index at which the search should be begin.
1764:    * @param eflags The logical OR of any execution flags above.
1765:    * @return A String containing the substring of the input, starting
1766:    *   at the index position, and interpolating the substituted text.
1767:    * @see REMatch#substituteInto
1768:    */
1769:   public String substituteAll(Object input,String replace,int index,int eflags) {
1770:     return substituteAllImpl(makeCharIndexed(input,index),replace,index,eflags);
1771:   }
1772: 
1773:   private String substituteAllImpl(CharIndexed input,String replace,int index,int eflags) {
1774:     StringBuffer buffer = new StringBuffer();
1775:     REMatch m;
1776:     while ((m = getMatchImpl(input,index,eflags,buffer)) != null) {
1777:       buffer.append(getReplacement(replace, m, eflags));
1778:       index = m.getEndIndex();
1779:       if (m.end[0] == 0) {
1780:     char ch = input.charAt(0);
1781:     if (ch != CharIndexed.OUT_OF_BOUNDS) 
1782:         buffer.append(ch);
1783:     input.move(1);
1784:       } else {
1785:       input.move(m.end[0]);
1786:       }
1787: 
1788:       if (!input.isValid()) break;
1789:     }
1790:     return buffer.toString();
1791:   }
1792: 
1793:   public static String getReplacement(String replace, REMatch m, int eflags) {
1794:     if ((eflags & REG_NO_INTERPOLATE) > 0)
1795:       return replace;
1796:     else {
1797:       if ((eflags & REG_REPLACE_USE_BACKSLASHESCAPE) > 0) {
1798:         StringBuffer sb = new StringBuffer();
1799:         int l = replace.length();
1800:         for (int i = 0; i < l; i++) {
1801:         char c = replace.charAt(i);
1802:             switch(c) {
1803:             case '\\':
1804:               i++;
1805:               // Let StringIndexOutOfBoundsException be thrown.
1806:               sb.append(replace.charAt(i));
1807:               break;
1808:             case '$':
1809:           int i1 = i + 1;
1810:           while (i1 < replace.length() &&
1811:         Character.isDigit(replace.charAt(i1))) i1++;
1812:               sb.append(m.substituteInto(replace.substring(i, i1)));
1813:               i = i1 - 1;
1814:               break;
1815:             default:
1816:               sb.append(c);
1817:             }
1818:         }
1819:         return sb.toString();
1820:       }
1821:       else
1822:         return m.substituteInto(replace);
1823:     }
1824:   }    
1825:   
1826:   /* Helper function for constructor */
1827:   private void addToken(REToken next) {
1828:     if (next == null) return;
1829:     minimumLength += next.getMinimumLength();
1830:     int nmax = next.getMaximumLength();
1831:     if (nmax < Integer.MAX_VALUE && maximumLength < Integer.MAX_VALUE)
1832:     maximumLength += nmax;
1833:     else 
1834:     maximumLength = Integer.MAX_VALUE;
1835: 
1836:     if (firstToken == null) {
1837:     lastToken = firstToken = next;
1838:     } else {
1839:       // if chain returns false, it "rejected" the token due to
1840:       // an optimization, and next was combined with lastToken
1841:       if (lastToken.chain(next)) {
1842:       lastToken = next;
1843:       }
1844:     }
1845:   }
1846: 
1847:   private static REToken setRepeated(REToken current, int min, int max, int index) throws REException {
1848:     if (current == null) throw new REException(getLocalizedMessage("repeat.no.token"),REException.REG_BADRPT,index);
1849:     return new RETokenRepeated(current.subIndex,current,min,max);
1850:   }
1851: 
1852:   private static int getPosixSet(char[] pattern,int index,StringBuffer buf) {
1853:     // Precondition: pattern[index-1] == ':'
1854:     // we will return pos of closing ']'.
1855:     int i;
1856:     for (i=index; i<(pattern.length-1); i++) {
1857:       if ((pattern[i] == ':') && (pattern[i+1] == ']'))
1858:     return i+2;
1859:       buf.append(pattern[i]);
1860:     }
1861:     return index; // didn't match up
1862:   }
1863: 
1864:   private int getMinMax(char[] input,int index,IntPair minMax,RESyntax syntax) throws REException {
1865:     // Precondition: input[index-1] == '{', minMax != null
1866: 
1867:     boolean mustMatch = !syntax.get(RESyntax.RE_NO_BK_BRACES);
1868:     int startIndex = index;
1869:     if (index == input.length) {
1870:       if (mustMatch)
1871:         throw new REException(getLocalizedMessage("unmatched.brace"),REException.REG_EBRACE,index);
1872:       else
1873:         return startIndex;
1874:     }
1875:     
1876:     int min,max=0;
1877:     CharUnit unit = new CharUnit();
1878:     StringBuffer buf = new StringBuffer();
1879:     
1880:     // Read string of digits
1881:     do {
1882:       index = getCharUnit(input,index,unit,false);
1883:       if (Character.isDigit(unit.ch))
1884:         buf.append(unit.ch);
1885:     } while ((index != input.length) && Character.isDigit(unit.ch));
1886: 
1887:     // Check for {} tomfoolery
1888:     if (buf.length() == 0) {
1889:       if (mustMatch)
1890:         throw new REException(getLocalizedMessage("interval.error"),REException.REG_EBRACE,index);
1891:       else
1892:         return startIndex;
1893:     }
1894: 
1895:     min = Integer.parseInt(buf.toString());
1896:     
1897:     if ((unit.ch == '}') && (syntax.get(RESyntax.RE_NO_BK_BRACES) ^ unit.bk))
1898:       max = min;
1899:     else if (index == input.length)
1900:       if (mustMatch)
1901:         throw new REException(getLocalizedMessage("interval.no.end"),REException.REG_EBRACE,index);
1902:       else
1903:         return startIndex;
1904:     else if ((unit.ch == ',') && !unit.bk) {
1905:       buf = new StringBuffer();
1906:       // Read string of digits
1907:       while (((index = getCharUnit(input,index,unit,false)) != input.length) && Character.isDigit(unit.ch))
1908:     buf.append(unit.ch);
1909: 
1910:       if (!((unit.ch == '}') && (syntax.get(RESyntax.RE_NO_BK_BRACES) ^ unit.bk)))
1911:         if (mustMatch)
1912:           throw new REException(getLocalizedMessage("interval.error"),REException.REG_EBRACE,index);
1913:         else
1914:           return startIndex;
1915: 
1916:       // This is the case of {x,}
1917:       if (buf.length() == 0) max = Integer.MAX_VALUE;
1918:       else max = Integer.parseInt(buf.toString());
1919:     } else
1920:       if (mustMatch)
1921:         throw new REException(getLocalizedMessage("interval.error"),REException.REG_EBRACE,index);
1922:       else
1923:         return startIndex;
1924: 
1925:     // We know min and max now, and they are valid.
1926: 
1927:     minMax.first = min;
1928:     minMax.second = max;
1929: 
1930:     // return the index following the '}'
1931:     return index;
1932:   }
1933: 
1934:    /**
1935:     * Return a human readable form of the compiled regular expression,
1936:     * useful for debugging.
1937:     */
1938:    public String toString() {
1939:      StringBuffer sb = new StringBuffer();
1940:      dump(sb);
1941:      return sb.toString();
1942:    }
1943: 
1944:   void dump(StringBuffer os) {
1945:     os.append('(');
1946:     if (subIndex == 0)
1947:       os.append("?:");
1948:     if (firstToken != null)
1949:       firstToken.dumpAll(os);
1950:     os.append(')');
1951:   }
1952: 
1953:   // Cast input appropriately or throw exception
1954:   private static CharIndexed makeCharIndexed(Object input, int index) {
1955:       // We could let a String fall through to final input, but since
1956:       // it's the most likely input type, we check it first.
1957:     if (input instanceof String)
1958:       return new CharIndexedString((String) input,index);
1959:     else if (input instanceof char[])
1960:       return new CharIndexedCharArray((char[]) input,index);
1961:     else if (input instanceof StringBuffer)
1962:       return new CharIndexedStringBuffer((StringBuffer) input,index);
1963:     else if (input instanceof InputStream)
1964:       return new CharIndexedInputStream((InputStream) input,index);
1965:     else if (input instanceof CharIndexed)
1966:     return (CharIndexed) input; // do we lose index info?
1967:     else 
1968:     return new CharIndexedString(input.toString(), index);
1969:   }
1970: }