Source for gnu.java.util.regex.RE

   1: /* gnu/regexp/RE.java
   2:    Copyright (C) 2006 Free Software Foundation, Inc.
   3: 
   4: This file is part of GNU Classpath.
   5: 
   6: GNU Classpath is free software; you can redistribute it and/or modify
   7: it under the terms of the GNU General Public License as published by
   8: the Free Software Foundation; either version 2, or (at your option)
   9: any later version.
  10: 
  11: GNU Classpath is distributed in the hope that it will be useful, but
  12: WITHOUT ANY WARRANTY; without even the implied warranty of
  13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14: General Public License for more details.
  15: 
  16: You should have received a copy of the GNU General Public License
  17: along with GNU Classpath; see the file COPYING.  If not, write to the
  18: Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  19: 02110-1301 USA.
  20: 
  21: Linking this library statically or dynamically with other modules is
  22: making a combined work based on this library.  Thus, the terms and
  23: conditions of the GNU General Public License cover the whole
  24: combination.
  25: 
  26: As a special exception, the copyright holders of this library give you
  27: permission to link this library with independent modules to produce an
  28: executable, regardless of the license terms of these independent
  29: modules, and to copy and distribute the resulting executable under
  30: terms of your choice, provided that you also meet, for each linked
  31: independent module, the terms and conditions of the license of that
  32: module.  An independent module is a module which is not derived from
  33: or based on this library.  If you modify this library, you may extend
  34: this exception to your version of the library, but you are not
  35: obligated to do so.  If you do not wish to do so, delete this
  36: exception statement from your version. */
  37: 
  38: package gnu.java.util.regex;
  39: 
  40: import gnu.java.lang.CPStringBuilder;
  41: 
  42: import java.io.InputStream;
  43: import java.io.Serializable;
  44: 
  45: import java.util.ArrayList;
  46: import java.util.List;
  47: import java.util.Locale;
  48: import java.util.PropertyResourceBundle;
  49: import java.util.ResourceBundle;
  50: 
  51: /**
  52:  * RE provides the user interface for compiling and matching regular
  53:  * expressions.
  54:  * <P>
  55:  * A regular expression object (class RE) is compiled by constructing it
  56:  * from a String, StringBuffer or character array, with optional 
  57:  * compilation flags (below)
  58:  * and an optional syntax specification (see RESyntax; if not specified,
  59:  * <code>RESyntax.RE_SYNTAX_PERL5</code> is used).
  60:  * <P>
  61:  * Once compiled, a regular expression object is reusable as well as
  62:  * threadsafe: multiple threads can use the RE instance simultaneously
  63:  * to match against different input text.
  64:  * <P>
  65:  * Various methods attempt to match input text against a compiled
  66:  * regular expression.  These methods are:
  67:  * <LI><code>isMatch</code>: returns true if the input text in its
  68:  * entirety matches the regular expression pattern.
  69:  * <LI><code>getMatch</code>: returns the first match found in the
  70:  * input text, or null if no match is found.
  71:  * <LI><code>getAllMatches</code>: returns an array of all
  72:  * non-overlapping matches found in the input text.  If no matches are
  73:  * found, the array is zero-length.
  74:  * <LI><code>substitute</code>: substitute the first occurence of the
  75:  * pattern in the input text with a replacement string (which may
  76:  * include metacharacters $0-$9, see REMatch.substituteInto).
  77:  * <LI><code>substituteAll</code>: same as above, but repeat for each
  78:  * match before returning.
  79:  * <LI><code>getMatchEnumeration</code>: returns an REMatchEnumeration
  80:  * object that allows iteration over the matches (see
  81:  * REMatchEnumeration for some reasons why you may want to do this
  82:  * instead of using <code>getAllMatches</code>.
  83:  * <P>
  84:  *
  85:  * These methods all have similar argument lists.  The input can be a
  86:  * CharIndexed, String, a character array, a StringBuffer, or an
  87:  * InputStream of some sort.  Note that when using an
  88:  * InputStream, the stream read position cannot be guaranteed after
  89:  * attempting a match (this is not a bug, but a consequence of the way
  90:  * regular expressions work).  Using an REMatchEnumeration can
  91:  * eliminate most positioning problems.
  92:  *
  93:  * Although the input object can be of various types, it is recommended
  94:  * that it should be a CharIndexed because {@link CharIndexed#getLastMatch()}
  95:  * can show the last match found on this input, which helps the expression
  96:  * \G work as the end of the previous match.
  97:  *
  98:  * <P>
  99:  *
 100:  * The optional index argument specifies the offset from the beginning
 101:  * of the text at which the search should start (see the descriptions
 102:  * of some of the execution flags for how this can affect positional
 103:  * pattern operators).  For an InputStream, this means an
 104:  * offset from the current read position, so subsequent calls with the
 105:  * same index argument on an InputStream will not
 106:  * necessarily access the same position on the stream, whereas
 107:  * repeated searches at a given index in a fixed string will return
 108:  * consistent results.
 109:  *
 110:  * <P>
 111:  * You can optionally affect the execution environment by using a
 112:  * combination of execution flags (constants listed below).
 113:  * 
 114:  * <P>
 115:  * All operations on a regular expression are performed in a
 116:  * thread-safe manner.
 117:  *
 118:  * @author <A HREF="mailto:wes@cacas.org">Wes Biggs</A>
 119:  * @version 1.1.5-dev, to be released
 120:  */
 121: 
 122: public class RE extends REToken
 123: {
 124: 
 125:   private static final class IntPair implements Serializable
 126:   {
 127:     public int first, second;
 128:   }
 129: 
 130:   private static final class CharUnit implements Serializable
 131:   {
 132:     public char ch;
 133:     public boolean bk;
 134:   }
 135: 
 136:   // This String will be returned by getVersion()
 137:   private static final String VERSION = "1.1.5-dev";
 138: 
 139:   // The localized strings are kept in a separate file
 140:   // Used by getLocalizedMessage().
 141:   private static ResourceBundle messages;
 142: 
 143:   // Name of the bundle that contains the localized messages.
 144:   private static final String bundle = "gnu/java/util/regex/MessagesBundle";
 145: 
 146:   // These are, respectively, the first and last tokens in our linked list
 147:   // If there is only one token, firstToken == lastToken
 148:   private REToken firstToken, lastToken;
 149: 
 150:   // This is the number of subexpressions in this regular expression,
 151:   // with a minimum value of zero.  Returned by getNumSubs()
 152:   private int numSubs;
 153: 
 154:     /** Minimum length, in characters, of any possible match. */
 155:   private int minimumLength;
 156:   private int maximumLength;
 157: 
 158:   /**
 159:    * Compilation flag. Do  not  differentiate  case.   Subsequent
 160:    * searches  using  this  RE will be case insensitive.
 161:    */
 162:   public static final int REG_ICASE = 0x02;
 163: 
 164:   /**
 165:    * Compilation flag. The match-any-character operator (dot)
 166:    * will match a newline character.  When set this overrides the syntax
 167:    * bit RE_DOT_NEWLINE (see RESyntax for details).  This is equivalent to
 168:    * the "/s" operator in Perl.
 169:    */
 170:   public static final int REG_DOT_NEWLINE = 0x04;
 171: 
 172:   /**
 173:    * Compilation flag. Use multiline mode.  In this mode, the ^ and $
 174:    * anchors will match based on newlines within the input. This is
 175:    * equivalent to the "/m" operator in Perl.
 176:    */
 177:   public static final int REG_MULTILINE = 0x08;
 178: 
 179:   /**
 180:    * Execution flag.
 181:    * The match-beginning operator (^) will not match at the beginning
 182:    * of the input string. Useful for matching on a substring when you
 183:    * know the context of the input is such that position zero of the
 184:    * input to the match test is not actually position zero of the text.
 185:    * <P>
 186:    * This example demonstrates the results of various ways of matching on
 187:    * a substring.
 188:    * <P>
 189:    * <CODE>
 190:    * String s = "food bar fool";<BR>
 191:    * RE exp = new RE("^foo.");<BR>
 192:    * REMatch m0 = exp.getMatch(s);<BR>
 193:    * REMatch m1 = exp.getMatch(s.substring(8));<BR>
 194:    * REMatch m2 = exp.getMatch(s.substring(8),0,RE.REG_NOTBOL); <BR>
 195:    * REMatch m3 = exp.getMatch(s,8);                            <BR>
 196:    * REMatch m4 = exp.getMatch(s,8,RE.REG_ANCHORINDEX);         <BR>
 197:    * <P>
 198:    * // Results:<BR>
 199:    * //  m0.toString(): "food"<BR>
 200:    * //  m1.toString(): "fool"<BR>
 201:    * //  m2.toString(): null<BR>
 202:    * //  m3.toString(): null<BR>
 203:    * //  m4.toString(): "fool"<BR>
 204:    * </CODE>
 205:    */
 206:   public static final int REG_NOTBOL = 0x10;
 207: 
 208:   /**
 209:    * Execution flag.
 210:    * The match-end operator ($) does not match at the end
 211:    * of the input string. Useful for matching on substrings.
 212:    */
 213:   public static final int REG_NOTEOL = 0x20;
 214: 
 215:   /**
 216:    * Execution flag.
 217:    * When a match method is invoked that starts matching at a non-zero
 218:    * index into the input, treat the input as if it begins at the index
 219:    * given.  The effect of this flag is that the engine does not "see"
 220:    * any text in the input before the given index.  This is useful so
 221:    * that the match-beginning operator (^) matches not at position 0
 222:    * in the input string, but at the position the search started at
 223:    * (based on the index input given to the getMatch function).  See
 224:    * the example under REG_NOTBOL.  It also affects the use of the \&lt;
 225:    * and \b operators.
 226:    */
 227:   public static final int REG_ANCHORINDEX = 0x40;
 228: 
 229:   /**
 230:    * Execution flag.
 231:    * The substitute and substituteAll methods will not attempt to
 232:    * interpolate occurrences of $1-$9 in the replacement text with
 233:    * the corresponding subexpressions.  For example, you may want to
 234:    * replace all matches of "one dollar" with "$1".
 235:    */
 236:   public static final int REG_NO_INTERPOLATE = 0x80;
 237: 
 238:   /**
 239:    * Execution flag.
 240:    * Try to match the whole input string. An implicit match-end operator
 241:    * is added to this regexp.
 242:    */
 243:   public static final int REG_TRY_ENTIRE_MATCH = 0x0100;
 244: 
 245:   /**
 246:    * Execution flag.
 247:    * The substitute and substituteAll methods will treat the
 248:    * character '\' in the replacement as an escape to a literal
 249:    * character. In this case "\n", "\$", "\\", "\x40" and "\012"
 250:    * will become "n", "$", "\", "x40" and "012" respectively.
 251:    * This flag has no effect if REG_NO_INTERPOLATE is set on.
 252:    */
 253:   public static final int REG_REPLACE_USE_BACKSLASHESCAPE = 0x0200;
 254: 
 255:   /**
 256:    * Compilation flag. Allow whitespace and comments in pattern.
 257:    * This is equivalent to the "/x" operator in Perl.
 258:    */
 259:   public static final int REG_X_COMMENTS = 0x0400;
 260: 
 261:   /**
 262:    * Compilation flag. If set, REG_ICASE is effective only for US-ASCII.
 263:    */
 264:   public static final int REG_ICASE_USASCII = 0x0800;
 265: 
 266:   /**
 267:    * Execution flag.
 268:    * Do not move the position at which the search begins.  If not set,
 269:    * the starting position will be moved until a match is found.
 270:    */
 271:   public static final int REG_FIX_STARTING_POSITION = 0x1000;
 272: 
 273:   /** Returns a string representing the version of the gnu.regexp package. */
 274:   public static final String version ()
 275:   {
 276:     return VERSION;
 277:   }
 278: 
 279:   // Retrieves a message from the ResourceBundle
 280:   static final String getLocalizedMessage (String key)
 281:   {
 282:     if (messages == null)
 283:       messages =
 284:     PropertyResourceBundle.getBundle (bundle, Locale.getDefault ());
 285:     return messages.getString (key);
 286:   }
 287: 
 288:   /**
 289:    * Constructs a regular expression pattern buffer without any compilation
 290:    * flags set, and using the default syntax (RESyntax.RE_SYNTAX_PERL5).
 291:    *
 292:    * @param pattern A regular expression pattern, in the form of a String,
 293:    *   StringBuffer or char[].  Other input types will be converted to
 294:    *   strings using the toString() method.
 295:    * @exception REException The input pattern could not be parsed.
 296:    * @exception NullPointerException The pattern was null.
 297:    */
 298:   public RE (Object pattern) throws REException
 299:   {
 300:     this (pattern, 0, RESyntax.RE_SYNTAX_PERL5, 0, 0);
 301:   }
 302: 
 303:   /**
 304:    * Constructs a regular expression pattern buffer using the specified
 305:    * compilation flags and the default syntax (RESyntax.RE_SYNTAX_PERL5).
 306:    *
 307:    * @param pattern A regular expression pattern, in the form of a String,
 308:    *   StringBuffer, or char[].  Other input types will be converted to
 309:    *   strings using the toString() method.
 310:    * @param cflags The logical OR of any combination of the compilation flags listed above.
 311:    * @exception REException The input pattern could not be parsed.
 312:    * @exception NullPointerException The pattern was null.
 313:    */
 314:   public RE (Object pattern, int cflags) throws REException
 315:   {
 316:     this (pattern, cflags, RESyntax.RE_SYNTAX_PERL5, 0, 0);
 317:   }
 318: 
 319:   /**
 320:    * Constructs a regular expression pattern buffer using the specified
 321:    * compilation flags and regular expression syntax.
 322:    *
 323:    * @param pattern A regular expression pattern, in the form of a String,
 324:    *   StringBuffer, or char[].  Other input types will be converted to
 325:    *   strings using the toString() method.
 326:    * @param cflags The logical OR of any combination of the compilation flags listed above.
 327:    * @param syntax The type of regular expression syntax to use.
 328:    * @exception REException The input pattern could not be parsed.
 329:    * @exception NullPointerException The pattern was null.
 330:    */
 331:   public RE (Object pattern, int cflags, RESyntax syntax) throws REException
 332:   {
 333:     this (pattern, cflags, syntax, 0, 0);
 334:   }
 335: 
 336:   // internal constructor used for alternation
 337:   private RE (REToken first, REToken last, int subs, int subIndex,
 338:           int minLength, int maxLength)
 339:   {
 340:     super (subIndex);
 341:     firstToken = first;
 342:     lastToken = last;
 343:     numSubs = subs;
 344:     minimumLength = minLength;
 345:     maximumLength = maxLength;
 346:     addToken (new RETokenEndSub (subIndex));
 347:   }
 348: 
 349:   private RE (Object patternObj, int cflags, RESyntax syntax, int myIndex,
 350:           int nextSub) throws REException
 351:   {
 352:     super (myIndex);        // Subexpression index of this token.
 353:     initialize (patternObj, cflags, syntax, myIndex, nextSub);
 354:   }
 355: 
 356:   // For use by subclasses
 357:   protected RE ()
 358:   {
 359:     super (0);
 360:   }
 361: 
 362:   // The meat of construction
 363:   protected void initialize (Object patternObj, int cflags, RESyntax syntax,
 364:                  int myIndex, int nextSub) throws REException
 365:   {
 366:     char[] pattern;
 367:     if (patternObj instanceof String)
 368:       {
 369:     pattern = ((String) patternObj).toCharArray ();
 370:       }
 371:     else if (patternObj instanceof char[])
 372:       {
 373:     pattern = (char[]) patternObj;
 374:       }
 375:     else if (patternObj instanceof StringBuffer)
 376:       {
 377:     pattern = new char[((StringBuffer) patternObj).length ()];
 378:     ((StringBuffer) patternObj).getChars (0, pattern.length, pattern, 0);
 379:       }
 380:     else if (patternObj instanceof StringBuilder)
 381:       {
 382:     pattern = new char[((StringBuilder) patternObj).length ()];
 383:     ((StringBuilder) patternObj).getChars (0, pattern.length, pattern, 0);
 384:       }
 385:     else if (patternObj instanceof CPStringBuilder)
 386:       {
 387:     pattern = new char[((CPStringBuilder) patternObj).length ()];
 388:     ((CPStringBuilder) patternObj).getChars (0, pattern.length, pattern,
 389:                          0);
 390:       }
 391:     else
 392:       {
 393:     pattern = patternObj.toString ().toCharArray ();
 394:       }
 395: 
 396:     int pLength = pattern.length;
 397: 
 398:     numSubs = 0;        // Number of subexpressions in this token.
 399:     ArrayList < REToken > branches = null;
 400: 
 401:     // linked list of tokens (sort of -- some closed loops can exist)
 402:     firstToken = lastToken = null;
 403: 
 404:     // Precalculate these so we don't pay for the math every time we
 405:     // need to access them.
 406:     boolean insens = ((cflags & REG_ICASE) > 0);
 407:     boolean insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0);
 408: 
 409:     // Parse pattern into tokens.  Does anyone know if it's more efficient
 410:     // to use char[] than a String.charAt()?  I'm assuming so.
 411: 
 412:     // index tracks the position in the char array
 413:     int index = 0;
 414: 
 415:     // this will be the current parse character (pattern[index])
 416:     CharUnit unit = new CharUnit ();
 417: 
 418:     // This is used for {x,y} calculations
 419:     IntPair minMax = new IntPair ();
 420: 
 421:     // Buffer a token so we can create a TokenRepeated, etc.
 422:     REToken currentToken = null;
 423:     boolean quot = false;
 424: 
 425:     // Saved syntax and flags.
 426:     RESyntax savedSyntax = null;
 427:     int savedCflags = 0;
 428:     boolean flagsSaved = false;
 429: 
 430:     while (index < pLength)
 431:       {
 432:     // read the next character unit (including backslash escapes)
 433:     index = getCharUnit (pattern, index, unit, quot);
 434: 
 435:     if (unit.bk)
 436:       if (unit.ch == 'Q')
 437:         {
 438:           quot = true;
 439:           continue;
 440:         }
 441:       else if (unit.ch == 'E')
 442:         {
 443:           quot = false;
 444:           continue;
 445:         }
 446:     if (quot)
 447:       unit.bk = false;
 448: 
 449:     if (((cflags & REG_X_COMMENTS) > 0) && (!unit.bk) && (!quot))
 450:       {
 451:         if (Character.isWhitespace (unit.ch))
 452:           {
 453:         continue;
 454:           }
 455:         if (unit.ch == '#')
 456:           {
 457:         for (int i = index; i < pLength; i++)
 458:           {
 459:             if (pattern[i] == '\n')
 460:               {
 461:             index = i + 1;
 462:             continue;
 463:               }
 464:             else if (pattern[i] == '\r')
 465:               {
 466:             if (i + 1 < pLength && pattern[i + 1] == '\n')
 467:               {
 468:                 index = i + 2;
 469:               }
 470:             else
 471:               {
 472:                 index = i + 1;
 473:               }
 474:             continue;
 475:               }
 476:           }
 477:         index = pLength;
 478:         continue;
 479:           }
 480:       }
 481: 
 482:     // ALTERNATION OPERATOR
 483:     //  \| or | (if RE_NO_BK_VBAR) or newline (if RE_NEWLINE_ALT)
 484:     //  not available if RE_LIMITED_OPS is set
 485: 
 486:     // TODO: the '\n' literal here should be a test against REToken.newline,
 487:     // which unfortunately may be more than a single character.
 488:     if (((unit.ch == '|'
 489:           && (syntax.get (RESyntax.RE_NO_BK_VBAR) ^ (unit.bk || quot)))
 490:          || (syntax.get (RESyntax.RE_NEWLINE_ALT) && (unit.ch == '\n')
 491:          && !(unit.bk || quot)))
 492:         && !syntax.get (RESyntax.RE_LIMITED_OPS))
 493:       {
 494:         // make everything up to here be a branch. create vector if nec.
 495:         addToken (currentToken);
 496:         RE theBranch =
 497:           new RE (firstToken, lastToken, numSubs, subIndex, minimumLength,
 498:               maximumLength);
 499:         minimumLength = 0;
 500:         maximumLength = 0;
 501:         if (branches == null)
 502:           {
 503:         branches = new ArrayList < REToken > ();
 504:           }
 505:         branches.add (theBranch);
 506:         firstToken = lastToken = currentToken = null;
 507:       }
 508: 
 509:     // INTERVAL OPERATOR:
 510:     //  {x} | {x,} | {x,y}  (RE_INTERVALS && RE_NO_BK_BRACES)
 511:     //  \{x\} | \{x,\} | \{x,y\} (RE_INTERVALS && !RE_NO_BK_BRACES)
 512:     //
 513:     // OPEN QUESTION: 
 514:     //  what is proper interpretation of '{' at start of string?
 515:     //
 516:     // This method used to check "repeat.empty.token" to avoid such regexp
 517:     // as "(a*){2,}", but now "repeat.empty.token" is allowed.
 518: 
 519:     else if ((unit.ch == '{') && syntax.get (RESyntax.RE_INTERVALS)
 520:          && (syntax.
 521:              get (RESyntax.RE_NO_BK_BRACES) ^ (unit.bk || quot)))
 522:       {
 523:         int newIndex = getMinMax (pattern, index, minMax, syntax);
 524:         if (newIndex > index)
 525:           {
 526:         if (minMax.first > minMax.second)
 527:           throw new
 528:             REException (getLocalizedMessage ("interval.order"),
 529:                  REException.REG_BADRPT, newIndex);
 530:         if (currentToken == null)
 531:           throw new
 532:             REException (getLocalizedMessage ("repeat.no.token"),
 533:                  REException.REG_BADRPT, newIndex);
 534:         if (currentToken instanceof RETokenRepeated)
 535:           throw new
 536:             REException (getLocalizedMessage ("repeat.chained"),
 537:                  REException.REG_BADRPT, newIndex);
 538:         if (currentToken instanceof RETokenWordBoundary
 539:             || currentToken instanceof RETokenWordBoundary)
 540:           throw new
 541:             REException (getLocalizedMessage ("repeat.assertion"),
 542:                  REException.REG_BADRPT, newIndex);
 543:         index = newIndex;
 544:         currentToken =
 545:           setRepeated (currentToken, minMax.first, minMax.second,
 546:                    index);
 547:           }
 548:         else
 549:           {
 550:         addToken (currentToken);
 551:         currentToken = new RETokenChar (subIndex, unit.ch, insens);
 552:         if (insensUSASCII)
 553:           currentToken.unicodeAware = false;
 554:           }
 555:       }
 556: 
 557:     // LIST OPERATOR:
 558:     //  [...] | [^...]
 559: 
 560:     else if ((unit.ch == '[') && !(unit.bk || quot))
 561:       {
 562:         // Create a new RETokenOneOf
 563:         ParseCharClassResult result =
 564:           parseCharClass (subIndex, pattern, index, pLength, cflags,
 565:                   syntax, 0);
 566:         addToken (currentToken);
 567:         currentToken = result.token;
 568:         index = result.index;
 569:       }
 570: 
 571:     // SUBEXPRESSIONS
 572:     //  (...) | \(...\) depending on RE_NO_BK_PARENS
 573: 
 574:     else if ((unit.ch == '(')
 575:          && (syntax.
 576:              get (RESyntax.RE_NO_BK_PARENS) ^ (unit.bk || quot)))
 577:       {
 578:         boolean pure = false;
 579:         boolean comment = false;
 580:         boolean lookAhead = false;
 581:         boolean lookBehind = false;
 582:         boolean independent = false;
 583:         boolean negativelh = false;
 584:         boolean negativelb = false;
 585:         if ((index + 1 < pLength) && (pattern[index] == '?'))
 586:           {
 587:         switch (pattern[index + 1])
 588:           {
 589:           case '!':
 590:             if (syntax.get (RESyntax.RE_LOOKAHEAD))
 591:               {
 592:             pure = true;
 593:             negativelh = true;
 594:             lookAhead = true;
 595:             index += 2;
 596:               }
 597:             break;
 598:           case '=':
 599:             if (syntax.get (RESyntax.RE_LOOKAHEAD))
 600:               {
 601:             pure = true;
 602:             lookAhead = true;
 603:             index += 2;
 604:               }
 605:             break;
 606:           case '<':
 607:             // We assume that if the syntax supports look-ahead,
 608:             // it also supports look-behind.
 609:             if (syntax.get (RESyntax.RE_LOOKAHEAD))
 610:               {
 611:             index++;
 612:             switch (pattern[index + 1])
 613:               {
 614:               case '!':
 615:                 pure = true;
 616:                 negativelb = true;
 617:                 lookBehind = true;
 618:                 index += 2;
 619:                 break;
 620:               case '=':
 621:                 pure = true;
 622:                 lookBehind = true;
 623:                 index += 2;
 624:               }
 625:               }
 626:             break;
 627:           case '>':
 628:             // We assume that if the syntax supports look-ahead,
 629:             // it also supports independent group.
 630:             if (syntax.get (RESyntax.RE_LOOKAHEAD))
 631:               {
 632:             pure = true;
 633:             independent = true;
 634:             index += 2;
 635:               }
 636:             break;
 637:           case 'i':
 638:           case 'd':
 639:           case 'm':
 640:           case 's':
 641:           case 'u':
 642:           case 'x':
 643:           case '-':
 644:             if (!syntax.get (RESyntax.RE_EMBEDDED_FLAGS))
 645:               break;
 646:             // Set or reset syntax flags.
 647:             int flagIndex = index + 1;
 648:             int endFlag = -1;
 649:             RESyntax newSyntax = new RESyntax (syntax);
 650:             int newCflags = cflags;
 651:             boolean negate = false;
 652:             while (flagIndex < pLength && endFlag < 0)
 653:               {
 654:             switch (pattern[flagIndex])
 655:               {
 656:               case 'i':
 657:                 if (negate)
 658:                   newCflags &= ~REG_ICASE;
 659:                 else
 660:                   newCflags |= REG_ICASE;
 661:                 flagIndex++;
 662:                 break;
 663:               case 'd':
 664:                 if (negate)
 665:                   newSyntax.setLineSeparator (RESyntax.
 666:                               DEFAULT_LINE_SEPARATOR);
 667:                 else
 668:                   newSyntax.setLineSeparator ("\n");
 669:                 flagIndex++;
 670:                 break;
 671:               case 'm':
 672:                 if (negate)
 673:                   newCflags &= ~REG_MULTILINE;
 674:                 else
 675:                   newCflags |= REG_MULTILINE;
 676:                 flagIndex++;
 677:                 break;
 678:               case 's':
 679:                 if (negate)
 680:                   newCflags &= ~REG_DOT_NEWLINE;
 681:                 else
 682:                   newCflags |= REG_DOT_NEWLINE;
 683:                 flagIndex++;
 684:                 break;
 685:               case 'u':
 686:                 if (negate)
 687:                   newCflags |= REG_ICASE_USASCII;
 688:                 else
 689:                   newCflags &= ~REG_ICASE_USASCII;
 690:                 flagIndex++;
 691:                 break;
 692:               case 'x':
 693:                 if (negate)
 694:                   newCflags &= ~REG_X_COMMENTS;
 695:                 else
 696:                   newCflags |= REG_X_COMMENTS;
 697:                 flagIndex++;
 698:                 break;
 699:               case '-':
 700:                 negate = true;
 701:                 flagIndex++;
 702:                 break;
 703:               case ':':
 704:               case ')':
 705:                 endFlag = pattern[flagIndex];
 706:                 break;
 707:               default:
 708:                 throw new
 709:                   REException (getLocalizedMessage
 710:                        ("repeat.no.token"),
 711:                        REException.REG_BADRPT, index);
 712:               }
 713:               }
 714:             if (endFlag == ')')
 715:               {
 716:             syntax = newSyntax;
 717:             cflags = newCflags;
 718:             insens = ((cflags & REG_ICASE) > 0);
 719:             insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0);
 720:             // This can be treated as though it were a comment.
 721:             comment = true;
 722:             index = flagIndex - 1;
 723:             break;
 724:               }
 725:             if (endFlag == ':')
 726:               {
 727:             savedSyntax = syntax;
 728:             savedCflags = cflags;
 729:             flagsSaved = true;
 730:             syntax = newSyntax;
 731:             cflags = newCflags;
 732:             insens = ((cflags & REG_ICASE) > 0);
 733:             insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0);
 734:             index = flagIndex - 1;
 735:             // Fall through to the next case.
 736:               }
 737:             else
 738:               {
 739:             throw new
 740:               REException (getLocalizedMessage
 741:                        ("unmatched.paren"),
 742:                        REException.REG_ESUBREG, index);
 743:               }
 744:           case ':':
 745:             if (syntax.get (RESyntax.RE_PURE_GROUPING))
 746:               {
 747:             pure = true;
 748:             index += 2;
 749:               }
 750:             break;
 751:           case '#':
 752:             if (syntax.get (RESyntax.RE_COMMENTS))
 753:               {
 754:             comment = true;
 755:               }
 756:             break;
 757:           default:
 758:             throw new
 759:               REException (getLocalizedMessage ("repeat.no.token"),
 760:                    REException.REG_BADRPT, index);
 761:           }
 762:           }
 763: 
 764:         if (index >= pLength)
 765:           {
 766:         throw new
 767:           REException (getLocalizedMessage ("unmatched.paren"),
 768:                    REException.REG_ESUBREG, index);
 769:           }
 770: 
 771:         // find end of subexpression
 772:         int endIndex = index;
 773:         int nextIndex = index;
 774:         int nested = 0;
 775: 
 776:         while (((nextIndex =
 777:              getCharUnit (pattern, endIndex, unit, false)) > 0)
 778:            && !(nested == 0 && (unit.ch == ')')
 779:             && (syntax.
 780:                 get (RESyntax.RE_NO_BK_PARENS) ^ (unit.bk
 781:                                   || quot))))
 782:           {
 783:         if ((endIndex = nextIndex) >= pLength)
 784:           throw new
 785:             REException (getLocalizedMessage ("subexpr.no.end"),
 786:                  REException.REG_ESUBREG, nextIndex);
 787:         else
 788:           if ((unit.ch == '[') && !(unit.bk || quot))
 789:         {
 790:           // I hate to do something similar to the LIST OPERATOR matters
 791:           // above, but ...
 792:           int listIndex = nextIndex;
 793:           if (listIndex < pLength && pattern[listIndex] == '^')
 794:             listIndex++;
 795:           if (listIndex < pLength && pattern[listIndex] == ']')
 796:             listIndex++;
 797:           int listEndIndex = -1;
 798:           int listNest = 0;
 799:           while (listIndex < pLength && listEndIndex < 0)
 800:             {
 801:               switch (pattern[listIndex++])
 802:             {
 803:             case '\\':
 804:               listIndex++;
 805:               break;
 806:             case '[':
 807:               // Sun's API document says that regexp like "[a-d[m-p]]"
 808:               // is legal. Even something like "[[[^]]]]" is accepted.
 809:               listNest++;
 810:               if (listIndex < pLength
 811:                   && pattern[listIndex] == '^')
 812:                 listIndex++;
 813:               if (listIndex < pLength
 814:                   && pattern[listIndex] == ']')
 815:                 listIndex++;
 816:               break;
 817:             case ']':
 818:               if (listNest == 0)
 819:                 listEndIndex = listIndex;
 820:               listNest--;
 821:               break;
 822:             }
 823:             }
 824:           if (listEndIndex >= 0)
 825:             {
 826:               nextIndex = listEndIndex;
 827:               if ((endIndex = nextIndex) >= pLength)
 828:             throw new
 829:               REException (getLocalizedMessage ("subexpr.no.end"),
 830:                        REException.REG_ESUBREG, nextIndex);
 831:               else
 832:               continue;
 833:             }
 834:           throw new
 835:             REException (getLocalizedMessage ("subexpr.no.end"),
 836:                  REException.REG_ESUBREG, nextIndex);
 837:         }
 838:           else if (unit.ch == '('
 839:                && (syntax.
 840:                get (RESyntax.RE_NO_BK_PARENS) ^ (unit.bk
 841:                                  || quot)))
 842:         nested++;
 843:           else if (unit.ch == ')'
 844:                && (syntax.
 845:                get (RESyntax.RE_NO_BK_PARENS) ^ (unit.bk
 846:                                  || quot)))
 847:         nested--;
 848:           }
 849: 
 850:         // endIndex is now position at a ')','\)' 
 851:         // nextIndex is end of string or position after ')' or '\)'
 852: 
 853:         if (comment)
 854:           index = nextIndex;
 855:         else
 856:           {            // not a comment
 857:         // create RE subexpression as token.
 858:         addToken (currentToken);
 859:         if (!pure)
 860:           {
 861:             numSubs++;
 862:           }
 863: 
 864:         int useIndex = (pure || lookAhead || lookBehind
 865:                 || independent) ? 0 : nextSub + numSubs;
 866:         currentToken =
 867:           new RE (String.valueOf (pattern, index, endIndex - index).
 868:               toCharArray (), cflags, syntax, useIndex,
 869:               nextSub + numSubs);
 870:         numSubs += ((RE) currentToken).getNumSubs ();
 871: 
 872:         if (lookAhead)
 873:           {
 874:             currentToken =
 875:               new RETokenLookAhead (currentToken, negativelh);
 876:           }
 877:         else if (lookBehind)
 878:           {
 879:             currentToken =
 880:               new RETokenLookBehind (currentToken, negativelb);
 881:           }
 882:         else if (independent)
 883:           {
 884:             currentToken = new RETokenIndependent (currentToken);
 885:           }
 886: 
 887:         index = nextIndex;
 888:         if (flagsSaved)
 889:           {
 890:             syntax = savedSyntax;
 891:             cflags = savedCflags;
 892:             insens = ((cflags & REG_ICASE) > 0);
 893:             insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0);
 894:             flagsSaved = false;
 895:           }
 896:           }            // not a comment
 897:       }            // subexpression
 898: 
 899:     // UNMATCHED RIGHT PAREN
 900:     // ) or \) throw exception if
 901:     // !syntax.get(RESyntax.RE_UNMATCHED_RIGHT_PAREN_ORD)
 902:     else if (!syntax.get (RESyntax.RE_UNMATCHED_RIGHT_PAREN_ORD)
 903:          && ((unit.ch == ')')
 904:              && (syntax.
 905:              get (RESyntax.RE_NO_BK_PARENS) ^ (unit.bk || quot))))
 906:       {
 907:         throw new REException (getLocalizedMessage ("unmatched.paren"),
 908:                    REException.REG_EPAREN, index);
 909:       }
 910: 
 911:     // START OF LINE OPERATOR
 912:     //  ^
 913: 
 914:     else if ((unit.ch == '^') && !(unit.bk || quot))
 915:       {
 916:         addToken (currentToken);
 917:         currentToken = null;
 918:         RETokenStart token = null;
 919:         if ((cflags & REG_MULTILINE) > 0)
 920:           {
 921:         String sep = syntax.getLineSeparator ();
 922:         if (sep == null)
 923:           {
 924:             token = new RETokenStart (subIndex, null, true);
 925:           }
 926:         else
 927:           {
 928:             token = new RETokenStart (subIndex, sep);
 929:           }
 930:           }
 931:         else
 932:           {
 933:         token = new RETokenStart (subIndex, null);
 934:           }
 935:         addToken (token);
 936:       }
 937: 
 938:     // END OF LINE OPERATOR
 939:     //  $
 940: 
 941:     else if ((unit.ch == '$') && !(unit.bk || quot))
 942:       {
 943:         addToken (currentToken);
 944:         currentToken = null;
 945:         RETokenEnd token = null;
 946:         if ((cflags & REG_MULTILINE) > 0)
 947:           {
 948:         String sep = syntax.getLineSeparator ();
 949:         if (sep == null)
 950:           {
 951:             token = new RETokenEnd (subIndex, null, true);
 952:           }
 953:         else
 954:           {
 955:             token = new RETokenEnd (subIndex, sep);
 956:           }
 957:           }
 958:         else
 959:           {
 960:         token = new RETokenEnd (subIndex, null);
 961:           }
 962:         addToken (token);
 963:       }
 964: 
 965:     // MATCH-ANY-CHARACTER OPERATOR (except possibly newline and null)
 966:     //  .
 967: 
 968:     else if ((unit.ch == '.') && !(unit.bk || quot))
 969:       {
 970:         addToken (currentToken);
 971:         currentToken =
 972:           new RETokenAny (subIndex, syntax.get (RESyntax.RE_DOT_NEWLINE)
 973:                   || ((cflags & REG_DOT_NEWLINE) > 0),
 974:                   syntax.get (RESyntax.RE_DOT_NOT_NULL));
 975:       }
 976: 
 977:     // ZERO-OR-MORE REPEAT OPERATOR
 978:     //  *
 979:     //
 980:     // This method used to check "repeat.empty.token" to avoid such regexp
 981:     // as "(a*)*", but now "repeat.empty.token" is allowed.
 982: 
 983:     else if ((unit.ch == '*') && !(unit.bk || quot))
 984:       {
 985:         if (currentToken == null)
 986:           throw new REException (getLocalizedMessage ("repeat.no.token"),
 987:                      REException.REG_BADRPT, index);
 988:         if (currentToken instanceof RETokenRepeated)
 989:           throw new REException (getLocalizedMessage ("repeat.chained"),
 990:                      REException.REG_BADRPT, index);
 991:         if (currentToken instanceof RETokenWordBoundary
 992:         || currentToken instanceof RETokenWordBoundary)
 993:           throw new REException (getLocalizedMessage ("repeat.assertion"),
 994:                      REException.REG_BADRPT, index);
 995:         currentToken =
 996:           setRepeated (currentToken, 0, Integer.MAX_VALUE, index);
 997:       }
 998: 
 999:     // ONE-OR-MORE REPEAT OPERATOR / POSSESSIVE MATCHING OPERATOR
1000:     //  + | \+ depending on RE_BK_PLUS_QM
1001:     //  not available if RE_LIMITED_OPS is set
1002:     //
1003:     // This method used to check "repeat.empty.token" to avoid such regexp
1004:     // as "(a*)+", but now "repeat.empty.token" is allowed.
1005: 
1006:     else if ((unit.ch == '+') && !syntax.get (RESyntax.RE_LIMITED_OPS)
1007:          && (!syntax.
1008:              get (RESyntax.RE_BK_PLUS_QM) ^ (unit.bk || quot)))
1009:       {
1010:         if (currentToken == null)
1011:           throw new REException (getLocalizedMessage ("repeat.no.token"),
1012:                      REException.REG_BADRPT, index);
1013: 
1014:         // Check for possessive matching on RETokenRepeated
1015:         if (currentToken instanceof RETokenRepeated)
1016:           {
1017:         RETokenRepeated tokenRep = (RETokenRepeated) currentToken;
1018:         if (syntax.get (RESyntax.RE_POSSESSIVE_OPS)
1019:             && !tokenRep.isPossessive () && !tokenRep.isStingy ())
1020:           tokenRep.makePossessive ();
1021:         else
1022:           throw new
1023:             REException (getLocalizedMessage ("repeat.chained"),
1024:                  REException.REG_BADRPT, index);
1025: 
1026:           }
1027:         else if (currentToken instanceof RETokenWordBoundary
1028:              || currentToken instanceof RETokenWordBoundary)
1029:           throw new REException (getLocalizedMessage ("repeat.assertion"),
1030:                      REException.REG_BADRPT, index);
1031:         else
1032:         currentToken =
1033:           setRepeated (currentToken, 1, Integer.MAX_VALUE, index);
1034:       }
1035: 
1036:     // ZERO-OR-ONE REPEAT OPERATOR / STINGY MATCHING OPERATOR
1037:     //  ? | \? depending on RE_BK_PLUS_QM
1038:     //  not available if RE_LIMITED_OPS is set
1039:     //  stingy matching if RE_STINGY_OPS is set and it follows a quantifier
1040: 
1041:     else if ((unit.ch == '?') && !syntax.get (RESyntax.RE_LIMITED_OPS)
1042:          && (!syntax.
1043:              get (RESyntax.RE_BK_PLUS_QM) ^ (unit.bk || quot)))
1044:       {
1045:         if (currentToken == null)
1046:           throw new REException (getLocalizedMessage ("repeat.no.token"),
1047:                      REException.REG_BADRPT, index);
1048: 
1049:         // Check for stingy matching on RETokenRepeated
1050:         if (currentToken instanceof RETokenRepeated)
1051:           {
1052:         RETokenRepeated tokenRep = (RETokenRepeated) currentToken;
1053:         if (syntax.get (RESyntax.RE_STINGY_OPS)
1054:             && !tokenRep.isStingy () && !tokenRep.isPossessive ())
1055:           tokenRep.makeStingy ();
1056:         else
1057:           throw new
1058:             REException (getLocalizedMessage ("repeat.chained"),
1059:                  REException.REG_BADRPT, index);
1060:           }
1061:         else if (currentToken instanceof RETokenWordBoundary
1062:              || currentToken instanceof RETokenWordBoundary)
1063:           throw new REException (getLocalizedMessage ("repeat.assertion"),
1064:                      REException.REG_BADRPT, index);
1065:         else
1066:         currentToken = setRepeated (currentToken, 0, 1, index);
1067:       }
1068: 
1069:     // OCTAL CHARACTER
1070:     //  \0377
1071: 
1072:     else if (unit.bk && (unit.ch == '0')
1073:          && syntax.get (RESyntax.RE_OCTAL_CHAR))
1074:       {
1075:         CharExpression ce =
1076:           getCharExpression (pattern, index - 2, pLength, syntax);
1077:         if (ce == null)
1078:           throw new REException ("invalid octal character",
1079:                      REException.REG_ESCAPE, index);
1080:         index = index - 2 + ce.len;
1081:         addToken (currentToken);
1082:         currentToken = new RETokenChar (subIndex, ce.ch, insens);
1083:         if (insensUSASCII)
1084:           currentToken.unicodeAware = false;
1085:       }
1086: 
1087:     // BACKREFERENCE OPERATOR
1088:     //  \1 \2 ... \9 and \10 \11 \12 ...
1089:     // not available if RE_NO_BK_REFS is set
1090:     // Perl recognizes \10, \11, and so on only if enough number of
1091:     // parentheses have opened before it, otherwise they are treated
1092:     // as aliases of \010, \011, ... (octal characters).  In case of
1093:     // Sun's JDK, octal character expression must always begin with \0.
1094:     // We will do as JDK does. But FIXME, take a look at "(a)(b)\29".
1095:     // JDK treats \2 as a back reference to the 2nd group because
1096:     // there are only two groups. But in our poor implementation,
1097:     // we cannot help but treat \29 as a back reference to the 29th group.
1098: 
1099:     else if (unit.bk && Character.isDigit (unit.ch)
1100:          && !syntax.get (RESyntax.RE_NO_BK_REFS))
1101:       {
1102:         addToken (currentToken);
1103:         int numBegin = index - 1;
1104:         int numEnd = pLength;
1105:         for (int i = index; i < pLength; i++)
1106:           {
1107:         if (!Character.isDigit (pattern[i]))
1108:           {
1109:             numEnd = i;
1110:             break;
1111:           }
1112:           }
1113:         int num = parseInt (pattern, numBegin, numEnd - numBegin, 10);
1114: 
1115:         currentToken = new RETokenBackRef (subIndex, num, insens);
1116:         if (insensUSASCII)
1117:           currentToken.unicodeAware = false;
1118:         index = numEnd;
1119:       }
1120: 
1121:     // START OF STRING OPERATOR
1122:     //  \A if RE_STRING_ANCHORS is set
1123: 
1124:     else if (unit.bk && (unit.ch == 'A')
1125:          && syntax.get (RESyntax.RE_STRING_ANCHORS))
1126:       {
1127:         addToken (currentToken);
1128:         currentToken = new RETokenStart (subIndex, null);
1129:       }
1130: 
1131:     // WORD BREAK OPERATOR
1132:     //  \b if ????
1133: 
1134:     else if (unit.bk && (unit.ch == 'b')
1135:          && syntax.get (RESyntax.RE_STRING_ANCHORS))
1136:       {
1137:         addToken (currentToken);
1138:         currentToken =
1139:           new RETokenWordBoundary (subIndex,
1140:                        RETokenWordBoundary.
1141:                        BEGIN | RETokenWordBoundary.END,
1142:                        false);
1143:       }
1144: 
1145:     // WORD BEGIN OPERATOR 
1146:     //  \< if ????
1147:     else if (unit.bk && (unit.ch == '<'))
1148:       {
1149:         addToken (currentToken);
1150:         currentToken =
1151:           new RETokenWordBoundary (subIndex, RETokenWordBoundary.BEGIN,
1152:                        false);
1153:       }
1154: 
1155:     // WORD END OPERATOR 
1156:     //  \> if ????
1157:     else if (unit.bk && (unit.ch == '>'))
1158:       {
1159:         addToken (currentToken);
1160:         currentToken =
1161:           new RETokenWordBoundary (subIndex, RETokenWordBoundary.END,
1162:                        false);
1163:       }
1164: 
1165:     // NON-WORD BREAK OPERATOR
1166:     // \B if ????
1167: 
1168:     else if (unit.bk && (unit.ch == 'B')
1169:          && syntax.get (RESyntax.RE_STRING_ANCHORS))
1170:       {
1171:         addToken (currentToken);
1172:         currentToken =
1173:           new RETokenWordBoundary (subIndex,
1174:                        RETokenWordBoundary.
1175:                        BEGIN | RETokenWordBoundary.END, true);
1176:       }
1177: 
1178: 
1179:     // DIGIT OPERATOR
1180:     //  \d if RE_CHAR_CLASS_ESCAPES is set
1181: 
1182:     else if (unit.bk && (unit.ch == 'd')
1183:          && syntax.get (RESyntax.RE_CHAR_CLASS_ESCAPES))
1184:       {
1185:         addToken (currentToken);
1186:         currentToken =
1187:           new RETokenPOSIX (subIndex, RETokenPOSIX.DIGIT, insens, false);
1188:         if (insensUSASCII)
1189:           currentToken.unicodeAware = false;
1190:       }
1191: 
1192:     // NON-DIGIT OPERATOR
1193:     //  \D
1194: 
1195:     else if (unit.bk && (unit.ch == 'D')
1196:          && syntax.get (RESyntax.RE_CHAR_CLASS_ESCAPES))
1197:       {
1198:         addToken (currentToken);
1199:         currentToken =
1200:           new RETokenPOSIX (subIndex, RETokenPOSIX.DIGIT, insens, true);
1201:         if (insensUSASCII)
1202:           currentToken.unicodeAware = false;
1203:       }
1204: 
1205:     // NEWLINE ESCAPE
1206:     //  \n
1207: 
1208:     else if (unit.bk && (unit.ch == 'n'))
1209:       {
1210:         addToken (currentToken);
1211:         currentToken = new RETokenChar (subIndex, '\n', false);
1212:       }
1213: 
1214:     // RETURN ESCAPE
1215:     //  \r
1216: 
1217:     else if (unit.bk && (unit.ch == 'r'))
1218:       {
1219:         addToken (currentToken);
1220:         currentToken = new RETokenChar (subIndex, '\r', false);
1221:       }
1222: 
1223:     // WHITESPACE OPERATOR
1224:     //  \s if RE_CHAR_CLASS_ESCAPES is set
1225: 
1226:     else if (unit.bk && (unit.ch == 's')
1227:          && syntax.get (RESyntax.RE_CHAR_CLASS_ESCAPES))
1228:       {
1229:         addToken (currentToken);
1230:         currentToken =
1231:           new RETokenPOSIX (subIndex, RETokenPOSIX.SPACE, insens, false);
1232:         if (insensUSASCII)
1233:           currentToken.unicodeAware = false;
1234:       }
1235: 
1236:     // NON-WHITESPACE OPERATOR
1237:     //  \S
1238: 
1239:     else if (unit.bk && (unit.ch == 'S')
1240:          && syntax.get (RESyntax.RE_CHAR_CLASS_ESCAPES))
1241:       {
1242:         addToken (currentToken);
1243:         currentToken =
1244:           new RETokenPOSIX (subIndex, RETokenPOSIX.SPACE, insens, true);
1245:         if (insensUSASCII)
1246:           currentToken.unicodeAware = false;
1247:       }
1248: 
1249:     // TAB ESCAPE
1250:     //  \t
1251: 
1252:     else if (unit.bk && (unit.ch == 't'))
1253:       {
1254:         addToken (currentToken);
1255:         currentToken = new RETokenChar (subIndex, '\t', false);
1256:       }
1257: 
1258:     // ALPHANUMERIC OPERATOR
1259:     //  \w
1260: 
1261:     else if (unit.bk && (unit.ch == 'w')
1262:          && syntax.get (RESyntax.RE_CHAR_CLASS_ESCAPES))
1263:       {
1264:         addToken (currentToken);
1265:         currentToken =
1266:           new RETokenPOSIX (subIndex, RETokenPOSIX.ALNUM, insens, false);
1267:         if (insensUSASCII)
1268:           currentToken.unicodeAware = false;
1269:       }
1270: 
1271:     // NON-ALPHANUMERIC OPERATOR
1272:     //  \W
1273: 
1274:     else if (unit.bk && (unit.ch == 'W')
1275:          && syntax.get (RESyntax.RE_CHAR_CLASS_ESCAPES))
1276:       {
1277:         addToken (currentToken);
1278:         currentToken =
1279:           new RETokenPOSIX (subIndex, RETokenPOSIX.ALNUM, insens, true);
1280:         if (insensUSASCII)
1281:           currentToken.unicodeAware = false;
1282:       }
1283: 
1284:     // END OF STRING OPERATOR
1285:     //  \Z, \z
1286: 
1287:     // FIXME: \Z and \z are different in that if the input string
1288:     // ends with a line terminator, \Z matches the position before
1289:     // the final terminator.  This special behavior of \Z is yet
1290:     // to be implemented.
1291: 
1292:     else if (unit.bk && (unit.ch == 'Z' || unit.ch == 'z') &&
1293:          syntax.get (RESyntax.RE_STRING_ANCHORS))
1294:       {
1295:         addToken (currentToken);
1296:         currentToken = new RETokenEnd (subIndex, null);
1297:       }
1298: 
1299:     // HEX CHARACTER, UNICODE CHARACTER
1300:     //  \x1B, \u1234
1301: 
1302:     else
1303:       if ((unit.bk && (unit.ch == 'x')
1304:            && syntax.get (RESyntax.RE_HEX_CHAR)) || (unit.bk
1305:                              && (unit.ch == 'u')
1306:                              && syntax.
1307:                              get (RESyntax.
1308:                                   RE_UNICODE_CHAR)))
1309:       {
1310:         CharExpression ce =
1311:           getCharExpression (pattern, index - 2, pLength, syntax);
1312:         if (ce == null)
1313:           throw new REException ("invalid hex character",
1314:                      REException.REG_ESCAPE, index);
1315:         index = index - 2 + ce.len;
1316:         addToken (currentToken);
1317:         currentToken = new RETokenChar (subIndex, ce.ch, insens);
1318:         if (insensUSASCII)
1319:           currentToken.unicodeAware = false;
1320:       }
1321: 
1322:     // NAMED PROPERTY
1323:     // \p{prop}, \P{prop}
1324: 
1325:     else
1326:       if ((unit.bk && (unit.ch == 'p')
1327:            && syntax.get (RESyntax.RE_NAMED_PROPERTY)) || (unit.bk
1328:                                    && (unit.ch ==
1329:                                    'P')
1330:                                    && syntax.
1331:                                    get (RESyntax.
1332:                                     RE_NAMED_PROPERTY)))
1333:       {
1334:         NamedProperty np = getNamedProperty (pattern, index - 2, pLength);
1335:         if (np == null)
1336:           throw new REException ("invalid escape sequence",
1337:                      REException.REG_ESCAPE, index);
1338:         index = index - 2 + np.len;
1339:         addToken (currentToken);
1340:         currentToken =
1341:           getRETokenNamedProperty (subIndex, np, insens, index);
1342:         if (insensUSASCII)
1343:           currentToken.unicodeAware = false;
1344:       }
1345: 
1346:     // END OF PREVIOUS MATCH
1347:     //  \G
1348: 
1349:     else if (unit.bk && (unit.ch == 'G') &&
1350:          syntax.get (RESyntax.RE_STRING_ANCHORS))
1351:       {
1352:         addToken (currentToken);
1353:         currentToken = new RETokenEndOfPreviousMatch (subIndex);
1354:       }
1355: 
1356:     // NON-SPECIAL CHARACTER (or escape to make literal)
1357:     //  c | \* for example
1358: 
1359:     else
1360:       {            // not a special character
1361:         addToken (currentToken);
1362:         currentToken = new RETokenChar (subIndex, unit.ch, insens);
1363:         if (insensUSASCII)
1364:           currentToken.unicodeAware = false;
1365:       }
1366:       }                // end while
1367: 
1368:     // Add final buffered token and an EndSub marker
1369:     addToken (currentToken);
1370: 
1371:     if (branches != null)
1372:       {
1373:     branches.
1374:       add (new
1375:            RE (firstToken, lastToken, numSubs, subIndex, minimumLength,
1376:            maximumLength));
1377:     branches.trimToSize ();    // compact the Vector
1378:     minimumLength = 0;
1379:     maximumLength = 0;
1380:     firstToken = lastToken = null;
1381:     addToken (new RETokenOneOf (subIndex, branches, false));
1382:       }
1383:     else
1384:       addToken (new RETokenEndSub (subIndex));
1385: 
1386:   }
1387: 
1388:   private static class ParseCharClassResult
1389:   {
1390:     RETokenOneOf token;
1391:     int index;
1392:     boolean returnAtAndOperator = false;
1393:   }
1394: 
1395:   /**
1396:    * Parse [...] or [^...] and make an RETokenOneOf instance.
1397:    * @param subIndex subIndex to be given to the created RETokenOneOf instance.
1398:    * @param pattern Input array of characters to be parsed.
1399:    * @param index Index pointing to the character next to the beginning '['.
1400:    * @param pLength Limit of the input array.
1401:    * @param cflags Compilation flags used to parse the pattern.
1402:    * @param pflags Flags that affect the behavior of this method.
1403:    * @param syntax Syntax used to parse the pattern.
1404:    */
1405:   private static ParseCharClassResult parseCharClass (int subIndex,
1406:                               char[]pattern,
1407:                               int index, int pLength,
1408:                               int cflags,
1409:                               RESyntax syntax,
1410:                               int pflags) throws
1411:     REException
1412:   {
1413: 
1414:     boolean insens = ((cflags & REG_ICASE) > 0);
1415:     boolean insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0);
1416:     final ArrayList < REToken > options = new ArrayList < REToken > ();
1417:       ArrayList < Object > addition = new ArrayList < Object > ();
1418:     boolean additionAndAppeared = false;
1419:     final int RETURN_AT_AND = 0x01;
1420:     boolean returnAtAndOperator = ((pflags & RETURN_AT_AND) != 0);
1421:     boolean negative = false;
1422:     char ch;
1423: 
1424:     char lastChar = 0;
1425:     boolean lastCharIsSet = false;
1426:     if (index == pLength)
1427:       throw new REException (getLocalizedMessage ("unmatched.bracket"),
1428:                  REException.REG_EBRACK, index);
1429: 
1430:     // Check for initial caret, negation
1431:     if ((ch = pattern[index]) == '^')
1432:       {
1433:     negative = true;
1434:     if (++index == pLength)
1435:       throw new REException (getLocalizedMessage ("class.no.end"),
1436:                  REException.REG_EBRACK, index);
1437:       ch = pattern[index];
1438:       }
1439: 
1440:     // Check for leading right bracket literal
1441:     if (ch == ']')
1442:       {
1443:     lastChar = ch;
1444:     lastCharIsSet = true;
1445:     if (++index == pLength)
1446:       throw new REException (getLocalizedMessage ("class.no.end"),
1447:                  REException.REG_EBRACK, index);
1448:       }
1449: 
1450:     while ((ch = pattern[index++]) != ']')
1451:       {
1452:     if ((ch == '-') && (lastCharIsSet))
1453:       {
1454:         if (index == pLength)
1455:           throw new REException (getLocalizedMessage ("class.no.end"),
1456:                      REException.REG_EBRACK, index);
1457:         if ((ch = pattern[index]) == ']')
1458:           {
1459:         RETokenChar t = new RETokenChar (subIndex, lastChar, insens);
1460:         if (insensUSASCII)
1461:           t.unicodeAware = false;
1462:         options.add (t);
1463:         lastChar = '-';
1464:           }
1465:         else
1466:           {
1467:         if ((ch == '\\')
1468:             && syntax.get (RESyntax.RE_BACKSLASH_ESCAPE_IN_LISTS))
1469:           {
1470:             CharExpression ce =
1471:               getCharExpression (pattern, index, pLength, syntax);
1472:             if (ce == null)
1473:               throw new REException ("invalid escape sequence",
1474:                          REException.REG_ESCAPE, index);
1475:             ch = ce.ch;
1476:             index = index + ce.len - 1;
1477:           }
1478:         RETokenRange t =
1479:           new RETokenRange (subIndex, lastChar, ch, insens);
1480:         if (insensUSASCII)
1481:           t.unicodeAware = false;
1482:         options.add (t);
1483:         lastChar = 0;
1484:         lastCharIsSet = false;
1485:         index++;
1486:           }
1487:       }
1488:     else if ((ch == '\\')
1489:          && syntax.get (RESyntax.RE_BACKSLASH_ESCAPE_IN_LISTS))
1490:       {
1491:         if (index == pLength)
1492:           throw new REException (getLocalizedMessage ("class.no.end"),
1493:                      REException.REG_EBRACK, index);
1494:         int posixID = -1;
1495:         boolean negate = false;
1496:         char asciiEsc = 0;
1497:         boolean asciiEscIsSet = false;
1498:         NamedProperty np = null;
1499:         if (("dswDSW".indexOf (pattern[index]) != -1)
1500:         && syntax.get (RESyntax.RE_CHAR_CLASS_ESC_IN_LISTS))
1501:           {
1502:         switch (pattern[index])
1503:           {
1504:           case 'D':
1505:             negate = true;
1506:           case 'd':
1507:             posixID = RETokenPOSIX.DIGIT;
1508:             break;
1509:           case 'S':
1510:             negate = true;
1511:           case 's':
1512:             posixID = RETokenPOSIX.SPACE;
1513:             break;
1514:           case 'W':
1515:             negate = true;
1516:           case 'w':
1517:             posixID = RETokenPOSIX.ALNUM;
1518:             break;
1519:           }
1520:           }
1521:         if (("pP".indexOf (pattern[index]) != -1)
1522:         && syntax.get (RESyntax.RE_NAMED_PROPERTY))
1523:           {
1524:         np = getNamedProperty (pattern, index - 1, pLength);
1525:         if (np == null)
1526:           throw new REException ("invalid escape sequence",
1527:                      REException.REG_ESCAPE, index);
1528:         index = index - 1 + np.len - 1;
1529:           }
1530:         else
1531:           {
1532:         CharExpression ce =
1533:           getCharExpression (pattern, index - 1, pLength, syntax);
1534:         if (ce == null)
1535:           throw new REException ("invalid escape sequence",
1536:                      REException.REG_ESCAPE, index);
1537:         asciiEsc = ce.ch;
1538:         asciiEscIsSet = true;
1539:         index = index - 1 + ce.len - 1;
1540:           }
1541:         if (lastCharIsSet)
1542:           {
1543:         RETokenChar t = new RETokenChar (subIndex, lastChar, insens);
1544:         if (insensUSASCII)
1545:           t.unicodeAware = false;
1546:         options.add (t);
1547:           }
1548: 
1549:         if (posixID != -1)
1550:           {
1551:         RETokenPOSIX t =
1552:           new RETokenPOSIX (subIndex, posixID, insens, negate);
1553:         if (insensUSASCII)
1554:           t.unicodeAware = false;
1555:         options.add (t);
1556:           }
1557:         else if (np != null)
1558:           {
1559:         RETokenNamedProperty t =
1560:           getRETokenNamedProperty (subIndex, np, insens, index);
1561:         if (insensUSASCII)
1562:           t.unicodeAware = false;
1563:         options.add (t);
1564:           }
1565:         else if (asciiEscIsSet)
1566:           {
1567:         lastChar = asciiEsc;
1568:         lastCharIsSet = true;
1569:           }
1570:         else
1571:           {
1572:         lastChar = pattern[index];
1573:         lastCharIsSet = true;
1574:           }
1575:         ++index;
1576:       }
1577:     else if ((ch == '[') && (syntax.get (RESyntax.RE_CHAR_CLASSES))
1578:          && (index < pLength) && (pattern[index] == ':'))
1579:       {
1580:         CPStringBuilder posixSet = new CPStringBuilder ();
1581:         index = getPosixSet (pattern, index + 1, posixSet);
1582:         int posixId = RETokenPOSIX.intValue (posixSet.toString ());
1583:         if (posixId != -1)
1584:           {
1585:         RETokenPOSIX t =
1586:           new RETokenPOSIX (subIndex, posixId, insens, false);
1587:         if (insensUSASCII)
1588:           t.unicodeAware = false;
1589:         options.add (t);
1590:           }
1591:       }
1592:     else if ((ch == '[') && (syntax.get (RESyntax.RE_NESTED_CHARCLASS)))
1593:       {
1594:         ParseCharClassResult result =
1595:           parseCharClass (subIndex, pattern, index, pLength, cflags,
1596:                   syntax, 0);
1597:         addition.add (result.token);
1598:         addition.add ("|");
1599:         index = result.index;
1600:       }
1601:     else if ((ch == '&') &&
1602:          (syntax.get (RESyntax.RE_NESTED_CHARCLASS)) &&
1603:          (index < pLength) && (pattern[index] == '&'))
1604:       {
1605:         if (returnAtAndOperator)
1606:           {
1607:         ParseCharClassResult result = new ParseCharClassResult ();
1608:         options.trimToSize ();
1609:         if (additionAndAppeared)
1610:           addition.add ("&");
1611:         if (addition.size () == 0)
1612:           addition = null;
1613:         result.token = new RETokenOneOf (subIndex,
1614:                          options, addition, negative);
1615:         result.index = index - 1;
1616:         result.returnAtAndOperator = true;
1617:         return result;
1618:           }
1619:         // The precedence of the operator "&&" is the lowest.
1620:         // So we postpone adding "&" until other elements
1621:         // are added. And we insert Boolean.FALSE at the
1622:         // beginning of the list of tokens following "&&".
1623:         // So, "&&[a-b][k-m]" will be stored in the Vecter
1624:         // addition in this order:
1625:         //     Boolean.FALSE, [a-b], "|", [k-m], "|", "&"
1626:         if (additionAndAppeared)
1627:           addition.add ("&");
1628:         addition.add (Boolean.FALSE);
1629:         additionAndAppeared = true;
1630: 
1631:         // The part on which "&&" operates may be either
1632:         //   (1) explicitly enclosed by []
1633:         //   or
1634:         //   (2) not enclosed by [] and terminated by the
1635:         //       next "&&" or the end of the character list.
1636:         //  Let the preceding else if block do the case (1).
1637:         //  We must do something in case of (2).
1638:         if ((index + 1 < pLength) && (pattern[index + 1] != '['))
1639:           {
1640:         ParseCharClassResult result =
1641:           parseCharClass (subIndex, pattern, index + 1, pLength,
1642:                   cflags, syntax,
1643:                   RETURN_AT_AND);
1644:         addition.add (result.token);
1645:         addition.add ("|");
1646:         // If the method returned at the next "&&", it is OK.
1647:         // Otherwise we have eaten the mark of the end of this
1648:         // character list "]".  In this case we must give back
1649:         // the end mark.
1650:         index = (result.returnAtAndOperator ?
1651:              result.index : result.index - 1);
1652:           }
1653:       }
1654:     else
1655:       {
1656:         if (lastCharIsSet)
1657:           {
1658:         RETokenChar t = new RETokenChar (subIndex, lastChar, insens);
1659:         if (insensUSASCII)
1660:           t.unicodeAware = false;
1661:         options.add (t);
1662:           }
1663:         lastChar = ch;
1664:         lastCharIsSet = true;
1665:       }
1666:     if (index == pLength)
1667:       throw new REException (getLocalizedMessage ("class.no.end"),
1668:                  REException.REG_EBRACK, index);
1669:       }                // while in list
1670:     // Out of list, index is one past ']'
1671: 
1672:     if (lastCharIsSet)
1673:       {
1674:     RETokenChar t = new RETokenChar (subIndex, lastChar, insens);
1675:     if (insensUSASCII)
1676:       t.unicodeAware = false;
1677:     options.add (t);
1678:       }
1679: 
1680:     ParseCharClassResult result = new ParseCharClassResult ();
1681:     // Create a new RETokenOneOf
1682:     options.trimToSize ();
1683:     if (additionAndAppeared)
1684:       addition.add ("&");
1685:     if (addition.size () == 0)
1686:       addition = null;
1687:     result.token = new RETokenOneOf (subIndex, options, addition, negative);
1688:     result.index = index;
1689:     return result;
1690:   }
1691: 
1692:   private static int getCharUnit (char[]input, int index, CharUnit unit,
1693:                   boolean quot) throws REException
1694:   {
1695:     unit.ch = input[index++];
1696:     unit.bk = (unit.ch == '\\'
1697:            && (!quot || index >= input.length || input[index] == 'E'));
1698:     if (unit.bk)
1699:       if (index < input.length)
1700:     unit.ch = input[index++];
1701:       else
1702:     throw new REException (getLocalizedMessage ("ends.with.backslash"),
1703:                    REException.REG_ESCAPE, index);
1704:     return index;
1705:   }
1706: 
1707:   private static int parseInt (char[]input, int pos, int len, int radix)
1708:   {
1709:     int ret = 0;
1710:     for (int i = pos; i < pos + len; i++)
1711:       {
1712:     ret = ret * radix + Character.digit (input[i], radix);
1713:       }
1714:     return ret;
1715:   }
1716: 
1717:   /**
1718:    * This class represents various expressions for a character.
1719:    * "a"      : 'a' itself.
1720:    * "\0123"  : Octal char 0123
1721:    * "\x1b"   : Hex char 0x1b
1722:    * "\u1234" : Unicode char \u1234
1723:    */
1724:   private static class CharExpression
1725:   {
1726:     /** character represented by this expression */
1727:     char ch;
1728:     /** String expression */
1729:     String expr;
1730:     /** length of this expression */
1731:     int len;
1732:     public String toString ()
1733:     {
1734:       return expr;
1735:     }
1736:   }
1737: 
1738:   private static CharExpression getCharExpression (char[]input, int pos,
1739:                            int lim, RESyntax syntax)
1740:   {
1741:     CharExpression ce = new CharExpression ();
1742:     char c = input[pos];
1743:     if (c == '\\')
1744:       {
1745:     if (pos + 1 >= lim)
1746:       return null;
1747:     c = input[pos + 1];
1748:     switch (c)
1749:       {
1750:       case 't':
1751:         ce.ch = '\t';
1752:         ce.len = 2;
1753:         break;
1754:       case 'n':
1755:         ce.ch = '\n';
1756:         ce.len = 2;
1757:         break;
1758:       case 'r':
1759:         ce.ch = '\r';
1760:         ce.len = 2;
1761:         break;
1762:       case 'x':
1763:       case 'u':
1764:         if ((c == 'x' && syntax.get (RESyntax.RE_HEX_CHAR)) ||
1765:         (c == 'u' && syntax.get (RESyntax.RE_UNICODE_CHAR)))
1766:           {
1767:         int l = 0;
1768:         int expectedLength = (c == 'x' ? 2 : 4);
1769:         for (int i = pos + 2; i < pos + 2 + expectedLength; i++)
1770:           {
1771:             if (i >= lim)
1772:               break;
1773:             if (!((input[i] >= '0' && input[i] <= '9') ||
1774:               (input[i] >= 'A' && input[i] <= 'F') ||
1775:               (input[i] >= 'a' && input[i] <= 'f')))
1776:               break;
1777:             l++;
1778:           }
1779:         if (l != expectedLength)
1780:           return null;
1781:         ce.ch = (char) (parseInt (input, pos + 2, l, 16));
1782:         ce.len = l + 2;
1783:           }
1784:         else
1785:           {
1786:         ce.ch = c;
1787:         ce.len = 2;
1788:           }
1789:         break;
1790:       case '0':
1791:         if (syntax.get (RESyntax.RE_OCTAL_CHAR))
1792:           {
1793:         int l = 0;
1794:         for (int i = pos + 2; i < pos + 2 + 3; i++)
1795:           {
1796:             if (i >= lim)
1797:               break;
1798:             if (input[i] < '0' || input[i] > '7')
1799:               break;
1800:             l++;
1801:           }
1802:         if (l == 3 && input[pos + 2] > '3')
1803:           l--;
1804:         if (l <= 0)
1805:           return null;
1806:         ce.ch = (char) (parseInt (input, pos + 2, l, 8));
1807:         ce.len = l + 2;
1808:           }
1809:         else
1810:           {
1811:         ce.ch = c;
1812:         ce.len = 2;
1813:           }
1814:         break;
1815:       default:
1816:         ce.ch = c;
1817:         ce.len = 2;
1818:         break;
1819:       }
1820:       }
1821:     else
1822:       {
1823:     ce.ch = input[pos];
1824:     ce.len = 1;
1825:       }
1826:     ce.expr = new String (input, pos, ce.len);
1827:     return ce;
1828:   }
1829: 
1830:   /**
1831:    * This class represents a substring in a pattern string expressing
1832:    * a named property.
1833:    * "\pA"      : Property named "A"
1834:    * "\p{prop}" : Property named "prop"
1835:    * "\PA"      : Property named "A" (Negated)
1836:    * "\P{prop}" : Property named "prop" (Negated)
1837:    */
1838:   private static class NamedProperty
1839:   {
1840:     /** Property name */
1841:     String name;
1842:     /** Negated or not */
1843:     boolean negate;
1844:     /** length of this expression */
1845:     int len;
1846:   }
1847: 
1848:   private static NamedProperty getNamedProperty (char[]input, int pos,
1849:                          int lim)
1850:   {
1851:     NamedProperty np = new NamedProperty ();
1852:     char c = input[pos];
1853:     if (c == '\\')
1854:       {
1855:     if (++pos >= lim)
1856:       return null;
1857:     c = input[pos++];
1858:     switch (c)
1859:       {
1860:       case 'p':
1861:         np.negate = false;
1862:         break;
1863:       case 'P':
1864:         np.negate = true;
1865:         break;
1866:       default:
1867:         return null;
1868:       }
1869:     c = input[pos++];
1870:     if (c == '{')
1871:       {
1872:         int p = -1;
1873:         for (int i = pos; i < lim; i++)
1874:           {
1875:         if (input[i] == '}')
1876:           {
1877:             p = i;
1878:             break;
1879:           }
1880:           }
1881:         if (p < 0)
1882:           return null;
1883:         int len = p - pos;
1884:         np.name = new String (input, pos, len);
1885:         np.len = len + 4;
1886:       }
1887:     else
1888:       {
1889:         np.name = new String (input, pos - 1, 1);
1890:         np.len = 3;
1891:       }
1892:     return np;
1893:       }
1894:     else
1895:       return null;
1896:   }
1897: 
1898:   private static RETokenNamedProperty getRETokenNamedProperty (int subIndex,
1899:                                    NamedProperty
1900:                                    np,
1901:                                    boolean insens,
1902:                                    int index)
1903:     throws REException
1904:   {
1905:     try
1906:     {
1907:       return new RETokenNamedProperty (subIndex, np.name, insens, np.negate);
1908:     }
1909:     catch (REException e)
1910:     {
1911:       REException ree;
1912:       ree = new REException (e.getMessage (), REException.REG_ESCAPE, index);
1913:       ree.initCause (e);
1914:       throw ree;
1915:     }
1916:   }
1917: 
1918:   /**
1919:    * Checks if the regular expression matches the input in its entirety.
1920:    *
1921:    * @param input The input text.
1922:    */
1923:   public boolean isMatch (Object input)
1924:   {
1925:     return isMatch (input, 0, 0);
1926:   }
1927: 
1928:   /**
1929:    * Checks if the input string, starting from index, is an exact match of
1930:    * this regular expression.
1931:    *
1932:    * @param input The input text.
1933:    * @param index The offset index at which the search should be begin.
1934:    */
1935:   public boolean isMatch (Object input, int index)
1936:   {
1937:     return isMatch (input, index, 0);
1938:   }
1939: 
1940: 
1941:   /**
1942:    * Checks if the input, starting from index and using the specified
1943:    * execution flags, is an exact match of this regular expression.
1944:    *
1945:    * @param input The input text.
1946:    * @param index The offset index at which the search should be begin.
1947:    * @param eflags The logical OR of any execution flags above.
1948:    */
1949:   public boolean isMatch (Object input, int index, int eflags)
1950:   {
1951:     return isMatchImpl (makeCharIndexed (input, index), index, eflags);
1952:   }
1953: 
1954:   private boolean isMatchImpl (CharIndexed input, int index, int eflags)
1955:   {
1956:     if (firstToken == null)    // Trivial case
1957:       return (input.charAt (0) == CharIndexed.OUT_OF_BOUNDS);
1958:     REMatch m = new REMatch (numSubs, index, eflags);
1959:     if (firstToken.match (input, m))
1960:       {
1961:     if (m != null)
1962:       {
1963:         if (input.charAt (m.index) == CharIndexed.OUT_OF_BOUNDS)
1964:           {
1965:         return true;
1966:           }
1967:       }
1968:       }
1969:     return false;
1970:   }
1971: 
1972:   /**
1973:    * Returns the maximum number of subexpressions in this regular expression.
1974:    * If the expression contains branches, the value returned will be the
1975:    * maximum subexpressions in any of the branches.
1976:    */
1977:   public int getNumSubs ()
1978:   {
1979:     return numSubs;
1980:   }
1981: 
1982:   // Overrides REToken.setUncle
1983:   void setUncle (REToken uncle)
1984:   {
1985:     if (lastToken != null)
1986:       {
1987:     lastToken.setUncle (uncle);
1988:       }
1989:     else
1990:       super.setUncle (uncle);    // to deal with empty subexpressions
1991:   }
1992: 
1993:   // Overrides REToken.chain
1994: 
1995:   boolean chain (REToken next)
1996:   {
1997:     super.chain (next);
1998:     setUncle (next);
1999:     return true;
2000:   }
2001: 
2002:   /**
2003:    * Returns the minimum number of characters that could possibly
2004:    * constitute a match of this regular expression.
2005:    */
2006:   public int getMinimumLength ()
2007:   {
2008:     return minimumLength;
2009:   }
2010: 
2011:   public int getMaximumLength ()
2012:   {
2013:     return maximumLength;
2014:   }
2015: 
2016:   /**
2017:    * Returns an array of all matches found in the input.
2018:    *
2019:    * If the regular expression allows the empty string to match, it will
2020:    * substitute matches at all positions except the end of the input.
2021:    *
2022:    * @param input The input text.
2023:    * @return a non-null (but possibly zero-length) array of matches
2024:    */
2025:   public REMatch[] getAllMatches (Object input)
2026:   {
2027:     return getAllMatches (input, 0, 0);
2028:   }
2029: 
2030:   /**
2031:    * Returns an array of all matches found in the input,
2032:    * beginning at the specified index position.
2033:    *
2034:    * If the regular expression allows the empty string to match, it will
2035:    * substitute matches at all positions except the end of the input.
2036:    *
2037:    * @param input The input text.
2038:    * @param index The offset index at which the search should be begin.
2039:    * @return a non-null (but possibly zero-length) array of matches
2040:    */
2041:   public REMatch[] getAllMatches (Object input, int index)
2042:   {
2043:     return getAllMatches (input, index, 0);
2044:   }
2045: 
2046:   /**
2047:    * Returns an array of all matches found in the input string,
2048:    * beginning at the specified index position and using the specified
2049:    * execution flags.
2050:    *
2051:    * If the regular expression allows the empty string to match, it will
2052:    * substitute matches at all positions except the end of the input.
2053:    *
2054:    * @param input The input text.
2055:    * @param index The offset index at which the search should be begin.
2056:    * @param eflags The logical OR of any execution flags above.
2057:    * @return a non-null (but possibly zero-length) array of matches
2058:    */
2059:   public REMatch[] getAllMatches (Object input, int index, int eflags)
2060:   {
2061:     return getAllMatchesImpl (makeCharIndexed (input, index), index, eflags);
2062:   }
2063: 
2064:   // this has been changed since 1.03 to be non-overlapping matches
2065:   private REMatch[] getAllMatchesImpl (CharIndexed input, int index,
2066:                        int eflags)
2067:   {
2068:     List < REMatch > all = new ArrayList < REMatch > ();
2069:     REMatch m = null;
2070:     while ((m = getMatchImpl (input, index, eflags, null)) != null)
2071:       {
2072:     all.add (m);
2073:     index = m.getEndIndex ();
2074:     if (m.end[0] == 0)
2075:       {            // handle pathological case of zero-length match
2076:         index++;
2077:         input.move (1);
2078:       }
2079:     else
2080:       {
2081:         input.move (m.end[0]);
2082:       }
2083:     if (!input.isValid ())
2084:       break;
2085:       }
2086:     return all.toArray (new REMatch[all.size ()]);
2087:   }
2088: 
2089:   /* Implements abstract method REToken.match() */
2090:   boolean match (CharIndexed input, REMatch mymatch)
2091:   {
2092:     input.setHitEnd (mymatch);
2093:     if (firstToken == null)
2094:       {
2095:     return next (input, mymatch);
2096:       }
2097: 
2098:     // Note the start of this subexpression
2099:     mymatch.start1[subIndex] = mymatch.index;
2100: 
2101:     return firstToken.match (input, mymatch);
2102:   }
2103: 
2104:   REMatch findMatch (CharIndexed input, REMatch mymatch)
2105:   {
2106:     if (mymatch.backtrackStack == null)
2107:       mymatch.backtrackStack = new BacktrackStack ();
2108:     boolean b = match (input, mymatch);
2109:     if (b)
2110:       {
2111:     return mymatch;
2112:       }
2113:     return null;
2114:   }
2115: 
2116:   /**
2117:    * Returns the first match found in the input.  If no match is found,
2118:    * null is returned.
2119:    *
2120:    * @param input The input text.
2121:    * @return An REMatch instance referencing the match, or null if none.
2122:    */
2123:   public REMatch getMatch (Object input)
2124:   {
2125:     return getMatch (input, 0, 0);
2126:   }
2127: 
2128:   /**
2129:    * Returns the first match found in the input, beginning
2130:    * the search at the specified index.  If no match is found,
2131:    * returns null.
2132:    *
2133:    * @param input The input text.
2134:    * @param index The offset within the text to begin looking for a match.
2135:    * @return An REMatch instance referencing the match, or null if none.
2136:    */
2137:   public REMatch getMatch (Object input, int index)
2138:   {
2139:     return getMatch (input, index, 0);
2140:   }
2141: 
2142:   /**
2143:    * Returns the first match found in the input, beginning
2144:    * the search at the specified index, and using the specified
2145:    * execution flags.  If no match is found, returns null.
2146:    *
2147:    * @param input The input text.
2148:    * @param index The offset index at which the search should be begin.
2149:    * @param eflags The logical OR of any execution flags above.
2150:    * @return An REMatch instance referencing the match, or null if none.
2151:    */
2152:   public REMatch getMatch (Object input, int index, int eflags)
2153:   {
2154:     return getMatch (input, index, eflags, null);
2155:   }
2156: 
2157:   /**
2158:    * Returns the first match found in the input, beginning the search
2159:    * at the specified index, and using the specified execution flags.
2160:    * If no match is found, returns null.  If a StringBuffer is
2161:    * provided and is non-null, the contents of the input text from the
2162:    * index to the beginning of the match (or to the end of the input,
2163:    * if there is no match) are appended to the StringBuffer.
2164:    *
2165:    * @param input The input text.
2166:    * @param index The offset index at which the search should be begin.
2167:    * @param eflags The logical OR of any execution flags above.
2168:    * @param buffer The StringBuffer to save pre-match text in.
2169:    * @return An REMatch instance referencing the match, or null if none.  */
2170:   public REMatch getMatch (Object input, int index, int eflags,
2171:                CPStringBuilder buffer)
2172:   {
2173:     return getMatchImpl (makeCharIndexed (input, index), index, eflags,
2174:              buffer);
2175:   }
2176: 
2177:   REMatch getMatchImpl (CharIndexed input, int anchor, int eflags,
2178:             CPStringBuilder buffer)
2179:   {
2180:     boolean tryEntireMatch = ((eflags & REG_TRY_ENTIRE_MATCH) != 0);
2181:     boolean doMove = ((eflags & REG_FIX_STARTING_POSITION) == 0);
2182:     RE re = (tryEntireMatch ? (RE) this.clone () : this);
2183:     if (tryEntireMatch)
2184:       {
2185:     RETokenEnd reEnd = new RETokenEnd (0, null);
2186:     reEnd.setFake (true);
2187:     re.chain (reEnd);
2188:       }
2189:     // Create a new REMatch to hold results
2190:     REMatch mymatch = new REMatch (numSubs, anchor, eflags);
2191:     do
2192:       {
2193:     /* The following potimization is commented out because
2194:        the matching should be tried even if the length of
2195:        input is obviously too short in order that
2196:        java.util.regex.Matcher#hitEnd() may work correctly.
2197:        // Optimization: check if anchor + minimumLength > length
2198:        if (minimumLength == 0 || input.charAt(minimumLength-1) != CharIndexed.OUT_OF_BOUNDS) {
2199:      */
2200:     if (re.match (input, mymatch))
2201:       {
2202:         REMatch best = mymatch;
2203:         // We assume that the match that coms first is the best.
2204:         // And the following "The longer, the better" rule has
2205:         // been commented out. The longest is not neccesarily
2206:         // the best. For example, "a" out of "aaa" is the best
2207:         // match for /a+?/.
2208:         /*
2209:            // Find best match of them all to observe leftmost longest
2210:            while ((mymatch = mymatch.next) != null) {
2211:            if (mymatch.index > best.index) {
2212:            best = mymatch;
2213:            }
2214:            }
2215:          */
2216:         best.end[0] = best.index;
2217:         best.finish (input);
2218:         input.setLastMatch (best);
2219:         return best;
2220:       }
2221:     /* End of the optimization commented out
2222:        }
2223:      */
2224:     mymatch.clear (++anchor);
2225:     // Append character to buffer if needed
2226:     if (buffer != null && input.charAt (0) != CharIndexed.OUT_OF_BOUNDS)
2227:       {
2228:         buffer.append (input.charAt (0));
2229:       }
2230:     // java.util.regex.Matcher#hitEnd() requires that the search should
2231:     // be tried at the end of input, so we use move1(1) instead of move(1) 
2232:       }
2233:     while (doMove && input.move1 (1));
2234: 
2235:     // Special handling at end of input for e.g. "$"
2236:     if (minimumLength == 0)
2237:       {
2238:     if (match (input, mymatch))
2239:       {
2240:         mymatch.finish (input);
2241:         return mymatch;
2242:       }
2243:       }
2244: 
2245:     return null;
2246:   }
2247: 
2248:   /**
2249:    * Returns an REMatchEnumeration that can be used to iterate over the
2250:    * matches found in the input text.
2251:    *
2252:    * @param input The input text.
2253:    * @return A non-null REMatchEnumeration instance.
2254:    */
2255:   public REMatchEnumeration getMatchEnumeration (Object input)
2256:   {
2257:     return getMatchEnumeration (input, 0, 0);
2258:   }
2259: 
2260: 
2261:   /**
2262:    * Returns an REMatchEnumeration that can be used to iterate over the
2263:    * matches found in the input text.
2264:    *
2265:    * @param input The input text.
2266:    * @param index The offset index at which the search should be begin.
2267:    * @return A non-null REMatchEnumeration instance, with its input cursor
2268:    *  set to the index position specified.
2269:    */
2270:   public REMatchEnumeration getMatchEnumeration (Object input, int index)
2271:   {
2272:     return getMatchEnumeration (input, index, 0);
2273:   }
2274: 
2275:   /**
2276:    * Returns an REMatchEnumeration that can be used to iterate over the
2277:    * matches found in the input text.
2278:    *
2279:    * @param input The input text.
2280:    * @param index The offset index at which the search should be begin.
2281:    * @param eflags The logical OR of any execution flags above.
2282:    * @return A non-null REMatchEnumeration instance, with its input cursor
2283:    *  set to the index position specified.
2284:    */
2285:   public REMatchEnumeration getMatchEnumeration (Object input, int index,
2286:                          int eflags)
2287:   {
2288:     return new REMatchEnumeration (this, makeCharIndexed (input, index),
2289:                    index, eflags);
2290:   }
2291: 
2292: 
2293:   /**
2294:    * Substitutes the replacement text for the first match found in the input.
2295:    *
2296:    * @param input The input text.
2297:    * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
2298:    * @return A String interpolating the substituted text.
2299:    * @see REMatch#substituteInto
2300:    */
2301:   public String substitute (Object input, String replace)
2302:   {
2303:     return substitute (input, replace, 0, 0);
2304:   }
2305: 
2306:   /**
2307:    * Substitutes the replacement text for the first match found in the input
2308:    * beginning at the specified index position.  Specifying an index
2309:    * effectively causes the regular expression engine to throw away the
2310:    * specified number of characters. 
2311:    *
2312:    * @param input The input text.
2313:    * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
2314:    * @param index The offset index at which the search should be begin.
2315:    * @return A String containing the substring of the input, starting
2316:    *   at the index position, and interpolating the substituted text.
2317:    * @see REMatch#substituteInto
2318:    */
2319:   public String substitute (Object input, String replace, int index)
2320:   {
2321:     return substitute (input, replace, index, 0);
2322:   }
2323: 
2324:   /**
2325:    * Substitutes the replacement text for the first match found in the input
2326:    * string, beginning at the specified index position and using the
2327:    * specified execution flags.
2328:    *
2329:    * @param input The input text.
2330:    * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
2331:    * @param index The offset index at which the search should be begin.
2332:    * @param eflags The logical OR of any execution flags above.
2333:    * @return A String containing the substring of the input, starting
2334:    *   at the index position, and interpolating the substituted text.
2335:    * @see REMatch#substituteInto
2336:    */
2337:   public String substitute (Object input, String replace, int index,
2338:                 int eflags)
2339:   {
2340:     return substituteImpl (makeCharIndexed (input, index), replace, index,
2341:                eflags);
2342:   }
2343: 
2344:   private String substituteImpl (CharIndexed input, String replace, int index,
2345:                  int eflags)
2346:   {
2347:     CPStringBuilder buffer = new CPStringBuilder ();
2348:     REMatch m = getMatchImpl (input, index, eflags, buffer);
2349:     if (m == null)
2350:       return buffer.toString ();
2351:     buffer.append (getReplacement (replace, m, eflags));
2352:     if (input.move (m.end[0]))
2353:       {
2354:     do
2355:       {
2356:         buffer.append (input.charAt (0));
2357:       }
2358:     while (input.move (1));
2359:       }
2360:     return buffer.toString ();
2361:   }
2362: 
2363:   /**
2364:    * Substitutes the replacement text for each non-overlapping match found 
2365:    * in the input text.
2366:    *
2367:    * @param input The input text.
2368:    * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
2369:    * @return A String interpolating the substituted text.
2370:    * @see REMatch#substituteInto
2371:    */
2372:   public String substituteAll (Object input, String replace)
2373:   {
2374:     return substituteAll (input, replace, 0, 0);
2375:   }
2376: 
2377:   /**
2378:    * Substitutes the replacement text for each non-overlapping match found 
2379:    * in the input text, starting at the specified index.
2380:    *
2381:    * If the regular expression allows the empty string to match, it will
2382:    * substitute matches at all positions except the end of the input.
2383:    *
2384:    * @param input The input text.
2385:    * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
2386:    * @param index The offset index at which the search should be begin.
2387:    * @return A String containing the substring of the input, starting
2388:    *   at the index position, and interpolating the substituted text.
2389:    * @see REMatch#substituteInto
2390:    */
2391:   public String substituteAll (Object input, String replace, int index)
2392:   {
2393:     return substituteAll (input, replace, index, 0);
2394:   }
2395: 
2396:   /**
2397:    * Substitutes the replacement text for each non-overlapping match found 
2398:    * in the input text, starting at the specified index and using the
2399:    * specified execution flags.
2400:    *
2401:    * @param input The input text.
2402:    * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
2403:    * @param index The offset index at which the search should be begin.
2404:    * @param eflags The logical OR of any execution flags above.
2405:    * @return A String containing the substring of the input, starting
2406:    *   at the index position, and interpolating the substituted text.
2407:    * @see REMatch#substituteInto
2408:    */
2409:   public String substituteAll (Object input, String replace, int index,
2410:                    int eflags)
2411:   {
2412:     return substituteAllImpl (makeCharIndexed (input, index), replace, index,
2413:                   eflags);
2414:   }
2415: 
2416:   private String substituteAllImpl (CharIndexed input, String replace,
2417:                     int index, int eflags)
2418:   {
2419:     CPStringBuilder buffer = new CPStringBuilder ();
2420:     REMatch m;
2421:     while ((m = getMatchImpl (input, index, eflags, buffer)) != null)
2422:       {
2423:     buffer.append (getReplacement (replace, m, eflags));
2424:     index = m.getEndIndex ();
2425:     if (m.end[0] == 0)
2426:       {
2427:         char ch = input.charAt (0);
2428:         if (ch != CharIndexed.OUT_OF_BOUNDS)
2429:           buffer.append (ch);
2430:         input.move (1);
2431:       }
2432:     else
2433:       {
2434:         input.move (m.end[0]);
2435:       }
2436: 
2437:     if (!input.isValid ())
2438:       break;
2439:       }
2440:     return buffer.toString ();
2441:   }
2442: 
2443:   public static String getReplacement (String replace, REMatch m, int eflags)
2444:   {
2445:     if ((eflags & REG_NO_INTERPOLATE) > 0)
2446:       return replace;
2447:     else
2448:       {
2449:     if ((eflags & REG_REPLACE_USE_BACKSLASHESCAPE) > 0)
2450:       {
2451:         CPStringBuilder sb = new CPStringBuilder ();
2452:         int l = replace.length ();
2453:         for (int i = 0; i < l; i++)
2454:           {
2455:         char c = replace.charAt (i);
2456:         switch (c)
2457:           {
2458:           case '\\':
2459:             i++;
2460:             // Let StringIndexOutOfBoundsException be thrown.
2461:             sb.append (replace.charAt (i));
2462:             break;
2463:           case '$':
2464:             int i1 = i + 1;
2465:             while (i1 < replace.length () &&
2466:                Character.isDigit (replace.charAt (i1)))
2467:               i1++;
2468:             sb.append (m.substituteInto (replace.substring (i, i1)));
2469:             i = i1 - 1;
2470:             break;
2471:           default:
2472:             sb.append (c);
2473:           }
2474:           }
2475:         return sb.toString ();
2476:       }
2477:     else
2478:       return m.substituteInto (replace);
2479:       }
2480:   }
2481: 
2482:   /* Helper function for constructor */
2483:   private void addToken (REToken next)
2484:   {
2485:     if (next == null)
2486:       return;
2487:     minimumLength += next.getMinimumLength ();
2488:     int nmax = next.getMaximumLength ();
2489:     if (nmax < Integer.MAX_VALUE && maximumLength < Integer.MAX_VALUE)
2490:       maximumLength += nmax;
2491:     else
2492:       maximumLength = Integer.MAX_VALUE;
2493: 
2494:     if (firstToken == null)
2495:       {
2496:     lastToken = firstToken = next;
2497:       }
2498:     else
2499:       {
2500:     // if chain returns false, it "rejected" the token due to
2501:     // an optimization, and next was combined with lastToken
2502:     if (lastToken.chain (next))
2503:       {
2504:         lastToken = next;
2505:       }
2506:       }
2507:   }
2508: 
2509:   private static REToken setRepeated (REToken current, int min, int max,
2510:                       int index) throws REException
2511:   {
2512:     if (current == null)
2513:       throw new REException (getLocalizedMessage ("repeat.no.token"),
2514:                  REException.REG_BADRPT, index);
2515:       return new RETokenRepeated (current.subIndex, current, min, max);
2516:   }
2517: 
2518:   private static int getPosixSet (char[]pattern, int index,
2519:                   CPStringBuilder buf)
2520:   {
2521:     // Precondition: pattern[index-1] == ':'
2522:     // we will return pos of closing ']'.
2523:     int i;
2524:     for (i = index; i < (pattern.length - 1); i++)
2525:       {
2526:     if ((pattern[i] == ':') && (pattern[i + 1] == ']'))
2527:       return i + 2;
2528:     buf.append (pattern[i]);
2529:       }
2530:     return index;        // didn't match up
2531:   }
2532: 
2533:   private int getMinMax (char[]input, int index, IntPair minMax,
2534:              RESyntax syntax) throws REException
2535:   {
2536:     // Precondition: input[index-1] == '{', minMax != null
2537: 
2538:     boolean mustMatch = !syntax.get (RESyntax.RE_NO_BK_BRACES);
2539:     int startIndex = index;
2540:     if (index == input.length)
2541:       {
2542:     if (mustMatch)
2543:       throw new REException (getLocalizedMessage ("unmatched.brace"),
2544:                  REException.REG_EBRACE, index);
2545:     else
2546:       return startIndex;
2547:       }
2548: 
2549:     int min, max = 0;
2550:     CharUnit unit = new CharUnit ();
2551:     CPStringBuilder buf = new CPStringBuilder ();
2552: 
2553:     // Read string of digits
2554:     do
2555:       {
2556:     index = getCharUnit (input, index, unit, false);
2557:     if (Character.isDigit (unit.ch))
2558:       buf.append (unit.ch);
2559:       }
2560:     while ((index != input.length) && Character.isDigit (unit.ch));
2561: 
2562:     // Check for {} tomfoolery
2563:     if (buf.length () == 0)
2564:       {
2565:     if (mustMatch)
2566:       throw new REException (getLocalizedMessage ("interval.error"),
2567:                  REException.REG_EBRACE, index);
2568:     else
2569:     return startIndex;
2570:       }
2571: 
2572:     min = Integer.parseInt (buf.toString ());
2573: 
2574:     if ((unit.ch == '}') && (syntax.get (RESyntax.RE_NO_BK_BRACES) ^ unit.bk))
2575:       max = min;
2576:     else if (index == input.length)
2577:       if (mustMatch)
2578:     throw new REException (getLocalizedMessage ("interval.no.end"),
2579:                    REException.REG_EBRACE, index);
2580:     else
2581:     return startIndex;
2582:     else
2583:   if ((unit.ch == ',') && !unit.bk)
2584:     {
2585:       buf = new CPStringBuilder ();
2586:       // Read string of digits
2587:       while (((index =
2588:            getCharUnit (input, index, unit, false)) != input.length)
2589:          && Character.isDigit (unit.ch))
2590:     buf.append (unit.ch);
2591: 
2592:       if (!
2593:       ((unit.ch == '}')
2594:        && (syntax.get (RESyntax.RE_NO_BK_BRACES) ^ unit.bk)))
2595:     if (mustMatch)
2596:       throw new REException (getLocalizedMessage ("interval.error"),
2597:                  REException.REG_EBRACE, index);
2598:       else
2599:       return startIndex;
2600: 
2601:       // This is the case of {x,}
2602:       if (buf.length () == 0)
2603:     max = Integer.MAX_VALUE;
2604:       else
2605:     max = Integer.parseInt (buf.toString ());
2606:     }
2607:   else if (mustMatch)
2608:     throw new REException (getLocalizedMessage ("interval.error"),
2609:                REException.REG_EBRACE, index);
2610:   else
2611:   return startIndex;
2612: 
2613:   // We know min and max now, and they are valid.
2614: 
2615:   minMax.first = min;
2616:   minMax.second = max;
2617: 
2618:   // return the index following the '}'
2619:   return index;
2620:   }
2621: 
2622:    /**
2623:     * Return a human readable form of the compiled regular expression,
2624:     * useful for debugging.
2625:     */
2626:   public String toString ()
2627:   {
2628:     CPStringBuilder sb = new CPStringBuilder ();
2629:     dump (sb);
2630:     return sb.toString ();
2631:   }
2632: 
2633:   void dump (CPStringBuilder os)
2634:   {
2635:     os.append ("(?#startRE subIndex=" + subIndex + ")");
2636:     if (subIndex == 0)
2637:       os.append ("?:");
2638:     if (firstToken != null)
2639:       firstToken.dumpAll (os);
2640:     if (subIndex == 0)
2641:       os.append (")");
2642:     os.append ("(?#endRE subIndex=" + subIndex + ")");
2643:   }
2644: 
2645:   // Cast input appropriately or throw exception
2646:   // This method was originally a private method, but has been made
2647:   // public because java.util.regex.Matcher uses this.
2648:   public static CharIndexed makeCharIndexed (Object input, int index)
2649:   {
2650:     // The case where input is already a CharIndexed is supposed
2651:     // be the most likely because this is the case with
2652:     // java.util.regex.Matcher.
2653:     // We could let a String or a CharSequence fall through
2654:     // to final input, but since it'a very likely input type, 
2655:     // we check it first.
2656:     if (input instanceof CharIndexed)
2657:       {
2658:     CharIndexed ci = (CharIndexed) input;
2659:     ci.setAnchor (index);
2660:     return ci;
2661:       }
2662:     else if (input instanceof CharSequence)
2663:       return new CharIndexedCharSequence ((CharSequence) input, index);
2664:     else if (input instanceof String)
2665:       return new CharIndexedString ((String) input, index);
2666:     else if (input instanceof char[])
2667:       return new CharIndexedCharArray ((char[]) input, index);
2668:     else if (input instanceof StringBuffer)
2669:       return new CharIndexedStringBuffer ((StringBuffer) input, index);
2670:     else if (input instanceof InputStream)
2671:       return new CharIndexedInputStream ((InputStream) input, index);
2672:     else
2673:       return new CharIndexedString (input.toString (), index);
2674:   }
2675: }