Source for java.util.regex.Matcher

   1: /* Matcher.java -- Instance of a regular expression applied to a char sequence.
   2:    Copyright (C) 2002, 2004, 2006 Free Software Foundation, Inc.
   3: 
   4: This file is part of GNU Classpath.
   5: 
   6: GNU Classpath is free software; you can redistribute it and/or modify
   7: it under the terms of the GNU General Public License as published by
   8: the Free Software Foundation; either version 2, or (at your option)
   9: any later version.
  10: 
  11: GNU Classpath is distributed in the hope that it will be useful, but
  12: WITHOUT ANY WARRANTY; without even the implied warranty of
  13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14: General Public License for more details.
  15: 
  16: You should have received a copy of the GNU General Public License
  17: along with GNU Classpath; see the file COPYING.  If not, write to the
  18: Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  19: 02110-1301 USA.
  20: 
  21: Linking this library statically or dynamically with other modules is
  22: making a combined work based on this library.  Thus, the terms and
  23: conditions of the GNU General Public License cover the whole
  24: combination.
  25: 
  26: As a special exception, the copyright holders of this library give you
  27: permission to link this library with independent modules to produce an
  28: executable, regardless of the license terms of these independent
  29: modules, and to copy and distribute the resulting executable under
  30: terms of your choice, provided that you also meet, for each linked
  31: independent module, the terms and conditions of the license of that
  32: module.  An independent module is a module which is not derived from
  33: or based on this library.  If you modify this library, you may extend
  34: this exception to your version of the library, but you are not
  35: obligated to do so.  If you do not wish to do so, delete this
  36: exception statement from your version. */
  37: 
  38: 
  39: package java.util.regex;
  40: 
  41: import gnu.java.lang.CPStringBuilder;
  42: 
  43: import gnu.java.util.regex.CharIndexed;
  44: import gnu.java.util.regex.RE;
  45: import gnu.java.util.regex.REMatch;
  46: 
  47: /**
  48:  * Instance of a regular expression applied to a char sequence.
  49:  *
  50:  * @since 1.4
  51:  */
  52: public final class Matcher implements MatchResult
  53: {
  54:   private Pattern pattern;
  55:   private CharSequence input;
  56:   // We use CharIndexed as an input object to the getMatch method in order
  57:   // that /\G/ (the end of the previous match) may work.  The information
  58:   // of the previous match is stored in the CharIndexed object.
  59:   private CharIndexed inputCharIndexed;
  60:   private int position;
  61:   private int appendPosition;
  62:   private REMatch match;
  63: 
  64:   /**
  65:    * The start of the region of the input on which to match.
  66:    */
  67:   private int regionStart;
  68: 
  69:   /**
  70:    * The end of the region of the input on which to match.
  71:    */
  72:   private int regionEnd;
  73:   
  74:   /**
  75:    * True if the match process should look beyond the 
  76:    * region marked by regionStart to regionEnd when
  77:    * performing lookAhead, lookBehind and boundary
  78:    * matching.
  79:    */
  80:   private boolean transparentBounds;
  81: 
  82:   /**
  83:    * The flags that affect the anchoring bounds.
  84:    * If {@link #hasAnchoringBounds()} is {@code true},
  85:    * the match process will honour the
  86:    * anchoring bounds: ^, \A, \Z, \z and $.  If
  87:    * {@link #hasAnchoringBounds()} is {@code false},
  88:    * the anchors are ignored and appropriate flags,
  89:    * stored in this variable, are used to provide this
  90:    * behaviour.
  91:    */
  92:   private int anchoringBounds;
  93: 
  94:   Matcher(Pattern pattern, CharSequence input)
  95:   {
  96:     this.pattern = pattern;
  97:     this.input = input;
  98:     this.inputCharIndexed = RE.makeCharIndexed(input, 0);
  99:     regionStart = 0;
 100:     regionEnd = input.length();
 101:     transparentBounds = false;
 102:     anchoringBounds = 0;
 103:   }
 104:   
 105:   /**
 106:    * @param sb The target string buffer
 107:    * @param replacement The replacement string
 108:    *
 109:    * @exception IllegalStateException If no match has yet been attempted,
 110:    * or if the previous match operation failed
 111:    * @exception IndexOutOfBoundsException If the replacement string refers
 112:    * to a capturing group that does not exist in the pattern
 113:    */
 114:   public Matcher appendReplacement (StringBuffer sb, String replacement)
 115:     throws IllegalStateException
 116:   {
 117:     assertMatchOp();
 118:     sb.append(input.subSequence(appendPosition,
 119:                 match.getStartIndex()).toString());
 120:     sb.append(RE.getReplacement(replacement, match,
 121:     RE.REG_REPLACE_USE_BACKSLASHESCAPE));
 122:     appendPosition = match.getEndIndex();
 123:     return this;
 124:   }
 125: 
 126:   /**
 127:    * @param sb The target string buffer
 128:    */
 129:   public StringBuffer appendTail (StringBuffer sb)
 130:   {
 131:     sb.append(input.subSequence(appendPosition, input.length()).toString());
 132:     return sb;
 133:   }
 134:  
 135:   /**
 136:    * @exception IllegalStateException If no match has yet been attempted,
 137:    * or if the previous match operation failed
 138:    */
 139:   public int end ()
 140:     throws IllegalStateException
 141:   {
 142:     assertMatchOp();
 143:     return match.getEndIndex();
 144:   }
 145:   
 146:   /**
 147:    * @param group The index of a capturing group in this matcher's pattern
 148:    *
 149:    * @exception IllegalStateException If no match has yet been attempted,
 150:    * or if the previous match operation failed
 151:    * @exception IndexOutOfBoundsException If the replacement string refers
 152:    * to a capturing group that does not exist in the pattern
 153:    */
 154:   public int end (int group)
 155:     throws IllegalStateException
 156:   {
 157:     assertMatchOp();
 158:     return match.getEndIndex(group);
 159:   }
 160:  
 161:   public boolean find ()
 162:   {
 163:     boolean first = (match == null);
 164:     if (transparentBounds || (regionStart == 0 && regionEnd == input.length()))
 165:       match = pattern.getRE().getMatch(inputCharIndexed, position, anchoringBounds);
 166:     else
 167:       match = pattern.getRE().getMatch(input.subSequence(regionStart, regionEnd),
 168:                        position, anchoringBounds);
 169:     if (match != null)
 170:       {
 171:     int endIndex = match.getEndIndex();
 172:     // Are we stuck at the same position?
 173:     if (!first && endIndex == position)
 174:       {        
 175:         match = null;
 176:         // Not at the end of the input yet?
 177:         if (position < input.length() - 1)
 178:           {
 179:         position++;
 180:         return find(position);
 181:           }
 182:         else
 183:           return false;
 184:       }
 185:     position = endIndex;
 186:     return true;
 187:       }
 188:     return false;
 189:   } 
 190: 
 191:   /**
 192:    * @param start The index to start the new pattern matching
 193:    *
 194:    * @exception IndexOutOfBoundsException If the replacement string refers
 195:    * to a capturing group that does not exist in the pattern
 196:    */
 197:   public boolean find (int start)
 198:   {
 199:     if (transparentBounds || (regionStart == 0 && regionEnd == input.length()))
 200:       match = pattern.getRE().getMatch(inputCharIndexed, start, anchoringBounds);
 201:     else
 202:       match = pattern.getRE().getMatch(input.subSequence(regionStart, regionEnd),
 203:                        start, anchoringBounds);
 204:     if (match != null)
 205:       {
 206:     position = match.getEndIndex();
 207:     return true;
 208:       }
 209:     return false;
 210:   }
 211:  
 212:   /**
 213:    * @exception IllegalStateException If no match has yet been attempted,
 214:    * or if the previous match operation failed
 215:    */
 216:   public String group ()
 217:   {
 218:     assertMatchOp();
 219:     return match.toString();
 220:   }
 221:   
 222:   /**
 223:    * @param group The index of a capturing group in this matcher's pattern
 224:    *
 225:    * @exception IllegalStateException If no match has yet been attempted,
 226:    * or if the previous match operation failed
 227:    * @exception IndexOutOfBoundsException If the replacement string refers
 228:    * to a capturing group that does not exist in the pattern
 229:    */
 230:   public String group (int group)
 231:     throws IllegalStateException
 232:   {
 233:     assertMatchOp();
 234:     return match.toString(group);
 235:   }
 236: 
 237:   /**
 238:    * @param replacement The replacement string
 239:    */
 240:   public String replaceFirst (String replacement)
 241:   {
 242:     reset();
 243:     // Semantics might not quite match
 244:     return pattern.getRE().substitute(input, replacement, position,
 245:     RE.REG_REPLACE_USE_BACKSLASHESCAPE);
 246:   }
 247: 
 248:   /**
 249:    * @param replacement The replacement string
 250:    */
 251:   public String replaceAll (String replacement)
 252:   {
 253:     reset();
 254:     return pattern.getRE().substituteAll(input, replacement, position,
 255:     RE.REG_REPLACE_USE_BACKSLASHESCAPE);
 256:   }
 257:   
 258:   public int groupCount ()
 259:   {
 260:     return pattern.getRE().getNumSubs();
 261:   }
 262:  
 263:   public boolean lookingAt ()
 264:   {
 265:     if (transparentBounds || (regionStart == 0 && regionEnd == input.length()))
 266:       match = pattern.getRE().getMatch(inputCharIndexed, regionStart,
 267:                        anchoringBounds|RE.REG_FIX_STARTING_POSITION|RE.REG_ANCHORINDEX);
 268:     else
 269:       match = pattern.getRE().getMatch(input.subSequence(regionStart, regionEnd), 0,
 270:                        anchoringBounds|RE.REG_FIX_STARTING_POSITION);
 271:     if (match != null)
 272:       {
 273:     if (match.getStartIndex() == 0)
 274:       {
 275:         position = match.getEndIndex();
 276:         return true;
 277:       }
 278:     match = null;
 279:       }
 280:     return false;
 281:   }
 282:   
 283:   /**
 284:    * Attempts to match the entire input sequence against the pattern. 
 285:    *
 286:    * If the match succeeds then more information can be obtained via the
 287:    * start, end, and group methods.
 288:    *
 289:    * @see #start()
 290:    * @see #end()
 291:    * @see #group()
 292:    */
 293:   public boolean matches ()
 294:   {
 295:     if (transparentBounds || (regionStart == 0 && regionEnd == input.length()))
 296:       match = pattern.getRE().getMatch(inputCharIndexed, regionStart,
 297:                        anchoringBounds|RE.REG_TRY_ENTIRE_MATCH|RE.REG_FIX_STARTING_POSITION|RE.REG_ANCHORINDEX);
 298:     else
 299:       match = pattern.getRE().getMatch(input.subSequence(regionStart, regionEnd), 0,
 300:                        anchoringBounds|RE.REG_TRY_ENTIRE_MATCH|RE.REG_FIX_STARTING_POSITION);
 301:     if (match != null)
 302:       {
 303:     if (match.getStartIndex() == 0)
 304:       {
 305:         position = match.getEndIndex();
 306:         if (position == input.length())
 307:             return true;
 308:       }
 309:     match = null;
 310:       }
 311:     return false;
 312:   }
 313:   
 314:   /**
 315:    * Returns the Pattern that is interpreted by this Matcher
 316:    */
 317:   public Pattern pattern ()
 318:   {
 319:     return pattern;
 320:   }
 321:   
 322:   /**
 323:    * Resets the internal state of the matcher, including
 324:    * resetting the region to its default state of encompassing
 325:    * the whole input.  The state of {@link #hasTransparentBounds()}
 326:    * and {@link #hasAnchoringBounds()} are unaffected.
 327:    *
 328:    * @return a reference to this matcher.
 329:    * @see #regionStart()
 330:    * @see #regionEnd()
 331:    * @see #hasTransparentBounds()
 332:    * @see #hasAnchoringBounds()
 333:    */
 334:   public Matcher reset ()
 335:   {
 336:     position = 0;
 337:     match = null;
 338:     regionStart = 0;
 339:     regionEnd = input.length();
 340:     appendPosition = 0;
 341:     return this;
 342:   }
 343:   
 344:   /**
 345:    * Resets the internal state of the matcher, including
 346:    * resetting the region to its default state of encompassing
 347:    * the whole input.  The state of {@link #hasTransparentBounds()}
 348:    * and {@link #hasAnchoringBounds()} are unaffected.
 349:    *
 350:    * @param input The new input character sequence.
 351:    * @return a reference to this matcher.
 352:    * @see #regionStart()
 353:    * @see #regionEnd()
 354:    * @see #hasTransparentBounds()
 355:    * @see #hasAnchoringBounds()
 356:    */
 357:   public Matcher reset (CharSequence input)
 358:   {
 359:     this.input = input;
 360:     this.inputCharIndexed = RE.makeCharIndexed(input, 0);
 361:     return reset();
 362:   }
 363:   
 364:   /**
 365:    * @return the index of a capturing group in this matcher's pattern
 366:    *
 367:    * @exception IllegalStateException If no match has yet been attempted,
 368:    * or if the previous match operation failed
 369:    */
 370:   public int start ()
 371:     throws IllegalStateException
 372:   {
 373:     assertMatchOp();
 374:     return match.getStartIndex();
 375:   }
 376: 
 377:   /**
 378:    * @param group The index of a capturing group in this matcher's pattern
 379:    *
 380:    * @exception IllegalStateException If no match has yet been attempted,
 381:    * or if the previous match operation failed
 382:    * @exception IndexOutOfBoundsException If the replacement string refers
 383:    * to a capturing group that does not exist in the pattern
 384:    */
 385:   public int start (int group)
 386:     throws IllegalStateException
 387:   {
 388:     assertMatchOp();
 389:     return match.getStartIndex(group);
 390:   }
 391: 
 392:   /**
 393:    * @return True if and only if the matcher hit the end of input.
 394:    * @since 1.5
 395:    */
 396:   public boolean hitEnd()
 397:   {
 398:     return inputCharIndexed.hitEnd();
 399:   }
 400: 
 401:   /**
 402:    * @return A string expression of this matcher.
 403:    */
 404:   public String toString()
 405:   {
 406:     CPStringBuilder sb = new CPStringBuilder();
 407:     sb.append(this.getClass().getName())
 408:       .append("[pattern=").append(pattern.pattern())
 409:       .append(" region=").append(regionStart).append(",").append(regionEnd)
 410:       .append(" anchoringBounds=").append(anchoringBounds == 0)
 411:       .append(" transparentBounds=").append(transparentBounds)
 412:       .append(" lastmatch=").append(match == null ? "" : match.toString())
 413:       .append("]");
 414:     return sb.toString();
 415:   }
 416: 
 417:   private void assertMatchOp()
 418:   {
 419:     if (match == null) throw new IllegalStateException();
 420:   }
 421: 
 422:   /**
 423:    * <p>
 424:    * Defines the region of the input on which to match.
 425:    * By default, the {@link Matcher} attempts to match
 426:    * the whole string (from 0 to the length of the input),
 427:    * but a region between {@code start} (inclusive) and
 428:    * {@code end} (exclusive) on which to match may instead
 429:    * be defined using this method.
 430:    * </p>
 431:    * <p>
 432:    * The behaviour of region matching is further affected
 433:    * by the use of transparent or opaque bounds (see
 434:    * {@link #useTransparentBounds(boolean)}) and whether or not
 435:    * anchors ({@code ^} and {@code $}) are in use
 436:    * (see {@link #useAnchoringBounds(boolean)}).  With transparent
 437:    * bounds, the matcher is aware of input outside the bounds
 438:    * set by this method, whereas, with opaque bounds (the default)
 439:    * only the input within the bounds is used.  The use of
 440:    * anchors are affected by this setting; with transparent
 441:    * bounds, anchors will match the beginning of the real input,
 442:    * while with opaque bounds they match the beginning of the
 443:    * region.  {@link #useAnchoringBounds(boolean)} can be used
 444:    * to turn on or off the matching of anchors.
 445:    * </p>
 446:    *
 447:    * @param start the start of the region (inclusive).
 448:    * @param end the end of the region (exclusive).
 449:    * @return a reference to this matcher.
 450:    * @throws IndexOutOfBoundsException if either {@code start} or
 451:    *                                   {@code end} are less than zero,
 452:    *                                   if either {@code start} or
 453:    *                                   {@code end} are greater than the
 454:    *                                   length of the input, or if
 455:    *                                   {@code start} is greater than
 456:    *                                   {@code end}.
 457:    * @see #regionStart()
 458:    * @see #regionEnd()
 459:    * @see #hasTransparentBounds()
 460:    * @see #useTransparentBounds(boolean)
 461:    * @see #hasAnchoringBounds()
 462:    * @see #useAnchoringBounds(boolean)
 463:    * @since 1.5
 464:    */
 465:   public Matcher region(int start, int end)
 466:   {
 467:     int length = input.length();
 468:     if (start < 0)
 469:       throw new IndexOutOfBoundsException("The start position was less than zero.");
 470:     if (start >= length)
 471:       throw new IndexOutOfBoundsException("The start position is after the end of the input.");
 472:     if (end < 0)
 473:       throw new IndexOutOfBoundsException("The end position was less than zero.");
 474:     if (end > length)
 475:       throw new IndexOutOfBoundsException("The end position is after the end of the input.");
 476:     if (start > end)
 477:       throw new IndexOutOfBoundsException("The start position is after the end position.");
 478:     reset();
 479:     regionStart = start;
 480:     regionEnd = end;
 481:     return this;
 482:   }
 483: 
 484:   /**
 485:    * The start of the region on which to perform matches (inclusive).
 486:    *
 487:    * @return the start index of the region.
 488:    * @see #region(int,int)
 489:    * #see #regionEnd()
 490:    * @since 1.5
 491:    */
 492:   public int regionStart()
 493:   {
 494:     return regionStart;
 495:   }
 496:   
 497:   /**
 498:    * The end of the region on which to perform matches (exclusive).
 499:    *
 500:    * @return the end index of the region.
 501:    * @see #region(int,int)
 502:    * @see #regionStart()
 503:    * @since 1.5
 504:    */
 505:   public int regionEnd()
 506:   {
 507:     return regionEnd;
 508:   }
 509: 
 510:   /**
 511:    * Returns true if the bounds of the region marked by
 512:    * {@link #regionStart()} and {@link #regionEnd()} are
 513:    * transparent.  When these bounds are transparent, the
 514:    * matching process can look beyond them to perform
 515:    * lookahead, lookbehind and boundary matching operations.
 516:    * By default, the bounds are opaque.
 517:    *
 518:    * @return true if the bounds of the matching region are
 519:    *         transparent.
 520:    * @see #useTransparentBounds(boolean)
 521:    * @see #region(int,int)
 522:    * @see #regionStart()
 523:    * @see #regionEnd()
 524:    * @since 1.5
 525:    */
 526:   public boolean hasTransparentBounds()
 527:   {
 528:     return transparentBounds;
 529:   }
 530: 
 531:   /**
 532:    * Sets the transparency of the bounds of the region
 533:    * marked by {@link #regionStart()} and {@link #regionEnd()}.
 534:    * A value of {@code true} makes the bounds transparent,
 535:    * so the matcher can see beyond them to perform lookahead,
 536:    * lookbehind and boundary matching operations.  A value
 537:    * of {@code false} (the default) makes the bounds opaque,
 538:    * restricting the match to the input region denoted
 539:    * by {@link #regionStart()} and {@link #regionEnd()}.
 540:    *
 541:    * @param transparent true if the bounds should be transparent.
 542:    * @return a reference to this matcher.
 543:    * @see #hasTransparentBounds()
 544:    * @see #region(int,int)
 545:    * @see #regionStart()
 546:    * @see #regionEnd()
 547:    * @since 1.5
 548:    */
 549:   public Matcher useTransparentBounds(boolean transparent)
 550:   {
 551:     transparentBounds = transparent;
 552:     return this;
 553:   }
 554: 
 555:   /**
 556:    * Returns true if the matcher will honour the use of
 557:    * the anchoring bounds: {@code ^}, {@code \A}, {@code \Z},
 558:    * {@code \z} and {@code $}.  By default, the anchors
 559:    * are used.  Note that the effect of the anchors is
 560:    * also affected by {@link #hasTransparentBounds()}.
 561:    *
 562:    * @return true if the matcher will attempt to match
 563:    *         the anchoring bounds.
 564:    * @see #useAnchoringBounds(boolean)
 565:    * @see #hasTransparentBounds()
 566:    * @since 1.5
 567:    */
 568:   public boolean hasAnchoringBounds()
 569:   {
 570:     return anchoringBounds == 0;
 571:   }
 572: 
 573:   /**
 574:    * Enables or disables the use of the anchoring bounds:
 575:    * {@code ^}, {@code \A}, {@code \Z}, {@code \z} and
 576:    * {@code $}. By default, their use is enabled.  When
 577:    * disabled, the matcher will not attempt to match
 578:    * the anchors.
 579:    *
 580:    * @param useAnchors true if anchoring bounds should be used.
 581:    * @return a reference to this matcher.
 582:    * @since 1.5
 583:    * @see #hasAnchoringBounds()
 584:    */
 585:   public Matcher useAnchoringBounds(boolean useAnchors)
 586:   {
 587:     if (useAnchors)
 588:       anchoringBounds = 0;
 589:     else
 590:       anchoringBounds = RE.REG_NOTBOL|RE.REG_NOTEOL;
 591:     return this;
 592:   }
 593: 
 594:   /**
 595:    * Returns a read-only snapshot of the current state of
 596:    * the {@link Matcher} as a {@link MatchResult}.  Any
 597:    * subsequent changes to this instance are not reflected
 598:    * in the returned {@link MatchResult}.
 599:    *
 600:    * @return a {@link MatchResult} instance representing the
 601:    *         current state of the {@link Matcher}.
 602:    */
 603:   public MatchResult toMatchResult()
 604:   {
 605:     Matcher snapshot = new Matcher(pattern, input);
 606:     snapshot.match = (REMatch) match.clone();
 607:     return snapshot;
 608:   }
 609: 
 610: }