GNU Classpath (0.18) | ||
Frames | No Frames |
1: /* Parser.java -- HTML parser 2: Copyright (C) 2005 Free Software Foundation, Inc. 3: 4: This file is part of GNU Classpath. 5: 6: GNU Classpath is free software; you can redistribute it and/or modify 7: it under the terms of the GNU General Public License as published by 8: the Free Software Foundation; either version 2, or (at your option) 9: any later version. 10: 11: GNU Classpath is distributed in the hope that it will be useful, but 12: WITHOUT ANY WARRANTY; without even the implied warranty of 13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14: General Public License for more details. 15: 16: You should have received a copy of the GNU General Public License 17: along with GNU Classpath; see the file COPYING. If not, write to the 18: Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 19: 02110-1301 USA. 20: 21: Linking this library statically or dynamically with other modules is 22: making a combined work based on this library. Thus, the terms and 23: conditions of the GNU General Public License cover the whole 24: combination. 25: 26: As a special exception, the copyright holders of this library give you 27: permission to link this library with independent modules to produce an 28: executable, regardless of the license terms of these independent 29: modules, and to copy and distribute the resulting executable under 30: terms of your choice, provided that you also meet, for each linked 31: independent module, the terms and conditions of the license of that 32: module. An independent module is a module which is not derived from 33: or based on this library. If you modify this library, you may extend 34: this exception to your version of the library, but you are not 35: obligated to do so. If you do not wish to do so, delete this 36: exception statement from your version. */ 37: 38: 39: package javax.swing.text.html.parser; 40: 41: import java.io.IOException; 42: import java.io.Reader; 43: 44: import javax.swing.text.ChangedCharSetException; 45: import javax.swing.text.SimpleAttributeSet; 46: 47: /* 48: * FOR DEVELOPERS: To avoid regression, please run the package test 49: * textsuite/javax.swing.text.html.parser/AllParserTests after your 50: * modifications. 51: */ 52: 53: /** 54: * <p>A simple error-tolerant HTML parser that uses a DTD document 55: * to access data on the possible tokens, arguments and syntax.</p> 56: * <p> The parser reads an HTML content from a Reader and calls various 57: * notifying methods (which should be overridden in a subclass) 58: * when tags or data are encountered.</p> 59: * <p>Some HTML elements need no opening or closing tags. The 60: * task of this parser is to invoke the tag handling methods also when 61: * the tags are not explicitly specified and must be supposed using 62: * information, stored in the DTD. 63: * For example, parsing the document 64: * <p><table><tr><td>a<td>b<td>c</tr> <br> 65: * will invoke exactly the handling methods exactly in the same order 66: * (and with the same parameters) as if parsing the document: <br> 67: * <em><html><head></head><body><table>< 68: * tbody></em><tr><td>a<em></td></em><td>b<em> 69: * </td></em><td>c<em></td></tr></em>< 70: * <em>/tbody></table></body></html></em></p> 71: * (supposed tags are given in italics). The parser also supports 72: * obsolete elements of HTML syntax.<p> 73: * </p> 74: * @author Audrius Meskauskas, Lithuania (AudriusA@Bioinformatics.org) 75: */ 76: public class Parser 77: implements DTDConstants 78: { 79: /** 80: * The document template description that will be used to parse the documents. 81: */ 82: protected DTD dtd; 83: 84: /** 85: * The value of this field determines whether or not the Parser will be 86: * strict in enforcing SGML compatibility. The default value is false, 87: * stating that the parser should do everything to parse and get at least 88: * some information even from the incorrectly written HTML input. 89: */ 90: protected boolean strict; 91: 92: /** 93: * The package level reference to the working HTML parser in this 94: * implementation. 95: */ 96: final gnu.javax.swing.text.html.parser.support.Parser gnu; 97: 98: /** 99: * Creates a new parser that uses the given DTD to access data on the 100: * possible tokens, arguments and syntax. There is no single - step way 101: * to get a default DTD; you must either refer to the implementation - 102: * specific packages, write your own DTD or obtain the working instance 103: * of parser in other way, for example, by calling 104: * {@link javax.swing.text.html.HTMLEditorKit#getParser() }. 105: * @param a_dtd A DTD to use. 106: */ 107: public Parser(DTD a_dtd) 108: { 109: dtd = a_dtd; 110: 111: final Parser j = this; 112: 113: gnu = 114: new gnu.javax.swing.text.html.parser.support.Parser(dtd) 115: { 116: protected final void handleComment(char[] comment) 117: { 118: j.handleComment(comment); 119: } 120: 121: protected final void handleEOFInComment() 122: { 123: j.handleEOFInComment(); 124: } 125: 126: protected final void handleEmptyTag(TagElement tag) 127: throws javax.swing.text.ChangedCharSetException 128: { 129: j.handleEmptyTag(tag); 130: } 131: 132: protected final void handleStartTag(TagElement tag) 133: { 134: j.handleStartTag(tag); 135: } 136: 137: protected final void handleEndTag(TagElement tag) 138: { 139: j.handleEndTag(tag); 140: } 141: 142: protected final void handleError(int line, String message) 143: { 144: j.handleError(line, message); 145: } 146: 147: protected final void handleText(char[] text) 148: { 149: j.handleText(text); 150: } 151: 152: protected final void handleTitle(char[] title) 153: { 154: j.handleTitle(title); 155: } 156: 157: protected final void markFirstTime(Element element) 158: { 159: j.markFirstTime(element); 160: } 161: 162: protected final void startTag(TagElement tag) 163: throws ChangedCharSetException 164: { 165: j.startTag(tag); 166: } 167: 168: protected final void endTag(boolean omitted) 169: { 170: j.endTag(omitted); 171: } 172: 173: protected TagElement makeTag(Element element) 174: { 175: return j.makeTag(element); 176: } 177: 178: protected TagElement makeTag(Element element, boolean isSupposed) 179: { 180: return j.makeTag(element, isSupposed); 181: } 182: }; 183: } 184: 185: /** 186: * Parse the HTML text, calling various methods in response to the 187: * occurence of the corresponding HTML constructions. 188: * @param reader The reader to read the source HTML from. 189: * @throws IOException If the reader throws one. 190: */ 191: public synchronized void parse(Reader reader) 192: throws IOException 193: { 194: gnu.parse(reader); 195: } 196: 197: /** 198: * Parses DTD markup declaration. Currently returns without action. 199: * @return null. 200: * @throws java.io.IOException 201: */ 202: public String parseDTDMarkup() 203: throws IOException 204: { 205: return gnu.parseDTDMarkup(); 206: } 207: 208: /** 209: * Parse DTD document declarations. Currently only parses the document 210: * type declaration markup. 211: * @param strBuff 212: * @return true if this is a valid DTD markup declaration. 213: * @throws IOException 214: */ 215: protected boolean parseMarkupDeclarations(StringBuffer strBuff) 216: throws IOException 217: { 218: return gnu.parseMarkupDeclarations(strBuff); 219: } 220: 221: /** 222: * Get the attributes of the current tag. 223: * @return The attribute set, representing the attributes of the current tag. 224: */ 225: protected SimpleAttributeSet getAttributes() 226: { 227: return gnu.getAttributes(); 228: } 229: 230: /** 231: * Get the number of the document line being parsed. 232: * @return The current line. 233: */ 234: protected int getCurrentLine() 235: { 236: return gnu.hTag.where.beginLine; 237: } 238: 239: /** 240: * Get the current position in the document being parsed. 241: * @return The current position. 242: */ 243: protected int getCurrentPos() 244: { 245: return gnu.hTag.where.startPosition; 246: } 247: 248: /** 249: * The method is called when the HTML end (closing) tag is found or if 250: * the parser concludes that the one should be present in the 251: * current position. The method is called immediatly 252: * before calling the handleEndTag(). 253: * @param omitted True if the tag is no actually present in the document, 254: * but is supposed by the parser (like </html> at the end of the 255: * document). 256: */ 257: protected void endTag(boolean omitted) 258: { 259: } 260: 261: /** 262: * Invokes the error handler. The default method in this implementation 263: * finally delegates the call to handleError, also providing the number of the 264: * current line. 265: */ 266: protected void error(String msg) 267: { 268: gnu.error(msg); 269: } 270: 271: /** 272: * Invokes the error handler. The default method in this implementation 273: * finally delegates the call to error (msg+": '"+invalid+"'"). 274: */ 275: protected void error(String msg, String invalid) 276: { 277: gnu.error(msg, invalid); 278: } 279: 280: /** 281: * Invokes the error handler. The default method in this implementation 282: * finally delegates the call to error (parm1+" "+ parm2+" "+ parm3). 283: */ 284: protected void error(String parm1, String parm2, String parm3) 285: { 286: gnu.error(parm1, parm2, parm3); 287: } 288: 289: /** 290: * Invokes the error handler. The default method in this implementation 291: * finally delegates the call to error 292: * (parm1+" "+ parm2+" "+ parm3+" "+ parm4). 293: */ 294: protected void error(String parm1, String parm2, String parm3, String parm4) 295: { 296: gnu.error(parm1, parm2, parm3, parm4); 297: } 298: 299: /** 300: * In this implementation, this is never called and returns without action. 301: */ 302: protected void flushAttributes() 303: { 304: gnu.flushAttributes(); 305: } 306: 307: /** 308: * Handle HTML comment. The default method returns without action. 309: * @param comment The comment being handled 310: */ 311: protected void handleComment(char[] comment) 312: { 313: } 314: 315: /** 316: * This is additionally called in when the HTML content terminates 317: * without closing the HTML comment. This can only happen if the 318: * HTML document contains errors (for example, the closing --;gt is 319: * missing. The default method calls the error handler. 320: */ 321: protected void handleEOFInComment() 322: { 323: gnu.error("Unclosed comment"); 324: } 325: 326: /** 327: * Handle the tag with no content, like <br>. The method is 328: * called for the elements that, in accordance with the current DTD, 329: * has an empty content. 330: * @param tag The tag being handled. 331: * @throws javax.swing.text.ChangedCharSetException 332: */ 333: protected void handleEmptyTag(TagElement tag) 334: throws ChangedCharSetException 335: { 336: } 337: 338: /** 339: * The method is called when the HTML closing tag ((like </table>) 340: * is found or if the parser concludes that the one should be present 341: * in the current position. 342: * @param tag The tag being handled 343: */ 344: protected void handleEndTag(TagElement tag) 345: { 346: } 347: 348: /* Handle error that has occured in the given line. */ 349: protected void handleError(int line, String message) 350: { 351: } 352: 353: /** 354: * The method is called when the HTML opening tag ((like <table>) 355: * is found or if the parser concludes that the one should be present 356: * in the current position. 357: * @param tag The tag being handled 358: */ 359: protected void handleStartTag(TagElement tag) 360: { 361: } 362: 363: /** 364: * Handle the text section. 365: * <p> For non-preformatted section, the parser replaces 366: * \t, \r and \n by spaces and then multiple spaces 367: * by a single space. Additionaly, all whitespace around 368: * tags is discarded. 369: * </p> 370: * <p> For pre-formatted text (inside TEXAREA and PRE), the parser preserves 371: * all tabs and spaces, but removes <b>one</b> bounding \r, \n or \r\n, 372: * if it is present. Additionally, it replaces each occurence of \r or \r\n 373: * by a single \n.</p> 374: * 375: * @param text A section text. 376: */ 377: protected void handleText(char[] text) 378: { 379: } 380: 381: /** 382: * Handle HTML <title> tag. This method is invoked when 383: * both title starting and closing tags are already behind. 384: * The passed argument contains the concatenation of all 385: * title text sections. 386: * @param title The title text. 387: */ 388: protected void handleTitle(char[] title) 389: { 390: } 391: 392: /** 393: * Constructs the tag from the given element. In this implementation, 394: * this is defined, but never called. 395: * @param element the base element of the tag. 396: * @return the tag 397: */ 398: protected TagElement makeTag(Element element) 399: { 400: return makeTag(element, false); 401: } 402: 403: /** 404: * Constructs the tag from the given element. 405: * @param element the tag base {@link javax.swing.text.html.parser.Element} 406: * @param isSupposed true if the tag is not actually present in the 407: * html input, but the parser supposes that it should to occur in 408: * the current location. 409: * @return the tag 410: */ 411: protected TagElement makeTag(Element element, boolean isSupposed) 412: { 413: return new TagElement(element, isSupposed); 414: } 415: 416: /** 417: * This is called when the tag, representing the given element, 418: * occurs first time in the document. 419: * @param element 420: */ 421: protected void markFirstTime(Element element) 422: { 423: } 424: 425: /** 426: * The method is called when the HTML opening tag ((like <table>) 427: * is found or if the parser concludes that the one should be present 428: * in the current position. The method is called immediately before 429: * calling the handleStartTag. 430: * @param tag The tag 431: */ 432: protected void startTag(TagElement tag) 433: throws ChangedCharSetException 434: { 435: } 436: }
GNU Classpath (0.18) |