Frames | No Frames |
1: /* DomHTMLParser.java -- 2: Copyright (C) 2005 Free Software Foundation, Inc. 3: 4: This file is part of GNU Classpath. 5: 6: GNU Classpath is free software; you can redistribute it and/or modify 7: it under the terms of the GNU General Public License as published by 8: the Free Software Foundation; either version 2, or (at your option) 9: any later version. 10: 11: GNU Classpath is distributed in the hope that it will be useful, but 12: WITHOUT ANY WARRANTY; without even the implied warranty of 13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14: General Public License for more details. 15: 16: You should have received a copy of the GNU General Public License 17: along with GNU Classpath; see the file COPYING. If not, write to the 18: Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 19: 02110-1301 USA. 20: 21: Linking this library statically or dynamically with other modules is 22: making a combined work based on this library. Thus, the terms and 23: conditions of the GNU General Public License cover the whole 24: combination. 25: 26: As a special exception, the copyright holders of this library give you 27: permission to link this library with independent modules to produce an 28: executable, regardless of the license terms of these independent 29: modules, and to copy and distribute the resulting executable under 30: terms of your choice, provided that you also meet, for each linked 31: independent module, the terms and conditions of the license of that 32: module. An independent module is a module which is not derived from 33: or based on this library. If you modify this library, you may extend 34: this exception to your version of the library, but you are not 35: obligated to do so. If you do not wish to do so, delete this 36: exception statement from your version. */ 37: 38: 39: package gnu.xml.dom.html2; 40: 41: import gnu.javax.swing.text.html.parser.support.Parser; 42: 43: import java.io.IOException; 44: import java.io.Reader; 45: 46: import java.util.Enumeration; 47: import java.util.Iterator; 48: import java.util.LinkedList; 49: 50: import javax.swing.text.AttributeSet; 51: import javax.swing.text.html.HTML; 52: import javax.swing.text.html.parser.DTD; 53: import javax.swing.text.html.parser.TagElement; 54: 55: import org.w3c.dom.NamedNodeMap; 56: import org.w3c.dom.Node; 57: import org.w3c.dom.html2.HTMLDocument; 58: 59: /** 60: * This parser reads HTML from the given stream and stores into 61: * {@link HTMLDocument}. The HTML tag becomes the {@link Node}. 62: * The tag attributes become the node attributes. The text inside 63: * HTML tag is inserted as one or several text nodes. The nested 64: * HTML tags are inserted as child nodes. 65: * 66: * If the strict tree structure, closing the tag means closing all 67: * nested tags. To work around this, this parser closes the nested 68: * tags and immediately reopens them after the closed tag. 69: * In this way, <code><b><i>c</b>d</code> 70: * is parsed as <code><b><i>c</i></b><i>d</code> . 71: * 72: * @author Audrius Meskauskas (AudriusA@Bioinformatics.org) 73: */ 74: public class DomHTMLParser 75: extends gnu.javax.swing.text.html.parser.support.Parser 76: { 77: /** 78: * The target where HTML document will be inserted. 79: */ 80: protected DomHTMLDocument document; 81: 82: /** 83: * The subsequently created new nodes will be inserted as the 84: * childs of this cursor. 85: */ 86: protected Node cursor; 87: 88: /** 89: * Create parser using the given DTD. 90: * 91: * @param dtd the DTD (for example, 92: * {@link gnu.javax.swing.text.html.parser.HTML_401F}). 93: */ 94: public DomHTMLParser(DTD dtd) 95: { 96: super(dtd); 97: } 98: 99: /** 100: * Parse SGML insertion ( <! ... > ). 101: * Currently just treats it as comment. 102: */ 103: public boolean parseMarkupDeclarations(StringBuffer strBuff) 104: throws java.io.IOException 105: { 106: Node c = document.createComment(strBuff.toString()); 107: cursor.appendChild(c); 108: return false; 109: } 110: 111: /** 112: * Read the document, present in the given stream, and 113: * return the corresponding {@link HTMLDocument}. 114: * 115: * @param input a stream to read from. 116: * @return a document, reflecting the structure of the provided HTML 117: * text. 118: * 119: * @throws IOException if the reader throws one. 120: */ 121: public HTMLDocument parseDocument(Reader input) 122: throws IOException 123: { 124: try 125: { 126: document = new DomHTMLDocument(); 127: 128: cursor = document; 129: 130: parse(input); 131: 132: DomHTMLDocument h = document; 133: document = null; 134: return h; 135: } 136: catch (Exception ex) 137: { 138: ex.printStackTrace(); 139: throw new IOException("Exception: " + ex.getMessage()); 140: } 141: } 142: 143: /** 144: * Create a new node. 145: * @param name the name of node, case insensitive. 146: * @return the created node. 147: */ 148: protected Node createNode(String name) 149: { 150: Node new_node = document.createElement(name.toLowerCase()); 151: AttributeSet hatts = getAttributes(); 152: NamedNodeMap natts = new_node.getAttributes(); 153: 154: Enumeration enumeration = hatts.getAttributeNames(); 155: Object key; 156: Node attribute; 157: 158: while (hatts != null) 159: { 160: while (enumeration.hasMoreElements()) 161: { 162: key = enumeration.nextElement(); 163: attribute = document.createAttribute(key.toString()); 164: attribute.setNodeValue(hatts.getAttribute(key).toString()); 165: natts.setNamedItem(attribute); 166: } 167: 168: // The default values are stored in a parent node. 169: hatts = hatts.getResolveParent(); 170: } 171: 172: return new_node; 173: } 174: 175: /** 176: * Handle comment by inserting the comment node. 177: * @param text the comment text. 178: */ 179: protected void handleComment(char[] text) 180: { 181: Node c = document.createComment(new String(text)); 182: cursor.appendChild(c); 183: } 184: 185: /** 186: * Handle the tag with no content. 187: * @param tag the tag to handle. 188: */ 189: protected void handleEmptyTag(TagElement tag) 190: { 191: String name = tag.getHTMLTag().toString(); 192: 193: if (name.equalsIgnoreCase("#pcdata")) 194: return; 195: 196: Node c = createNode(name); 197: cursor.appendChild(c); 198: } 199: 200: /** 201: * Close the given tag. Close and reopen all nested tags. 202: * @param tag the tag to close. 203: */ 204: protected void handleEndTag(TagElement tag) 205: { 206: String name = tag.getHTMLTag().toString(); 207: String nname = cursor.getNodeName(); 208: 209: // Closing the current tag. 210: if (nname != null && nname.equalsIgnoreCase(name)) 211: { 212: cursor = cursor.getParentNode(); 213: } 214: else 215: { 216: Node nCursor = cursor.getParentNode(); 217: 218: // Remember the opened nodes. 219: LinkedList open = new LinkedList(); 220: Node close = cursor; 221: while (close != null && !close.getNodeName().equalsIgnoreCase(name)) 222: { 223: if (close != document) 224: open.addFirst(close); 225: close = close.getParentNode(); 226: } 227: 228: if (close == null) 229: cursor = document; 230: else 231: cursor = close.getParentNode(); 232: 233: // Insert the copies of the opened nodes. 234: Iterator iter = open.iterator(); 235: while (iter.hasNext()) 236: { 237: Node item = (Node) iter.next(); 238: Node copy = item.cloneNode(true); 239: cursor.appendChild(copy); 240: cursor = copy; 241: } 242: } 243: } 244: 245: /** 246: * Handle the start tag by inserting the HTML element. 247: * @param tag the tag to handle. 248: */ 249: protected void handleStartTag(TagElement tag) 250: { 251: HTML.Tag h = tag.getHTMLTag(); 252: Node c = createNode(h.toString()); 253: cursor.appendChild(c); 254: cursor = c; 255: } 256: 257: /** 258: * Handle text by inserting the text node. 259: * @param text the text to insert. 260: */ 261: protected void handleText(char[] text) 262: { 263: Node c = document.createTextNode(text, 0, text.length); 264: cursor.appendChild(c); 265: } 266: }