Source for gnu.xml.dom.html2.DomHTMLParser

   1: /* DomHTMLParser.java --
   2:    Copyright (C) 2005 Free Software Foundation, Inc.
   3: 
   4: This file is part of GNU Classpath.
   5: 
   6: GNU Classpath is free software; you can redistribute it and/or modify
   7: it under the terms of the GNU General Public License as published by
   8: the Free Software Foundation; either version 2, or (at your option)
   9: any later version.
  10: 
  11: GNU Classpath is distributed in the hope that it will be useful, but
  12: WITHOUT ANY WARRANTY; without even the implied warranty of
  13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14: General Public License for more details.
  15: 
  16: You should have received a copy of the GNU General Public License
  17: along with GNU Classpath; see the file COPYING.  If not, write to the
  18: Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  19: 02110-1301 USA.
  20: 
  21: Linking this library statically or dynamically with other modules is
  22: making a combined work based on this library.  Thus, the terms and
  23: conditions of the GNU General Public License cover the whole
  24: combination.
  25: 
  26: As a special exception, the copyright holders of this library give you
  27: permission to link this library with independent modules to produce an
  28: executable, regardless of the license terms of these independent
  29: modules, and to copy and distribute the resulting executable under
  30: terms of your choice, provided that you also meet, for each linked
  31: independent module, the terms and conditions of the license of that
  32: module.  An independent module is a module which is not derived from
  33: or based on this library.  If you modify this library, you may extend
  34: this exception to your version of the library, but you are not
  35: obligated to do so.  If you do not wish to do so, delete this
  36: exception statement from your version. */
  37: 
  38: 
  39: package gnu.xml.dom.html2;
  40: 
  41: import gnu.javax.swing.text.html.parser.support.Parser;
  42: 
  43: import java.io.IOException;
  44: import java.io.Reader;
  45: 
  46: import java.util.Enumeration;
  47: import java.util.Iterator;
  48: import java.util.LinkedList;
  49: 
  50: import javax.swing.text.AttributeSet;
  51: import javax.swing.text.html.HTML;
  52: import javax.swing.text.html.parser.DTD;
  53: import javax.swing.text.html.parser.TagElement;
  54: 
  55: import org.w3c.dom.NamedNodeMap;
  56: import org.w3c.dom.Node;
  57: import org.w3c.dom.html2.HTMLDocument;
  58: 
  59: /**
  60:  * This parser reads HTML from the given stream and stores into
  61:  * {@link HTMLDocument}. The HTML tag becomes the {@link Node}.
  62:  * The tag attributes become the node attributes. The text inside
  63:  * HTML tag is inserted as one or several text nodes. The nested
  64:  * HTML tags are inserted as child nodes.
  65:  * 
  66:  * If the strict tree structure, closing the tag means closing all
  67:  * nested tags. To work around this, this parser closes the nested
  68:  * tags and immediately reopens them after the closed tag.
  69:  * In this way, <code>&lt;b&gt;&lt;i&gt;c&lt;/b&gt;d</code> 
  70:  * is parsed as <code>&lt;b&gt;&lt;i&gt;c&lt;/i&gt;&lt;/b&gt;&lt;i&gt;d</code> .
  71:  *
  72:  * @author Audrius Meskauskas (AudriusA@Bioinformatics.org)
  73:  */
  74: public class DomHTMLParser
  75:   extends gnu.javax.swing.text.html.parser.support.Parser
  76: {
  77:   /**
  78:    * The target where HTML document will be inserted.
  79:    */
  80:   protected DomHTMLDocument document;
  81: 
  82:   /**
  83:    * The subsequently created new nodes will be inserted as the
  84:    * childs of this cursor.
  85:    */
  86:   protected Node cursor;
  87: 
  88:   /**
  89:    * Create parser using the given DTD.
  90:    *
  91:    * @param dtd the DTD (for example,
  92:    * {@link gnu.javax.swing.text.html.parser.HTML_401F}).
  93:    */
  94:   public DomHTMLParser(DTD dtd)
  95:   {
  96:     super(dtd);
  97:   }
  98: 
  99:   /**
 100:    * Parse SGML insertion ( &lt;! ... &gt; ).
 101:    * Currently just treats it as comment.
 102:    */
 103:   public boolean parseMarkupDeclarations(StringBuffer strBuff)
 104:                                   throws java.io.IOException
 105:   {
 106:     Node c = document.createComment(strBuff.toString());
 107:     cursor.appendChild(c);
 108:     return false;
 109:   }
 110: 
 111:   /**
 112:    * Read the document, present in the given stream, and
 113:    * return the corresponding {@link HTMLDocument}.
 114:    *
 115:    * @param input a stream to read from.
 116:    * @return a document, reflecting the structure of the provided HTML
 117:    * text.
 118:    *
 119:    * @throws IOException if the reader throws one.
 120:    */
 121:   public HTMLDocument parseDocument(Reader input)
 122:                     throws IOException
 123:   {
 124:     try
 125:       {
 126:         document = new DomHTMLDocument();
 127: 
 128:         cursor = document;
 129: 
 130:         parse(input);
 131: 
 132:         DomHTMLDocument h = document;
 133:         document = null;
 134:         return h;
 135:       }
 136:     catch (Exception ex)
 137:       {
 138:         ex.printStackTrace();
 139:         throw new IOException("Exception: " + ex.getMessage());
 140:       }
 141:   }
 142:   
 143:   /**
 144:    * Create a new node.
 145:    * @param name the name of node, case insensitive.
 146:    * @return the created node.
 147:    */
 148:   protected Node createNode(String name)
 149:   {
 150:     Node new_node = document.createElement(name.toLowerCase());
 151:     AttributeSet hatts = getAttributes();
 152:     NamedNodeMap natts = new_node.getAttributes();
 153: 
 154:     Enumeration enumeration = hatts.getAttributeNames();
 155:     Object key;
 156:     Node attribute;
 157: 
 158:     while (hatts != null)
 159:       {
 160:         while (enumeration.hasMoreElements())
 161:           {
 162:             key = enumeration.nextElement();
 163:             attribute = document.createAttribute(key.toString());
 164:             attribute.setNodeValue(hatts.getAttribute(key).toString());
 165:             natts.setNamedItem(attribute);
 166:           }
 167: 
 168:         // The default values are stored in a parent node. 
 169:         hatts = hatts.getResolveParent();
 170:       }
 171: 
 172:     return new_node;
 173:   }
 174:   
 175:   /**
 176:    * Handle comment by inserting the comment node.
 177:    * @param text the comment text.
 178:    */
 179:   protected void handleComment(char[] text)
 180:   {
 181:     Node c = document.createComment(new String(text));
 182:     cursor.appendChild(c);
 183:   }
 184:   
 185:   /**
 186:    * Handle the tag with no content.
 187:    * @param tag the tag to handle.
 188:    */
 189:   protected void handleEmptyTag(TagElement tag)
 190:   {
 191:     String name = tag.getHTMLTag().toString();
 192: 
 193:     if (name.equalsIgnoreCase("#pcdata"))
 194:       return;
 195: 
 196:     Node c = createNode(name);
 197:     cursor.appendChild(c);
 198:   }
 199:   
 200:   /**
 201:    * Close the given tag. Close and reopen all nested tags.
 202:    * @param tag the tag to close.
 203:    */
 204:   protected void handleEndTag(TagElement tag)
 205:   {
 206:     String name = tag.getHTMLTag().toString();
 207:     String nname = cursor.getNodeName();
 208: 
 209:     // Closing the current tag.
 210:     if (nname != null && nname.equalsIgnoreCase(name))
 211:       {
 212:         cursor = cursor.getParentNode();
 213:       }
 214:     else
 215:       {
 216:         Node nCursor = cursor.getParentNode();
 217: 
 218:         // Remember the opened nodes.
 219:         LinkedList open = new LinkedList();
 220:         Node close = cursor;
 221:         while (close != null && !close.getNodeName().equalsIgnoreCase(name))
 222:           {
 223:             if (close != document)
 224:               open.addFirst(close);
 225:             close = close.getParentNode();
 226:           }
 227: 
 228:         if (close == null)
 229:           cursor = document;
 230:         else
 231:           cursor = close.getParentNode();
 232: 
 233:         // Insert the copies of the opened nodes.   
 234:         Iterator iter = open.iterator();
 235:         while (iter.hasNext())
 236:           {
 237:             Node item = (Node) iter.next();
 238:             Node copy = item.cloneNode(true);
 239:             cursor.appendChild(copy);
 240:             cursor = copy;
 241:           }
 242:       }
 243:   }
 244: 
 245:   /**
 246:    * Handle the start tag by inserting the HTML element.
 247:    * @param tag the tag to handle.
 248:    */
 249:   protected void handleStartTag(TagElement tag)
 250:   {
 251:     HTML.Tag h = tag.getHTMLTag();
 252:     Node c = createNode(h.toString());
 253:     cursor.appendChild(c);
 254:     cursor = c;
 255:   }
 256:   
 257:   /**
 258:    * Handle text by inserting the text node.
 259:    * @param text the text to insert.
 260:    */
 261:   protected void handleText(char[] text)
 262:   {
 263:     Node c = document.createTextNode(text, 0, text.length);
 264:     cursor.appendChild(c);
 265:   }
 266: }