Source for gnu.xml.dom.html2.DomHTMLParser

   1: /* DomHTMLParser.java --
   2:    Copyright (C) 2005 Free Software Foundation, Inc.
   3: 
   4: This file is part of GNU Classpath.
   5: 
   6: GNU Classpath is free software; you can redistribute it and/or modify
   7: it under the terms of the GNU General Public License as published by
   8: the Free Software Foundation; either version 2, or (at your option)
   9: any later version.
  10: 
  11: GNU Classpath is distributed in the hope that it will be useful, but
  12: WITHOUT ANY WARRANTY; without even the implied warranty of
  13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14: General Public License for more details.
  15: 
  16: You should have received a copy of the GNU General Public License
  17: along with GNU Classpath; see the file COPYING.  If not, write to the
  18: Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  19: 02110-1301 USA.
  20: 
  21: Linking this library statically or dynamically with other modules is
  22: making a combined work based on this library.  Thus, the terms and
  23: conditions of the GNU General Public License cover the whole
  24: combination.
  25: 
  26: As a special exception, the copyright holders of this library give you
  27: permission to link this library with independent modules to produce an
  28: executable, regardless of the license terms of these independent
  29: modules, and to copy and distribute the resulting executable under
  30: terms of your choice, provided that you also meet, for each linked
  31: independent module, the terms and conditions of the license of that
  32: module.  An independent module is a module which is not derived from
  33: or based on this library.  If you modify this library, you may extend
  34: this exception to your version of the library, but you are not
  35: obligated to do so.  If you do not wish to do so, delete this
  36: exception statement from your version. */
  37: 
  38: 
  39: package gnu.xml.dom.html2;
  40: 
  41: import gnu.javax.swing.text.html.parser.support.Parser;
  42: 
  43: import java.io.IOException;
  44: import java.io.Reader;
  45: 
  46: import java.util.Enumeration;
  47: import java.util.Iterator;
  48: import java.util.LinkedList;
  49: 
  50: import javax.swing.text.AttributeSet;
  51: import javax.swing.text.html.HTML;
  52: import javax.swing.text.html.parser.DTD;
  53: import javax.swing.text.html.parser.TagElement;
  54: 
  55: import org.w3c.dom.NamedNodeMap;
  56: import org.w3c.dom.Node;
  57: import org.w3c.dom.html2.HTMLDocument;
  58: 
  59: /**
  60:  * This parser reads HTML from the given stream and stores into
  61:  * {@link HTMLDocument}. The HTML tag becomes the {@link Node}.
  62:  * The tag attributes become the node attributes. The text inside
  63:  * HTML tag is inserted as one or several text nodes. The nested
  64:  * HTML tags are inserted as child nodes.
  65:  * 
  66:  * If the strict tree structure, closing the tag means closing all
  67:  * nested tags. To work around this, this parser closes the nested
  68:  * tags and immediately reopens them after the closed tag.
  69:  * In this way, <code>&lt;b&gt;&lt;i&gt;c&lt;/b&gt;d</code> 
  70:  * is parsed as <code>&lt;b&gt;&lt;i&gt;c&lt;/i&gt;&lt;/b&gt;&lt;i&gt;d</code> .
  71:  *
  72:  * @author Audrius Meskauskas (AudriusA@Bioinformatics.org)
  73:  */
  74: public class DomHTMLParser
  75:   extends gnu.javax.swing.text.html.parser.support.Parser
  76: {
  77:   /**
  78:    * The target where HTML document will be inserted.
  79:    */
  80:   protected DomHTMLDocument document;
  81: 
  82:   /**
  83:    * The subsequently created new nodes will be inserted as the
  84:    * childs of this cursor.
  85:    */
  86:   protected Node cursor;
  87: 
  88:   /**
  89:    * Create parser using the given DTD.
  90:    *
  91:    * @param dtd the DTD (for example,
  92:    * {@link gnu.javax.swing.text.html.parser.HTML_401F}).
  93:    */
  94:   public DomHTMLParser(DTD dtd)
  95:   {
  96:     super(dtd);
  97:   }
  98: 
  99:   /**
 100:    * Parse SGML insertion ( &lt;! ... &gt; ).
 101:    * Currently just treats it as comment.
 102:    */
 103:   public boolean parseMarkupDeclarations(StringBuffer strBuff)
 104:                                   throws java.io.IOException
 105:   {
 106:     Node c = document.createComment(strBuff.toString());
 107:     cursor.appendChild(c);
 108:     return false;
 109:   }
 110: 
 111:   /**
 112:    * Read the document, present in the given stream, and
 113:    * return the corresponding {@link HTMLDocument}.
 114:    *
 115:    * @param input a stream to read from.
 116:    * @return a document, reflecting the structure of the provided HTML
 117:    * text.
 118:    *
 119:    * @throws IOException if the reader throws one.
 120:    */
 121:   public HTMLDocument parseDocument(Reader input)
 122:                     throws IOException
 123:   {
 124:     try
 125:       {
 126:         document = new DomHTMLDocument();
 127:         document.setCheckWellformedness(false);
 128:         document.setCheckingCharacters(false);
 129:         
 130:         cursor = document;
 131:         
 132:         parse(input);
 133: 
 134:         DomHTMLDocument h = document;
 135:         document = null;
 136:         return h;
 137:       }
 138:     catch (Exception ex)
 139:       {
 140:         ex.printStackTrace();
 141:         throw new IOException("Exception: " + ex.getMessage());
 142:       }
 143:   }
 144:   
 145:   /**
 146:    * Create a new node.
 147:    * @param name the name of node, case insensitive.
 148:    * @return the created node.
 149:    */
 150:   protected Node createNode(String name)
 151:   {
 152:     Node new_node = document.createElement(name.toLowerCase());
 153:     AttributeSet hatts = getAttributes();
 154:     NamedNodeMap natts = new_node.getAttributes();
 155: 
 156:     Enumeration enumeration = hatts.getAttributeNames();
 157:     Object key;
 158:     Node attribute;
 159: 
 160:     while (hatts != null)
 161:       {
 162:         while (enumeration.hasMoreElements())
 163:           {
 164:             key = enumeration.nextElement();
 165:             attribute = document.createAttribute(key.toString());
 166:             attribute.setNodeValue(hatts.getAttribute(key).toString());
 167:             natts.setNamedItem(attribute);
 168:           }
 169: 
 170:         // The default values are stored in a parent node. 
 171:         hatts = hatts.getResolveParent();
 172:       }
 173: 
 174:     return new_node;
 175:   }
 176:   
 177:   /**
 178:    * Handle comment by inserting the comment node.
 179:    * @param text the comment text.
 180:    */
 181:   protected void handleComment(char[] text)
 182:   {
 183:     Node c = document.createComment(new String(text));
 184:     cursor.appendChild(c);
 185:   }
 186:   
 187:   /**
 188:    * Handle the tag with no content.
 189:    * @param tag the tag to handle.
 190:    */
 191:   protected void handleEmptyTag(TagElement tag)
 192:   {
 193:     String name = tag.getHTMLTag().toString();
 194: 
 195:     if (name.equalsIgnoreCase("#pcdata"))
 196:       return;
 197: 
 198:     Node c = createNode(name);
 199:     cursor.appendChild(c);
 200:   }
 201:   
 202:   /**
 203:    * Close the given tag. Close and reopen all nested tags.
 204:    * @param tag the tag to close.
 205:    */
 206:   protected void handleEndTag(TagElement tag)
 207:   {
 208:     String name = tag.getHTMLTag().toString();
 209:     String nname = cursor.getNodeName();
 210: 
 211:     // Closing the current tag.
 212:     if (nname != null && nname.equalsIgnoreCase(name))
 213:       {
 214:         cursor = cursor.getParentNode();
 215:       }
 216:     else
 217:       {
 218:         Node nCursor = cursor.getParentNode();
 219: 
 220:         // Remember the opened nodes.
 221:         LinkedList open = new LinkedList();
 222:         Node close = cursor;
 223:         while (close != null && !close.getNodeName().equalsIgnoreCase(name))
 224:           {
 225:             if (close != document)
 226:               open.addFirst(close);
 227:             close = close.getParentNode();
 228:           }
 229:         if (close == null)
 230:           cursor = document;
 231:         else
 232:           cursor = close.getParentNode();
 233: 
 234:         // Insert the copies of the opened nodes.   
 235:         Iterator iter = open.iterator();
 236:         while (iter.hasNext())
 237:           {
 238:             Node item = (Node) iter.next();
 239:             cursor.appendChild(item);
 240:             cursor = item;
 241:           }
 242:       }
 243:   }
 244: 
 245:   /**
 246:    * Handle the start tag by inserting the HTML element.
 247:    * @param tag the tag to handle.
 248:    */
 249:   protected void handleStartTag(TagElement tag)
 250:   {
 251:     HTML.Tag h = tag.getHTMLTag();
 252:     Node c = createNode(h.toString());
 253:     cursor.appendChild(c);
 254:     cursor = c;
 255:   }
 256:   
 257:   /**
 258:    * Handle text by inserting the text node.
 259:    * @param text the text to insert.
 260:    */
 261:   protected void handleText(char[] text)
 262:   {
 263:     Node c = document.createTextNode(text, 0, text.length);
 264:     cursor.appendChild(c);
 265:   }
 266: }