parser.cpp

00001 /*
00002   Copyright (c) 2004-2008 by Jakob Schroeter <js@camaya.net>
00003   This file is part of the gloox library. http://camaya.net/gloox
00004 
00005   This software is distributed under a license. The full license
00006   agreement can be found in the file LICENSE in this distribution.
00007   This software may not be copied, modified, sold or distributed
00008   other than expressed in the named license agreement.
00009 
00010   This software is distributed without any warranty.
00011 */
00012 
00013 
00014 
00015 #include "gloox.h"
00016 
00017 #include "parser.h"
00018 
00019 namespace gloox
00020 {
00021 
00022   Parser::Parser( TagHandler *ph )
00023     : m_tagHandler( ph ), m_current( 0 ), m_root( 0 ), m_state( Initial ),
00024       m_preamble( 0 ), m_quote( false )
00025   {
00026   }
00027 
00028   Parser::~Parser()
00029   {
00030     delete m_root;
00031   }
00032 
00033   bool Parser::feed( const std::string& data )
00034   {
00035     std::string::const_iterator it = data.begin();
00036     for( ; it != data.end(); ++it )
00037     {
00038       const unsigned char c = (*it);
00039 //       printf( "found char:   %c, ", c );
00040 
00041       if( !isValid( c ) )
00042       {
00043         cleanup();
00044         return false;
00045       }
00046 
00047       switch( m_state )
00048       {
00049         case Initial:
00050           m_tag = "";
00051           if( isWhitespace( c ) )
00052             break;
00053 
00054           switch( c )
00055           {
00056             case '<':
00057               m_state = TagOpening;
00058               break;
00059             case '>':
00060             default:
00061 //               cleanup();
00062 //               return false;
00063               break;
00064           }
00065           break;
00066         case TagOpening:               // opening '<' has been found before
00067           if( isWhitespace( c ) )
00068             break;
00069 
00070           switch( c )
00071           {
00072             case '<':
00073             case '>':
00074             case '!':
00075               cleanup();
00076               return false;
00077               break;
00078             case '/':
00079               m_state = TagClosingSlash;
00080               break;
00081             case '?':
00082               m_state = TagNameCollect;
00083               m_preamble = 1;
00084               break;
00085             default:
00086               m_tag += c;
00087               m_state = TagNameCollect;
00088               break;
00089           }
00090           break;
00091         case TagNameCollect:          // we're collecting the tag's name, we have at least one octet already
00092           if( isWhitespace( c ) )
00093           {
00094             m_state = TagNameComplete;
00095             break;
00096           }
00097 
00098           switch( c )
00099           {
00100             case '<':
00101             case '?':
00102             case '!':
00103               cleanup();
00104               return false;
00105               break;
00106             case '/':
00107               m_state = TagOpeningSlash;
00108               break;
00109             case '>':
00110               addTag();
00111               m_state = TagInside;
00112               break;
00113             default:
00114               m_tag += c;
00115               break;
00116           }
00117           break;
00118         case TagInside:                // we're inside a tag, expecting a child tag or cdata
00119           m_tag = "";
00120           switch( c )
00121           {
00122             case '<':
00123               addCData();
00124               m_state = TagOpening;
00125               break;
00126             default:
00127               m_cdata += c;
00128               break;
00129           }
00130           break;
00131         case TagOpeningSlash:         // a slash in an opening tag has been found, initing close of the tag
00132           if( isWhitespace( c ) )
00133             break;
00134 
00135           if( c == '>' )
00136           {
00137             addTag();
00138             if( !closeTag() )
00139             {
00140               cleanup();
00141               return false;
00142             }
00143 
00144             m_state = Initial;
00145           }
00146           else
00147           {
00148             cleanup();
00149             return false;
00150           }
00151           break;
00152         case TagClosingSlash:         // we have found the '/' of a closing tag
00153           if( isWhitespace( c ) )
00154             break;
00155 
00156           switch( c )
00157           {
00158             case '>':
00159             case '<':
00160             case '/':
00161               cleanup();
00162               return false;
00163               break;
00164             default:
00165               m_tag += c;
00166               m_state = TagClosing;
00167               break;
00168           }
00169           break;
00170         case TagClosing:               // we're collecting the name of a closing tag
00171           switch( c )
00172           {
00173             case '<':
00174             case '/':
00175               cleanup();
00176               return false;
00177               break;
00178             case '>':
00179               if( !closeTag() )
00180               {
00181                 cleanup();
00182                 return false;
00183               }
00184 
00185               m_state = Initial;
00186               break;
00187             default:
00188               m_tag += c;
00189               break;
00190           }
00191           break;
00192         case TagNameComplete:        // a tag name is complete, expect tag close or attribs
00193           if( isWhitespace( c ) )
00194             break;
00195 
00196           switch( c )
00197           {
00198             case '<':
00199               cleanup();
00200               return false;
00201               break;
00202             case '/':
00203               m_state = TagOpeningSlash;
00204               break;
00205             case '>':
00206               if( m_preamble == 1 )
00207               {
00208                 cleanup();
00209                 return false;
00210               }
00211               m_state = TagInside;
00212               addTag();
00213               break;
00214             case '?':
00215               if( m_preamble == 1 )
00216                 m_preamble = 2;
00217               else
00218               {
00219                 cleanup();
00220                 return false;
00221               }
00222               break;
00223             default:
00224               m_attrib += c;
00225               m_state = TagAttribute;
00226               break;
00227           }
00228           break;
00229         case TagAttribute:                  // we're collecting the name of an attribute, we have at least 1 octet
00230           if( isWhitespace( c ) )
00231           {
00232             m_state = TagAttributeComplete;
00233             break;
00234           }
00235 
00236           switch( c )
00237           {
00238             case '<':
00239             case '/':
00240             case '>':
00241               cleanup();
00242               return false;
00243               break;
00244             case '=':
00245               m_state = TagAttributeEqual;
00246               break;
00247             default:
00248               m_attrib += c;
00249           }
00250           break;
00251         case TagAttributeComplete:         // we're expecting an equals sign or ws or the attrib value
00252           if( isWhitespace( c ) )
00253             break;
00254 
00255           switch( c )
00256           {
00257             case '=':
00258               m_state = TagAttributeEqual;
00259               break;
00260             case '<':
00261             case '/':
00262             case '>':
00263             default:
00264               cleanup();
00265               return false;
00266               break;
00267           }
00268           break;
00269         case TagAttributeEqual:            // we have found an equals sign
00270           if( isWhitespace( c ) )
00271             break;
00272 
00273           switch( c )
00274           {
00275             case '"':
00276               m_quote = true;
00277             case '\'':
00278               m_state = TagValue;
00279               break;
00280             case '=':
00281             case '<':
00282             case '>':
00283             default:
00284               cleanup();
00285               return false;
00286               break;
00287           }
00288           break;
00289         case TagValue:                 // we're expecting value data
00290           switch( c )
00291           {
00292             case '<':
00293               cleanup();
00294               return false;
00295               break;
00296             case '\'':
00297               if( m_quote )
00298               {
00299                 m_value += c;
00300                 break;
00301               }
00302             case '"':
00303               addAttribute();
00304               m_state = TagNameComplete;
00305               m_quote = false;
00306               break;
00307             case '>':
00308             default:
00309               m_value += c;
00310           }
00311           break;
00312         default:
00313 //           printf( "default action!?\n" );
00314           break;
00315       }
00316 //       printf( "parser state: %d\n", m_state );
00317     }
00318 
00319     return true;
00320   }
00321 
00322   void Parser::addTag()
00323   {
00324     if( !m_root )
00325     {
00326 //       printf( "created Tag named %s, ", m_tag.c_str() );
00327       m_root = new Tag( m_tag, "", true );
00328       m_current = m_root;
00329     }
00330     else
00331     {
00332 //       printf( "created Tag named %s, ", m_tag.c_str() );
00333       m_current = new Tag( m_current, m_tag, "", true );
00334     }
00335 
00336     if( m_attribs.size() )
00337     {
00338       m_current->setAttributes( m_attribs );
00339 //       printf( "added %d attributes, ", m_attribs.size() );
00340       m_attribs.clear();
00341     }
00342 
00343     if( m_tag == "stream:stream" )
00344     {
00345       streamEvent( m_root );
00346       cleanup();
00347     }
00348 //     else
00349 //       printf( "%s, ", m_root->xml().c_str() );
00350 
00351     if( m_tag == "xml" && m_preamble == 2 )
00352       cleanup();
00353   }
00354 
00355   void Parser::addAttribute()
00356   {
00357 //     printf( "adding attribute: %s='%s', ", m_attrib.c_str(), m_value.c_str() );
00358     m_attribs.push_back( Tag::Attribute( Tag::relax( m_attrib ), Tag::relax( m_value ) ) );
00359     m_attrib = "";
00360     m_value = "";
00361 //     printf( "added, " );
00362   }
00363 
00364   void Parser::addCData()
00365   {
00366     if( m_current )
00367     {
00368       m_current->setCData( m_cdata );
00369 //       printf( "added cdata %s, ", m_cdata.c_str() );
00370       m_cdata = "";
00371     }
00372   }
00373 
00374   bool Parser::closeTag()
00375   {
00376 //     printf( "about to close, " );
00377 
00378     if( m_tag == "stream:stream" )
00379       return true;
00380 
00381     if( !m_current || m_current->name() != m_tag )
00382       return false;
00383 
00384 //       printf( "m_current: %s, ", m_current->name().c_str() );
00385 //       printf( "m_tag: %s, ", m_tag.c_str() );
00386 
00387     if( m_current->parent() )
00388       m_current = m_current->parent();
00389     else
00390     {
00391 //       printf( "pushing upstream, " );
00392       streamEvent( m_root );
00393       cleanup();
00394     }
00395 
00396     return true;
00397   }
00398 
00399   void Parser::cleanup()
00400   {
00401     delete m_root;
00402     m_root = 0;
00403     m_current = 0;
00404     m_cdata = "";
00405     m_tag = "";
00406     m_attrib = "";
00407     m_value = "";
00408     m_attribs.clear();
00409     m_state = Initial;
00410     m_preamble = 0;
00411   }
00412 
00413   bool Parser::isValid( unsigned char c )
00414   {
00415     return ( c != 0xc0 || c != 0xc1 || c < 0xf5 );
00416   }
00417 
00418   bool Parser::isWhitespace( unsigned char c )
00419   {
00420     return ( c == 0x09 || c == 0x0a || c == 0x0d || c == 0x20 );
00421   }
00422 
00423   void Parser::streamEvent( Tag *tag )
00424   {
00425     if( m_tagHandler )
00426       m_tagHandler->handleTag( tag );
00427   }
00428 
00429 }

Generated on Mon Dec 7 13:28:19 2009 for gloox by  doxygen 1.6.1