gloox 1.0

parser.cpp

00001 /*
00002   Copyright (c) 2004-2009 by Jakob Schroeter <js@camaya.net>
00003   This file is part of the gloox library. http://camaya.net/gloox
00004 
00005   This software is distributed under a license. The full license
00006   agreement can be found in the file LICENSE in this distribution.
00007   This software may not be copied, modified, sold or distributed
00008   other than expressed in the named license agreement.
00009 
00010   This software is distributed without any warranty.
00011 */
00012 
00013 #include "gloox.h"
00014 #include "util.h"
00015 #include "parser.h"
00016 
00017 #include <cstdlib>
00018 
00019 namespace gloox
00020 {
00021 
00022   Parser::Parser( TagHandler* ph, bool deleteRoot )
00023     : m_tagHandler( ph ), m_current( 0 ), m_root( 0 ), m_xmlnss( 0 ), m_state( Initial ),
00024       m_preamble( 0 ), m_quote( false ), m_haveTagPrefix( false ), m_haveAttribPrefix( false ),
00025       m_attribIsXmlns( false ), m_deleteRoot( deleteRoot )
00026   {
00027   }
00028 
00029   Parser::~Parser()
00030   {
00031     delete m_root;
00032     delete m_xmlnss;
00033   }
00034 
00035   Parser::DecodeState Parser::decode( std::string::size_type& pos, const std::string& data )
00036   {
00037     std::string::size_type p = data.find( ';', pos );
00038     std::string::size_type diff = p - pos;
00039 
00040     if( p == std::string::npos )
00041     {
00042       m_backBuffer = data.substr( pos );
00043       return DecodeInsufficient;
00044     }
00045 
00046     if( diff < 3 || diff > 9 )
00047       return DecodeInvalid;
00048 
00049     std::string rep;
00050     switch( data[pos + 1] )
00051     {
00052       case '#':
00053         {
00054           int base = 10;
00055           int idx = 2;
00056 
00057           if( data[pos + 2] == 'x' || data[pos + 2] == 'X' )
00058           {
00059             base = 16;
00060             idx = 3;
00061           }
00062 
00063           char* end;
00064           const long int val = std::strtol( data.data() + pos + idx, &end, base );
00065           if( *end != ';' || val < 0 )
00066             return DecodeInvalid;
00067 
00068           if( val == 0x9 || val == 0xA || val == 0xD || ( val >= 0x20 && val <= 0x7F ) )
00069           {
00070             rep += char( val );
00071           }
00072           else if( val >= 0x80 && val <= 0x7FF )
00073           {
00074             rep += char( 192 + ( val >> 6 ) );
00075             rep += char( 128 + ( val % 64 ) );
00076           }
00077           else if( ( val >= 0x800 && val <= 0xD7FF ) || ( val >= 0xE000 && val <= 0xFFFD ) )
00078           {
00079             rep += char( 224 + ( val >> 12 ) );
00080             rep += char( 128 + ( ( val >> 6 ) % 64 ) );
00081             rep += char( 128 + ( val % 64 ) );
00082           }
00083           else if( val >= 0x100000 && val < 0x10FFFF )
00084           {
00085             rep += char( 240 + ( val >> 18 ) );
00086             rep += char( 128 + ( ( val >> 12 ) % 64 ) );
00087             rep += char( 128 + ( ( val >> 6 ) % 64 ) );
00088             rep += char( 128 + ( val % 64 ) );
00089           }
00090           else
00091             return DecodeInvalid;
00092         }
00093         break;
00094       case 'l':
00095         if( diff == 3 && data[pos + 2] == 't' )
00096           rep += '<';
00097         else
00098           return DecodeInvalid;
00099         break;
00100       case 'g':
00101         if( diff == 3 && data[pos + 2] == 't' )
00102           rep += '>';
00103         else
00104           return DecodeInvalid;
00105         break;
00106       case 'a':
00107         if( diff == 5 && !data.compare( pos + 1, 5, "apos;" ) )
00108           rep += '\'';
00109         else if( diff == 4 && !data.compare( pos + 1, 4, "amp;" ) )
00110           rep += '&';
00111         else
00112           return DecodeInvalid;
00113         break;
00114       case 'q':
00115         if( diff == 5 && !data.compare( pos + 1, 5, "quot;" ) )
00116           rep += '"';
00117         else
00118           return DecodeInvalid;
00119         break;
00120       default:
00121         return DecodeInvalid;
00122     }
00123 
00124     switch( m_state )
00125     {
00126       case TagInside:
00127         m_cdata += rep;
00128         break;
00129       case TagAttributeValue:
00130         m_value += rep;
00131         break;
00132       default:
00133         break;
00134     }
00135     pos += diff;
00136     return DecodeValid;
00137   }
00138 
00139   Parser::ForwardScanState Parser::forwardScan( std::string::size_type& pos, const std::string& data,
00140                                                 const std::string& needle )
00141   {
00142     if( pos + needle.length() <= data.length() )
00143     {
00144       if( !data.compare( pos, needle.length(), needle ) )
00145       {
00146         pos += needle.length() - 1;
00147         return ForwardFound;
00148       }
00149       else
00150       {
00151         return ForwardNotFound;
00152       }
00153     }
00154     else
00155     {
00156       m_backBuffer = data.substr( pos );
00157       return ForwardInsufficientSize;
00158     }
00159   }
00160 
00161   int Parser::feed( std::string& data )
00162   {
00163     if( !m_backBuffer.empty() )
00164     {
00165       data.insert( 0, m_backBuffer );
00166       m_backBuffer = EmptyString;
00167     }
00168 
00169     std::string::size_type count = data.length();
00170     for( std::string::size_type i = 0; i < count; ++i )
00171     {
00172       const unsigned char c = data[i];
00173 //       printf( "found char:   %c, ", c );
00174 
00175       if( !isValid( c ) )
00176       {
00177         cleanup();
00178         return static_cast<int>( i );
00179       }
00180 
00181       switch( m_state )
00182       {
00183         case Initial:
00184 //           printf( "Initial: %c\n", c );
00185           if( isWhitespace( c ) )
00186             break;
00187 
00188           switch( c )
00189           {
00190             case '<':
00191               m_state = TagOpening;
00192               break;
00193             default:
00194               cleanup();
00195               return static_cast<int>( i );
00196               break;
00197           }
00198           break;
00199         case InterTag:
00200 //           printf( "InterTag: %c\n", c );
00201           m_tag = EmptyString;
00202           if( isWhitespace( c ) )
00203             break;
00204 
00205           switch( c )
00206           {
00207             case '<':
00208               m_state = TagOpening;
00209               break;
00210             case '>':
00211             default:
00212               if( m_current )
00213               {
00214                 m_cdata += c;
00215                 m_state = TagInside;
00216               }
00217               break;
00218           }
00219           break;
00220           case TagOpening:               // opening '<' has been found before
00221 //           printf( "TagOpening: %c\n", c );
00222           if( isWhitespace( c ) )
00223             break;
00224 
00225           switch( c )
00226           {
00227             case '<':
00228             case '>':
00229             case '&':
00230               cleanup();
00231               return static_cast<int>( i );
00232               break;
00233             case '/':
00234               m_state = TagClosingSlash;
00235               break;
00236             case '?':
00237               m_state = TagNameCollect;
00238               m_preamble = 1;
00239               break;
00240             case '!':
00241               switch( forwardScan( i, data, "![CDATA[" ) )
00242               {
00243                 case ForwardFound:
00244                   m_state = TagCDATASection;
00245                   break;
00246                 case ForwardNotFound:
00247                   cleanup();
00248                   return static_cast<int>( i );
00249                 case ForwardInsufficientSize:
00250                   return -1;
00251               }
00252               break;
00253             default:
00254               m_tag += c;
00255               m_state = TagNameCollect;
00256               break;
00257           }
00258           break;
00259         case TagCDATASection:
00260           switch( c )
00261           {
00262             case ']':
00263               switch( forwardScan( i, data, "]]>" ) )
00264               {
00265                 case ForwardFound:
00266                   m_state = TagInside;
00267                   break;
00268                 case ForwardNotFound:
00269                   m_cdata += c;
00270                   break;
00271                 case ForwardInsufficientSize:
00272                   return -1;
00273               }
00274               break;
00275             default:
00276               m_cdata += c;
00277               break;
00278           }
00279           break;
00280         case TagNameCollect:          // we're collecting the tag's name, we have at least one octet already
00281 //           printf( "TagNameCollect: %c\n", c );
00282           if( isWhitespace( c ) )
00283           {
00284             m_state = TagNameComplete;
00285             break;
00286           }
00287 
00288           switch( c )
00289           {
00290             case '<':
00291             case '?':
00292             case '!':
00293             case '&':
00294               cleanup();
00295               return static_cast<int>( i );
00296               break;
00297             case '/':
00298               m_state = TagOpeningSlash;
00299               break;
00300             case '>':
00301               addTag();
00302               m_state = TagInside;
00303               break;
00304             case ':':
00305               if( !m_haveTagPrefix )
00306               {
00307                 m_haveTagPrefix = true;
00308                 m_tagPrefix = m_tag;
00309                 m_tag = EmptyString;
00310               }
00311               else
00312               {
00313                 cleanup();
00314                 return static_cast<int>( i );
00315               }
00316               break;
00317             default:
00318               m_tag += c;
00319               break;
00320           }
00321           break;
00322         case TagInside:                // we're inside a tag, expecting a child tag or cdata
00323 //           printf( "TagInside: %c\n", c );
00324           m_tag = EmptyString;
00325           switch( c )
00326           {
00327             case '<':
00328               addCData();
00329               m_state = TagOpening;
00330               break;
00331             case '&':
00332 //               printf( "TagInside, calling decode\n" );
00333               switch( decode( i, data ) )
00334               {
00335                 case DecodeValid:
00336                   break;
00337                 case DecodeInvalid:
00338                   cleanup();
00339                   return static_cast<int>( i );
00340                 case DecodeInsufficient:
00341                   return -1;
00342               }
00343               break;
00344             default:
00345               m_cdata += c;
00346               break;
00347           }
00348           break;
00349         case TagOpeningSlash:         // a slash in an opening tag has been found, initing close of the tag
00350 //           printf( "TagOpeningSlash: %c\n", c );
00351           if( isWhitespace( c ) )
00352             break;
00353 
00354           if( c == '>' )
00355           {
00356             addTag();
00357             if( !closeTag() )
00358             {
00359 //               printf( "noipe, here\n" );
00360               cleanup();
00361               return static_cast<int>( i );
00362             }
00363 
00364             m_state = InterTag;
00365           }
00366           else
00367           {
00368             cleanup();
00369             return static_cast<int>( i );
00370           }
00371           break;
00372         case TagClosingSlash:         // we have found the '/' of a closing tag
00373 //           printf( "TagClosingSlash: %c\n", c );
00374           if( isWhitespace( c ) )
00375             break;
00376 
00377           switch( c )
00378           {
00379             case '>':
00380             case '<':
00381             case '/':
00382               cleanup();
00383               return static_cast<int>( i );
00384               break;
00385             default:
00386               m_tag += c;
00387               m_state = TagClosing;
00388               break;
00389           }
00390           break;
00391         case TagClosing:               // we're collecting the name of a closing tag
00392 //           printf( "TagClosing: %c\n", c );
00393           switch( c )
00394           {
00395             case '<':
00396             case '/':
00397             case '!':
00398             case '?':
00399             case '&':
00400               cleanup();
00401               return static_cast<int>( i );
00402               break;
00403             case ':':
00404               if( !m_haveTagPrefix )
00405               {
00406                 m_haveTagPrefix = true;
00407                 m_tagPrefix = m_tag;
00408                 m_tag = EmptyString;
00409               }
00410               else
00411               {
00412                 cleanup();
00413                 return static_cast<int>( i );
00414               }
00415               break;
00416             case '>':
00417               if( !closeTag() )
00418               {
00419 //                 printf( "here\n" );
00420                 cleanup();
00421                 return static_cast<int>( i );
00422               }
00423               m_state = InterTag;
00424               break;
00425             default:
00426               m_tag += c;
00427               break;
00428           }
00429           break;
00430         case TagNameComplete:        // a tag name is complete, expect tag close or attribs
00431 //           printf( "TagNameComplete: %c\n", c );
00432           if( isWhitespace( c ) )
00433             break;
00434 
00435           switch( c )
00436           {
00437             case '<':
00438             case '!':
00439             case '&':
00440               cleanup();
00441               return static_cast<int>( i );
00442               break;
00443             case '/':
00444               m_state = TagOpeningSlash;
00445               break;
00446             case '>':
00447               if( m_preamble == 1 )
00448               {
00449                 cleanup();
00450                 return static_cast<int>( i );
00451               }
00452               m_state = TagInside;
00453               addTag();
00454               break;
00455             case '?':
00456               if( m_preamble == 1 )
00457                 m_preamble = 2;
00458               else
00459               {
00460                 cleanup();
00461                 return static_cast<int>( i );
00462               }
00463               break;
00464             default:
00465               m_attrib += c;
00466               m_state = TagAttribute;
00467               break;
00468           }
00469           break;
00470         case TagAttribute:                  // we're collecting the name of an attribute, we have at least 1 octet
00471 //           printf( "TagAttribute: %c\n", c );
00472           if( isWhitespace( c ) )
00473           {
00474             m_state = TagAttributeComplete;
00475             break;
00476           }
00477 
00478           switch( c )
00479           {
00480             case '<':
00481             case '/':
00482             case '>':
00483             case '?':
00484             case '!':
00485             case '&':
00486               cleanup();
00487               return static_cast<int>( i );
00488               break;
00489             case '=':
00490               m_state = TagAttributeEqual;
00491               break;
00492             case ':':
00493               if( !m_haveAttribPrefix && m_attrib != XMLNS )
00494               {
00495                 m_haveAttribPrefix = true;
00496                 m_attribPrefix = m_attrib;
00497                 m_attrib = EmptyString;
00498               }
00499               else if( m_attrib == XMLNS )
00500               {
00501                 m_attribIsXmlns = true;
00502                 m_attrib = EmptyString;
00503               }
00504               else
00505               {
00506                 cleanup();
00507                 return static_cast<int>( i );
00508               }
00509               break;
00510             default:
00511               m_attrib += c;
00512           }
00513           break;
00514         case TagAttributeComplete:         // we're expecting an equals sign or ws
00515 //           printf( "TagAttributeComplete: %c\n", c );
00516           if( isWhitespace( c ) )
00517             break;
00518 
00519           switch( c )
00520           {
00521             case '=':
00522               m_state = TagAttributeEqual;
00523               break;
00524             default:
00525               cleanup();
00526               return static_cast<int>( i );
00527               break;
00528           }
00529           break;
00530         case TagAttributeEqual:            // we have found an equals sign
00531 //           printf( "TagAttributeEqual: %c\n", c );
00532           if( isWhitespace( c ) )
00533             break;
00534 
00535           switch( c )
00536           {
00537             case '"':
00538               m_quote = true;
00539             case '\'':
00540               m_state = TagAttributeValue;
00541               break;
00542             default:
00543               cleanup();
00544               return static_cast<int>( i );
00545               break;
00546           }
00547           break;
00548         case TagAttributeValue:                 // we're expecting value data
00549 //           printf( "TagValue: %c\n", c );
00550           switch( c )
00551           {
00552             case '<':
00553               cleanup();
00554               return static_cast<int>( i );
00555               break;
00556             case '\'':
00557               if( m_quote )
00558               {
00559                 m_value += c;
00560                 break;
00561               }
00562             case '"':
00563               addAttribute();
00564               m_state = TagNameAlmostComplete;
00565               m_quote = false;
00566               break;
00567             case '&':
00568 //               printf( "TagAttributeValue, calling decode\n" );
00569               switch( decode( i, data ) )
00570               {
00571                 case DecodeValid:
00572                   break;
00573                 case DecodeInvalid:
00574                   cleanup();
00575                   return static_cast<int>( i );
00576                 case DecodeInsufficient:
00577                   return -1;
00578               }
00579               break;
00580             case '>':
00581             default:
00582               m_value += c;
00583           }
00584           break;
00585         case TagNameAlmostComplete:
00586 //           printf( "TagAttributeEqual: %c\n", c );
00587           if( isWhitespace( c ) )
00588           {
00589             m_state = TagNameComplete;
00590             break;
00591           }
00592 
00593           switch( c )
00594           {
00595             case '/':
00596               m_state = TagOpeningSlash;
00597               break;
00598             case '>':
00599               if( m_preamble == 1 )
00600               {
00601                 cleanup();
00602                 return static_cast<int>( i );
00603               }
00604               m_state = TagInside;
00605               addTag();
00606               break;
00607             case '?':
00608               if( m_preamble == 1 )
00609                 m_preamble = 2;
00610               else
00611               {
00612                 cleanup();
00613                 return static_cast<int>( i );
00614               }
00615               break;
00616             default:
00617               cleanup();
00618               return static_cast<int>( i );
00619               break;
00620           }
00621           break;
00622         default:
00623 //           printf( "default action!?\n" );
00624           break;
00625       }
00626 //       printf( "parser state: %d\n", m_state );
00627     }
00628 
00629     return -1;
00630   }
00631 
00632   void Parser::addTag()
00633   {
00634     if( !m_root )
00635     {
00636 //       printf( "created Tag named %s, ", m_tag.c_str() );
00637       m_root = new Tag( m_tag );
00638       m_current = m_root;
00639     }
00640     else
00641     {
00642 //       printf( "created Tag named %s, ", m_tag.c_str() );
00643       m_current = new Tag( m_current, m_tag );
00644     }
00645 
00646     if( m_haveTagPrefix )
00647     {
00648 //       printf( "setting tag prefix: %s\n", m_tagPrefix.c_str() );
00649       m_current->setPrefix( m_tagPrefix );
00650       m_haveTagPrefix = false;
00651     }
00652 
00653     if( m_attribs.size() )
00654     {
00655       m_current->setAttributes( m_attribs );
00656 //       printf( "added %d attributes, ", m_attribs.size() );
00657       m_attribs.clear();
00658     }
00659 
00660     if( m_xmlnss )
00661     {
00662 //       printf( "have ns decls\n" );
00663 //       StringMap::const_iterator it = m_xmlnss->begin();
00664 //       for( ; it != m_xmlnss->end(); ++it )
00665 //         printf( "%s='%s'\n", (*it).first.c_str(), (*it).second.c_str() );
00666       m_current->setXmlns( m_xmlnss );
00667       m_xmlnss = 0;
00668     }
00669 
00670     m_current->setXmlns( m_xmlns );
00671     m_xmlns = EmptyString;
00672 
00673     if( m_tag == "stream" && m_root->xmlns() == XMLNS_STREAM )
00674     {
00675       streamEvent( m_root );
00676       cleanup( m_deleteRoot );
00677       return;
00678     }
00679 //     else
00680 //       printf( "%s, ", m_root->xml().c_str() );
00681 
00682     if( m_root && m_root == m_current && m_tagPrefix == "stream" )
00683       m_root->setXmlns( XMLNS_STREAM, m_tagPrefix );
00684 
00685     if( m_tag == "xml" && m_preamble == 2 )
00686       cleanup();
00687   }
00688 
00689   void Parser::addAttribute()
00690   {
00691     Tag::Attribute* attr = new Tag::Attribute( m_attrib, m_value );;
00692     if( m_attribIsXmlns )
00693     {
00694       if( !m_xmlnss )
00695         m_xmlnss = new StringMap();
00696 
00697       (*m_xmlnss)[m_attrib] = m_value;
00698       attr->setPrefix( XMLNS );
00699     }
00700     else
00701     {
00702 //   printf( "adding attribute: %s:%s='%s'\n", m_attribPrefix.c_str(), m_attrib.c_str(), m_value.c_str() );
00703       if( !m_attribPrefix.empty() )
00704         attr->setPrefix( m_attribPrefix );
00705       if( m_attrib == XMLNS )
00706         m_xmlns = m_value;
00707     }
00708     m_attribs.push_back( attr );
00709     m_attrib = EmptyString;
00710     m_value = EmptyString;
00711     m_attribPrefix = EmptyString;
00712     m_haveAttribPrefix = false;
00713     m_attribIsXmlns = false;
00714   }
00715 
00716   void Parser::addCData()
00717   {
00718     if( m_current && !m_cdata.empty() )
00719     {
00720       m_current->addCData( m_cdata );
00721 //       printf( "added cdata %s to %s: %s\n",
00722 //               m_cdata.c_str(), m_current->name().c_str(), m_current->xml().c_str() );
00723       m_cdata = EmptyString;
00724     }
00725   }
00726 
00727   bool Parser::closeTag()
00728   {
00729 //     printf( "about to close, " );
00730 
00731     if( m_tag == "stream" && m_tagPrefix == "stream" )
00732       return true;
00733 
00734     if( !m_current || m_current->name() != m_tag
00735         || ( !m_current->prefix().empty() && m_current->prefix() != m_tagPrefix ) )
00736     {
00737 //       printf( "current xml: %s\n", m_current->xml().c_str() );
00738 //       printf( "current name: %s, m_tag: %s\n", m_current->name().c_str(), m_tag.c_str() );
00739 //       printf( "current prefix: %s, m_tagPrefix: %s\n", m_current->prefix().c_str(), m_tagPrefix.c_str() );
00740       return false;
00741     }
00742 
00743 //       printf( "m_current: %s, ", m_current->name().c_str() );
00744 //       printf( "m_tag: %s, ", m_tag.c_str() );
00745 
00746     m_tagPrefix = EmptyString;
00747     m_haveTagPrefix = false;
00748 
00749     if( m_current->parent() )
00750       m_current = m_current->parent();
00751     else
00752     {
00753 //       printf( "pushing upstream\n" );
00754       streamEvent( m_root );
00755       cleanup( m_deleteRoot );
00756     }
00757 
00758     return true;
00759   }
00760 
00761   void Parser::cleanup( bool deleteRoot )
00762   {
00763     if( deleteRoot )
00764       delete m_root;
00765     m_root = 0;
00766     m_current = 0;
00767     delete m_xmlnss;
00768     m_xmlnss = 0;
00769     m_cdata = EmptyString;
00770     m_tag = EmptyString;
00771     m_attrib = EmptyString;
00772     m_attribPrefix = EmptyString;
00773     m_tagPrefix = EmptyString;
00774     m_haveAttribPrefix = false;
00775     m_haveTagPrefix = false;
00776     m_value = EmptyString;
00777     m_xmlns = EmptyString;
00778     util::clearList( m_attribs );
00779     m_attribs.clear();
00780     m_state = Initial;
00781     m_preamble = 0;
00782   }
00783 
00784   bool Parser::isValid( unsigned char c )
00785   {
00786     return ( c != 0xc0 || c != 0xc1 || c < 0xf5 );
00787   }
00788 
00789   bool Parser::isWhitespace( unsigned char c )
00790   {
00791     return ( c == 0x09 || c == 0x0a || c == 0x0d || c == 0x20 );
00792   }
00793 
00794   void Parser::streamEvent( Tag* tag )
00795   {
00796     if( m_tagHandler )
00797       m_tagHandler->handleTag( tag );
00798   }
00799 
00800 }