parser.cpp
00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013 #include "gloox.h"
00014 #include "util.h"
00015 #include "parser.h"
00016
00017 #include <cstdlib>
00018
00019 namespace gloox
00020 {
00021
00022 Parser::Parser( TagHandler* ph, bool deleteRoot )
00023 : m_tagHandler( ph ), m_current( 0 ), m_root( 0 ), m_xmlnss( 0 ), m_state( Initial ),
00024 m_preamble( 0 ), m_quote( false ), m_haveTagPrefix( false ), m_haveAttribPrefix( false ),
00025 m_attribIsXmlns( false ), m_deleteRoot( deleteRoot )
00026 {
00027 }
00028
00029 Parser::~Parser()
00030 {
00031 delete m_root;
00032 delete m_xmlnss;
00033 }
00034
00035 Parser::DecodeState Parser::decode( std::string::size_type& pos, const std::string& data )
00036 {
00037 std::string::size_type p = data.find( ';', pos );
00038 std::string::size_type diff = p - pos;
00039
00040 if( p == std::string::npos )
00041 {
00042 m_backBuffer = data.substr( pos );
00043 return DecodeInsufficient;
00044 }
00045
00046 if( diff < 3 || diff > 9 )
00047 return DecodeInvalid;
00048
00049 std::string rep;
00050 switch( data[pos + 1] )
00051 {
00052 case '#':
00053 {
00054 int base = 10;
00055 int idx = 2;
00056
00057 if( data[pos + 2] == 'x' || data[pos + 2] == 'X' )
00058 {
00059 base = 16;
00060 idx = 3;
00061 }
00062
00063 char* end;
00064 const long int val = std::strtol( data.data() + pos + idx, &end, base );
00065 if( *end != ';' || val < 0 )
00066 return DecodeInvalid;
00067
00068 if( val == 0x9 || val == 0xA || val == 0xD || ( val >= 0x20 && val <= 0x7F ) )
00069 {
00070 rep += char( val );
00071 }
00072 else if( val >= 0x80 && val <= 0x7FF )
00073 {
00074 rep += char( 192 + ( val >> 6 ) );
00075 rep += char( 128 + ( val % 64 ) );
00076 }
00077 else if( ( val >= 0x800 && val <= 0xD7FF ) || ( val >= 0xE000 && val <= 0xFFFD ) )
00078 {
00079 rep += char( 224 + ( val >> 12 ) );
00080 rep += char( 128 + ( ( val >> 6 ) % 64 ) );
00081 rep += char( 128 + ( val % 64 ) );
00082 }
00083 else if( val >= 0x100000 && val < 0x10FFFF )
00084 {
00085 rep += char( 240 + ( val >> 18 ) );
00086 rep += char( 128 + ( ( val >> 12 ) % 64 ) );
00087 rep += char( 128 + ( ( val >> 6 ) % 64 ) );
00088 rep += char( 128 + ( val % 64 ) );
00089 }
00090 else
00091 return DecodeInvalid;
00092 }
00093 break;
00094 case 'l':
00095 if( diff == 3 && data[pos + 2] == 't' )
00096 rep += '<';
00097 else
00098 return DecodeInvalid;
00099 break;
00100 case 'g':
00101 if( diff == 3 && data[pos + 2] == 't' )
00102 rep += '>';
00103 else
00104 return DecodeInvalid;
00105 break;
00106 case 'a':
00107 if( diff == 5 && !data.compare( pos + 1, 5, "apos;" ) )
00108 rep += '\'';
00109 else if( diff == 4 && !data.compare( pos + 1, 4, "amp;" ) )
00110 rep += '&';
00111 else
00112 return DecodeInvalid;
00113 break;
00114 case 'q':
00115 if( diff == 5 && !data.compare( pos + 1, 5, "quot;" ) )
00116 rep += '"';
00117 else
00118 return DecodeInvalid;
00119 break;
00120 default:
00121 return DecodeInvalid;
00122 }
00123
00124 switch( m_state )
00125 {
00126 case TagInside:
00127 m_cdata += rep;
00128 break;
00129 case TagAttributeValue:
00130 m_value += rep;
00131 break;
00132 default:
00133 break;
00134 }
00135 pos += diff;
00136 return DecodeValid;
00137 }
00138
00139 Parser::ForwardScanState Parser::forwardScan( std::string::size_type& pos, const std::string& data,
00140 const std::string& needle )
00141 {
00142 if( pos + needle.length() <= data.length() )
00143 {
00144 if( !data.compare( pos, needle.length(), needle ) )
00145 {
00146 pos += needle.length() - 1;
00147 return ForwardFound;
00148 }
00149 else
00150 {
00151 return ForwardNotFound;
00152 }
00153 }
00154 else
00155 {
00156 m_backBuffer = data.substr( pos );
00157 return ForwardInsufficientSize;
00158 }
00159 }
00160
00161 int Parser::feed( std::string& data )
00162 {
00163 if( !m_backBuffer.empty() )
00164 {
00165 data.insert( 0, m_backBuffer );
00166 m_backBuffer = EmptyString;
00167 }
00168
00169 std::string::size_type count = data.length();
00170 for( std::string::size_type i = 0; i < count; ++i )
00171 {
00172 const unsigned char c = data[i];
00173
00174
00175 if( !isValid( c ) )
00176 {
00177 cleanup();
00178 return static_cast<int>( i );
00179 }
00180
00181 switch( m_state )
00182 {
00183 case Initial:
00184
00185 if( isWhitespace( c ) )
00186 break;
00187
00188 switch( c )
00189 {
00190 case '<':
00191 m_state = TagOpening;
00192 break;
00193 default:
00194 cleanup();
00195 return static_cast<int>( i );
00196 break;
00197 }
00198 break;
00199 case InterTag:
00200
00201 m_tag = EmptyString;
00202 if( isWhitespace( c ) )
00203 break;
00204
00205 switch( c )
00206 {
00207 case '<':
00208 m_state = TagOpening;
00209 break;
00210 case '>':
00211 default:
00212 if( m_current )
00213 {
00214 m_cdata += c;
00215 m_state = TagInside;
00216 }
00217 break;
00218 }
00219 break;
00220 case TagOpening:
00221
00222 if( isWhitespace( c ) )
00223 break;
00224
00225 switch( c )
00226 {
00227 case '<':
00228 case '>':
00229 case '&':
00230 cleanup();
00231 return static_cast<int>( i );
00232 break;
00233 case '/':
00234 m_state = TagClosingSlash;
00235 break;
00236 case '?':
00237 m_state = TagNameCollect;
00238 m_preamble = 1;
00239 break;
00240 case '!':
00241 switch( forwardScan( i, data, "![CDATA[" ) )
00242 {
00243 case ForwardFound:
00244 m_state = TagCDATASection;
00245 break;
00246 case ForwardNotFound:
00247 cleanup();
00248 return static_cast<int>( i );
00249 case ForwardInsufficientSize:
00250 return -1;
00251 }
00252 break;
00253 default:
00254 m_tag += c;
00255 m_state = TagNameCollect;
00256 break;
00257 }
00258 break;
00259 case TagCDATASection:
00260 switch( c )
00261 {
00262 case ']':
00263 switch( forwardScan( i, data, "]]>" ) )
00264 {
00265 case ForwardFound:
00266 m_state = TagInside;
00267 break;
00268 case ForwardNotFound:
00269 m_cdata += c;
00270 break;
00271 case ForwardInsufficientSize:
00272 return -1;
00273 }
00274 break;
00275 default:
00276 m_cdata += c;
00277 break;
00278 }
00279 break;
00280 case TagNameCollect:
00281
00282 if( isWhitespace( c ) )
00283 {
00284 m_state = TagNameComplete;
00285 break;
00286 }
00287
00288 switch( c )
00289 {
00290 case '<':
00291 case '?':
00292 case '!':
00293 case '&':
00294 cleanup();
00295 return static_cast<int>( i );
00296 break;
00297 case '/':
00298 m_state = TagOpeningSlash;
00299 break;
00300 case '>':
00301 addTag();
00302 m_state = TagInside;
00303 break;
00304 case ':':
00305 if( !m_haveTagPrefix )
00306 {
00307 m_haveTagPrefix = true;
00308 m_tagPrefix = m_tag;
00309 m_tag = EmptyString;
00310 }
00311 else
00312 {
00313 cleanup();
00314 return static_cast<int>( i );
00315 }
00316 break;
00317 default:
00318 m_tag += c;
00319 break;
00320 }
00321 break;
00322 case TagInside:
00323
00324 m_tag = EmptyString;
00325 switch( c )
00326 {
00327 case '<':
00328 addCData();
00329 m_state = TagOpening;
00330 break;
00331 case '&':
00332
00333 switch( decode( i, data ) )
00334 {
00335 case DecodeValid:
00336 break;
00337 case DecodeInvalid:
00338 cleanup();
00339 return static_cast<int>( i );
00340 case DecodeInsufficient:
00341 return -1;
00342 }
00343 break;
00344 default:
00345 m_cdata += c;
00346 break;
00347 }
00348 break;
00349 case TagOpeningSlash:
00350
00351 if( isWhitespace( c ) )
00352 break;
00353
00354 if( c == '>' )
00355 {
00356 addTag();
00357 if( !closeTag() )
00358 {
00359
00360 cleanup();
00361 return static_cast<int>( i );
00362 }
00363
00364 m_state = InterTag;
00365 }
00366 else
00367 {
00368 cleanup();
00369 return static_cast<int>( i );
00370 }
00371 break;
00372 case TagClosingSlash:
00373
00374 if( isWhitespace( c ) )
00375 break;
00376
00377 switch( c )
00378 {
00379 case '>':
00380 case '<':
00381 case '/':
00382 cleanup();
00383 return static_cast<int>( i );
00384 break;
00385 default:
00386 m_tag += c;
00387 m_state = TagClosing;
00388 break;
00389 }
00390 break;
00391 case TagClosing:
00392
00393 switch( c )
00394 {
00395 case '<':
00396 case '/':
00397 case '!':
00398 case '?':
00399 case '&':
00400 cleanup();
00401 return static_cast<int>( i );
00402 break;
00403 case ':':
00404 if( !m_haveTagPrefix )
00405 {
00406 m_haveTagPrefix = true;
00407 m_tagPrefix = m_tag;
00408 m_tag = EmptyString;
00409 }
00410 else
00411 {
00412 cleanup();
00413 return static_cast<int>( i );
00414 }
00415 break;
00416 case '>':
00417 if( !closeTag() )
00418 {
00419
00420 cleanup();
00421 return static_cast<int>( i );
00422 }
00423 m_state = InterTag;
00424 break;
00425 default:
00426 m_tag += c;
00427 break;
00428 }
00429 break;
00430 case TagNameComplete:
00431
00432 if( isWhitespace( c ) )
00433 break;
00434
00435 switch( c )
00436 {
00437 case '<':
00438 case '!':
00439 case '&':
00440 cleanup();
00441 return static_cast<int>( i );
00442 break;
00443 case '/':
00444 m_state = TagOpeningSlash;
00445 break;
00446 case '>':
00447 if( m_preamble == 1 )
00448 {
00449 cleanup();
00450 return static_cast<int>( i );
00451 }
00452 m_state = TagInside;
00453 addTag();
00454 break;
00455 case '?':
00456 if( m_preamble == 1 )
00457 m_preamble = 2;
00458 else
00459 {
00460 cleanup();
00461 return static_cast<int>( i );
00462 }
00463 break;
00464 default:
00465 m_attrib += c;
00466 m_state = TagAttribute;
00467 break;
00468 }
00469 break;
00470 case TagAttribute:
00471
00472 if( isWhitespace( c ) )
00473 {
00474 m_state = TagAttributeComplete;
00475 break;
00476 }
00477
00478 switch( c )
00479 {
00480 case '<':
00481 case '/':
00482 case '>':
00483 case '?':
00484 case '!':
00485 case '&':
00486 cleanup();
00487 return static_cast<int>( i );
00488 break;
00489 case '=':
00490 m_state = TagAttributeEqual;
00491 break;
00492 case ':':
00493 if( !m_haveAttribPrefix && m_attrib != XMLNS )
00494 {
00495 m_haveAttribPrefix = true;
00496 m_attribPrefix = m_attrib;
00497 m_attrib = EmptyString;
00498 }
00499 else if( m_attrib == XMLNS )
00500 {
00501 m_attribIsXmlns = true;
00502 m_attrib = EmptyString;
00503 }
00504 else
00505 {
00506 cleanup();
00507 return static_cast<int>( i );
00508 }
00509 break;
00510 default:
00511 m_attrib += c;
00512 }
00513 break;
00514 case TagAttributeComplete:
00515
00516 if( isWhitespace( c ) )
00517 break;
00518
00519 switch( c )
00520 {
00521 case '=':
00522 m_state = TagAttributeEqual;
00523 break;
00524 default:
00525 cleanup();
00526 return static_cast<int>( i );
00527 break;
00528 }
00529 break;
00530 case TagAttributeEqual:
00531
00532 if( isWhitespace( c ) )
00533 break;
00534
00535 switch( c )
00536 {
00537 case '"':
00538 m_quote = true;
00539 case '\'':
00540 m_state = TagAttributeValue;
00541 break;
00542 default:
00543 cleanup();
00544 return static_cast<int>( i );
00545 break;
00546 }
00547 break;
00548 case TagAttributeValue:
00549
00550 switch( c )
00551 {
00552 case '<':
00553 cleanup();
00554 return static_cast<int>( i );
00555 break;
00556 case '\'':
00557 if( m_quote )
00558 {
00559 m_value += c;
00560 break;
00561 }
00562 case '"':
00563 addAttribute();
00564 m_state = TagNameAlmostComplete;
00565 m_quote = false;
00566 break;
00567 case '&':
00568
00569 switch( decode( i, data ) )
00570 {
00571 case DecodeValid:
00572 break;
00573 case DecodeInvalid:
00574 cleanup();
00575 return static_cast<int>( i );
00576 case DecodeInsufficient:
00577 return -1;
00578 }
00579 break;
00580 case '>':
00581 default:
00582 m_value += c;
00583 }
00584 break;
00585 case TagNameAlmostComplete:
00586
00587 if( isWhitespace( c ) )
00588 {
00589 m_state = TagNameComplete;
00590 break;
00591 }
00592
00593 switch( c )
00594 {
00595 case '/':
00596 m_state = TagOpeningSlash;
00597 break;
00598 case '>':
00599 if( m_preamble == 1 )
00600 {
00601 cleanup();
00602 return static_cast<int>( i );
00603 }
00604 m_state = TagInside;
00605 addTag();
00606 break;
00607 case '?':
00608 if( m_preamble == 1 )
00609 m_preamble = 2;
00610 else
00611 {
00612 cleanup();
00613 return static_cast<int>( i );
00614 }
00615 break;
00616 default:
00617 cleanup();
00618 return static_cast<int>( i );
00619 break;
00620 }
00621 break;
00622 default:
00623
00624 break;
00625 }
00626
00627 }
00628
00629 return -1;
00630 }
00631
00632 void Parser::addTag()
00633 {
00634 if( !m_root )
00635 {
00636
00637 m_root = new Tag( m_tag );
00638 m_current = m_root;
00639 }
00640 else
00641 {
00642
00643 m_current = new Tag( m_current, m_tag );
00644 }
00645
00646 if( m_haveTagPrefix )
00647 {
00648
00649 m_current->setPrefix( m_tagPrefix );
00650 m_haveTagPrefix = false;
00651 }
00652
00653 if( m_attribs.size() )
00654 {
00655 m_current->setAttributes( m_attribs );
00656
00657 m_attribs.clear();
00658 }
00659
00660 if( m_xmlnss )
00661 {
00662
00663
00664
00665
00666 m_current->setXmlns( m_xmlnss );
00667 m_xmlnss = 0;
00668 }
00669
00670 m_current->setXmlns( m_xmlns );
00671 m_xmlns = EmptyString;
00672
00673 if( m_tag == "stream" && m_root->xmlns() == XMLNS_STREAM )
00674 {
00675 streamEvent( m_root );
00676 cleanup( m_deleteRoot );
00677 return;
00678 }
00679
00680
00681
00682 if( m_root && m_root == m_current && m_tagPrefix == "stream" )
00683 m_root->setXmlns( XMLNS_STREAM, m_tagPrefix );
00684
00685 if( m_tag == "xml" && m_preamble == 2 )
00686 cleanup();
00687 }
00688
00689 void Parser::addAttribute()
00690 {
00691 Tag::Attribute* attr = new Tag::Attribute( m_attrib, m_value );;
00692 if( m_attribIsXmlns )
00693 {
00694 if( !m_xmlnss )
00695 m_xmlnss = new StringMap();
00696
00697 (*m_xmlnss)[m_attrib] = m_value;
00698 attr->setPrefix( XMLNS );
00699 }
00700 else
00701 {
00702
00703 if( !m_attribPrefix.empty() )
00704 attr->setPrefix( m_attribPrefix );
00705 if( m_attrib == XMLNS )
00706 m_xmlns = m_value;
00707 }
00708 m_attribs.push_back( attr );
00709 m_attrib = EmptyString;
00710 m_value = EmptyString;
00711 m_attribPrefix = EmptyString;
00712 m_haveAttribPrefix = false;
00713 m_attribIsXmlns = false;
00714 }
00715
00716 void Parser::addCData()
00717 {
00718 if( m_current && !m_cdata.empty() )
00719 {
00720 m_current->addCData( m_cdata );
00721
00722
00723 m_cdata = EmptyString;
00724 }
00725 }
00726
00727 bool Parser::closeTag()
00728 {
00729
00730
00731 if( m_tag == "stream" && m_tagPrefix == "stream" )
00732 return true;
00733
00734 if( !m_current || m_current->name() != m_tag
00735 || ( !m_current->prefix().empty() && m_current->prefix() != m_tagPrefix ) )
00736 {
00737
00738
00739
00740 return false;
00741 }
00742
00743
00744
00745
00746 m_tagPrefix = EmptyString;
00747 m_haveTagPrefix = false;
00748
00749 if( m_current->parent() )
00750 m_current = m_current->parent();
00751 else
00752 {
00753
00754 streamEvent( m_root );
00755 cleanup( m_deleteRoot );
00756 }
00757
00758 return true;
00759 }
00760
00761 void Parser::cleanup( bool deleteRoot )
00762 {
00763 if( deleteRoot )
00764 delete m_root;
00765 m_root = 0;
00766 m_current = 0;
00767 delete m_xmlnss;
00768 m_xmlnss = 0;
00769 m_cdata = EmptyString;
00770 m_tag = EmptyString;
00771 m_attrib = EmptyString;
00772 m_attribPrefix = EmptyString;
00773 m_tagPrefix = EmptyString;
00774 m_haveAttribPrefix = false;
00775 m_haveTagPrefix = false;
00776 m_value = EmptyString;
00777 m_xmlns = EmptyString;
00778 util::clearList( m_attribs );
00779 m_attribs.clear();
00780 m_state = Initial;
00781 m_preamble = 0;
00782 }
00783
00784 bool Parser::isValid( unsigned char c )
00785 {
00786 return ( c != 0xc0 || c != 0xc1 || c < 0xf5 );
00787 }
00788
00789 bool Parser::isWhitespace( unsigned char c )
00790 {
00791 return ( c == 0x09 || c == 0x0a || c == 0x0d || c == 0x20 );
00792 }
00793
00794 void Parser::streamEvent( Tag* tag )
00795 {
00796 if( m_tagHandler )
00797 m_tagHandler->handleTag( tag );
00798 }
00799
00800 }