gloox 1.0
|
00001 /* 00002 Copyright (c) 2004-2009 by Jakob Schroeter <js@camaya.net> 00003 This file is part of the gloox library. http://camaya.net/gloox 00004 00005 This software is distributed under a license. The full license 00006 agreement can be found in the file LICENSE in this distribution. 00007 This software may not be copied, modified, sold or distributed 00008 other than expressed in the named license agreement. 00009 00010 This software is distributed without any warranty. 00011 */ 00012 00013 #include "gloox.h" 00014 #include "util.h" 00015 #include "parser.h" 00016 00017 #include <cstdlib> 00018 00019 namespace gloox 00020 { 00021 00022 Parser::Parser( TagHandler* ph, bool deleteRoot ) 00023 : m_tagHandler( ph ), m_current( 0 ), m_root( 0 ), m_xmlnss( 0 ), m_state( Initial ), 00024 m_preamble( 0 ), m_quote( false ), m_haveTagPrefix( false ), m_haveAttribPrefix( false ), 00025 m_attribIsXmlns( false ), m_deleteRoot( deleteRoot ) 00026 { 00027 } 00028 00029 Parser::~Parser() 00030 { 00031 delete m_root; 00032 delete m_xmlnss; 00033 } 00034 00035 Parser::DecodeState Parser::decode( std::string::size_type& pos, const std::string& data ) 00036 { 00037 std::string::size_type p = data.find( ';', pos ); 00038 std::string::size_type diff = p - pos; 00039 00040 if( p == std::string::npos ) 00041 { 00042 m_backBuffer = data.substr( pos ); 00043 return DecodeInsufficient; 00044 } 00045 00046 if( diff < 3 || diff > 9 ) 00047 return DecodeInvalid; 00048 00049 std::string rep; 00050 switch( data[pos + 1] ) 00051 { 00052 case '#': 00053 { 00054 int base = 10; 00055 int idx = 2; 00056 00057 if( data[pos + 2] == 'x' || data[pos + 2] == 'X' ) 00058 { 00059 base = 16; 00060 idx = 3; 00061 } 00062 00063 char* end; 00064 const long int val = std::strtol( data.data() + pos + idx, &end, base ); 00065 if( *end != ';' || val < 0 ) 00066 return DecodeInvalid; 00067 00068 if( val == 0x9 || val == 0xA || val == 0xD || ( val >= 0x20 && val <= 0x7F ) ) 00069 { 00070 rep += char( val ); 00071 } 00072 else if( val >= 0x80 && val <= 0x7FF ) 00073 { 00074 rep += char( 192 + ( val >> 6 ) ); 00075 rep += char( 128 + ( val % 64 ) ); 00076 } 00077 else if( ( val >= 0x800 && val <= 0xD7FF ) || ( val >= 0xE000 && val <= 0xFFFD ) ) 00078 { 00079 rep += char( 224 + ( val >> 12 ) ); 00080 rep += char( 128 + ( ( val >> 6 ) % 64 ) ); 00081 rep += char( 128 + ( val % 64 ) ); 00082 } 00083 else if( val >= 0x100000 && val < 0x10FFFF ) 00084 { 00085 rep += char( 240 + ( val >> 18 ) ); 00086 rep += char( 128 + ( ( val >> 12 ) % 64 ) ); 00087 rep += char( 128 + ( ( val >> 6 ) % 64 ) ); 00088 rep += char( 128 + ( val % 64 ) ); 00089 } 00090 else 00091 return DecodeInvalid; 00092 } 00093 break; 00094 case 'l': 00095 if( diff == 3 && data[pos + 2] == 't' ) 00096 rep += '<'; 00097 else 00098 return DecodeInvalid; 00099 break; 00100 case 'g': 00101 if( diff == 3 && data[pos + 2] == 't' ) 00102 rep += '>'; 00103 else 00104 return DecodeInvalid; 00105 break; 00106 case 'a': 00107 if( diff == 5 && !data.compare( pos + 1, 5, "apos;" ) ) 00108 rep += '\''; 00109 else if( diff == 4 && !data.compare( pos + 1, 4, "amp;" ) ) 00110 rep += '&'; 00111 else 00112 return DecodeInvalid; 00113 break; 00114 case 'q': 00115 if( diff == 5 && !data.compare( pos + 1, 5, "quot;" ) ) 00116 rep += '"'; 00117 else 00118 return DecodeInvalid; 00119 break; 00120 default: 00121 return DecodeInvalid; 00122 } 00123 00124 switch( m_state ) 00125 { 00126 case TagInside: 00127 m_cdata += rep; 00128 break; 00129 case TagAttributeValue: 00130 m_value += rep; 00131 break; 00132 default: 00133 break; 00134 } 00135 pos += diff; 00136 return DecodeValid; 00137 } 00138 00139 Parser::ForwardScanState Parser::forwardScan( std::string::size_type& pos, const std::string& data, 00140 const std::string& needle ) 00141 { 00142 if( pos + needle.length() <= data.length() ) 00143 { 00144 if( !data.compare( pos, needle.length(), needle ) ) 00145 { 00146 pos += needle.length() - 1; 00147 return ForwardFound; 00148 } 00149 else 00150 { 00151 return ForwardNotFound; 00152 } 00153 } 00154 else 00155 { 00156 m_backBuffer = data.substr( pos ); 00157 return ForwardInsufficientSize; 00158 } 00159 } 00160 00161 int Parser::feed( std::string& data ) 00162 { 00163 if( !m_backBuffer.empty() ) 00164 { 00165 data.insert( 0, m_backBuffer ); 00166 m_backBuffer = EmptyString; 00167 } 00168 00169 std::string::size_type count = data.length(); 00170 for( std::string::size_type i = 0; i < count; ++i ) 00171 { 00172 const unsigned char c = data[i]; 00173 // printf( "found char: %c, ", c ); 00174 00175 if( !isValid( c ) ) 00176 { 00177 cleanup(); 00178 return static_cast<int>( i ); 00179 } 00180 00181 switch( m_state ) 00182 { 00183 case Initial: 00184 // printf( "Initial: %c\n", c ); 00185 if( isWhitespace( c ) ) 00186 break; 00187 00188 switch( c ) 00189 { 00190 case '<': 00191 m_state = TagOpening; 00192 break; 00193 default: 00194 cleanup(); 00195 return static_cast<int>( i ); 00196 break; 00197 } 00198 break; 00199 case InterTag: 00200 // printf( "InterTag: %c\n", c ); 00201 m_tag = EmptyString; 00202 if( isWhitespace( c ) ) 00203 break; 00204 00205 switch( c ) 00206 { 00207 case '<': 00208 m_state = TagOpening; 00209 break; 00210 case '>': 00211 default: 00212 if( m_current ) 00213 { 00214 m_cdata += c; 00215 m_state = TagInside; 00216 } 00217 break; 00218 } 00219 break; 00220 case TagOpening: // opening '<' has been found before 00221 // printf( "TagOpening: %c\n", c ); 00222 if( isWhitespace( c ) ) 00223 break; 00224 00225 switch( c ) 00226 { 00227 case '<': 00228 case '>': 00229 case '&': 00230 cleanup(); 00231 return static_cast<int>( i ); 00232 break; 00233 case '/': 00234 m_state = TagClosingSlash; 00235 break; 00236 case '?': 00237 m_state = TagNameCollect; 00238 m_preamble = 1; 00239 break; 00240 case '!': 00241 switch( forwardScan( i, data, "![CDATA[" ) ) 00242 { 00243 case ForwardFound: 00244 m_state = TagCDATASection; 00245 break; 00246 case ForwardNotFound: 00247 cleanup(); 00248 return static_cast<int>( i ); 00249 case ForwardInsufficientSize: 00250 return -1; 00251 } 00252 break; 00253 default: 00254 m_tag += c; 00255 m_state = TagNameCollect; 00256 break; 00257 } 00258 break; 00259 case TagCDATASection: 00260 switch( c ) 00261 { 00262 case ']': 00263 switch( forwardScan( i, data, "]]>" ) ) 00264 { 00265 case ForwardFound: 00266 m_state = TagInside; 00267 break; 00268 case ForwardNotFound: 00269 m_cdata += c; 00270 break; 00271 case ForwardInsufficientSize: 00272 return -1; 00273 } 00274 break; 00275 default: 00276 m_cdata += c; 00277 break; 00278 } 00279 break; 00280 case TagNameCollect: // we're collecting the tag's name, we have at least one octet already 00281 // printf( "TagNameCollect: %c\n", c ); 00282 if( isWhitespace( c ) ) 00283 { 00284 m_state = TagNameComplete; 00285 break; 00286 } 00287 00288 switch( c ) 00289 { 00290 case '<': 00291 case '?': 00292 case '!': 00293 case '&': 00294 cleanup(); 00295 return static_cast<int>( i ); 00296 break; 00297 case '/': 00298 m_state = TagOpeningSlash; 00299 break; 00300 case '>': 00301 addTag(); 00302 m_state = TagInside; 00303 break; 00304 case ':': 00305 if( !m_haveTagPrefix ) 00306 { 00307 m_haveTagPrefix = true; 00308 m_tagPrefix = m_tag; 00309 m_tag = EmptyString; 00310 } 00311 else 00312 { 00313 cleanup(); 00314 return static_cast<int>( i ); 00315 } 00316 break; 00317 default: 00318 m_tag += c; 00319 break; 00320 } 00321 break; 00322 case TagInside: // we're inside a tag, expecting a child tag or cdata 00323 // printf( "TagInside: %c\n", c ); 00324 m_tag = EmptyString; 00325 switch( c ) 00326 { 00327 case '<': 00328 addCData(); 00329 m_state = TagOpening; 00330 break; 00331 case '&': 00332 // printf( "TagInside, calling decode\n" ); 00333 switch( decode( i, data ) ) 00334 { 00335 case DecodeValid: 00336 break; 00337 case DecodeInvalid: 00338 cleanup(); 00339 return static_cast<int>( i ); 00340 case DecodeInsufficient: 00341 return -1; 00342 } 00343 break; 00344 default: 00345 m_cdata += c; 00346 break; 00347 } 00348 break; 00349 case TagOpeningSlash: // a slash in an opening tag has been found, initing close of the tag 00350 // printf( "TagOpeningSlash: %c\n", c ); 00351 if( isWhitespace( c ) ) 00352 break; 00353 00354 if( c == '>' ) 00355 { 00356 addTag(); 00357 if( !closeTag() ) 00358 { 00359 // printf( "noipe, here\n" ); 00360 cleanup(); 00361 return static_cast<int>( i ); 00362 } 00363 00364 m_state = InterTag; 00365 } 00366 else 00367 { 00368 cleanup(); 00369 return static_cast<int>( i ); 00370 } 00371 break; 00372 case TagClosingSlash: // we have found the '/' of a closing tag 00373 // printf( "TagClosingSlash: %c\n", c ); 00374 if( isWhitespace( c ) ) 00375 break; 00376 00377 switch( c ) 00378 { 00379 case '>': 00380 case '<': 00381 case '/': 00382 cleanup(); 00383 return static_cast<int>( i ); 00384 break; 00385 default: 00386 m_tag += c; 00387 m_state = TagClosing; 00388 break; 00389 } 00390 break; 00391 case TagClosing: // we're collecting the name of a closing tag 00392 // printf( "TagClosing: %c\n", c ); 00393 switch( c ) 00394 { 00395 case '<': 00396 case '/': 00397 case '!': 00398 case '?': 00399 case '&': 00400 cleanup(); 00401 return static_cast<int>( i ); 00402 break; 00403 case ':': 00404 if( !m_haveTagPrefix ) 00405 { 00406 m_haveTagPrefix = true; 00407 m_tagPrefix = m_tag; 00408 m_tag = EmptyString; 00409 } 00410 else 00411 { 00412 cleanup(); 00413 return static_cast<int>( i ); 00414 } 00415 break; 00416 case '>': 00417 if( !closeTag() ) 00418 { 00419 // printf( "here\n" ); 00420 cleanup(); 00421 return static_cast<int>( i ); 00422 } 00423 m_state = InterTag; 00424 break; 00425 default: 00426 m_tag += c; 00427 break; 00428 } 00429 break; 00430 case TagNameComplete: // a tag name is complete, expect tag close or attribs 00431 // printf( "TagNameComplete: %c\n", c ); 00432 if( isWhitespace( c ) ) 00433 break; 00434 00435 switch( c ) 00436 { 00437 case '<': 00438 case '!': 00439 case '&': 00440 cleanup(); 00441 return static_cast<int>( i ); 00442 break; 00443 case '/': 00444 m_state = TagOpeningSlash; 00445 break; 00446 case '>': 00447 if( m_preamble == 1 ) 00448 { 00449 cleanup(); 00450 return static_cast<int>( i ); 00451 } 00452 m_state = TagInside; 00453 addTag(); 00454 break; 00455 case '?': 00456 if( m_preamble == 1 ) 00457 m_preamble = 2; 00458 else 00459 { 00460 cleanup(); 00461 return static_cast<int>( i ); 00462 } 00463 break; 00464 default: 00465 m_attrib += c; 00466 m_state = TagAttribute; 00467 break; 00468 } 00469 break; 00470 case TagAttribute: // we're collecting the name of an attribute, we have at least 1 octet 00471 // printf( "TagAttribute: %c\n", c ); 00472 if( isWhitespace( c ) ) 00473 { 00474 m_state = TagAttributeComplete; 00475 break; 00476 } 00477 00478 switch( c ) 00479 { 00480 case '<': 00481 case '/': 00482 case '>': 00483 case '?': 00484 case '!': 00485 case '&': 00486 cleanup(); 00487 return static_cast<int>( i ); 00488 break; 00489 case '=': 00490 m_state = TagAttributeEqual; 00491 break; 00492 case ':': 00493 if( !m_haveAttribPrefix && m_attrib != XMLNS ) 00494 { 00495 m_haveAttribPrefix = true; 00496 m_attribPrefix = m_attrib; 00497 m_attrib = EmptyString; 00498 } 00499 else if( m_attrib == XMLNS ) 00500 { 00501 m_attribIsXmlns = true; 00502 m_attrib = EmptyString; 00503 } 00504 else 00505 { 00506 cleanup(); 00507 return static_cast<int>( i ); 00508 } 00509 break; 00510 default: 00511 m_attrib += c; 00512 } 00513 break; 00514 case TagAttributeComplete: // we're expecting an equals sign or ws 00515 // printf( "TagAttributeComplete: %c\n", c ); 00516 if( isWhitespace( c ) ) 00517 break; 00518 00519 switch( c ) 00520 { 00521 case '=': 00522 m_state = TagAttributeEqual; 00523 break; 00524 default: 00525 cleanup(); 00526 return static_cast<int>( i ); 00527 break; 00528 } 00529 break; 00530 case TagAttributeEqual: // we have found an equals sign 00531 // printf( "TagAttributeEqual: %c\n", c ); 00532 if( isWhitespace( c ) ) 00533 break; 00534 00535 switch( c ) 00536 { 00537 case '"': 00538 m_quote = true; 00539 case '\'': 00540 m_state = TagAttributeValue; 00541 break; 00542 default: 00543 cleanup(); 00544 return static_cast<int>( i ); 00545 break; 00546 } 00547 break; 00548 case TagAttributeValue: // we're expecting value data 00549 // printf( "TagValue: %c\n", c ); 00550 switch( c ) 00551 { 00552 case '<': 00553 cleanup(); 00554 return static_cast<int>( i ); 00555 break; 00556 case '\'': 00557 if( m_quote ) 00558 { 00559 m_value += c; 00560 break; 00561 } 00562 case '"': 00563 addAttribute(); 00564 m_state = TagNameAlmostComplete; 00565 m_quote = false; 00566 break; 00567 case '&': 00568 // printf( "TagAttributeValue, calling decode\n" ); 00569 switch( decode( i, data ) ) 00570 { 00571 case DecodeValid: 00572 break; 00573 case DecodeInvalid: 00574 cleanup(); 00575 return static_cast<int>( i ); 00576 case DecodeInsufficient: 00577 return -1; 00578 } 00579 break; 00580 case '>': 00581 default: 00582 m_value += c; 00583 } 00584 break; 00585 case TagNameAlmostComplete: 00586 // printf( "TagAttributeEqual: %c\n", c ); 00587 if( isWhitespace( c ) ) 00588 { 00589 m_state = TagNameComplete; 00590 break; 00591 } 00592 00593 switch( c ) 00594 { 00595 case '/': 00596 m_state = TagOpeningSlash; 00597 break; 00598 case '>': 00599 if( m_preamble == 1 ) 00600 { 00601 cleanup(); 00602 return static_cast<int>( i ); 00603 } 00604 m_state = TagInside; 00605 addTag(); 00606 break; 00607 case '?': 00608 if( m_preamble == 1 ) 00609 m_preamble = 2; 00610 else 00611 { 00612 cleanup(); 00613 return static_cast<int>( i ); 00614 } 00615 break; 00616 default: 00617 cleanup(); 00618 return static_cast<int>( i ); 00619 break; 00620 } 00621 break; 00622 default: 00623 // printf( "default action!?\n" ); 00624 break; 00625 } 00626 // printf( "parser state: %d\n", m_state ); 00627 } 00628 00629 return -1; 00630 } 00631 00632 void Parser::addTag() 00633 { 00634 if( !m_root ) 00635 { 00636 // printf( "created Tag named %s, ", m_tag.c_str() ); 00637 m_root = new Tag( m_tag ); 00638 m_current = m_root; 00639 } 00640 else 00641 { 00642 // printf( "created Tag named %s, ", m_tag.c_str() ); 00643 m_current = new Tag( m_current, m_tag ); 00644 } 00645 00646 if( m_haveTagPrefix ) 00647 { 00648 // printf( "setting tag prefix: %s\n", m_tagPrefix.c_str() ); 00649 m_current->setPrefix( m_tagPrefix ); 00650 m_haveTagPrefix = false; 00651 } 00652 00653 if( m_attribs.size() ) 00654 { 00655 m_current->setAttributes( m_attribs ); 00656 // printf( "added %d attributes, ", m_attribs.size() ); 00657 m_attribs.clear(); 00658 } 00659 00660 if( m_xmlnss ) 00661 { 00662 // printf( "have ns decls\n" ); 00663 // StringMap::const_iterator it = m_xmlnss->begin(); 00664 // for( ; it != m_xmlnss->end(); ++it ) 00665 // printf( "%s='%s'\n", (*it).first.c_str(), (*it).second.c_str() ); 00666 m_current->setXmlns( m_xmlnss ); 00667 m_xmlnss = 0; 00668 } 00669 00670 m_current->setXmlns( m_xmlns ); 00671 m_xmlns = EmptyString; 00672 00673 if( m_tag == "stream" && m_root->xmlns() == XMLNS_STREAM ) 00674 { 00675 streamEvent( m_root ); 00676 cleanup( m_deleteRoot ); 00677 return; 00678 } 00679 // else 00680 // printf( "%s, ", m_root->xml().c_str() ); 00681 00682 if( m_root && m_root == m_current && m_tagPrefix == "stream" ) 00683 m_root->setXmlns( XMLNS_STREAM, m_tagPrefix ); 00684 00685 if( m_tag == "xml" && m_preamble == 2 ) 00686 cleanup(); 00687 } 00688 00689 void Parser::addAttribute() 00690 { 00691 Tag::Attribute* attr = new Tag::Attribute( m_attrib, m_value );; 00692 if( m_attribIsXmlns ) 00693 { 00694 if( !m_xmlnss ) 00695 m_xmlnss = new StringMap(); 00696 00697 (*m_xmlnss)[m_attrib] = m_value; 00698 attr->setPrefix( XMLNS ); 00699 } 00700 else 00701 { 00702 // printf( "adding attribute: %s:%s='%s'\n", m_attribPrefix.c_str(), m_attrib.c_str(), m_value.c_str() ); 00703 if( !m_attribPrefix.empty() ) 00704 attr->setPrefix( m_attribPrefix ); 00705 if( m_attrib == XMLNS ) 00706 m_xmlns = m_value; 00707 } 00708 m_attribs.push_back( attr ); 00709 m_attrib = EmptyString; 00710 m_value = EmptyString; 00711 m_attribPrefix = EmptyString; 00712 m_haveAttribPrefix = false; 00713 m_attribIsXmlns = false; 00714 } 00715 00716 void Parser::addCData() 00717 { 00718 if( m_current && !m_cdata.empty() ) 00719 { 00720 m_current->addCData( m_cdata ); 00721 // printf( "added cdata %s to %s: %s\n", 00722 // m_cdata.c_str(), m_current->name().c_str(), m_current->xml().c_str() ); 00723 m_cdata = EmptyString; 00724 } 00725 } 00726 00727 bool Parser::closeTag() 00728 { 00729 // printf( "about to close, " ); 00730 00731 if( m_tag == "stream" && m_tagPrefix == "stream" ) 00732 return true; 00733 00734 if( !m_current || m_current->name() != m_tag 00735 || ( !m_current->prefix().empty() && m_current->prefix() != m_tagPrefix ) ) 00736 { 00737 // printf( "current xml: %s\n", m_current->xml().c_str() ); 00738 // printf( "current name: %s, m_tag: %s\n", m_current->name().c_str(), m_tag.c_str() ); 00739 // printf( "current prefix: %s, m_tagPrefix: %s\n", m_current->prefix().c_str(), m_tagPrefix.c_str() ); 00740 return false; 00741 } 00742 00743 // printf( "m_current: %s, ", m_current->name().c_str() ); 00744 // printf( "m_tag: %s, ", m_tag.c_str() ); 00745 00746 m_tagPrefix = EmptyString; 00747 m_haveTagPrefix = false; 00748 00749 if( m_current->parent() ) 00750 m_current = m_current->parent(); 00751 else 00752 { 00753 // printf( "pushing upstream\n" ); 00754 streamEvent( m_root ); 00755 cleanup( m_deleteRoot ); 00756 } 00757 00758 return true; 00759 } 00760 00761 void Parser::cleanup( bool deleteRoot ) 00762 { 00763 if( deleteRoot ) 00764 delete m_root; 00765 m_root = 0; 00766 m_current = 0; 00767 delete m_xmlnss; 00768 m_xmlnss = 0; 00769 m_cdata = EmptyString; 00770 m_tag = EmptyString; 00771 m_attrib = EmptyString; 00772 m_attribPrefix = EmptyString; 00773 m_tagPrefix = EmptyString; 00774 m_haveAttribPrefix = false; 00775 m_haveTagPrefix = false; 00776 m_value = EmptyString; 00777 m_xmlns = EmptyString; 00778 util::clearList( m_attribs ); 00779 m_attribs.clear(); 00780 m_state = Initial; 00781 m_preamble = 0; 00782 } 00783 00784 bool Parser::isValid( unsigned char c ) 00785 { 00786 return ( c != 0xc0 || c != 0xc1 || c < 0xf5 ); 00787 } 00788 00789 bool Parser::isWhitespace( unsigned char c ) 00790 { 00791 return ( c == 0x09 || c == 0x0a || c == 0x0d || c == 0x20 ); 00792 } 00793 00794 void Parser::streamEvent( Tag* tag ) 00795 { 00796 if( m_tagHandler ) 00797 m_tagHandler->handleTag( tag ); 00798 } 00799 00800 }