nux-1.14.0
|
00001 /* 00002 * Copyright 2010 Inalogic® Inc. 00003 * 00004 * This program is free software: you can redistribute it and/or modify it 00005 * under the terms of the GNU Lesser General Public License, as 00006 * published by the Free Software Foundation; either version 2.1 or 3.0 00007 * of the License. 00008 * 00009 * This program is distributed in the hope that it will be useful, but 00010 * WITHOUT ANY WARRANTY; without even the implied warranties of 00011 * MERCHANTABILITY, SATISFACTORY QUALITY or FITNESS FOR A PARTICULAR 00012 * PURPOSE. See the applicable version of the GNU Lesser General Public 00013 * License for more details. 00014 * 00015 * You should have received a copy of both the GNU Lesser General Public 00016 * License along with this program. If not, see <http://www.gnu.org/licenses/> 00017 * 00018 * Authored by: Jay Taoko <jaytaoko@inalogic.com> 00019 * 00020 */ 00021 00022 00023 /* 00024 www.sourceforge.net/projects/tinyxml 00025 Original code (2.0 and earlier )copyright (c) 2000-2002 Lee Thomason (www.grinninglizard.com) 00026 00027 This software is provided 'as-is', without any express or implied 00028 warranty. In no event will the authors be held liable for any 00029 damages arising from the use of this software. 00030 00031 Permission is granted to anyone to use this software for any 00032 purpose, including commercial applications, and to alter it and 00033 redistribute it freely, subject to the following restrictions: 00034 00035 1. The origin of this software must not be misrepresented; you must 00036 not claim that you wrote the original software. If you use this 00037 software in a product, an acknowledgment in the product documentation 00038 would be appreciated but is not required. 00039 00040 2. Altered source versions must be plainly marked as such, and 00041 must not be misrepresented as being the original software. 00042 00043 3. This notice may not be removed or altered from any source 00044 distribution. 00045 */ 00046 00047 #include <ctype.h> 00048 #include <stddef.h> 00049 00050 #include "tinyxml.h" 00051 00052 //#define DEBUG_PARSER 00053 #if defined( DEBUG_PARSER ) 00054 # if defined( DEBUG ) && defined( _MSC_VER ) 00055 # include <windows.h> 00056 # define TIXML_LOG OutputDebugString 00057 # else 00058 # define TIXML_LOG printf 00059 # endif 00060 #endif 00061 00062 // Note tha "PutString" hardcodes the same list. This 00063 // is less flexible than it appears. Changing the entries 00064 // or order will break putstring. 00065 TiXmlBase::Entity TiXmlBase::entity[ NUM_ENTITY ] = 00066 { 00067 { "&", 5, '&' }, 00068 { "<", 4, '<' }, 00069 { ">", 4, '>' }, 00070 { """, 6, '\"' }, 00071 { "'", 6, '\'' } 00072 }; 00073 00074 // Bunch of unicode info at: 00075 // http://www.unicode.org/faq/utf_bom.html 00076 // Including the basic of this table, which determines the #bytes in the 00077 // sequence from the lead byte. 1 placed for invalid sequences -- 00078 // although the result will be junk, pass it through as much as possible. 00079 // Beware of the non-characters in UTF-8: 00080 // ef bb bf (Microsoft "lead bytes") 00081 // ef bf be 00082 // ef bf bf 00083 00084 const unsigned char TIXML_UTF_LEAD_0 = 0xefU; 00085 const unsigned char TIXML_UTF_LEAD_1 = 0xbbU; 00086 const unsigned char TIXML_UTF_LEAD_2 = 0xbfU; 00087 00088 const int TiXmlBase::utf8ByteTable[256] = 00089 { 00090 // 0 1 2 3 4 5 6 7 8 9 a b c d e f 00091 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x00 00092 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x10 00093 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x20 00094 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x30 00095 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x40 00096 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x50 00097 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x60 00098 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x70 End of ASCII range 00099 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x80 0x80 to 0xc1 invalid 00100 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x90 00101 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0xa0 00102 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0xb0 00103 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xc0 0xc2 to 0xdf 2 byte 00104 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xd0 00105 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 0xe0 0xe0 to 0xef 3 byte 00106 4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 // 0xf0 0xf0 to 0xf4 4 byte, 0xf5 and higher invalid 00107 }; 00108 00109 00110 void TiXmlBase::ConvertUTF32ToUTF8 ( unsigned long input, char *output, int *length ) 00111 { 00112 const unsigned long BYTE_MASK = 0xBF; 00113 const unsigned long BYTE_MARK = 0x80; 00114 const unsigned long FIRST_BYTE_MARK[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; 00115 00116 if (input < 0x80) 00117 *length = 1; 00118 else if ( input < 0x800 ) 00119 *length = 2; 00120 else if ( input < 0x10000 ) 00121 *length = 3; 00122 else if ( input < 0x200000 ) 00123 *length = 4; 00124 else 00125 { 00126 *length = 0; // This code won't covert this correctly anyway. 00127 return; 00128 } 00129 00130 output += *length; 00131 00132 // Scary scary fall throughs. 00133 switch (*length) 00134 { 00135 case 4: 00136 --output; 00137 *output = (char) ( (input | BYTE_MARK) & BYTE_MASK); 00138 input >>= 6; 00139 case 3: 00140 --output; 00141 *output = (char) ( (input | BYTE_MARK) & BYTE_MASK); 00142 input >>= 6; 00143 case 2: 00144 --output; 00145 *output = (char) ( (input | BYTE_MARK) & BYTE_MASK); 00146 input >>= 6; 00147 case 1: 00148 --output; 00149 *output = (char) (input | FIRST_BYTE_MARK[*length]); 00150 } 00151 } 00152 00153 00154 /*static*/ int TiXmlBase::IsAlpha ( unsigned char anyByte, TiXmlEncoding /*encoding*/ ) 00155 { 00156 // This will only work for low-ascii, everything else is assumed to be a valid 00157 // letter. I'm not sure this is the best approach, but it is quite tricky trying 00158 // to figure out alhabetical vs. not across encoding. So take a very 00159 // conservative approach. 00160 00161 // if ( encoding == TIXML_ENCODING_UTF8 ) 00162 // { 00163 if ( anyByte < 127 ) 00164 return isalpha ( anyByte ); 00165 else 00166 return 1; // What else to do? The unicode set is huge...get the english ones right. 00167 00168 // } 00169 // else 00170 // { 00171 // return isalpha( anyByte ); 00172 // } 00173 } 00174 00175 00176 /*static*/ int TiXmlBase::IsAlphaNum ( unsigned char anyByte, TiXmlEncoding /*encoding*/ ) 00177 { 00178 // This will only work for low-ascii, everything else is assumed to be a valid 00179 // letter. I'm not sure this is the best approach, but it is quite tricky trying 00180 // to figure out alhabetical vs. not across encoding. So take a very 00181 // conservative approach. 00182 00183 // if ( encoding == TIXML_ENCODING_UTF8 ) 00184 // { 00185 if ( anyByte < 127 ) 00186 return isalnum ( anyByte ); 00187 else 00188 return 1; // What else to do? The unicode set is huge...get the english ones right. 00189 00190 // } 00191 // else 00192 // { 00193 // return isalnum( anyByte ); 00194 // } 00195 } 00196 00197 00198 class TiXmlParsingData 00199 { 00200 friend class TiXmlDocument; 00201 public: 00202 void Stamp ( const char *now, TiXmlEncoding encoding ); 00203 00204 const TiXmlCursor &Cursor() 00205 { 00206 return cursor; 00207 } 00208 00209 private: 00210 // Only used by the document! 00211 TiXmlParsingData ( const char *start, int _tabsize, int row, int col ) 00212 { 00213 assert ( start ); 00214 stamp = start; 00215 tabsize = _tabsize; 00216 cursor.row = row; 00217 cursor.col = col; 00218 } 00219 00220 TiXmlCursor cursor; 00221 const char *stamp; 00222 int tabsize; 00223 }; 00224 00225 00226 void TiXmlParsingData::Stamp ( const char *now, TiXmlEncoding encoding ) 00227 { 00228 assert ( now ); 00229 00230 // Do nothing if the tabsize is 0. 00231 if ( tabsize < 1 ) 00232 { 00233 return; 00234 } 00235 00236 // Get the current row, column. 00237 int row = cursor.row; 00238 int col = cursor.col; 00239 const char *p = stamp; 00240 assert ( p ); 00241 00242 while ( p < now ) 00243 { 00244 // Treat p as unsigned, so we have a happy compiler. 00245 const unsigned char *pU = (const unsigned char *) p; 00246 00247 // Code contributed by Fletcher Dunn: (modified by lee) 00248 switch (*pU) 00249 { 00250 case 0: 00251 // We *should* never get here, but in case we do, don't 00252 // advance past the terminating null character, ever 00253 return; 00254 00255 case '\r': 00256 // bump down to the next line 00257 ++row; 00258 col = 0; 00259 // Eat the character 00260 ++p; 00261 00262 // Check for \r\n sequence, and treat this as a single character 00263 if (*p == '\n') 00264 { 00265 ++p; 00266 } 00267 00268 break; 00269 00270 case '\n': 00271 // bump down to the next line 00272 ++row; 00273 col = 0; 00274 00275 // Eat the character 00276 ++p; 00277 00278 // Check for \n\r sequence, and treat this as a single 00279 // character. (Yes, this bizarre thing does occur still 00280 // on some arcane platforms...) 00281 if (*p == '\r') 00282 { 00283 ++p; 00284 } 00285 00286 break; 00287 00288 case '\t': 00289 // Eat the character 00290 ++p; 00291 00292 // Skip to next tab stop 00293 col = (col / tabsize + 1) * tabsize; 00294 break; 00295 00296 case TIXML_UTF_LEAD_0: 00297 00298 if ( encoding == TIXML_ENCODING_UTF8 ) 00299 { 00300 if ( * (p + 1) && * (p + 2) ) 00301 { 00302 // In these cases, don't advance the column. These are 00303 // 0-width spaces. 00304 if ( * (pU + 1) == TIXML_UTF_LEAD_1 && * (pU + 2) == TIXML_UTF_LEAD_2 ) 00305 p += 3; 00306 else if ( * (pU + 1) == 0xbfU && * (pU + 2) == 0xbeU ) 00307 p += 3; 00308 else if ( * (pU + 1) == 0xbfU && * (pU + 2) == 0xbfU ) 00309 p += 3; 00310 else 00311 { 00312 p += 3; // A normal character. 00313 ++col; 00314 } 00315 } 00316 } 00317 else 00318 { 00319 ++p; 00320 ++col; 00321 } 00322 00323 break; 00324 00325 default: 00326 00327 if ( encoding == TIXML_ENCODING_UTF8 ) 00328 { 00329 // Eat the 1 to 4 byte utf8 character. 00330 int step = TiXmlBase::utf8ByteTable[* ( (const unsigned char *) p) ]; 00331 00332 if ( step == 0 ) 00333 step = 1; // Error case from bad encoding, but handle gracefully. 00334 00335 p += step; 00336 00337 // Just advance one column, of course. 00338 ++col; 00339 } 00340 else 00341 { 00342 ++p; 00343 ++col; 00344 } 00345 00346 break; 00347 } 00348 } 00349 00350 cursor.row = row; 00351 cursor.col = col; 00352 assert ( cursor.row >= -1 ); 00353 assert ( cursor.col >= -1 ); 00354 stamp = p; 00355 assert ( stamp ); 00356 } 00357 00358 00359 const char *TiXmlBase::SkipWhiteSpace ( const char *p, TiXmlEncoding encoding ) 00360 { 00361 if ( !p || !*p ) 00362 { 00363 return 0; 00364 } 00365 00366 if ( encoding == TIXML_ENCODING_UTF8 ) 00367 { 00368 while ( *p ) 00369 { 00370 const unsigned char *pU = (const unsigned char *) p; 00371 00372 // Skip the stupid Microsoft UTF-8 Byte order marks 00373 if ( * (pU + 0) == TIXML_UTF_LEAD_0 00374 && * (pU + 1) == TIXML_UTF_LEAD_1 00375 && * (pU + 2) == TIXML_UTF_LEAD_2 ) 00376 { 00377 p += 3; 00378 continue; 00379 } 00380 else if (* (pU + 0) == TIXML_UTF_LEAD_0 00381 && * (pU + 1) == 0xbfU 00382 && * (pU + 2) == 0xbeU ) 00383 { 00384 p += 3; 00385 continue; 00386 } 00387 else if (* (pU + 0) == TIXML_UTF_LEAD_0 00388 && * (pU + 1) == 0xbfU 00389 && * (pU + 2) == 0xbfU ) 00390 { 00391 p += 3; 00392 continue; 00393 } 00394 00395 if ( IsWhiteSpace ( *p ) || *p == '\n' || *p == '\r' ) // Still using old rules for white space. 00396 ++p; 00397 else 00398 break; 00399 } 00400 } 00401 else 00402 { 00403 while ( (*p && IsWhiteSpace ( *p ) ) || (*p == '\n') || (*p == '\r' ) ) 00404 ++p; 00405 } 00406 00407 return p; 00408 } 00409 00410 #ifdef TIXML_USE_STL 00411 /*static*/ bool TiXmlBase::StreamWhiteSpace ( std::istream *in, TIXML_STRING *tag ) 00412 { 00413 for ( ;; ) 00414 { 00415 if ( !in->good() ) return false; 00416 00417 int c = in->peek(); 00418 00419 // At this scope, we can't get to a document. So fail silently. 00420 if ( !IsWhiteSpace ( c ) || c <= 0 ) 00421 return true; 00422 00423 *tag += (char) in->get(); 00424 } 00425 } 00426 00427 /*static*/ bool TiXmlBase::StreamTo ( std::istream *in, int character, TIXML_STRING *tag ) 00428 { 00429 //assert( character > 0 && character < 128 ); // else it won't work in utf-8 00430 while ( in->good() ) 00431 { 00432 int c = in->peek(); 00433 00434 if ( c == character ) 00435 return true; 00436 00437 if ( c <= 0 ) // Silent failure: can't get document at this scope 00438 return false; 00439 00440 in->get(); 00441 *tag += (char) c; 00442 } 00443 00444 return false; 00445 } 00446 #endif 00447 00448 // One of TinyXML's more performance demanding functions. Try to keep the memory overhead down. The 00449 // "assign" optimization removes over 10% of the execution time. 00450 // 00451 const char *TiXmlBase::ReadName ( const char *p, TIXML_STRING *name, TiXmlEncoding encoding ) 00452 { 00453 // Oddly, not supported on some comilers, 00454 //name->clear(); 00455 // So use this: 00456 *name = ""; 00457 assert ( p ); 00458 00459 // Names start with letters or underscores. 00460 // Of course, in unicode, tinyxml has no idea what a letter *is*. The 00461 // algorithm is generous. 00462 // 00463 // After that, they can be letters, underscores, numbers, 00464 // hyphens, or colons. (Colons are valid ony for namespaces, 00465 // but tinyxml can't tell namespaces from names.) 00466 if ( p && *p 00467 && ( IsAlpha ( (unsigned char) *p, encoding ) || *p == '_' ) ) 00468 { 00469 const char *start = p; 00470 00471 while ( p && *p 00472 && ( IsAlphaNum ( (unsigned char ) *p, encoding ) 00473 || *p == '_' 00474 || *p == '-' 00475 || *p == '.' 00476 || *p == ':' ) ) 00477 { 00478 //(*name) += *p; // expensive 00479 ++p; 00480 } 00481 00482 if ( p - start > 0 ) 00483 { 00484 name->assign ( start, p - start ); 00485 } 00486 00487 return p; 00488 } 00489 00490 return 0; 00491 } 00492 00493 const char *TiXmlBase::GetEntity ( const char *p, char *value, int *length, TiXmlEncoding encoding ) 00494 { 00495 // Presume an entity, and pull it out. 00496 TIXML_STRING ent; 00497 int i; 00498 *length = 0; 00499 00500 if ( * (p + 1) && * (p + 1) == '#' && * (p + 2) ) 00501 { 00502 unsigned long ucs = 0; 00503 ptrdiff_t delta = 0; 00504 unsigned mult = 1; 00505 00506 if ( * (p + 2) == 'x' ) 00507 { 00508 // Hexadecimal. 00509 if ( !* (p + 3) ) return 0; 00510 00511 const char *q = p + 3; 00512 q = strchr ( q, ';' ); 00513 00514 if ( !q || !*q ) return 0; 00515 00516 delta = q - p; 00517 --q; 00518 00519 while ( *q != 'x' ) 00520 { 00521 if ( *q >= '0' && *q <= '9' ) 00522 ucs += mult * (*q - '0'); 00523 else if ( *q >= 'a' && *q <= 'f' ) 00524 ucs += mult * (*q - 'a' + 10); 00525 else if ( *q >= 'A' && *q <= 'F' ) 00526 ucs += mult * (*q - 'A' + 10 ); 00527 else 00528 return 0; 00529 00530 mult *= 16; 00531 --q; 00532 } 00533 } 00534 else 00535 { 00536 // Decimal. 00537 if ( !* (p + 2) ) return 0; 00538 00539 const char *q = p + 2; 00540 q = strchr ( q, ';' ); 00541 00542 if ( !q || !*q ) return 0; 00543 00544 delta = q - p; 00545 --q; 00546 00547 while ( *q != '#' ) 00548 { 00549 if ( *q >= '0' && *q <= '9' ) 00550 ucs += mult * (*q - '0'); 00551 else 00552 return 0; 00553 00554 mult *= 10; 00555 --q; 00556 } 00557 } 00558 00559 if ( encoding == TIXML_ENCODING_UTF8 ) 00560 { 00561 // convert the UCS to UTF-8 00562 ConvertUTF32ToUTF8 ( ucs, value, length ); 00563 } 00564 else 00565 { 00566 *value = (char) ucs; 00567 *length = 1; 00568 } 00569 00570 return p + delta + 1; 00571 } 00572 00573 // Now try to match it. 00574 for ( i = 0; i < NUM_ENTITY; ++i ) 00575 { 00576 if ( strncmp ( entity[i].str, p, entity[i].strLength ) == 0 ) 00577 { 00578 assert ( strlen ( entity[i].str ) == entity[i].strLength ); 00579 *value = entity[i].chr; 00580 *length = 1; 00581 return ( p + entity[i].strLength ); 00582 } 00583 } 00584 00585 // So it wasn't an entity, its unrecognized, or something like that. 00586 *value = *p; // Don't put back the last one, since we return it! 00587 //*length = 1; // Leave unrecognized entities - this doesn't really work. 00588 // Just writes strange XML. 00589 return p + 1; 00590 } 00591 00592 00593 bool TiXmlBase::StringEqual ( const char *p, 00594 const char *tag, 00595 bool ignoreCase, 00596 TiXmlEncoding encoding ) 00597 { 00598 assert ( p ); 00599 assert ( tag ); 00600 00601 if ( !p || !*p ) 00602 { 00603 assert ( 0 ); 00604 return false; 00605 } 00606 00607 const char *q = p; 00608 00609 if ( ignoreCase ) 00610 { 00611 while ( *q && *tag && ToLower ( *q, encoding ) == ToLower ( *tag, encoding ) ) 00612 { 00613 ++q; 00614 ++tag; 00615 } 00616 00617 if ( *tag == 0 ) 00618 return true; 00619 } 00620 else 00621 { 00622 while ( *q && *tag && *q == *tag ) 00623 { 00624 ++q; 00625 ++tag; 00626 } 00627 00628 if ( *tag == 0 ) // Have we found the end of the tag, and everything equal? 00629 return true; 00630 } 00631 00632 return false; 00633 } 00634 00635 const char *TiXmlBase::ReadText ( const char *p, 00636 TIXML_STRING *text, 00637 bool trimWhiteSpace, 00638 const char *endTag, 00639 bool caseInsensitive, 00640 TiXmlEncoding encoding ) 00641 { 00642 *text = ""; 00643 00644 if ( !trimWhiteSpace // certain tags always keep whitespace 00645 || !condenseWhiteSpace ) // if true, whitespace is always kept 00646 { 00647 // Keep all the white space. 00648 while ( p && *p 00649 && !StringEqual ( p, endTag, caseInsensitive, encoding ) 00650 ) 00651 { 00652 int len; 00653 char cArr[4] = { 0, 0, 0, 0 }; 00654 p = GetChar ( p, cArr, &len, encoding ); 00655 text->append ( cArr, len ); 00656 } 00657 } 00658 else 00659 { 00660 bool whitespace = false; 00661 00662 // Remove leading white space: 00663 p = SkipWhiteSpace ( p, encoding ); 00664 00665 while ( p && *p 00666 && !StringEqual ( p, endTag, caseInsensitive, encoding ) ) 00667 { 00668 if ( *p == '\r' || *p == '\n' ) 00669 { 00670 whitespace = true; 00671 ++p; 00672 } 00673 else if ( IsWhiteSpace ( *p ) ) 00674 { 00675 whitespace = true; 00676 ++p; 00677 } 00678 else 00679 { 00680 // If we've found whitespace, add it before the 00681 // new character. Any whitespace just becomes a space. 00682 if ( whitespace ) 00683 { 00684 (*text) += ' '; 00685 whitespace = false; 00686 } 00687 00688 int len; 00689 char cArr[4] = { 0, 0, 0, 0 }; 00690 p = GetChar ( p, cArr, &len, encoding ); 00691 00692 if ( len == 1 ) 00693 (*text) += cArr[0]; // more efficient 00694 else 00695 text->append ( cArr, len ); 00696 } 00697 } 00698 } 00699 00700 if ( p ) 00701 p += strlen ( endTag ); 00702 00703 return p; 00704 } 00705 00706 #ifdef TIXML_USE_STL 00707 00708 void TiXmlDocument::StreamIn ( std::istream *in, TIXML_STRING *tag ) 00709 { 00710 // The basic issue with a document is that we don't know what we're 00711 // streaming. Read something presumed to be a tag (and hope), then 00712 // identify it, and call the appropriate stream method on the tag. 00713 // 00714 // This "pre-streaming" will never read the closing ">" so the 00715 // sub-tag can orient itself. 00716 00717 if ( !StreamTo ( in, '<', tag ) ) 00718 { 00719 SetError ( TIXML_ERROR_PARSING_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN ); 00720 return; 00721 } 00722 00723 while ( in->good() ) 00724 { 00725 int tagIndex = (int) tag->length(); 00726 00727 while ( in->good() && in->peek() != '>' ) 00728 { 00729 int c = in->get(); 00730 00731 if ( c <= 0 ) 00732 { 00733 SetError ( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN ); 00734 break; 00735 } 00736 00737 (*tag) += (char) c; 00738 } 00739 00740 if ( in->good() ) 00741 { 00742 // We now have something we presume to be a node of 00743 // some sort. Identify it, and call the node to 00744 // continue streaming. 00745 TiXmlNode *node = Identify ( tag->c_str() + tagIndex, TIXML_DEFAULT_ENCODING ); 00746 00747 if ( node ) 00748 { 00749 node->StreamIn ( in, tag ); 00750 bool isElement = node->ToElement() != 0; 00751 delete node; 00752 node = 0; 00753 00754 // If this is the root element, we're done. Parsing will be 00755 // done by the >> operator. 00756 if ( isElement ) 00757 { 00758 return; 00759 } 00760 } 00761 else 00762 { 00763 SetError ( TIXML_ERROR, 0, 0, TIXML_ENCODING_UNKNOWN ); 00764 return; 00765 } 00766 } 00767 } 00768 00769 // We should have returned sooner. 00770 SetError ( TIXML_ERROR, 0, 0, TIXML_ENCODING_UNKNOWN ); 00771 } 00772 00773 #endif 00774 00775 const char *TiXmlDocument::Parse ( const char *p, TiXmlParsingData *prevData, TiXmlEncoding encoding ) 00776 { 00777 ClearError(); 00778 00779 // Parse away, at the document level. Since a document 00780 // contains nothing but other tags, most of what happens 00781 // here is skipping white space. 00782 if ( !p || !*p ) 00783 { 00784 SetError ( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN ); 00785 return 0; 00786 } 00787 00788 // Note that, for a document, this needs to come 00789 // before the while space skip, so that parsing 00790 // starts from the pointer we are given. 00791 location.Clear(); 00792 00793 if ( prevData ) 00794 { 00795 location.row = prevData->cursor.row; 00796 location.col = prevData->cursor.col; 00797 } 00798 else 00799 { 00800 location.row = 0; 00801 location.col = 0; 00802 } 00803 00804 TiXmlParsingData data ( p, TabSize(), location.row, location.col ); 00805 location = data.Cursor(); 00806 00807 if ( encoding == TIXML_ENCODING_UNKNOWN ) 00808 { 00809 // Check for the Microsoft UTF-8 lead bytes. 00810 const unsigned char *pU = (const unsigned char *) p; 00811 00812 if ( * (pU + 0) && * (pU + 0) == TIXML_UTF_LEAD_0 00813 && * (pU + 1) && * (pU + 1) == TIXML_UTF_LEAD_1 00814 && * (pU + 2) && * (pU + 2) == TIXML_UTF_LEAD_2 ) 00815 { 00816 encoding = TIXML_ENCODING_UTF8; 00817 useMicrosoftBOM = true; 00818 } 00819 } 00820 00821 p = SkipWhiteSpace ( p, encoding ); 00822 00823 if ( !p ) 00824 { 00825 SetError ( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN ); 00826 return 0; 00827 } 00828 00829 while ( p && *p ) 00830 { 00831 TiXmlNode *node = Identify ( p, encoding ); 00832 00833 if ( node ) 00834 { 00835 p = node->Parse ( p, &data, encoding ); 00836 LinkEndChild ( node ); 00837 00838 /* LinkEndChild may potentially free the node. 00839 If this happens we should break to avoid dereferencing it */ 00840 if ( !node ) 00841 break; 00842 } 00843 else 00844 { 00845 break; 00846 } 00847 00848 // Did we get encoding info? 00849 if ( encoding == TIXML_ENCODING_UNKNOWN 00850 && node->ToDeclaration() ) 00851 { 00852 TiXmlDeclaration *dec = node->ToDeclaration(); 00853 const char *enc = dec->Encoding(); 00854 assert ( enc ); 00855 00856 if ( *enc == 0 ) 00857 encoding = TIXML_ENCODING_UTF8; 00858 else if ( StringEqual ( enc, "UTF-8", true, TIXML_ENCODING_UNKNOWN ) ) 00859 encoding = TIXML_ENCODING_UTF8; 00860 else if ( StringEqual ( enc, "UTF8", true, TIXML_ENCODING_UNKNOWN ) ) 00861 encoding = TIXML_ENCODING_UTF8; // incorrect, but be nice 00862 else 00863 encoding = TIXML_ENCODING_LEGACY; 00864 } 00865 00866 p = SkipWhiteSpace ( p, encoding ); 00867 } 00868 00869 // Was this empty? 00870 if ( !firstChild ) 00871 { 00872 SetError ( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, encoding ); 00873 return 0; 00874 } 00875 00876 // All is well. 00877 return p; 00878 } 00879 00880 void TiXmlDocument::SetError ( int err, const char *pError, TiXmlParsingData *data, TiXmlEncoding encoding ) 00881 { 00882 // The first error in a chain is more accurate - don't set again! 00883 if ( error ) 00884 return; 00885 00886 assert ( err > 0 && err < TIXML_ERROR_STRING_COUNT ); 00887 error = true; 00888 errorId = err; 00889 errorDesc = errorString[ errorId ]; 00890 00891 errorLocation.Clear(); 00892 00893 if ( pError && data ) 00894 { 00895 data->Stamp ( pError, encoding ); 00896 errorLocation = data->Cursor(); 00897 } 00898 } 00899 00900 00901 TiXmlNode *TiXmlNode::Identify ( const char *p, TiXmlEncoding encoding ) 00902 { 00903 TiXmlNode *returnNode = 0; 00904 00905 p = SkipWhiteSpace ( p, encoding ); 00906 00907 if ( !p || !*p || *p != '<' ) 00908 { 00909 return 0; 00910 } 00911 00912 TiXmlDocument *doc = GetDocument(); 00913 p = SkipWhiteSpace ( p, encoding ); 00914 00915 if ( !p || !*p ) 00916 { 00917 return 0; 00918 } 00919 00920 // What is this thing? 00921 // - Elements start with a letter or underscore, but xml is reserved. 00922 // - Comments: <!-- 00923 // - Decleration: <?xml 00924 // - Everthing else is unknown to tinyxml. 00925 // 00926 00927 const char *xmlHeader = { "<?xml" }; 00928 const char *commentHeader = { "<!--" }; 00929 const char *dtdHeader = { "<!" }; 00930 const char *cdataHeader = { "<![CDATA[" }; 00931 00932 if ( StringEqual ( p, xmlHeader, true, encoding ) ) 00933 { 00934 #ifdef DEBUG_PARSER 00935 TIXML_LOG ( "XML parsing Declaration\n" ); 00936 #endif 00937 returnNode = new TiXmlDeclaration(); 00938 } 00939 else if ( StringEqual ( p, commentHeader, false, encoding ) ) 00940 { 00941 #ifdef DEBUG_PARSER 00942 TIXML_LOG ( "XML parsing Comment\n" ); 00943 #endif 00944 returnNode = new TiXmlComment(); 00945 } 00946 else if ( StringEqual ( p, cdataHeader, false, encoding ) ) 00947 { 00948 #ifdef DEBUG_PARSER 00949 TIXML_LOG ( "XML parsing CDATA\n" ); 00950 #endif 00951 TiXmlText *text = new TiXmlText ( "" ); 00952 text->SetCDATA ( true ); 00953 returnNode = text; 00954 } 00955 else if ( StringEqual ( p, dtdHeader, false, encoding ) ) 00956 { 00957 #ifdef DEBUG_PARSER 00958 TIXML_LOG ( "XML parsing Unknown(1)\n" ); 00959 #endif 00960 returnNode = new TiXmlUnknown(); 00961 } 00962 else if ( IsAlpha ( * (p + 1), encoding ) 00963 || * (p + 1) == '_' ) 00964 { 00965 #ifdef DEBUG_PARSER 00966 TIXML_LOG ( "XML parsing Element\n" ); 00967 #endif 00968 returnNode = new TiXmlElement ( "" ); 00969 } 00970 else 00971 { 00972 #ifdef DEBUG_PARSER 00973 TIXML_LOG ( "XML parsing Unknown(2)\n" ); 00974 #endif 00975 returnNode = new TiXmlUnknown(); 00976 } 00977 00978 if ( returnNode ) 00979 { 00980 // Set the parent, so it can report errors 00981 returnNode->parent = this; 00982 } 00983 else 00984 { 00985 if ( doc ) 00986 doc->SetError ( TIXML_ERROR_OUT_OF_MEMORY, 0, 0, TIXML_ENCODING_UNKNOWN ); 00987 } 00988 00989 return returnNode; 00990 } 00991 00992 #ifdef TIXML_USE_STL 00993 00994 void TiXmlElement::StreamIn (std::istream *in, TIXML_STRING *tag) 00995 { 00996 // We're called with some amount of pre-parsing. That is, some of "this" 00997 // element is in "tag". Go ahead and stream to the closing ">" 00998 while ( in->good() ) 00999 { 01000 int c = in->get(); 01001 01002 if ( c <= 0 ) 01003 { 01004 TiXmlDocument *document = GetDocument(); 01005 01006 if ( document ) 01007 document->SetError ( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN ); 01008 01009 return; 01010 } 01011 01012 (*tag) += (char) c ; 01013 01014 if ( c == '>' ) 01015 break; 01016 } 01017 01018 if ( tag->length() < 3 ) return; 01019 01020 // Okay...if we are a "/>" tag, then we're done. We've read a complete tag. 01021 // If not, identify and stream. 01022 01023 if ( tag->at ( tag->length() - 1 ) == '>' 01024 && tag->at ( tag->length() - 2 ) == '/' ) 01025 { 01026 // All good! 01027 return; 01028 } 01029 else if ( tag->at ( tag->length() - 1 ) == '>' ) 01030 { 01031 // There is more. Could be: 01032 // text 01033 // cdata text (which looks like another node) 01034 // closing tag 01035 // another node. 01036 for ( ;; ) 01037 { 01038 StreamWhiteSpace ( in, tag ); 01039 01040 // Do we have text? 01041 if ( in->good() && in->peek() != '<' ) 01042 { 01043 // Yep, text. 01044 TiXmlText text ( "" ); 01045 text.StreamIn ( in, tag ); 01046 01047 // What follows text is a closing tag or another node. 01048 // Go around again and figure it out. 01049 continue; 01050 } 01051 01052 // We now have either a closing tag...or another node. 01053 // We should be at a "<", regardless. 01054 if ( !in->good() ) return; 01055 01056 assert ( in->peek() == '<' ); 01057 int tagIndex = (int) tag->length(); 01058 01059 bool closingTag = false; 01060 bool firstCharFound = false; 01061 01062 for ( ;; ) 01063 { 01064 if ( !in->good() ) 01065 return; 01066 01067 int c = in->peek(); 01068 01069 if ( c <= 0 ) 01070 { 01071 TiXmlDocument *document = GetDocument(); 01072 01073 if ( document ) 01074 document->SetError ( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN ); 01075 01076 return; 01077 } 01078 01079 if ( c == '>' ) 01080 break; 01081 01082 *tag += (char) c; 01083 in->get(); 01084 01085 // Early out if we find the CDATA id. 01086 if ( c == '[' && tag->size() >= 9 ) 01087 { 01088 size_t len = tag->size(); 01089 const char *start = tag->c_str() + len - 9; 01090 01091 if ( strcmp ( start, "<![CDATA[" ) == 0 ) 01092 { 01093 assert ( !closingTag ); 01094 break; 01095 } 01096 } 01097 01098 if ( !firstCharFound && c != '<' && !IsWhiteSpace ( c ) ) 01099 { 01100 firstCharFound = true; 01101 01102 if ( c == '/' ) 01103 closingTag = true; 01104 } 01105 } 01106 01107 // If it was a closing tag, then read in the closing '>' to clean up the input stream. 01108 // If it was not, the streaming will be done by the tag. 01109 if ( closingTag ) 01110 { 01111 if ( !in->good() ) 01112 return; 01113 01114 int c = in->get(); 01115 01116 if ( c <= 0 ) 01117 { 01118 TiXmlDocument *document = GetDocument(); 01119 01120 if ( document ) 01121 document->SetError ( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN ); 01122 01123 return; 01124 } 01125 01126 assert ( c == '>' ); 01127 *tag += (char) c; 01128 01129 // We are done, once we've found our closing tag. 01130 return; 01131 } 01132 else 01133 { 01134 // If not a closing tag, id it, and stream. 01135 const char *tagloc = tag->c_str() + tagIndex; 01136 TiXmlNode *node = Identify ( tagloc, TIXML_DEFAULT_ENCODING ); 01137 01138 if ( !node ) 01139 return; 01140 01141 node->StreamIn ( in, tag ); 01142 delete node; 01143 node = 0; 01144 01145 // No return: go around from the beginning: text, closing tag, or node. 01146 } 01147 } 01148 } 01149 } 01150 #endif 01151 01152 const char *TiXmlElement::Parse ( const char *p, TiXmlParsingData *data, TiXmlEncoding encoding ) 01153 { 01154 p = SkipWhiteSpace ( p, encoding ); 01155 TiXmlDocument *document = GetDocument(); 01156 01157 if ( !p || !*p ) 01158 { 01159 if ( document ) document->SetError ( TIXML_ERROR_PARSING_ELEMENT, 0, 0, encoding ); 01160 01161 return 0; 01162 } 01163 01164 if ( data ) 01165 { 01166 data->Stamp ( p, encoding ); 01167 location = data->Cursor(); 01168 } 01169 01170 if ( *p != '<' ) 01171 { 01172 if ( document ) document->SetError ( TIXML_ERROR_PARSING_ELEMENT, p, data, encoding ); 01173 01174 return 0; 01175 } 01176 01177 p = SkipWhiteSpace ( p + 1, encoding ); 01178 01179 // Read the name. 01180 const char *pErr = p; 01181 01182 p = ReadName ( p, &value, encoding ); 01183 01184 if ( !p || !*p ) 01185 { 01186 if ( document ) document->SetError ( TIXML_ERROR_FAILED_TO_READ_ELEMENT_NAME, pErr, data, encoding ); 01187 01188 return 0; 01189 } 01190 01191 TIXML_STRING endTag ("</"); 01192 endTag += value; 01193 endTag += ">"; 01194 01195 // Check for and read attributes. Also look for an empty 01196 // tag or an end tag. 01197 while ( p && *p ) 01198 { 01199 pErr = p; 01200 p = SkipWhiteSpace ( p, encoding ); 01201 01202 if ( !p || !*p ) 01203 { 01204 if ( document ) document->SetError ( TIXML_ERROR_READING_ATTRIBUTES, pErr, data, encoding ); 01205 01206 return 0; 01207 } 01208 01209 if ( *p == '/' ) 01210 { 01211 ++p; 01212 01213 // Empty tag. 01214 if ( *p != '>' ) 01215 { 01216 if ( document ) document->SetError ( TIXML_ERROR_PARSING_EMPTY, p, data, encoding ); 01217 01218 return 0; 01219 } 01220 01221 return (p + 1); 01222 } 01223 else if ( *p == '>' ) 01224 { 01225 // Done with attributes (if there were any.) 01226 // Read the value -- which can include other 01227 // elements -- read the end tag, and return. 01228 ++p; 01229 p = ReadValue ( p, data, encoding ); // Note this is an Element method, and will set the error if one happens. 01230 01231 if ( !p || !*p ) 01232 { 01233 // We were looking for the end tag, but found nothing. 01234 // Fix for [ 1663758 ] Failure to report error on bad XML 01235 if ( document ) document->SetError ( TIXML_ERROR_READING_END_TAG, p, data, encoding ); 01236 01237 return 0; 01238 } 01239 01240 // We should find the end tag now 01241 if ( StringEqual ( p, endTag.c_str(), false, encoding ) ) 01242 { 01243 p += endTag.length(); 01244 return p; 01245 } 01246 else 01247 { 01248 if ( document ) document->SetError ( TIXML_ERROR_READING_END_TAG, p, data, encoding ); 01249 01250 return 0; 01251 } 01252 } 01253 else 01254 { 01255 // Try to read an attribute: 01256 TiXmlAttribute *attrib = new TiXmlAttribute(); 01257 01258 if ( !attrib ) 01259 { 01260 if ( document ) document->SetError ( TIXML_ERROR_OUT_OF_MEMORY, pErr, data, encoding ); 01261 01262 return 0; 01263 } 01264 01265 attrib->SetDocument ( document ); 01266 pErr = p; 01267 p = attrib->Parse ( p, data, encoding ); 01268 01269 if ( !p || !*p ) 01270 { 01271 if ( document ) document->SetError ( TIXML_ERROR_PARSING_ELEMENT, pErr, data, encoding ); 01272 01273 delete attrib; 01274 return 0; 01275 } 01276 01277 // Handle the strange case of double attributes: 01278 #ifdef TIXML_USE_STL 01279 TiXmlAttribute *node = attributeSet.Find ( attrib->NameTStr() ); 01280 #else 01281 TiXmlAttribute *node = attributeSet.Find ( attrib->Name() ); 01282 #endif 01283 01284 if ( node ) 01285 { 01286 node->SetValue ( attrib->Value() ); 01287 delete attrib; 01288 return 0; 01289 } 01290 01291 attributeSet.Add ( attrib ); 01292 } 01293 } 01294 01295 return p; 01296 } 01297 01298 01299 const char *TiXmlElement::ReadValue ( const char *p, TiXmlParsingData *data, TiXmlEncoding encoding ) 01300 { 01301 TiXmlDocument *document = GetDocument(); 01302 01303 // Read in text and elements in any order. 01304 const char *pWithWhiteSpace = p; 01305 p = SkipWhiteSpace ( p, encoding ); 01306 01307 while ( p && *p ) 01308 { 01309 if ( *p != '<' ) 01310 { 01311 // Take what we have, make a text element. 01312 TiXmlText *textNode = new TiXmlText ( "" ); 01313 01314 if ( !textNode ) 01315 { 01316 if ( document ) document->SetError ( TIXML_ERROR_OUT_OF_MEMORY, 0, 0, encoding ); 01317 01318 return 0; 01319 } 01320 01321 if ( TiXmlBase::IsWhiteSpaceCondensed() ) 01322 { 01323 p = textNode->Parse ( p, data, encoding ); 01324 } 01325 else 01326 { 01327 // Special case: we want to keep the white space 01328 // so that leading spaces aren't removed. 01329 p = textNode->Parse ( pWithWhiteSpace, data, encoding ); 01330 } 01331 01332 if ( !textNode->Blank() ) 01333 LinkEndChild ( textNode ); 01334 else 01335 delete textNode; 01336 } 01337 else 01338 { 01339 // We hit a '<' 01340 // Have we hit a new element or an end tag? This could also be 01341 // a TiXmlText in the "CDATA" style. 01342 if ( StringEqual ( p, "</", false, encoding ) ) 01343 { 01344 return p; 01345 } 01346 else 01347 { 01348 TiXmlNode *node = Identify ( p, encoding ); 01349 01350 if ( node ) 01351 { 01352 p = node->Parse ( p, data, encoding ); 01353 LinkEndChild ( node ); 01354 } 01355 else 01356 { 01357 return 0; 01358 } 01359 } 01360 } 01361 01362 pWithWhiteSpace = p; 01363 p = SkipWhiteSpace ( p, encoding ); 01364 } 01365 01366 if ( !p ) 01367 { 01368 if ( document ) document->SetError ( TIXML_ERROR_READING_ELEMENT_VALUE, 0, 0, encoding ); 01369 } 01370 01371 return p; 01372 } 01373 01374 01375 #ifdef TIXML_USE_STL 01376 void TiXmlUnknown::StreamIn ( std::istream *in, TIXML_STRING *tag ) 01377 { 01378 while ( in->good() ) 01379 { 01380 int c = in->get(); 01381 01382 if ( c <= 0 ) 01383 { 01384 TiXmlDocument *document = GetDocument(); 01385 01386 if ( document ) 01387 document->SetError ( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN ); 01388 01389 return; 01390 } 01391 01392 (*tag) += (char) c; 01393 01394 if ( c == '>' ) 01395 { 01396 // All is well. 01397 return; 01398 } 01399 } 01400 } 01401 #endif 01402 01403 01404 const char *TiXmlUnknown::Parse ( const char *p, TiXmlParsingData *data, TiXmlEncoding encoding ) 01405 { 01406 TiXmlDocument *document = GetDocument(); 01407 p = SkipWhiteSpace ( p, encoding ); 01408 01409 if ( data ) 01410 { 01411 data->Stamp ( p, encoding ); 01412 location = data->Cursor(); 01413 } 01414 01415 if ( !p || !*p || *p != '<' ) 01416 { 01417 if ( document ) document->SetError ( TIXML_ERROR_PARSING_UNKNOWN, p, data, encoding ); 01418 01419 return 0; 01420 } 01421 01422 ++p; 01423 value = ""; 01424 01425 while ( p && *p && *p != '>' ) 01426 { 01427 value += *p; 01428 ++p; 01429 } 01430 01431 if ( !p ) 01432 { 01433 if ( document ) document->SetError ( TIXML_ERROR_PARSING_UNKNOWN, 0, 0, encoding ); 01434 } 01435 01436 if ( *p == '>' ) 01437 return p + 1; 01438 01439 return p; 01440 } 01441 01442 #ifdef TIXML_USE_STL 01443 void TiXmlComment::StreamIn ( std::istream *in, TIXML_STRING *tag ) 01444 { 01445 while ( in->good() ) 01446 { 01447 int c = in->get(); 01448 01449 if ( c <= 0 ) 01450 { 01451 TiXmlDocument *document = GetDocument(); 01452 01453 if ( document ) 01454 document->SetError ( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN ); 01455 01456 return; 01457 } 01458 01459 (*tag) += (char) c; 01460 01461 if ( c == '>' 01462 && tag->at ( tag->length() - 2 ) == '-' 01463 && tag->at ( tag->length() - 3 ) == '-' ) 01464 { 01465 // All is well. 01466 return; 01467 } 01468 } 01469 } 01470 #endif 01471 01472 01473 const char *TiXmlComment::Parse ( const char *p, TiXmlParsingData *data, TiXmlEncoding encoding ) 01474 { 01475 TiXmlDocument *document = GetDocument(); 01476 value = ""; 01477 01478 p = SkipWhiteSpace ( p, encoding ); 01479 01480 if ( data ) 01481 { 01482 data->Stamp ( p, encoding ); 01483 location = data->Cursor(); 01484 } 01485 01486 const char *startTag = "<!--"; 01487 01488 const char *endTag = "-->"; 01489 01490 if ( !StringEqual ( p, startTag, false, encoding ) ) 01491 { 01492 document->SetError ( TIXML_ERROR_PARSING_COMMENT, p, data, encoding ); 01493 return 0; 01494 } 01495 01496 p += strlen ( startTag ); 01497 01498 // [ 1475201 ] TinyXML parses entities in comments 01499 // Oops - ReadText doesn't work, because we don't want to parse the entities. 01500 // p = ReadText( p, &value, false, endTag, false, encoding ); 01501 // 01502 // from the XML spec: 01503 /* 01504 [Definition: Comments may appear anywhere in a document outside other markup; in addition, 01505 they may appear within the document type declaration at places allowed by the grammar. 01506 They are not part of the document's character data; an XML processor MAY, but need not, 01507 make it possible for an application to retrieve the text of comments. For compatibility, 01508 the string "--" (double-hyphen) MUST NOT occur within comments.] Parameter entity 01509 references MUST NOT be recognized within comments. 01510 01511 An example of a comment: 01512 01513 <!-- declarations for <head> & <body> --> 01514 */ 01515 01516 value = ""; 01517 01518 // Keep all the white space. 01519 while ( p && *p && !StringEqual ( p, endTag, false, encoding ) ) 01520 { 01521 value.append ( p, 1 ); 01522 ++p; 01523 } 01524 01525 if ( p ) 01526 p += strlen ( endTag ); 01527 01528 return p; 01529 } 01530 01531 01532 const char *TiXmlAttribute::Parse ( const char *p, TiXmlParsingData *data, TiXmlEncoding encoding ) 01533 { 01534 p = SkipWhiteSpace ( p, encoding ); 01535 01536 if ( !p || !*p ) return 0; 01537 01538 // int tabsize = 4; 01539 // if ( document ) 01540 // tabsize = document->TabSize(); 01541 01542 if ( data ) 01543 { 01544 data->Stamp ( p, encoding ); 01545 location = data->Cursor(); 01546 } 01547 01548 // Read the name, the '=' and the value. 01549 const char *pErr = p; 01550 p = ReadName ( p, &name, encoding ); 01551 01552 if ( !p || !*p ) 01553 { 01554 if ( document ) document->SetError ( TIXML_ERROR_READING_ATTRIBUTES, pErr, data, encoding ); 01555 01556 return 0; 01557 } 01558 01559 p = SkipWhiteSpace ( p, encoding ); 01560 01561 if ( !p || !*p || *p != '=' ) 01562 { 01563 if ( document ) document->SetError ( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding ); 01564 01565 return 0; 01566 } 01567 01568 ++p; // skip '=' 01569 p = SkipWhiteSpace ( p, encoding ); 01570 01571 if ( !p || !*p ) 01572 { 01573 if ( document ) document->SetError ( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding ); 01574 01575 return 0; 01576 } 01577 01578 const char *end; 01579 01580 const char SINGLE_QUOTE = '\''; 01581 01582 const char DOUBLE_QUOTE = '\"'; 01583 01584 if ( *p == SINGLE_QUOTE ) 01585 { 01586 ++p; 01587 end = "\'"; // single quote in string 01588 p = ReadText ( p, &value, false, end, false, encoding ); 01589 } 01590 else if ( *p == DOUBLE_QUOTE ) 01591 { 01592 ++p; 01593 end = "\""; // double quote in string 01594 p = ReadText ( p, &value, false, end, false, encoding ); 01595 } 01596 else 01597 { 01598 // All attribute values should be in single or double quotes. 01599 // But this is such a common error that the parser will try 01600 // its best, even without them. 01601 value = ""; 01602 01603 while ( p && *p // existence 01604 && !IsWhiteSpace ( *p ) && *p != '\n' && *p != '\r' // whitespace 01605 && *p != '/' && *p != '>' ) // tag end 01606 { 01607 if ( *p == SINGLE_QUOTE || *p == DOUBLE_QUOTE ) 01608 { 01609 // [ 1451649 ] Attribute values with trailing quotes not handled correctly 01610 // We did not have an opening quote but seem to have a 01611 // closing one. Give up and throw an error. 01612 if ( document ) document->SetError ( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding ); 01613 01614 return 0; 01615 } 01616 01617 value += *p; 01618 ++p; 01619 } 01620 } 01621 01622 return p; 01623 } 01624 01625 #ifdef TIXML_USE_STL 01626 void TiXmlText::StreamIn ( std::istream *in, TIXML_STRING *tag ) 01627 { 01628 while ( in->good() ) 01629 { 01630 int c = in->peek(); 01631 01632 if ( !cdata && (c == '<' ) ) 01633 { 01634 return; 01635 } 01636 01637 if ( c <= 0 ) 01638 { 01639 TiXmlDocument *document = GetDocument(); 01640 01641 if ( document ) 01642 document->SetError ( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN ); 01643 01644 return; 01645 } 01646 01647 (*tag) += (char) c; 01648 in->get(); // "commits" the peek made above 01649 01650 if ( cdata && c == '>' && tag->size() >= 3 ) 01651 { 01652 size_t len = tag->size(); 01653 01654 if ( (*tag) [len-2] == ']' && (*tag) [len-3] == ']' ) 01655 { 01656 // terminator of cdata. 01657 return; 01658 } 01659 } 01660 } 01661 } 01662 #endif 01663 01664 const char *TiXmlText::Parse ( const char *p, TiXmlParsingData *data, TiXmlEncoding encoding ) 01665 { 01666 value = ""; 01667 TiXmlDocument *document = GetDocument(); 01668 01669 if ( data ) 01670 { 01671 data->Stamp ( p, encoding ); 01672 location = data->Cursor(); 01673 } 01674 01675 const char *const startTag = "<![CDATA["; 01676 01677 const char *const endTag = "]]>"; 01678 01679 if ( cdata || StringEqual ( p, startTag, false, encoding ) ) 01680 { 01681 cdata = true; 01682 01683 if ( !StringEqual ( p, startTag, false, encoding ) ) 01684 { 01685 document->SetError ( TIXML_ERROR_PARSING_CDATA, p, data, encoding ); 01686 return 0; 01687 } 01688 01689 p += strlen ( startTag ); 01690 01691 // Keep all the white space, ignore the encoding, etc. 01692 while ( p && *p 01693 && !StringEqual ( p, endTag, false, encoding ) 01694 ) 01695 { 01696 value += *p; 01697 ++p; 01698 } 01699 01700 TIXML_STRING dummy; 01701 p = ReadText ( p, &dummy, false, endTag, false, encoding ); 01702 return p; 01703 } 01704 else 01705 { 01706 bool ignoreWhite = true; 01707 01708 const char *end = "<"; 01709 p = ReadText ( p, &value, ignoreWhite, end, false, encoding ); 01710 01711 if ( p ) 01712 return p - 1; // don't truncate the '<' 01713 01714 return 0; 01715 } 01716 } 01717 01718 #ifdef TIXML_USE_STL 01719 void TiXmlDeclaration::StreamIn ( std::istream *in, TIXML_STRING *tag ) 01720 { 01721 while ( in->good() ) 01722 { 01723 int c = in->get(); 01724 01725 if ( c <= 0 ) 01726 { 01727 TiXmlDocument *document = GetDocument(); 01728 01729 if ( document ) 01730 document->SetError ( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN ); 01731 01732 return; 01733 } 01734 01735 (*tag) += (char) c; 01736 01737 if ( c == '>' ) 01738 { 01739 // All is well. 01740 return; 01741 } 01742 } 01743 } 01744 #endif 01745 01746 const char *TiXmlDeclaration::Parse ( const char *p, TiXmlParsingData *data, TiXmlEncoding _encoding ) 01747 { 01748 p = SkipWhiteSpace ( p, _encoding ); 01749 // Find the beginning, find the end, and look for 01750 // the stuff in-between. 01751 TiXmlDocument *document = GetDocument(); 01752 01753 if ( !p || !*p || !StringEqual ( p, "<?xml", true, _encoding ) ) 01754 { 01755 if ( document ) document->SetError ( TIXML_ERROR_PARSING_DECLARATION, 0, 0, _encoding ); 01756 01757 return 0; 01758 } 01759 01760 if ( data ) 01761 { 01762 data->Stamp ( p, _encoding ); 01763 location = data->Cursor(); 01764 } 01765 01766 p += 5; 01767 01768 version = ""; 01769 encoding = ""; 01770 standalone = ""; 01771 01772 while ( p && *p ) 01773 { 01774 if ( *p == '>' ) 01775 { 01776 ++p; 01777 return p; 01778 } 01779 01780 p = SkipWhiteSpace ( p, _encoding ); 01781 01782 if ( StringEqual ( p, "version", true, _encoding ) ) 01783 { 01784 TiXmlAttribute attrib; 01785 p = attrib.Parse ( p, data, _encoding ); 01786 version = attrib.Value(); 01787 } 01788 else if ( StringEqual ( p, "encoding", true, _encoding ) ) 01789 { 01790 TiXmlAttribute attrib; 01791 p = attrib.Parse ( p, data, _encoding ); 01792 encoding = attrib.Value(); 01793 } 01794 else if ( StringEqual ( p, "standalone", true, _encoding ) ) 01795 { 01796 TiXmlAttribute attrib; 01797 p = attrib.Parse ( p, data, _encoding ); 01798 standalone = attrib.Value(); 01799 } 01800 else 01801 { 01802 // Read over whatever it is. 01803 while ( p && *p && *p != '>' && !IsWhiteSpace ( *p ) ) 01804 ++p; 01805 } 01806 } 01807 01808 return 0; 01809 } 01810 01811 bool TiXmlText::Blank() const 01812 { 01813 for ( unsigned i = 0; i < value.length(); i++ ) 01814 if ( !IsWhiteSpace ( value[i] ) ) 01815 return false; 01816 01817 return true; 01818 } 01819