MyGUI  3.2.0
MyGUI_UString.cpp
Go to the documentation of this file.
1 
6 /*
7  This file is part of MyGUI.
8 
9  MyGUI is free software: you can redistribute it and/or modify
10  it under the terms of the GNU Lesser General Public License as published by
11  the Free Software Foundation, either version 3 of the License, or
12  (at your option) any later version.
13 
14  MyGUI is distributed in the hope that it will be useful,
15  but WITHOUT ANY WARRANTY; without even the implied warranty of
16  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17  GNU Lesser General Public License for more details.
18 
19  You should have received a copy of the GNU Lesser General Public License
20  along with MyGUI. If not, see <http://www.gnu.org/licenses/>.
21 */
22 #include "MyGUI_Precompiled.h"
23 #include "MyGUI_UString.h"
24 
25 namespace MyGUI
26 {
27 
28  //--------------------------------------------------------------------------
30  {
31  mString = 0;
32  }
33  //--------------------------------------------------------------------------
35  {
36  mIter += c;
37  }
38  //--------------------------------------------------------------------------
40  {
41  mIter -= c;
42  }
43  //--------------------------------------------------------------------------
45  {
46  mIter = i.mIter;
47  mString = i.mString;
48  }
49  //--------------------------------------------------------------------------
51  {
52  return mIter == mString->mData.begin();
53  }
54  //--------------------------------------------------------------------------
56  {
57  return mIter == mString->mData.end();
58  }
59  //--------------------------------------------------------------------------
61  {
62  return mIter - mString->mData.begin();
63  }
64  //--------------------------------------------------------------------------
66  {
67  mIter = mString->mData.begin() + index;
68  }
69  //--------------------------------------------------------------------------
71  {
72  size_type current_index = _get_index();
73  return mString->getChar( current_index );
74  }
75  //--------------------------------------------------------------------------
77  {
78  size_type current_index = _get_index();
79  int change = mString->setChar( current_index, uc );
80  _jump_to( current_index );
81  return change;
82  }
83  //--------------------------------------------------------------------------
85  {
86  _seekFwd( 1 ); // move 1 code point forward
87  if ( _test_end() ) return; // exit if we hit the end
88  if ( _utf16_surrogate_follow( mIter[0] ) ) {
89  // landing on a follow code point means we might be part of a bigger character
90  // so we test for that
91  code_point lead_half = 0;
92  //NB: we can't possibly be at the beginning here, so no need to test
93  lead_half = mIter[-1]; // check the previous code point to see if we're part of a surrogate pair
94  if ( _utf16_surrogate_lead( lead_half ) ) {
95  _seekFwd( 1 ); // if so, then advance 1 more code point
96  }
97  }
98  }
99  //--------------------------------------------------------------------------
101  {
102  _seekRev( 1 ); // move 1 code point backwards
103  if ( _test_begin() ) return; // exit if we hit the beginning
104  if ( _utf16_surrogate_follow( mIter[0] ) ) {
105  // landing on a follow code point means we might be part of a bigger character
106  // so we test for that
107  code_point lead_half = 0;
108  lead_half = mIter[-1]; // check the previous character to see if we're part of a surrogate pair
109  if ( _utf16_surrogate_lead( lead_half ) ) {
110  _seekRev( 1 ); // if so, then rewind 1 more code point
111  }
112  }
113  }
114  //--------------------------------------------------------------------------
115  //--------------------------------------------------------------------------
116  //--------------------------------------------------------------------------
117  //--------------------------------------------------------------------------
119  {
120 
121  }
122  //--------------------------------------------------------------------------
124  {
125  _become( i );
126  }
127  //--------------------------------------------------------------------------
129  {
130  _seekFwd( 1 );
131  return *this;
132  }
133  //--------------------------------------------------------------------------
135  {
136  _fwd_iterator tmp( *this );
137  _seekFwd( 1 );
138  return tmp;
139  }
140  //--------------------------------------------------------------------------
142  {
143  _seekRev( 1 );
144  return *this;
145  }
146  //--------------------------------------------------------------------------
148  {
149  _fwd_iterator tmp( *this );
150  _seekRev( 1 );
151  return tmp;
152  }
153  //--------------------------------------------------------------------------
155  {
156  _fwd_iterator tmp( *this );
157  if ( n < 0 )
158  tmp._seekRev( -n );
159  else
160  tmp._seekFwd( n );
161  return tmp;
162  }
163  //--------------------------------------------------------------------------
165  {
166  _fwd_iterator tmp( *this );
167  if ( n < 0 )
168  tmp._seekFwd( -n );
169  else
170  tmp._seekRev( n );
171  return tmp;
172  }
173  //--------------------------------------------------------------------------
175  {
176  if ( n < 0 )
177  _seekRev( -n );
178  else
179  _seekFwd( n );
180  return *this;
181  }
182  //--------------------------------------------------------------------------
184  {
185  if ( n < 0 )
186  _seekFwd( -n );
187  else
188  _seekRev( n );
189  return *this;
190  }
191  //--------------------------------------------------------------------------
193  {
194  return *mIter;
195  }
196  //--------------------------------------------------------------------------
198  {
199  _fwd_iterator tmp( *this );
200  tmp += n;
201  return *tmp;
202  }
203  //--------------------------------------------------------------------------
205  {
206  _moveNext();
207  return *this;
208  }
209  //--------------------------------------------------------------------------
211  {
212  _movePrev();
213  return *this;
214  }
215  //--------------------------------------------------------------------------
217  {
218  return _getCharacter();
219  }
220  //--------------------------------------------------------------------------
222  {
223  return _setCharacter( uc );
224  }
225  //--------------------------------------------------------------------------
226  //--------------------------------------------------------------------------
227  //--------------------------------------------------------------------------
228  //--------------------------------------------------------------------------
230  {
231 
232  }
233  //--------------------------------------------------------------------------
235  {
236  _become( i );
237  }
238  //--------------------------------------------------------------------------
240  {
241  _become( i );
242  }
243  //--------------------------------------------------------------------------
245  {
246  _seekFwd( 1 );
247  return *this;
248  }
249  //--------------------------------------------------------------------------
251  {
252  _const_fwd_iterator tmp( *this );
253  _seekFwd( 1 );
254  return tmp;
255  }
256  //--------------------------------------------------------------------------
258  {
259  _seekRev( 1 );
260  return *this;
261  }
262  //--------------------------------------------------------------------------
264  {
265  _const_fwd_iterator tmp( *this );
266  _seekRev( 1 );
267  return tmp;
268  }
269  //--------------------------------------------------------------------------
271  {
272  _const_fwd_iterator tmp( *this );
273  if ( n < 0 )
274  tmp._seekRev( -n );
275  else
276  tmp._seekFwd( n );
277  return tmp;
278  }
279  //--------------------------------------------------------------------------
281  {
282  _const_fwd_iterator tmp( *this );
283  if ( n < 0 )
284  tmp._seekFwd( -n );
285  else
286  tmp._seekRev( n );
287  return tmp;
288  }
289  //--------------------------------------------------------------------------
291  {
292  if ( n < 0 )
293  _seekRev( -n );
294  else
295  _seekFwd( n );
296  return *this;
297  }
298  //--------------------------------------------------------------------------
300  {
301  if ( n < 0 )
302  _seekFwd( -n );
303  else
304  _seekRev( n );
305  return *this;
306  }
307  //--------------------------------------------------------------------------
309  {
310  return *mIter;
311  }
312  //--------------------------------------------------------------------------
314  {
315  _const_fwd_iterator tmp( *this );
316  tmp += n;
317  return *tmp;
318  }
319  //--------------------------------------------------------------------------
321  {
322  _moveNext();
323  return *this;
324  }
325  //--------------------------------------------------------------------------
327  {
328  _movePrev();
329  return *this;
330  }
331  //--------------------------------------------------------------------------
333  {
334  return _getCharacter();
335  }
336  //--------------------------------------------------------------------------
337  //--------------------------------------------------------------------------
338  //--------------------------------------------------------------------------
339  //--------------------------------------------------------------------------
341  {
342 
343  }
344  //--------------------------------------------------------------------------
346  {
347  _become( i );
348  }
349  //--------------------------------------------------------------------------
351  {
352  _seekRev( 1 );
353  return *this;
354  }
355  //--------------------------------------------------------------------------
357  {
358  _rev_iterator tmp( *this );
359  _seekRev( 1 );
360  return tmp;
361  }
362  //--------------------------------------------------------------------------
364  {
365  _seekFwd( 1 );
366  return *this;
367  }
368  //--------------------------------------------------------------------------
370  {
371  _rev_iterator tmp( *this );
372  _seekFwd( 1 );
373  return tmp;
374  }
375  //--------------------------------------------------------------------------
377  {
378  _rev_iterator tmp( *this );
379  if ( n < 0 )
380  tmp._seekFwd( -n );
381  else
382  tmp._seekRev( n );
383  return tmp;
384  }
385  //--------------------------------------------------------------------------
387  {
388  _rev_iterator tmp( *this );
389  if ( n < 0 )
390  tmp._seekRev( -n );
391  else
392  tmp._seekFwd( n );
393  return tmp;
394  }
395  //--------------------------------------------------------------------------
397  {
398  if ( n < 0 )
399  _seekFwd( -n );
400  else
401  _seekRev( n );
402  return *this;
403  }
404  //--------------------------------------------------------------------------
406  {
407  if ( n < 0 )
408  _seekRev( -n );
409  else
410  _seekFwd( n );
411  return *this;
412  }
413  //--------------------------------------------------------------------------
415  {
416  return mIter[-1];
417  }
418  //--------------------------------------------------------------------------
420  {
421  _rev_iterator tmp( *this );
422  tmp -= n;
423  return *tmp;
424  }
425  //--------------------------------------------------------------------------
426  //--------------------------------------------------------------------------
427  //--------------------------------------------------------------------------
428  //--------------------------------------------------------------------------
430  {
431 
432  }
433  //--------------------------------------------------------------------------
435  {
436  _become( i );
437  }
438  //--------------------------------------------------------------------------
440  {
441  _become( i );
442  }
443  //--------------------------------------------------------------------------
445  {
446  _seekRev( 1 );
447  return *this;
448  }
449  //--------------------------------------------------------------------------
451  {
452  _const_rev_iterator tmp( *this );
453  _seekRev( 1 );
454  return tmp;
455  }
456  //--------------------------------------------------------------------------
458  {
459  _seekFwd( 1 );
460  return *this;
461  }
462  //--------------------------------------------------------------------------
464  {
465  _const_rev_iterator tmp( *this );
466  _seekFwd( 1 );
467  return tmp;
468  }
469  //--------------------------------------------------------------------------
471  {
472  _const_rev_iterator tmp( *this );
473  if ( n < 0 )
474  tmp._seekFwd( -n );
475  else
476  tmp._seekRev( n );
477  return tmp;
478  }
479  //--------------------------------------------------------------------------
481  {
482  _const_rev_iterator tmp( *this );
483  if ( n < 0 )
484  tmp._seekRev( -n );
485  else
486  tmp._seekFwd( n );
487  return tmp;
488  }
489  //--------------------------------------------------------------------------
491  {
492  if ( n < 0 )
493  _seekFwd( -n );
494  else
495  _seekRev( n );
496  return *this;
497  }
498  //--------------------------------------------------------------------------
500  {
501  if ( n < 0 )
502  _seekRev( -n );
503  else
504  _seekFwd( n );
505  return *this;
506  }
507  //--------------------------------------------------------------------------
509  {
510  return mIter[-1];
511  }
512  //--------------------------------------------------------------------------
514  {
515  _const_rev_iterator tmp( *this );
516  tmp -= n;
517  return *tmp;
518  }
519  //--------------------------------------------------------------------------
520  //--------------------------------------------------------------------------
521  //--------------------------------------------------------------------------
522  //--------------------------------------------------------------------------
524  {
525  _init();
526  }
527  //--------------------------------------------------------------------------
528  UString::UString( const UString& copy )
529  {
530  _init();
531  mData = copy.mData;
532  }
533  //--------------------------------------------------------------------------
535  {
536  _init();
537  assign( length, ch );
538  }
539  //--------------------------------------------------------------------------
541  {
542  _init();
543  assign( str );
544  }
545  //--------------------------------------------------------------------------
547  {
548  _init();
549  assign( str, length );
550  }
551  //--------------------------------------------------------------------------
553  {
554  _init();
555  assign( str, index, length );
556  }
557  //--------------------------------------------------------------------------
558 #if MYGUI_IS_NATIVE_WCHAR_T
559  UString::UString( const wchar_t* w_str )
560  {
561  _init();
562  assign( w_str );
563  }
564  //--------------------------------------------------------------------------
565  UString::UString( const wchar_t* w_str, size_type length )
566  {
567  _init();
568  assign( w_str, length );
569  }
570 #endif
571  //--------------------------------------------------------------------------
572  UString::UString( const std::wstring& wstr )
573  {
574  _init();
575  assign( wstr );
576  }
577  //--------------------------------------------------------------------------
578  UString::UString( const char* c_str )
579  {
580  _init();
581  assign( c_str );
582  }
583  //--------------------------------------------------------------------------
585  {
586  _init();
587  assign( c_str, length );
588  }
589  //--------------------------------------------------------------------------
590  UString::UString( const std::string& str )
591  {
592  _init();
593  assign( str );
594  }
595  //--------------------------------------------------------------------------
597  {
598  _cleanBuffer();
599  }
600  //--------------------------------------------------------------------------
602  {
603  return mData.size();
604  }
605  //--------------------------------------------------------------------------
607  {
608  return size();
609  }
610  //--------------------------------------------------------------------------
612  {
613  const_iterator i = begin(), ie = end();
614  size_type c = 0;
615  while ( i != ie ) {
616  i.moveNext();
617  ++c;
618  }
619  return c;
620  }
621  //--------------------------------------------------------------------------
623  {
624  return mData.max_size();
625  }
626  //--------------------------------------------------------------------------
628  {
629  mData.reserve( size );
630  }
631  //--------------------------------------------------------------------------
632  void UString::resize( size_type num, const code_point& val /*= 0 */ )
633  {
634  mData.resize( num, val );
635  }
636  //--------------------------------------------------------------------------
637  void UString::swap( UString& from )
638  {
639  mData.swap( from.mData );
640  }
641  //--------------------------------------------------------------------------
642  bool UString::empty() const
643  {
644  return mData.empty();
645  }
646  //--------------------------------------------------------------------------
648  {
649  return mData.c_str();
650  }
651  //--------------------------------------------------------------------------
653  {
654  return c_str();
655  }
656  //--------------------------------------------------------------------------
658  {
659  return mData.capacity();
660  }
661  //--------------------------------------------------------------------------
663  {
664  mData.clear();
665  }
666  //--------------------------------------------------------------------------
667  UString UString::substr( size_type index, size_type num /*= npos */ ) const
668  {
669  // this could avoid the extra copy if we used a private specialty constructor
670  dstring data = mData.substr( index, num );
671  UString tmp;
672  tmp.mData.swap( data );
673  return tmp;
674  }
675  //--------------------------------------------------------------------------
677  {
678  code_point cp[2];
679  size_t c = _utf32_to_utf16( val, cp );
680  if ( c > 0 ) push_back( cp[0] );
681  if ( c > 1 ) push_back( cp[1] );
682  }
683  //--------------------------------------------------------------------------
684 #if MYGUI_IS_NATIVE_WCHAR_T
685  void UString::push_back( wchar_t val )
686  {
687  // we do this because the Unicode method still preserves UTF-16 code points
688  mData.push_back( static_cast<code_point>( val ) );
689  }
690 #endif
691  //--------------------------------------------------------------------------
693  {
694  mData.push_back( val );
695  }
696 
697  void UString::push_back( char val )
698  {
699  mData.push_back( static_cast<code_point>( val ) );
700  }
701 
703  {
704  const_iterator i, ie = end();
705  for ( i = begin(); i != ie; i.moveNext() ) {
706  if ( i.getCharacter() == ch )
707  return true;
708  }
709  return false;
710  }
711 
712  const std::string& UString::asUTF8() const
713  {
714  _load_buffer_UTF8();
715  return *m_buffer.mStrBuffer;
716  }
717 
718  const char* UString::asUTF8_c_str() const
719  {
720  _load_buffer_UTF8();
721  return m_buffer.mStrBuffer->c_str();
722  }
723 
725  {
726  _load_buffer_UTF32();
727  return *m_buffer.mUTF32StrBuffer;
728  }
729 
731  {
732  _load_buffer_UTF32();
733  return m_buffer.mUTF32StrBuffer->c_str();
734  }
735 
736  const std::wstring& UString::asWStr() const
737  {
738  _load_buffer_WStr();
739  return *m_buffer.mWStrBuffer;
740  }
741 
742  const wchar_t* UString::asWStr_c_str() const
743  {
744  _load_buffer_WStr();
745  return m_buffer.mWStrBuffer->c_str();
746  }
747 
749  {
750  return mData.at( loc );
751  }
752 
754  {
755  return mData.at( loc );
756  }
757 
759  {
760  const code_point* ptr = c_str();
761  unicode_char uc;
762  size_t l = _utf16_char_length( ptr[loc] );
763  code_point cp[2] = { /* blame the code beautifier */
764  0, 0
765  };
766  cp[0] = ptr[loc];
767 
768  if ( l == 2 && ( loc + 1 ) < mData.length() ) {
769  cp[1] = ptr[loc+1];
770  }
771  _utf16_to_utf32( cp, uc );
772  return uc;
773  }
774 
776  {
777  code_point cp[2] = { /* blame the code beautifier */
778  0, 0
779  };
780  size_t l = _utf32_to_utf16( ch, cp );
781  unicode_char existingChar = getChar( loc );
782  size_t existingSize = _utf16_char_length( existingChar );
783  size_t newSize = _utf16_char_length( ch );
784 
785  if ( newSize > existingSize ) {
786  at( loc ) = cp[0];
787  insert( loc + 1, 1, cp[1] );
788  return 1;
789  }
790  if ( newSize < existingSize ) {
791  erase( loc, 1 );
792  at( loc ) = cp[0];
793  return -1;
794  }
795 
796  // newSize == existingSize
797  at( loc ) = cp[0];
798  if ( l == 2 ) at( loc + 1 ) = cp[1];
799  return 0;
800  }
801 
803  {
804  iterator i;
805  i.mIter = mData.begin();
806  i.mString = this;
807  return i;
808  }
809 
811  {
812  const_iterator i;
813  i.mIter = const_cast<UString*>( this )->mData.begin();
814  i.mString = const_cast<UString*>( this );
815  return i;
816  }
817 
819  {
820  iterator i;
821  i.mIter = mData.end();
822  i.mString = this;
823  return i;
824  }
825 
827  {
828  const_iterator i;
829  i.mIter = const_cast<UString*>( this )->mData.end();
830  i.mString = const_cast<UString*>( this );
831  return i;
832  }
833 
835  {
837  i.mIter = mData.end();
838  i.mString = this;
839  return i;
840  }
841 
843  {
845  i.mIter = const_cast<UString*>( this )->mData.end();
846  i.mString = const_cast<UString*>( this );
847  return i;
848  }
849 
851  {
853  i.mIter = mData.begin();
854  i.mString = this;
855  return i;
856  }
857 
859  {
861  i.mIter = const_cast<UString*>( this )->mData.begin();
862  i.mString = const_cast<UString*>( this );
863  return i;
864  }
865 
867  {
868  mData.assign( start.mIter, end.mIter );
869  return *this;
870  }
871 
873  {
874  mData.assign( str.mData );
875  return *this;
876  }
877 
879  {
880  mData.assign( str );
881  return *this;
882  }
883 
885  {
886  mData.assign( str, num );
887  return *this;
888  }
889 
891  {
892  mData.assign( str.mData, index, len );
893  return *this;
894  }
895 
897  {
898  mData.assign( num, ch );
899  return *this;
900  }
901 
902  UString& UString::assign( const std::wstring& wstr )
903  {
904  mData.clear();
905  mData.reserve( wstr.length() ); // best guess bulk allocate
906 #ifdef WCHAR_UTF16 // if we're already working in UTF-16, this is easy
907  code_point tmp;
908  std::wstring::const_iterator i, ie = wstr.end();
909  for ( i = wstr.begin(); i != ie; i++ ) {
910  tmp = static_cast<code_point>( *i );
911  mData.push_back( tmp );
912  }
913 #else // otherwise we do it the safe way (which is still 100% safe to pass UTF-16 through, just slower)
914  code_point cp[3] = {0, 0, 0};
915  unicode_char tmp;
916  std::wstring::const_iterator i, ie = wstr.end();
917  for ( i = wstr.begin(); i != ie; i++ ) {
918  tmp = static_cast<unicode_char>( *i );
919  size_t l = _utf32_to_utf16( tmp, cp );
920  if ( l > 0 ) mData.push_back( cp[0] );
921  if ( l > 1 ) mData.push_back( cp[1] );
922  }
923 #endif
924  return *this;
925  }
926 
927 #if MYGUI_IS_NATIVE_WCHAR_T
928  UString& UString::assign( const wchar_t* w_str )
929  {
930  std::wstring tmp;
931  tmp.assign( w_str );
932  return assign( tmp );
933  }
934 
935  UString& UString::assign( const wchar_t* w_str, size_type num )
936  {
937  std::wstring tmp;
938  tmp.assign( w_str, num );
939  return assign( tmp );
940  }
941 #endif
942 
943  UString& UString::assign( const std::string& str )
944  {
945  size_type len = _verifyUTF8( str );
946  clear(); // empty our contents, if there are any
947  reserve( len ); // best guess bulk capacity growth
948 
949  // This is a 3 step process, converting each byte in the UTF-8 stream to UTF-32,
950  // then converting it to UTF-16, then finally appending the data buffer
951 
952  unicode_char uc; // temporary Unicode character buffer
953  unsigned char utf8buf[7]; // temporary UTF-8 buffer
954  utf8buf[6] = 0;
955  size_t utf8len; // UTF-8 length
956  code_point utf16buff[3]; // temporary UTF-16 buffer
957  utf16buff[2] = 0;
958  size_t utf16len; // UTF-16 length
959 
960  std::string::const_iterator i, ie = str.end();
961  for ( i = str.begin(); i != ie; i++ ) {
962  utf8len = _utf8_char_length( static_cast<unsigned char>( *i ) ); // estimate bytes to load
963  for ( size_t j = 0; j < utf8len; j++ ) { // load the needed UTF-8 bytes
964  utf8buf[j] = ( static_cast<unsigned char>( *( i + j ) ) ); // we don't increment 'i' here just in case the estimate is wrong (shouldn't happen, but we're being careful)
965  }
966  utf8buf[utf8len] = 0; // nul terminate so we throw an exception before running off the end of the buffer
967  utf8len = _utf8_to_utf32( utf8buf, uc ); // do the UTF-8 -> UTF-32 conversion
968  i += utf8len - 1; // we subtract 1 for the increment of the 'for' loop
969 
970  utf16len = _utf32_to_utf16( uc, utf16buff ); // UTF-32 -> UTF-16 conversion
971  append( utf16buff, utf16len ); // append the characters to the string
972  }
973  return *this;
974  }
975 
976  UString& UString::assign( const char* c_str )
977  {
978  std::string tmp( c_str );
979  return assign( tmp );
980  }
981 
982  UString& UString::assign( const char* c_str, size_type num )
983  {
984  std::string tmp;
985  tmp.assign( c_str, num );
986  return assign( tmp );
987  }
988 
990  {
991  mData.append( str.mData );
992  return *this;
993  }
994 
996  {
997  mData.append( str );
998  return *this;
999  }
1000 
1002  {
1003  mData.append( str.mData, index, len );
1004  return *this;
1005  }
1006 
1008  {
1009  mData.append( str, num );
1010  return *this;
1011  }
1012 
1014  {
1015  mData.append( num, ch );
1016  return *this;
1017  }
1018 
1020  {
1021  mData.append( start.mIter, end.mIter );
1022  return *this;
1023  }
1024 
1025 #if MYGUI_IS_NATIVE_WCHAR_T
1026  UString& UString::append( const wchar_t* w_str, size_type num )
1027  {
1028  std::wstring tmp( w_str, num );
1029  return append( tmp );
1030  }
1031 
1032  UString& UString::append( size_type num, wchar_t ch )
1033  {
1034  return append( num, static_cast<unicode_char>( ch ) );
1035  }
1036 #endif
1038  {
1039  UString tmp( c_str, num );
1040  append( tmp );
1041  return *this;
1042  }
1043 
1045  {
1046  append( num, static_cast<code_point>( ch ) );
1047  return *this;
1048  }
1049 
1051  {
1052  code_point cp[2] = {0, 0};
1053  if ( _utf32_to_utf16( ch, cp ) == 2 ) {
1054  for ( size_type i = 0; i < num; i++ ) {
1055  append( 1, cp[0] );
1056  append( 1, cp[1] );
1057  }
1058  } else {
1059  for ( size_type i = 0; i < num; i++ ) {
1060  append( 1, cp[0] );
1061  }
1062  }
1063  return *this;
1064  }
1065 
1067  {
1068  iterator ret;
1069  ret.mIter = mData.insert( i.mIter, ch );
1070  ret.mString = this;
1071  return ret;
1072  }
1073 
1075  {
1076  mData.insert( index, str.mData );
1077  return *this;
1078  }
1079 
1080  UString& UString::insert( size_type index1, const UString& str, size_type index2, size_type num )
1081  {
1082  mData.insert( index1, str.mData, index2, num );
1083  return *this;
1084  }
1085 
1087  {
1088  mData.insert( i.mIter, start.mIter, end.mIter );
1089  }
1090 
1092  {
1093  mData.insert( index, str, num );
1094  return *this;
1095  }
1096 
1097 #if MYGUI_IS_NATIVE_WCHAR_T
1098  UString& UString::insert( size_type index, const wchar_t* w_str, size_type num )
1099  {
1100  UString tmp( w_str, num );
1101  insert( index, tmp );
1102  return *this;
1103  }
1104 #endif
1105 
1106  UString& UString::insert( size_type index, const char* c_str, size_type num )
1107  {
1108  UString tmp( c_str, num );
1109  insert( index, tmp );
1110  return *this;
1111  }
1112 
1114  {
1115  mData.insert( index, num, ch );
1116  return *this;
1117  }
1118 
1119 #if MYGUI_IS_NATIVE_WCHAR_T
1120  UString& UString::insert( size_type index, size_type num, wchar_t ch )
1121  {
1122  insert( index, num, static_cast<unicode_char>( ch ) );
1123  return *this;
1124  }
1125 #endif
1126 
1127  UString& UString::insert( size_type index, size_type num, char ch )
1128  {
1129  insert( index, num, static_cast<code_point>( ch ) );
1130  return *this;
1131  }
1132 
1134  {
1135  code_point cp[3] = {0, 0, 0};
1136  size_t l = _utf32_to_utf16( ch, cp );
1137  if ( l == 1 ) {
1138  return insert( index, num, cp[0] );
1139  }
1140  for ( size_type c = 0; c < num; c++ ) {
1141  // insert in reverse order to preserve ordering after insert
1142  insert( index, 1, cp[1] );
1143  insert( index, 1, cp[0] );
1144  }
1145  return *this;
1146  }
1147 
1148  void UString::insert( iterator i, size_type num, const code_point& ch )
1149  {
1150  mData.insert( i.mIter, num, ch );
1151  }
1152 #if MYGUI_IS_NATIVE_WCHAR_T
1153  void UString::insert( iterator i, size_type num, const wchar_t& ch )
1154  {
1155  insert( i, num, static_cast<unicode_char>( ch ) );
1156  }
1157 #endif
1158 
1159  void UString::insert( iterator i, size_type num, const char& ch )
1160  {
1161  insert( i, num, static_cast<code_point>( ch ) );
1162  }
1163 
1165  {
1166  code_point cp[3] = {0, 0, 0};
1167  size_t l = _utf32_to_utf16( ch, cp );
1168  if ( l == 1 ) {
1169  insert( i, num, cp[0] );
1170  } else {
1171  for ( size_type c = 0; c < num; c++ ) {
1172  // insert in reverse order to preserve ordering after insert
1173  insert( i, 1, cp[1] );
1174  insert( i, 1, cp[0] );
1175  }
1176  }
1177  }
1178 
1180  {
1181  iterator ret;
1182  ret.mIter = mData.erase( loc.mIter );
1183  ret.mString = this;
1184  return ret;
1185  }
1186 
1188  {
1189  iterator ret;
1190  ret.mIter = mData.erase( start.mIter, end.mIter );
1191  ret.mString = this;
1192  return ret;
1193  }
1194 
1195  UString& UString::erase( size_type index /*= 0*/, size_type num /*= npos */ )
1196  {
1197  if ( num == npos )
1198  mData.erase( index );
1199  else
1200  mData.erase( index, num );
1201  return *this;
1202  }
1203 
1204  UString& UString::replace( size_type index1, size_type num1, const UString& str )
1205  {
1206  mData.replace( index1, num1, str.mData, 0, npos );
1207  return *this;
1208  }
1209 
1210  UString& UString::replace( size_type index1, size_type num1, const UString& str, size_type num2 )
1211  {
1212  mData.replace( index1, num1, str.mData, 0, num2 );
1213  return *this;
1214  }
1215 
1216  UString& UString::replace( size_type index1, size_type num1, const UString& str, size_type index2, size_type num2 )
1217  {
1218  mData.replace( index1, num1, str.mData, index2, num2 );
1219  return *this;
1220  }
1221 
1222  UString& UString::replace( iterator start, iterator end, const UString& str, size_type num /*= npos */ )
1223  {
1224  _const_fwd_iterator st(start); //Work around for gcc, allow it to find correct overload
1225 
1226  size_type index1 = begin() - st;
1227  size_type num1 = end - st;
1228  return replace( index1, num1, str, 0, num );
1229  }
1230 
1232  {
1233  mData.replace( index, num1, num2, ch );
1234  return *this;
1235  }
1236 
1238  {
1239  _const_fwd_iterator st(start); //Work around for gcc, allow it to find correct overload
1240 
1241  size_type index1 = begin() - st;
1242  size_type num1 = end - st;
1243  return replace( index1, num1, num, ch );
1244  }
1245 
1246  int UString::compare( const UString& str ) const
1247  {
1248  return mData.compare( str.mData );
1249  }
1250 
1251  int UString::compare( const code_point* str ) const
1252  {
1253  return mData.compare( str );
1254  }
1255 
1256  int UString::compare( size_type index, size_type length, const UString& str ) const
1257  {
1258  return mData.compare( index, length, str.mData );
1259  }
1260 
1261  int UString::compare( size_type index, size_type length, const UString& str, size_type index2, size_type length2 ) const
1262  {
1263  return mData.compare( index, length, str.mData, index2, length2 );
1264  }
1265 
1266  int UString::compare( size_type index, size_type length, const code_point* str, size_type length2 ) const
1267  {
1268  return mData.compare( index, length, str, length2 );
1269  }
1270 
1271 #if MYGUI_IS_NATIVE_WCHAR_T
1272  int UString::compare( size_type index, size_type length, const wchar_t* w_str, size_type length2 ) const
1273  {
1274  UString tmp( w_str, length2 );
1275  return compare( index, length, tmp );
1276  }
1277 #endif
1278 
1279  int UString::compare( size_type index, size_type length, const char* c_str, size_type length2 ) const
1280  {
1281  UString tmp( c_str, length2 );
1282  return compare( index, length, tmp );
1283  }
1284 
1285  UString::size_type UString::find( const UString& str, size_type index /*= 0 */ ) const
1286  {
1287  return mData.find( str.c_str(), index );
1288  }
1289 
1291  {
1292  UString tmp( cp_str );
1293  return mData.find( tmp.c_str(), index, length );
1294  }
1295 
1297  {
1298  UString tmp( c_str );
1299  return mData.find( tmp.c_str(), index, length );
1300  }
1301 
1302 #if MYGUI_IS_NATIVE_WCHAR_T
1303  UString::size_type UString::find( const wchar_t* w_str, size_type index, size_type length ) const
1304  {
1305  UString tmp( w_str );
1306  return mData.find( tmp.c_str(), index, length );
1307  }
1308 #endif
1309 
1310  UString::size_type UString::find( char ch, size_type index /*= 0 */ ) const
1311  {
1312  return find( static_cast<code_point>( ch ), index );
1313  }
1314 
1316  {
1317  return mData.find( ch, index );
1318  }
1319 
1320 #if MYGUI_IS_NATIVE_WCHAR_T
1321  UString::size_type UString::find( wchar_t ch, size_type index /*= 0 */ ) const
1322  {
1323  return find( static_cast<unicode_char>( ch ), index );
1324  }
1325 #endif
1326 
1328  {
1329  code_point cp[3] = {0, 0, 0};
1330  size_t l = _utf32_to_utf16( ch, cp );
1331  return find( UString( cp, l ), index );
1332  }
1333 
1334  UString::size_type UString::rfind( const UString& str, size_type index /*= 0 */ ) const
1335  {
1336  return mData.rfind( str.c_str(), index );
1337  }
1338 
1340  {
1341  UString tmp( cp_str );
1342  return mData.rfind( tmp.c_str(), index, num );
1343  }
1344 
1345  UString::size_type UString::rfind( const char* c_str, size_type index, size_type num ) const
1346  {
1347  UString tmp( c_str );
1348  return mData.rfind( tmp.c_str(), index, num );
1349  }
1350 
1351 #if MYGUI_IS_NATIVE_WCHAR_T
1352  UString::size_type UString::rfind( const wchar_t* w_str, size_type index, size_type num ) const
1353  {
1354  UString tmp( w_str );
1355  return mData.rfind( tmp.c_str(), index, num );
1356  }
1357 #endif
1358 
1359  UString::size_type UString::rfind( char ch, size_type index /*= 0 */ ) const
1360  {
1361  return rfind( static_cast<code_point>( ch ), index );
1362  }
1363 
1365  {
1366  return mData.rfind( ch, index );
1367  }
1368 
1369 #if MYGUI_IS_NATIVE_WCHAR_T
1370  UString::size_type UString::rfind( wchar_t ch, size_type index /*= 0 */ ) const
1371  {
1372  return rfind( static_cast<unicode_char>( ch ), index );
1373  }
1374 #endif
1375 
1377  {
1378  code_point cp[3] = {0, 0, 0};
1379  size_t l = _utf32_to_utf16( ch, cp );
1380  return rfind( UString( cp, l ), index );
1381  }
1382 
1383  UString::size_type UString::find_first_of( const UString &str, size_type index /*= 0*/, size_type num /*= npos */ ) const
1384  {
1385  size_type i = 0;
1386  const size_type len = length();
1387  while ( i < num && ( index + i ) < len ) {
1388  unicode_char ch = getChar( index + i );
1389  if ( str.inString( ch ) )
1390  return index + i;
1391  i += _utf16_char_length( ch ); // increment by the Unicode character length
1392  }
1393  return npos;
1394  }
1395 
1397  {
1398  UString tmp;
1399  tmp.assign( 1, ch );
1400  return find_first_of( tmp, index );
1401  }
1402 
1403  UString::size_type UString::find_first_of( char ch, size_type index /*= 0 */ ) const
1404  {
1405  return find_first_of( static_cast<code_point>( ch ), index );
1406  }
1407 
1408 #if MYGUI_IS_NATIVE_WCHAR_T
1409  UString::size_type UString::find_first_of( wchar_t ch, size_type index /*= 0 */ ) const
1410  {
1411  return find_first_of( static_cast<unicode_char>( ch ), index );
1412  }
1413 #endif
1414 
1416  {
1417  code_point cp[3] = {0, 0, 0};
1418  size_t l = _utf32_to_utf16( ch, cp );
1419  return find_first_of( UString( cp, l ), index );
1420  }
1421 
1422  UString::size_type UString::find_first_not_of( const UString& str, size_type index /*= 0*/, size_type num /*= npos */ ) const
1423  {
1424  size_type i = 0;
1425  const size_type len = length();
1426  while ( i < num && ( index + i ) < len ) {
1427  unicode_char ch = getChar( index + i );
1428  if ( !str.inString( ch ) )
1429  return index + i;
1430  i += _utf16_char_length( ch ); // increment by the Unicode character length
1431  }
1432  return npos;
1433  }
1434 
1436  {
1437  UString tmp;
1438  tmp.assign( 1, ch );
1439  return find_first_not_of( tmp, index );
1440  }
1441 
1443  {
1444  return find_first_not_of( static_cast<code_point>( ch ), index );
1445  }
1446 
1447 #if MYGUI_IS_NATIVE_WCHAR_T
1448  UString::size_type UString::find_first_not_of( wchar_t ch, size_type index /*= 0 */ ) const
1449  {
1450  return find_first_not_of( static_cast<unicode_char>( ch ), index );
1451  }
1452 #endif
1453 
1455  {
1456  code_point cp[3] = {0, 0, 0};
1457  size_t l = _utf32_to_utf16( ch, cp );
1458  return find_first_not_of( UString( cp, l ), index );
1459  }
1460 
1461  UString::size_type UString::find_last_of( const UString& str, size_type index /*= npos*/, size_type num /*= npos */ ) const
1462  {
1463  size_type i = 0;
1464  const size_type len = length();
1465  if ( index > len ) index = len - 1;
1466 
1467  while ( i < num && ( index - i ) != npos ) {
1468  size_type j = index - i;
1469  // careful to step full Unicode characters
1470  if ( j != 0 && _utf16_surrogate_follow( at( j ) ) && _utf16_surrogate_lead( at( j - 1 ) ) ) {
1471  j = index - ++i;
1472  }
1473  // and back to the usual dull test
1474  unicode_char ch = getChar( j );
1475  if ( str.inString( ch ) )
1476  return j;
1477  i++;
1478  }
1479  return npos;
1480  }
1481 
1483  {
1484  UString tmp;
1485  tmp.assign( 1, ch );
1486  return find_last_of( tmp, index );
1487  }
1488 
1489 #if MYGUI_IS_NATIVE_WCHAR_T
1490  UString::size_type UString::find_last_of( wchar_t ch, size_type index /*= npos */ ) const
1491  {
1492  return find_last_of( static_cast<unicode_char>( ch ), index );
1493  }
1494 #endif
1495 
1497  {
1498  code_point cp[3] = {0, 0, 0};
1499  size_t l = _utf32_to_utf16( ch, cp );
1500  return find_last_of( UString( cp, l ), index );
1501  }
1502 
1503  UString::size_type UString::find_last_not_of( const UString& str, size_type index /*= npos*/, size_type num /*= npos */ ) const
1504  {
1505  size_type i = 0;
1506  const size_type len = length();
1507  if ( index > len ) index = len - 1;
1508 
1509  while ( i < num && ( index - i ) != npos ) {
1510  size_type j = index - i;
1511  // careful to step full Unicode characters
1512  if ( j != 0 && _utf16_surrogate_follow( at( j ) ) && _utf16_surrogate_lead( at( j - 1 ) ) ) {
1513  j = index - ++i;
1514  }
1515  // and back to the usual dull test
1516  unicode_char ch = getChar( j );
1517  if ( !str.inString( ch ) )
1518  return j;
1519  i++;
1520  }
1521  return npos;
1522  }
1523 
1525  {
1526  UString tmp;
1527  tmp.assign( 1, ch );
1528  return find_last_not_of( tmp, index );
1529  }
1530 
1531  UString::size_type UString::find_last_not_of( char ch, size_type index /*= npos */ ) const
1532  {
1533  return find_last_not_of( static_cast<code_point>( ch ), index );
1534  }
1535 
1536 #if MYGUI_IS_NATIVE_WCHAR_T
1537  UString::size_type UString::find_last_not_of( wchar_t ch, size_type index /*= npos */ ) const
1538  {
1539  return find_last_not_of( static_cast<unicode_char>( ch ), index );
1540  }
1541 #endif
1542 
1544  {
1545  code_point cp[3] = {0, 0, 0};
1546  size_t l = _utf32_to_utf16( ch, cp );
1547  return find_last_not_of( UString( cp, l ), index );
1548  }
1549 
1550  bool UString::operator<( const UString& right ) const
1551  {
1552  return compare( right ) < 0;
1553  }
1554 
1555  bool UString::operator<=( const UString& right ) const
1556  {
1557  return compare( right ) <= 0;
1558  }
1559 
1561  {
1562  return assign( s );
1563  }
1564 
1566  {
1567  clear();
1568  return append( 1, ch );
1569  }
1570 
1572  {
1573  clear();
1574  return append( 1, ch );
1575  }
1576 
1577 #if MYGUI_IS_NATIVE_WCHAR_T
1578  UString& UString::operator=( wchar_t ch )
1579  {
1580  clear();
1581  return append( 1, ch );
1582  }
1583 #endif
1584 
1586  {
1587  clear();
1588  return append( 1, ch );
1589  }
1590 
1591  bool UString::operator>( const UString& right ) const
1592  {
1593  return compare( right ) > 0;
1594  }
1595 
1596  bool UString::operator>=( const UString& right ) const
1597  {
1598  return compare( right ) >= 0;
1599  }
1600 
1601  bool UString::operator==( const UString& right ) const
1602  {
1603  return compare( right ) == 0;
1604  }
1605 
1606  bool UString::operator!=( const UString& right ) const
1607  {
1608  return !operator==( right );
1609  }
1610 
1612  {
1613  return at( index );
1614  }
1615 
1617  {
1618  return at( index );
1619  }
1620 
1621  UString::operator std::string() const
1622  {
1623  return std::string( asUTF8() );
1624  }
1625 
1627  UString::operator std::wstring() const
1628  {
1629  return std::wstring( asWStr() );
1630  }
1631 
1632 
1634  {
1635  if ( 0xD800 <= cp && cp <= 0xDFFF ) // tests if the cp is within the surrogate pair range
1636  return false; // it matches a surrogate pair signature
1637  return true; // everything else is a standalone code point
1638  }
1639 
1641  {
1642  if ( 0xD800 <= cp && cp <= 0xDBFF ) // tests if the cp is within the 2nd word of a surrogate pair
1643  return true; // it is a 1st word
1644  return false; // it isn't
1645  }
1646 
1648  {
1649  if ( 0xDC00 <= cp && cp <= 0xDFFF ) // tests if the cp is within the 2nd word of a surrogate pair
1650  return true; // it is a 2nd word
1651  return false; // everything else isn't
1652  }
1653 
1655  {
1656  if ( 0xD800 <= cp && cp <= 0xDBFF ) // test if cp is the beginning of a surrogate pair
1657  return 2; // if it is, then we are 2 words long
1658  return 1; // otherwise we are only 1 word long
1659  }
1660 
1662  {
1663  if ( uc > 0xFFFF ) // test if uc is greater than the single word maximum
1664  return 2; // if so, we need a surrogate pair
1665  return 1; // otherwise we can stuff it into a single word
1666  }
1667 
1668  size_t UString::_utf16_to_utf32( const code_point in_cp[2], unicode_char& out_uc )
1669  {
1670  const code_point& cp1 = in_cp[0];
1671  const code_point& cp2 = in_cp[1];
1672  bool wordPair = false;
1673 
1674  // does it look like a surrogate pair?
1675  if ( 0xD800 <= cp1 && cp1 <= 0xDBFF ) {
1676  // looks like one, but does the other half match the algorithm as well?
1677  if ( 0xDC00 <= cp2 && cp2 <= 0xDFFF )
1678  wordPair = true; // yep!
1679  }
1680 
1681  if ( !wordPair ) { // if we aren't a 100% authentic surrogate pair, then just copy the value
1682  out_uc = cp1;
1683  return 1;
1684  }
1685 
1686  unsigned short cU = cp1, cL = cp2; // copy upper and lower words of surrogate pair to writable buffers
1687  cU -= 0xD800; // remove the encoding markers
1688  cL -= 0xDC00;
1689 
1690  out_uc = ( cU & 0x03FF ) << 10; // grab the 10 upper bits and set them in their proper location
1691  out_uc |= ( cL & 0x03FF ); // combine in the lower 10 bits
1692  out_uc += 0x10000; // add back in the value offset
1693 
1694  return 2; // this whole operation takes to words, so that's what we'll return
1695  }
1696 
1697  size_t UString::_utf32_to_utf16( const unicode_char& in_uc, code_point out_cp[2] )
1698  {
1699  if ( in_uc <= 0xFFFF ) { // we blindly preserve sentinel values because our decoder understands them
1700  out_cp[0] = static_cast<code_point>(in_uc);
1701  return 1;
1702  }
1703  unicode_char uc = in_uc; // copy to writable buffer
1704  unsigned short tmp; // single code point buffer
1705  uc -= 0x10000; // subtract value offset
1706 
1707  //process upper word
1708  tmp = static_cast<unsigned short>(( uc >> 10 ) & 0x03FF); // grab the upper 10 bits
1709  tmp += 0xD800; // add encoding offset
1710  out_cp[0] = tmp; // write
1711 
1712  // process lower word
1713  tmp = static_cast<unsigned short>(uc & 0x03FF); // grab the lower 10 bits
1714  tmp += 0xDC00; // add encoding offset
1715  out_cp[1] = tmp; // write
1716 
1717  return 2; // return used word count (2 for surrogate pairs)
1718  }
1719 
1720  bool UString::_utf8_start_char( unsigned char cp )
1721  {
1722  return ( cp & ~_cont_mask ) != _cont;
1723  }
1724 
1725  size_t UString::_utf8_char_length( unsigned char cp )
1726  {
1727  if ( !( cp & 0x80 ) ) return 1;
1728  if (( cp & ~_lead1_mask ) == _lead1 ) return 2;
1729  if (( cp & ~_lead2_mask ) == _lead2 ) return 3;
1730  if (( cp & ~_lead3_mask ) == _lead3 ) return 4;
1731  if (( cp & ~_lead4_mask ) == _lead4 ) return 5;
1732  if (( cp & ~_lead5_mask ) == _lead5 ) return 6;
1733  throw invalid_data( "invalid UTF-8 sequence header value" );
1734  }
1735 
1737  {
1738  /*
1739  7 bit: U-00000000 - U-0000007F: 0xxxxxxx
1740  11 bit: U-00000080 - U-000007FF: 110xxxxx 10xxxxxx
1741  16 bit: U-00000800 - U-0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
1742  21 bit: U-00010000 - U-001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
1743  26 bit: U-00200000 - U-03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
1744  31 bit: U-04000000 - U-7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
1745  */
1746  if ( !( uc & ~0x0000007F ) ) return 1;
1747  if ( !( uc & ~0x000007FF ) ) return 2;
1748  if ( !( uc & ~0x0000FFFF ) ) return 3;
1749  if ( !( uc & ~0x001FFFFF ) ) return 4;
1750  if ( !( uc & ~0x03FFFFFF ) ) return 5;
1751  if ( !( uc & ~0x7FFFFFFF ) ) return 6;
1752  throw invalid_data( "invalid UTF-32 value" );
1753  }
1754 
1755  size_t UString::_utf8_to_utf32( const unsigned char in_cp[6], unicode_char& out_uc )
1756  {
1757  size_t len = _utf8_char_length( in_cp[0] );
1758  if ( len == 1 ) { // if we are only 1 byte long, then just grab it and exit
1759  out_uc = in_cp[0];
1760  return 1;
1761  }
1762 
1763  unicode_char c = 0; // temporary buffer
1764  size_t i = 0;
1765  switch ( len ) { // load header byte
1766  case 6:
1767  c = in_cp[i] & _lead5_mask;
1768  break;
1769  case 5:
1770  c = in_cp[i] & _lead4_mask;
1771  break;
1772  case 4:
1773  c = in_cp[i] & _lead3_mask;
1774  break;
1775  case 3:
1776  c = in_cp[i] & _lead2_mask;
1777  break;
1778  case 2:
1779  c = in_cp[i] & _lead1_mask;
1780  break;
1781  }
1782 
1783  for ( ++i; i < len; i++ ) { // load each continuation byte
1784  if (( in_cp[i] & ~_cont_mask ) != _cont )
1785  throw invalid_data( "bad UTF-8 continuation byte" );
1786  c <<= 6;
1787  c |= ( in_cp[i] & _cont_mask );
1788  }
1789 
1790  out_uc = c; // write the final value and return the used byte length
1791  return len;
1792  }
1793 
1794  size_t UString::_utf32_to_utf8( const unicode_char& in_uc, unsigned char out_cp[6] )
1795  {
1796  size_t len = _utf8_char_length( in_uc ); // predict byte length of sequence
1797  unicode_char c = in_uc; // copy to temp buffer
1798 
1799  //stuff all of the lower bits
1800  for ( size_t i = len - 1; i > 0; i-- ) {
1801  out_cp[i] = static_cast<unsigned char>((( c ) & _cont_mask ) | _cont);
1802  c >>= 6;
1803  }
1804 
1805  //now write the header byte
1806  switch ( len ) {
1807  case 6:
1808  out_cp[0] = static_cast<unsigned char>((( c ) & _lead5_mask ) | _lead5);
1809  break;
1810  case 5:
1811  out_cp[0] = static_cast<unsigned char>((( c ) & _lead4_mask ) | _lead4);
1812  break;
1813  case 4:
1814  out_cp[0] = static_cast<unsigned char>((( c ) & _lead3_mask ) | _lead3);
1815  break;
1816  case 3:
1817  out_cp[0] = static_cast<unsigned char>((( c ) & _lead2_mask ) | _lead2);
1818  break;
1819  case 2:
1820  out_cp[0] = static_cast<unsigned char>((( c ) & _lead1_mask ) | _lead1);
1821  break;
1822  case 1:
1823  default:
1824  out_cp[0] = static_cast<unsigned char>(( c ) & 0x7F);
1825  break;
1826  }
1827 
1828  // return the byte length of the sequence
1829  return len;
1830  }
1831 
1833  {
1834  std::string tmp( reinterpret_cast<const char*>( c_str ) );
1835  return _verifyUTF8( tmp );
1836  }
1837 
1838  UString::size_type UString::_verifyUTF8( const std::string& str )
1839  {
1840  std::string::const_iterator i, ie = str.end();
1841  i = str.begin();
1842  size_type length = 0;
1843 
1844  while ( i != ie ) {
1845  // characters pass until we find an extended sequence
1846  if (( *i ) & 0x80 ) {
1847  unsigned char c = ( *i );
1848  size_t contBytes = 0;
1849 
1850  // get continuation byte count and test for overlong sequences
1851  if (( c & ~_lead1_mask ) == _lead1 ) { // 1 additional byte
1852  if ( c == _lead1 ) throw invalid_data( "overlong UTF-8 sequence" );
1853  contBytes = 1;
1854 
1855  } else if (( c & ~_lead2_mask ) == _lead2 ) { // 2 additional bytes
1856  contBytes = 2;
1857  if ( c == _lead2 ) { // possible overlong UTF-8 sequence
1858  c = ( *( i + 1 ) ); // look ahead to next byte in sequence
1859  if (( c & _lead2 ) == _cont ) throw invalid_data( "overlong UTF-8 sequence" );
1860  }
1861 
1862  } else if (( c & ~_lead3_mask ) == _lead3 ) { // 3 additional bytes
1863  contBytes = 3;
1864  if ( c == _lead3 ) { // possible overlong UTF-8 sequence
1865  c = ( *( i + 1 ) ); // look ahead to next byte in sequence
1866  if (( c & _lead3 ) == _cont ) throw invalid_data( "overlong UTF-8 sequence" );
1867  }
1868 
1869  } else if (( c & ~_lead4_mask ) == _lead4 ) { // 4 additional bytes
1870  contBytes = 4;
1871  if ( c == _lead4 ) { // possible overlong UTF-8 sequence
1872  c = ( *( i + 1 ) ); // look ahead to next byte in sequence
1873  if (( c & _lead4 ) == _cont ) throw invalid_data( "overlong UTF-8 sequence" );
1874  }
1875 
1876  } else if (( c & ~_lead5_mask ) == _lead5 ) { // 5 additional bytes
1877  contBytes = 5;
1878  if ( c == _lead5 ) { // possible overlong UTF-8 sequence
1879  c = ( *( i + 1 ) ); // look ahead to next byte in sequence
1880  if (( c & _lead5 ) == _cont ) throw invalid_data( "overlong UTF-8 sequence" );
1881  }
1882  }
1883 
1884  // check remaining continuation bytes for
1885  while ( contBytes-- ) {
1886  c = ( *( ++i ) ); // get next byte in sequence
1887  if (( c & ~_cont_mask ) != _cont )
1888  throw invalid_data( "bad UTF-8 continuation byte" );
1889  }
1890  }
1891  length++;
1892  i++;
1893  }
1894  return length;
1895  }
1896 
1897  void UString::_init()
1898  {
1899  m_buffer.mVoidBuffer = 0;
1900  m_bufferType = bt_none;
1901  m_bufferSize = 0;
1902  }
1903 
1904  void UString::_cleanBuffer() const
1905  {
1906  if ( m_buffer.mVoidBuffer != 0 ) {
1907  switch ( m_bufferType ) {
1908  case bt_string:
1909  delete m_buffer.mStrBuffer;
1910  break;
1911  case bt_wstring:
1912  delete m_buffer.mWStrBuffer;
1913  break;
1914  case bt_utf32string:
1915  delete m_buffer.mUTF32StrBuffer;
1916  break;
1917  case bt_none: // under the worse of circumstances, this is all we can do, and hope it works out
1918  default:
1919  //delete m_buffer.mVoidBuffer;
1920  // delete void* is undefined, don't do that
1921  assert("This should never happen - mVoidBuffer should never contain something if we "
1922  "don't know the type");
1923  break;
1924  }
1925  m_buffer.mVoidBuffer = 0;
1926  m_bufferSize = 0;
1927  m_bufferType = bt_none;
1928  }
1929  }
1930 
1931  void UString::_getBufferStr() const
1932  {
1933  if ( m_bufferType != bt_string ) {
1934  _cleanBuffer();
1935  m_buffer.mStrBuffer = new std::string();
1936  m_bufferType = bt_string;
1937  }
1938  m_buffer.mStrBuffer->clear();
1939  }
1940 
1941  void UString::_getBufferWStr() const
1942  {
1943  if ( m_bufferType != bt_wstring ) {
1944  _cleanBuffer();
1945  m_buffer.mWStrBuffer = new std::wstring();
1946  m_bufferType = bt_wstring;
1947  }
1948  m_buffer.mWStrBuffer->clear();
1949  }
1950 
1951  void UString::_getBufferUTF32Str() const
1952  {
1953  if ( m_bufferType != bt_utf32string ) {
1954  _cleanBuffer();
1955  m_buffer.mUTF32StrBuffer = new utf32string();
1956  m_bufferType = bt_utf32string;
1957  }
1958  m_buffer.mUTF32StrBuffer->clear();
1959  }
1960 
1961  void UString::_load_buffer_UTF8() const
1962  {
1963  _getBufferStr();
1964  std::string& buffer = ( *m_buffer.mStrBuffer );
1965  buffer.reserve( length() );
1966 
1967  unsigned char utf8buf[6];
1968  char* charbuf = ( char* )utf8buf;
1969  unicode_char c;
1970  size_t len;
1971 
1972  const_iterator i, ie = end();
1973  for ( i = begin(); i != ie; i.moveNext() ) {
1974  c = i.getCharacter();
1975  len = _utf32_to_utf8( c, utf8buf );
1976  size_t j = 0;
1977  while ( j < len )
1978  buffer.push_back( charbuf[j++] );
1979  }
1980  }
1981 
1982  void UString::_load_buffer_WStr() const
1983  {
1984  _getBufferWStr();
1985  std::wstring& buffer = ( *m_buffer.mWStrBuffer );
1986  buffer.reserve( length() ); // may over reserve, but should be close enough
1987 #ifdef WCHAR_UTF16 // wchar_t matches UTF-16
1988  const_iterator i, ie = end();
1989  for ( i = begin(); i != ie; ++i ) {
1990  buffer.push_back(( wchar_t )( *i ) );
1991  }
1992 #else // wchar_t fits UTF-32
1993  unicode_char c;
1994  const_iterator i, ie = end();
1995  for ( i = begin(); i != ie; i.moveNext() ) {
1996  c = i.getCharacter();
1997  buffer.push_back(( wchar_t )c );
1998  }
1999 #endif
2000  }
2001 
2002  void UString::_load_buffer_UTF32() const
2003  {
2004  _getBufferUTF32Str();
2005  utf32string& buffer = ( *m_buffer.mUTF32StrBuffer );
2006  buffer.reserve( length() ); // may over reserve, but should be close enough
2007 
2008  unicode_char c;
2009 
2010  const_iterator i, ie = end();
2011  for ( i = begin(); i != ie; i.moveNext() ) {
2012  c = i.getCharacter();
2013  buffer.push_back( c );
2014  }
2015  }
2016 
2017 } // namespace MyGUI
code_point & operator[](size_type index)
code point dereference operator
std::basic_string< code_point > dstring
base iterator class for UString
_const_rev_iterator operator-(difference_type n)
subtraction operator
_const_fwd_iterator & operator--()
pre-decrement
size_type capacity() const
returns the number of elements that the string can hold before it will need to allocate more space ...
const value_type & operator*() const
dereference operator
reverse_iterator rend()
returns a reverse iterator just past the beginning of the string
void resize(size_type num, const code_point &val=0)
changes the size of the string to size, filling in any new area with val
UString & append(const UString &str)
appends str on to the end of the current string
_rev_iterator & operator--()
pre-decrement
unicode_char getChar(size_type loc) const
returns the data point loc evaluated as a UTF-32 value
const char * asUTF8_c_str() const
returns the current string in UTF-8 form as a nul-terminated char array
int _setCharacter(unicode_char uc)
bool operator>(const UString &right) const
greater than operator
const code_point * c_str() const
returns a pointer to the first character in the current string
static size_t _utf32_to_utf8(const unicode_char &in_uc, unsigned char out_cp[6])
writes the given UTF-32 uc_in to the buffer location out_cp using UTF-8 encoding, returns the number ...
iterator erase(iterator loc)
removes the code point pointed to by loc, returning an iterator to the next character ...
size_type find_last_of(const UString &str, size_type index=npos, size_type num=npos) const
returns the index of the first character within the current string that matches any character in str...
UString()
default constructor, creates an empty string
bool inString(unicode_char ch) const
returns true if the given Unicode character ch is in this string
static size_t _utf8_char_length(unsigned char cp)
estimates the number of UTF-8 code points in the sequence starting with cp
_const_fwd_iterator operator+(difference_type n)
addition operator
_const_fwd_iterator & operator+=(difference_type n)
addition assignment operator
_const_rev_iterator & operator+=(difference_type n)
addition assignment operator
const value_type & operator*() const
dereference operator
const utf32string & asUTF32() const
returns the current string in UTF-32 form within a utf32string
_rev_iterator operator-(difference_type n)
subtraction operator
_const_fwd_iterator operator-(difference_type n)
subtraction operator
_fwd_iterator & operator+=(difference_type n)
addition assignment operator
const std::wstring & asWStr() const
returns the current string in the native form of std::wstring
size_t size_type
size type used to indicate string size and character positions within the string
This exception is used when invalid data streams are encountered.
_const_rev_iterator & operator++()
pre-increment
size_type find(const UString &str, size_type index=0) const
returns the index of the first occurrence of str within the current string, starting at index; return...
int setChar(size_type loc, unicode_char ch)
sets the value of the character at loc to the Unicode value ch (UTF-32)
_const_fwd_iterator const_iterator
const iterator
static const size_type npos
the usual constant representing: not found, no limit, etc
const forward iterator for UString
_fwd_iterator & operator-=(difference_type n)
subtraction assignment operator
bool operator<(const UString &right) const
less than operator
_rev_iterator & operator-=(difference_type n)
subtraction assignment operator
value_type & operator*() const
dereference operator
_const_rev_iterator operator+(difference_type n)
addition operator
static size_t _utf32_to_utf16(const unicode_char &in_uc, code_point out_cp[2])
writes the given UTF-32 uc_in to the buffer location out_cp using UTF-16 encoding, returns the number of code points used to encode the input (always 1 or 2)
unicode_char getCharacter() const
Returns the Unicode value of the character at the current position (decodes surrogate pairs if needed...
void push_back(unicode_char val)
appends val to the end of the string
int compare(const UString &str) const
compare str to the current string
std::basic_string< unicode_char > utf32string
string type used for returning UTF-32 formatted data
void clear()
deletes all of the elements in the string
const reverse iterator for UString
size_type find_last_not_of(const UString &str, size_type index=npos, size_type num=npos) const
returns the index of the last character within the current string that does not match any character i...
iterator begin()
returns an iterator to the first element of the string
_fwd_iterator iterator
iterator
_const_fwd_iterator & movePrev()
rewinds to the previous Unicode character, honoring surrogate pairs in the UTF-16 stream ...
reverse_iterator rbegin()
returns a reverse iterator to the last element of the string
_const_fwd_iterator & operator++()
pre-increment
bool operator==(const UString &right) const
equality operator
uint16 code_point
a single UTF-16 code point
_fwd_iterator operator+(difference_type n)
addition operator
_const_rev_iterator & operator-=(difference_type n)
subtraction assignment operator
const unicode_char * asUTF32_c_str() const
returns the current string in UTF-32 form as a nul-terminated unicode_char array
static bool _utf8_start_char(unsigned char cp)
returns true if cp is the beginning of a UTF-8 sequence
static bool _utf16_surrogate_lead(code_point cp)
returns true if cp matches the signature of a surrogate pair lead character
unicode_char _getCharacter() const
UString substr(size_type index, size_type num=npos) const
returns a substring of the current string, starting at index, and num characters long.
size_type max_size() const
returns the maximum number of UTF-16 code points that the string can hold
void swap(UString &from)
exchanges the elements of the current string with those of from
static size_t _utf16_to_utf32(const code_point in_cp[2], unicode_char &out_uc)
converts the given UTF-16 character buffer in_cp to a single UTF-32 Unicode character out_uc...
_fwd_iterator operator-(difference_type n)
subtraction operator
_rev_iterator operator+(difference_type n)
addition operator
size_type length() const
Returns the number of code points in the current string.
bool operator>=(const UString &right) const
greater than or equal operator
size_type rfind(const UString &str, size_type index=0) const
returns the location of the first occurrence of str in the current string, doing a reverse search fro...
code_point value_type
value type typedef for use in iterators
value_type & operator[](difference_type n) const
dereference at offset operator
void reserve(size_type size)
sets the capacity of the string to at least size code points
const value_type & operator[](difference_type n) const
dereference at offset operator
_fwd_iterator & movePrev()
rewinds to the previous Unicode character, honoring surrogate pairs in the UTF-16 stream ...
bool operator<=(const UString &right) const
less than or equal operator
forward iterator for UString
uint32 unicode_char
a single 32-bit Unicode character
static bool _utf16_surrogate_follow(code_point cp)
returns true if cp matches the signature of a surrogate pair following character
size_type find_first_not_of(const UString &str, size_type index=0, size_type num=npos) const
returns the index of the first character within the current string that does not match any character ...
~UString()
destructor
unicode_char getCharacter() const
Returns the Unicode value of the character at the current position (decodes surrogate pairs if needed...
static size_t _utf16_char_length(code_point cp)
estimates the number of UTF-16 code points in the sequence starting with cp
code_point & at(size_type loc)
returns a reference to the element in the string at index loc
static bool _utf16_independent_char(code_point cp)
returns true if cp does not match the signature for the lead of follow code point of a surrogate pair...
_const_rev_iterator & operator--()
pre-decrement
void _become(const _base_iterator &i)
_fwd_iterator & operator--()
pre-decrement
_rev_iterator & operator++()
pre-increment
_fwd_iterator & moveNext()
advances to the next Unicode character, honoring surrogate pairs in the UTF-16 stream ...
forward iterator for UString
static size_type _verifyUTF8(const unsigned char *c_str)
verifies a UTF-8 stream, returning the total number of Unicode characters found
UString & assign(iterator start, iterator end)
gives the current string the values from start to end
int setCharacter(unicode_char uc)
Sets the Unicode value of the character at the current position (adding a surrogate pair if needed); ...
_const_fwd_iterator & moveNext()
advances to the next Unicode character, honoring surrogate pairs in the UTF-16 stream ...
size_type length_Characters() const
Returns the number of Unicode characters in the string.
const std::string & asUTF8() const
returns the current string in UTF-8 form within a std::string
A UTF-16 string with implicit conversion to/from std::string and std::wstring.
_rev_iterator & operator+=(difference_type n)
addition assignment operator
static size_t _utf8_to_utf32(const unsigned char in_cp[6], unicode_char &out_uc)
converts the given UTF-8 character buffer to a single UTF-32 Unicode character, returns the number of...
value_type & operator[](difference_type n) const
dereference at offset operator
iterator insert(iterator i, const code_point &ch)
inserts ch before the code point denoted by i
const code_point * data() const
returns a pointer to the first character in the current string
UString & operator=(const UString &s)
assignment operator, implicitly casts all compatible types
iterator end()
returns an iterator just past the end of the string
UString & replace(size_type index1, size_type num1, const UString &str)
replaces up to num1 code points of the current string (starting at index1) with str ...
size_type size() const
Returns the number of code points in the current string.
size_type find_first_of(const UString &str, size_type index=0, size_type num=npos) const
Returns the index of the first character within the current string that matches any character in str...
value_type & operator*() const
dereference operator
const value_type & operator[](difference_type n) const
dereference at offset operator
bool operator!=(const UString &right) const
inequality operator
float len(float x, float y)
bool empty() const
returns true if the string has no elements, false otherwise
_const_fwd_iterator & operator-=(difference_type n)
subtraction assignment operator
_fwd_iterator & operator++()
pre-increment
void _jump_to(size_type index)
const wchar_t * asWStr_c_str() const
returns the current string in the native form of a nul-terminated wchar_t array