src/cguittfont/irrUString.h

   1 /*
   2    Basic Unicode string class for Irrlicht.
   3    Copyright (c) 2009-2011 John Norman
   4
   5    This software is provided 'as-is', without any express or implied
   6    warranty. In no event will the authors be held liable for any
   7    damages arising from the use of this software.
   8
   9    Permission is granted to anyone to use this software for any
  10    purpose, including commercial applications, and to alter it and
  11    redistribute it freely, subject to the following restrictions:
  12
  13    1. The origin of this software must not be misrepresented; you
  14       must not claim that you wrote the original software. If you use
  15       this software in a product, an acknowledgment in the product
  16       documentation would be appreciated but is not required.
  17
  18    2. Altered source versions must be plainly marked as such, and
  19       must not be misrepresented as being the original software.
  20
  21    3. This notice may not be removed or altered from any source
  22       distribution.
  23
  24    The original version of this class can be located at:
  25    http://irrlicht.suckerfreegames.com/
  26
  27    John Norman
  28    john@suckerfreegames.com
  29 */
  30
  31 #ifndef __IRR_USTRING_H_INCLUDED__
  32 #define __IRR_USTRING_H_INCLUDED__
  33
  34 #if (__cplusplus > 199711L) || (_MSC_VER >= 1600) || defined(__GXX_EXPERIMENTAL_CXX0X__)
  35 #       define USTRING_CPP0X
  36 #       if defined(__GXX_EXPERIMENTAL_CXX0X__) && ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 5)))
  37 #               define USTRING_CPP0X_NEWLITERALS
  38 #       endif
  39 #endif
  40
  41 #include <stdio.h>
  42 #include <string.h>
  43 #include <stdlib.h>
  44 #ifdef _WIN32
  45 #define __BYTE_ORDER 0
  46 #define __LITTLE_ENDIAN 0
  47 #define __BIG_ENDIAN 1
  48 #else
  49 #include <endian.h>
  50 #endif
  51
  52 #ifdef USTRING_CPP0X
  53 #       include <utility>
  54 #endif
  55
  56 #ifndef USTRING_NO_STL
  57 #       include <string>
  58 #       include <iterator>
  59 #       include <ostream>
  60 #endif
  61
  62 #include "irrTypes.h"
  63 #include "irrAllocator.h"
  64 #include "irrArray.h"
  65 #include "irrMath.h"
  66 #include "irrString.h"
  67 #include "path.h"
  68
  69 //! UTF-16 surrogate start values.
  70 static const irr::u16 UTF16_HI_SURROGATE = 0xD800;
  71 static const irr::u16 UTF16_LO_SURROGATE = 0xDC00;
  72
  73 //! Is a UTF-16 code point a surrogate?
  74 #define UTF16_IS_SURROGATE(c)           (((c) & 0xF800) == 0xD800)
  75 #define UTF16_IS_SURROGATE_HI(c)        (((c) & 0xFC00) == 0xD800)
  76 #define UTF16_IS_SURROGATE_LO(c)        (((c) & 0xFC00) == 0xDC00)
  77
  78
  79 namespace irr
  80 {
  81
  82         // Define our character types.
  83 #ifdef USTRING_CPP0X_NEWLITERALS        // C++0x
  84         typedef char32_t uchar32_t;
  85         typedef char16_t uchar16_t;
  86         typedef char uchar8_t;
  87 #else
  88         typedef u32 uchar32_t;
  89         typedef u16 uchar16_t;
  90         typedef u8 uchar8_t;
  91 #endif
  92
  93 namespace core
  94 {
  95
  96 namespace unicode
  97 {
  98
  99 //! The unicode replacement character.  Used to replace invalid characters.
 100 const irr::u16 UTF_REPLACEMENT_CHARACTER = 0xFFFD;
 101
 102 //! Convert a UTF-16 surrogate pair into a UTF-32 character.
 103 //! \param high The high value of the pair.
 104 //! \param low The low value of the pair.
 105 //! \return The UTF-32 character expressed by the surrogate pair.
 106 inline uchar32_t toUTF32(uchar16_t high, uchar16_t low)
 107 {
 108         // Convert the surrogate pair into a single UTF-32 character.
 109         uchar32_t x = ((high & ((1 << 6) -1)) << 10) | (low & ((1 << 10) -1));
 110         uchar32_t wu = ((high >> 6) & ((1 << 5) - 1)) + 1;
 111         return (wu << 16) | x;
 112 }
 113
 114 //! Swaps the endianness of a 16-bit value.
 115 //! \return The new value.
 116 inline uchar16_t swapEndian16(const uchar16_t& c)
 117 {
 118         return ((c >> 8) & 0x00FF) | ((c << 8) & 0xFF00);
 119 }
 120
 121 //! Swaps the endianness of a 32-bit value.
 122 //! \return The new value.
 123 inline uchar32_t swapEndian32(const uchar32_t& c)
 124 {
 125         return  ((c >> 24) & 0x000000FF) |
 126                         ((c >> 8)  & 0x0000FF00) |
 127                         ((c << 8)  & 0x00FF0000) |
 128                         ((c << 24) & 0xFF000000);
 129 }
 130
 131 //! The Unicode byte order mark.
 132 const u16 BOM = 0xFEFF;
 133
 134 //! The size of the Unicode byte order mark in terms of the Unicode character size.
 135 const u8 BOM_UTF8_LEN = 3;
 136 const u8 BOM_UTF16_LEN = 1;
 137 const u8 BOM_UTF32_LEN = 1;
 138
 139 //! Unicode byte order marks for file operations.
 140 const u8 BOM_ENCODE_UTF8[3] = { 0xEF, 0xBB, 0xBF };
 141 const u8 BOM_ENCODE_UTF16_BE[2] = { 0xFE, 0xFF };
 142 const u8 BOM_ENCODE_UTF16_LE[2] = { 0xFF, 0xFE };
 143 const u8 BOM_ENCODE_UTF32_BE[4] = { 0x00, 0x00, 0xFE, 0xFF };
 144 const u8 BOM_ENCODE_UTF32_LE[4] = { 0xFF, 0xFE, 0x00, 0x00 };
 145
 146 //! The size in bytes of the Unicode byte marks for file operations.
 147 const u8 BOM_ENCODE_UTF8_LEN = 3;
 148 const u8 BOM_ENCODE_UTF16_LEN = 2;
 149 const u8 BOM_ENCODE_UTF32_LEN = 4;
 150
 151 //! Unicode encoding type.
 152 enum EUTF_ENCODE
 153 {
 154         EUTFE_NONE              = 0,
 155         EUTFE_UTF8,
 156         EUTFE_UTF16,
 157         EUTFE_UTF16_LE,
 158         EUTFE_UTF16_BE,
 159         EUTFE_UTF32,
 160         EUTFE_UTF32_LE,
 161         EUTFE_UTF32_BE
 162 };
 163
 164 //! Unicode endianness.
 165 enum EUTF_ENDIAN
 166 {
 167         EUTFEE_NATIVE   = 0,
 168         EUTFEE_LITTLE,
 169         EUTFEE_BIG
 170 };
 171
 172 //! Returns the specified unicode byte order mark in a byte array.
 173 //! The byte order mark is the first few bytes in a text file that signifies its encoding.
 174 /** \param mode The Unicode encoding method that we want to get the byte order mark for.
 175                 If EUTFE_UTF16 or EUTFE_UTF32 is passed, it uses the native system endianness. **/
 176 //! \return An array that contains a byte order mark.
 177 inline core::array<u8> getUnicodeBOM(EUTF_ENCODE mode)
 178 {
 179 #define COPY_ARRAY(source, size) \
 180         memcpy(ret.pointer(), source, size); \
 181         ret.set_used(size)
 182
 183         core::array<u8> ret(4);
 184         switch (mode)
 185         {
 186                 case EUTFE_UTF8:
 187                         COPY_ARRAY(BOM_ENCODE_UTF8, BOM_ENCODE_UTF8_LEN);
 188                         break;
 189                 case EUTFE_UTF16:
 190                         #ifdef __BIG_ENDIAN__
 191                                 COPY_ARRAY(BOM_ENCODE_UTF16_BE, BOM_ENCODE_UTF16_LEN);
 192                         #else
 193                                 COPY_ARRAY(BOM_ENCODE_UTF16_LE, BOM_ENCODE_UTF16_LEN);
 194                         #endif
 195                         break;
 196                 case EUTFE_UTF16_BE:
 197                         COPY_ARRAY(BOM_ENCODE_UTF16_BE, BOM_ENCODE_UTF16_LEN);
 198                         break;
 199                 case EUTFE_UTF16_LE:
 200                         COPY_ARRAY(BOM_ENCODE_UTF16_LE, BOM_ENCODE_UTF16_LEN);
 201                         break;
 202                 case EUTFE_UTF32:
 203                         #ifdef __BIG_ENDIAN__
 204                                 COPY_ARRAY(BOM_ENCODE_UTF32_BE, BOM_ENCODE_UTF32_LEN);
 205                         #else
 206                                 COPY_ARRAY(BOM_ENCODE_UTF32_LE, BOM_ENCODE_UTF32_LEN);
 207                         #endif
 208                         break;
 209                 case EUTFE_UTF32_BE:
 210                         COPY_ARRAY(BOM_ENCODE_UTF32_BE, BOM_ENCODE_UTF32_LEN);
 211                         break;
 212                 case EUTFE_UTF32_LE:
 213                         COPY_ARRAY(BOM_ENCODE_UTF32_LE, BOM_ENCODE_UTF32_LEN);
 214                         break;
 215                 case EUTFE_NONE:
 216                         // TODO sapier: fixed warning only,
 217                         // don't know if something needs to be done here
 218                         break;
 219         }
 220         return ret;
 221
 222 #undef COPY_ARRAY
 223 }
 224
 225 //! Detects if the given data stream starts with a unicode BOM.
 226 //! \param data The data stream to check.
 227 //! \return The unicode BOM associated with the data stream, or EUTFE_NONE if none was found.
 228 inline EUTF_ENCODE determineUnicodeBOM(const char* data)
 229 {
 230         if (memcmp(data, BOM_ENCODE_UTF8, 3) == 0) return EUTFE_UTF8;
 231         if (memcmp(data, BOM_ENCODE_UTF16_BE, 2) == 0) return EUTFE_UTF16_BE;
 232         if (memcmp(data, BOM_ENCODE_UTF16_LE, 2) == 0) return EUTFE_UTF16_LE;
 233         if (memcmp(data, BOM_ENCODE_UTF32_BE, 4) == 0) return EUTFE_UTF32_BE;
 234         if (memcmp(data, BOM_ENCODE_UTF32_LE, 4) == 0) return EUTFE_UTF32_LE;
 235         return EUTFE_NONE;
 236 }
 237
 238 } // end namespace unicode
 239
 240
 241 //! UTF-16 string class.
 242 template <typename TAlloc = irrAllocator<uchar16_t> >
 243 class ustring16
 244 {
 245 public:
 246
 247         ///------------------///
 248         /// iterator classes ///
 249         ///------------------///
 250
 251         //! Access an element in a unicode string, allowing one to change it.
 252         class _ustring16_iterator_access
 253         {
 254                 public:
 255                         _ustring16_iterator_access(const ustring16<TAlloc>* s, u32 p) : ref(s), pos(p) {}
 256
 257                         //! Allow the class to be interpreted as a single UTF-32 character.
 258                         operator uchar32_t() const
 259                         {
 260                                 return _get();
 261                         }
 262
 263                         //! Allow one to change the character in the unicode string.
 264                         //! \param c The new character to use.
 265                         //! \return Myself.
 266                         _ustring16_iterator_access& operator=(const uchar32_t c)
 267                         {
 268                                 _set(c);
 269                                 return *this;
 270                         }
 271
 272                         //! Increments the value by 1.
 273                         //! \return Myself.
 274                         _ustring16_iterator_access& operator++()
 275                         {
 276                                 _set(_get() + 1);
 277                                 return *this;
 278                         }
 279
 280                         //! Increments the value by 1, returning the old value.
 281                         //! \return A unicode character.
 282                         uchar32_t operator++(int)
 283                         {
 284                                 uchar32_t old = _get();
 285                                 _set(old + 1);
 286                                 return old;
 287                         }
 288
 289                         //! Decrements the value by 1.
 290                         //! \return Myself.
 291                         _ustring16_iterator_access& operator--()
 292                         {
 293                                 _set(_get() - 1);
 294                                 return *this;
 295                         }
 296
 297                         //! Decrements the value by 1, returning the old value.
 298                         //! \return A unicode character.
 299                         uchar32_t operator--(int)
 300                         {
 301                                 uchar32_t old = _get();
 302                                 _set(old - 1);
 303                                 return old;
 304                         }
 305
 306                         //! Adds to the value by a specified amount.
 307                         //! \param val The amount to add to this character.
 308                         //! \return Myself.
 309                         _ustring16_iterator_access& operator+=(int val)
 310                         {
 311                                 _set(_get() + val);
 312                                 return *this;
 313                         }
 314
 315                         //! Subtracts from the value by a specified amount.
 316                         //! \param val The amount to subtract from this character.
 317                         //! \return Myself.
 318                         _ustring16_iterator_access& operator-=(int val)
 319                         {
 320                                 _set(_get() - val);
 321                                 return *this;
 322                         }
 323
 324                         //! Multiples the value by a specified amount.
 325                         //! \param val The amount to multiply this character by.
 326                         //! \return Myself.
 327                         _ustring16_iterator_access& operator*=(int val)
 328                         {
 329                                 _set(_get() * val);
 330                                 return *this;
 331                         }
 332
 333                         //! Divides the value by a specified amount.
 334                         //! \param val The amount to divide this character by.
 335                         //! \return Myself.
 336                         _ustring16_iterator_access& operator/=(int val)
 337                         {
 338                                 _set(_get() / val);
 339                                 return *this;
 340                         }
 341
 342                         //! Modulos the value by a specified amount.
 343                         //! \param val The amount to modulo this character by.
 344                         //! \return Myself.
 345                         _ustring16_iterator_access& operator%=(int val)
 346                         {
 347                                 _set(_get() % val);
 348                                 return *this;
 349                         }
 350
 351                         //! Adds to the value by a specified amount.
 352                         //! \param val The amount to add to this character.
 353                         //! \return A unicode character.
 354                         uchar32_t operator+(int val) const
 355                         {
 356                                 return _get() + val;
 357                         }
 358
 359                         //! Subtracts from the value by a specified amount.
 360                         //! \param val The amount to subtract from this character.
 361                         //! \return A unicode character.
 362                         uchar32_t operator-(int val) const
 363                         {
 364                                 return _get() - val;
 365                         }
 366
 367                         //! Multiplies the value by a specified amount.
 368                         //! \param val The amount to multiply this character by.
 369                         //! \return A unicode character.
 370                         uchar32_t operator*(int val) const
 371                         {
 372                                 return _get() * val;
 373                         }
 374
 375                         //! Divides the value by a specified amount.
 376                         //! \param val The amount to divide this character by.
 377                         //! \return A unicode character.
 378                         uchar32_t operator/(int val) const
 379                         {
 380                                 return _get() / val;
 381                         }
 382
 383                         //! Modulos the value by a specified amount.
 384                         //! \param val The amount to modulo this character by.
 385                         //! \return A unicode character.
 386                         uchar32_t operator%(int val) const
 387                         {
 388                                 return _get() % val;
 389                         }
 390
 391                 private:
 392                         //! Gets a uchar32_t from our current position.
 393                         uchar32_t _get() const
 394                         {
 395                                 const uchar16_t* a = ref->c_str();
 396                                 if (!UTF16_IS_SURROGATE(a[pos]))
 397                                         return static_cast<uchar32_t>(a[pos]);
 398                                 else
 399                                 {
 400                                         if (pos + 1 >= ref->size_raw())
 401                                                 return 0;
 402
 403                                         return unicode::toUTF32(a[pos], a[pos + 1]);
 404                                 }
 405                         }
 406
 407                         //! Sets a uchar32_t at our current position.
 408                         void _set(uchar32_t c)
 409                         {
 410                                 ustring16<TAlloc>* ref2 = const_cast<ustring16<TAlloc>*>(ref);
 411                                 const uchar16_t* a = ref2->c_str();
 412                                 if (c > 0xFFFF)
 413                                 {
 414                                         // c will be multibyte, so split it up into the high and low surrogate pairs.
 415                                         uchar16_t x = static_cast<uchar16_t>(c);
 416                                         uchar16_t vh = UTF16_HI_SURROGATE | ((((c >> 16) & ((1 << 5) - 1)) - 1) << 6) | (x >> 10);
 417                                         uchar16_t vl = UTF16_LO_SURROGATE | (x & ((1 << 10) - 1));
 418
 419                                         // If the previous position was a surrogate pair, just replace them.  Else, insert the low pair.
 420                                         if (UTF16_IS_SURROGATE_HI(a[pos]) && pos + 1 != ref2->size_raw())
 421                                                 ref2->replace_raw(vl, static_cast<u32>(pos) + 1);
 422                                         else ref2->insert_raw(vl, static_cast<u32>(pos) + 1);
 423
 424                                         ref2->replace_raw(vh, static_cast<u32>(pos));
 425                                 }
 426                                 else
 427                                 {
 428                                         // c will be a single byte.
 429                                         uchar16_t vh = static_cast<uchar16_t>(c);
 430
 431                                         // If the previous position was a surrogate pair, remove the extra byte.
 432                                         if (UTF16_IS_SURROGATE_HI(a[pos]))
 433                                                 ref2->erase_raw(static_cast<u32>(pos) + 1);
 434
 435                                         ref2->replace_raw(vh, static_cast<u32>(pos));
 436                                 }
 437                         }
 438
 439                         const ustring16<TAlloc>* ref;
 440                         u32 pos;
 441         };
 442         typedef typename ustring16<TAlloc>::_ustring16_iterator_access access;
 443
 444
 445         //! Iterator to iterate through a UTF-16 string.
 446 #ifndef USTRING_NO_STL
 447         class _ustring16_const_iterator : public std::iterator<
 448                 std::bidirectional_iterator_tag,        // iterator_category
 449                 access,                                                         // value_type
 450                 ptrdiff_t,                                                      // difference_type
 451                 const access,                                           // pointer
 452                 const access                                            // reference
 453         >
 454 #else
 455         class _ustring16_const_iterator
 456 #endif
 457         {
 458                 public:
 459                         typedef _ustring16_const_iterator _Iter;
 460                         typedef std::iterator<std::bidirectional_iterator_tag, access, ptrdiff_t, const access, const access> _Base;
 461                         typedef const access const_pointer;
 462                         typedef const access const_reference;
 463
 464 #ifndef USTRING_NO_STL
 465                         typedef typename _Base::value_type value_type;
 466                         typedef typename _Base::difference_type difference_type;
 467                         typedef typename _Base::difference_type distance_type;
 468                         typedef typename _Base::pointer pointer;
 469                         typedef const_reference reference;
 470 #else
 471                         typedef access value_type;
 472                         typedef u32 difference_type;
 473                         typedef u32 distance_type;
 474                         typedef const_pointer pointer;
 475                         typedef const_reference reference;
 476 #endif
 477
 478                         //! Constructors.
 479                         _ustring16_const_iterator(const _Iter& i) : ref(i.ref), pos(i.pos) {}
 480                         _ustring16_const_iterator(const ustring16<TAlloc>& s) : ref(&s), pos(0) {}
 481                         _ustring16_const_iterator(const ustring16<TAlloc>& s, const u32 p) : ref(&s), pos(0)
 482                         {
 483                                 if (ref->size_raw() == 0 || p == 0)
 484                                         return;
 485
 486                                 // Go to the appropriate position.
 487                                 u32 i = p;
 488                                 u32 sr = ref->size_raw();
 489                                 const uchar16_t* a = ref->c_str();
 490                                 while (i != 0 && pos < sr)
 491                                 {
 492                                         if (UTF16_IS_SURROGATE_HI(a[pos]))
 493                                                 pos += 2;
 494                                         else ++pos;
 495                                         --i;
 496                                 }
 497                         }
 498
 499                         //! Test for equalness.
 500                         bool operator==(const _Iter& iter) const
 501                         {
 502                                 if (ref == iter.ref && pos == iter.pos)
 503                                         return true;
 504                                 return false;
 505                         }
 506
 507                         //! Test for unequalness.
 508                         bool operator!=(const _Iter& iter) const
 509                         {
 510                                 if (ref != iter.ref || pos != iter.pos)
 511                                         return true;
 512                                 return false;
 513                         }
 514
 515                         //! Switch to the next full character in the string.
 516                         _Iter& operator++()
 517                         {       // ++iterator
 518                                 if (pos == ref->size_raw()) return *this;
 519                                 const uchar16_t* a = ref->c_str();
 520                                 if (UTF16_IS_SURROGATE_HI(a[pos]))
 521                                         pos += 2;                       // TODO: check for valid low surrogate?
 522                                 else ++pos;
 523                                 if (pos > ref->size_raw()) pos = ref->size_raw();
 524                                 return *this;
 525                         }
 526
 527                         //! Switch to the next full character in the string, returning the previous position.
 528                         _Iter operator++(int)
 529                         {       // iterator++
 530                                 _Iter _tmp(*this);
 531                                 ++*this;
 532                                 return _tmp;
 533                         }
 534
 535                         //! Switch to the previous full character in the string.
 536                         _Iter& operator--()
 537                         {       // --iterator
 538                                 if (pos == 0) return *this;
 539                                 const uchar16_t* a = ref->c_str();
 540                                 --pos;
 541                                 if (UTF16_IS_SURROGATE_LO(a[pos]) && pos != 0)  // low surrogate, go back one more.
 542                                         --pos;
 543                                 return *this;
 544                         }
 545
 546                         //! Switch to the previous full character in the string, returning the previous position.
 547                         _Iter operator--(int)
 548                         {       // iterator--
 549                                 _Iter _tmp(*this);
 550                                 --*this;
 551                                 return _tmp;
 552                         }
 553
 554                         //! Advance a specified number of full characters in the string.
 555                         //! \return Myself.
 556                         _Iter& operator+=(const difference_type v)
 557                         {
 558                                 if (v == 0) return *this;
 559                                 if (v < 0) return operator-=(v * -1);
 560
 561                                 if (pos >= ref->size_raw())
 562                                         return *this;
 563
 564                                 // Go to the appropriate position.
 565                                 // TODO: Don't force u32 on an x64 OS.  Make it agnostic.
 566                                 u32 i = (u32)v;
 567                                 u32 sr = ref->size_raw();
 568                                 const uchar16_t* a = ref->c_str();
 569                                 while (i != 0 && pos < sr)
 570                                 {
 571                                         if (UTF16_IS_SURROGATE_HI(a[pos]))
 572                                                 pos += 2;
 573                                         else ++pos;
 574                                         --i;
 575                                 }
 576                                 if (pos > sr)
 577                                         pos = sr;
 578
 579                                 return *this;
 580                         }
 581
 582                         //! Go back a specified number of full characters in the string.
 583                         //! \return Myself.
 584                         _Iter& operator-=(const difference_type v)
 585                         {
 586                                 if (v == 0) return *this;
 587                                 if (v > 0) return operator+=(v * -1);
 588
 589                                 if (pos == 0)
 590                                         return *this;
 591
 592                                 // Go to the appropriate position.
 593                                 // TODO: Don't force u32 on an x64 OS.  Make it agnostic.
 594                                 u32 i = (u32)v;
 595                                 const uchar16_t* a = ref->c_str();
 596                                 while (i != 0 && pos != 0)
 597                                 {
 598                                         --pos;
 599                                         if (UTF16_IS_SURROGATE_LO(a[pos]) != 0 && pos != 0)
 600                                                 --pos;
 601                                         --i;
 602                                 }
 603
 604                                 return *this;
 605                         }
 606
 607                         //! Return a new iterator that is a variable number of full characters forward from the current position.
 608                         _Iter operator+(const difference_type v) const
 609                         {
 610                                 _Iter ret(*this);
 611                                 ret += v;
 612                                 return ret;
 613                         }
 614
 615                         //! Return a new iterator that is a variable number of full characters backward from the current position.
 616                         _Iter operator-(const difference_type v) const
 617                         {
 618                                 _Iter ret(*this);
 619                                 ret -= v;
 620                                 return ret;
 621                         }
 622
 623                         //! Returns the distance between two iterators.
 624                         difference_type operator-(const _Iter& iter) const
 625                         {
 626                                 // Make sure we reference the same object!
 627                                 if (ref != iter.ref)
 628                                         return difference_type();
 629
 630                                 _Iter i = iter;
 631                                 difference_type ret;
 632
 633                                 // Walk up.
 634                                 if (pos > i.pos)
 635                                 {
 636                                         while (pos > i.pos)
 637                                         {
 638                                                 ++i;
 639                                                 ++ret;
 640                                         }
 641                                         return ret;
 642                                 }
 643
 644                                 // Walk down.
 645                                 while (pos < i.pos)
 646                                 {
 647                                         --i;
 648                                         --ret;
 649                                 }
 650                                 return ret;
 651                         }
 652
 653                         //! Accesses the full character at the iterator's position.
 654                         const_reference operator*() const
 655                         {
 656                                 if (pos >= ref->size_raw())
 657                                 {
 658                                         const uchar16_t* a = ref->c_str();
 659                                         u32 p = ref->size_raw();
 660                                         if (UTF16_IS_SURROGATE_LO(a[p]))
 661                                                 --p;
 662                                         reference ret(ref, p);
 663                                         return ret;
 664                                 }
 665                                 const_reference ret(ref, pos);
 666                                 return ret;
 667                         }
 668
 669                         //! Accesses the full character at the iterator's position.
 670                         reference operator*()
 671                         {
 672                                 if (pos >= ref->size_raw())
 673                                 {
 674                                         const uchar16_t* a = ref->c_str();
 675                                         u32 p = ref->size_raw();
 676                                         if (UTF16_IS_SURROGATE_LO(a[p]))
 677                                                 --p;
 678                                         reference ret(ref, p);
 679                                         return ret;
 680                                 }
 681                                 reference ret(ref, pos);
 682                                 return ret;
 683                         }
 684
 685                         //! Accesses the full character at the iterator's position.
 686                         const_pointer operator->() const
 687                         {
 688                                 return operator*();
 689                         }
 690
 691                         //! Accesses the full character at the iterator's position.
 692                         pointer operator->()
 693                         {
 694                                 return operator*();
 695                         }
 696
 697                         //! Is the iterator at the start of the string?
 698                         bool atStart() const
 699                         {
 700                                 return pos == 0;
 701                         }
 702
 703                         //! Is the iterator at the end of the string?
 704                         bool atEnd() const
 705                         {
 706                                 const uchar16_t* a = ref->c_str();
 707                                 if (UTF16_IS_SURROGATE(a[pos]))
 708                                         return (pos + 1) >= ref->size_raw();
 709                                 else return pos >= ref->size_raw();
 710                         }
 711
 712                         //! Moves the iterator to the start of the string.
 713                         void toStart()
 714                         {
 715                                 pos = 0;
 716                         }
 717
 718                         //! Moves the iterator to the end of the string.
 719                         void toEnd()
 720                         {
 721                                 pos = ref->size_raw();
 722                         }
 723
 724                         //! Returns the iterator's position.
 725                         //! \return The iterator's position.
 726                         u32 getPos() const
 727                         {
 728                                 return pos;
 729                         }
 730
 731                 protected:
 732                         const ustring16<TAlloc>* ref;
 733                         u32 pos;
 734         };
 735
 736         //! Iterator to iterate through a UTF-16 string.
 737         class _ustring16_iterator : public _ustring16_const_iterator
 738         {
 739                 public:
 740                         typedef _ustring16_iterator _Iter;
 741                         typedef _ustring16_const_iterator _Base;
 742                         typedef typename _Base::const_pointer const_pointer;
 743                         typedef typename _Base::const_reference const_reference;
 744
 745
 746                         typedef typename _Base::value_type value_type;
 747                         typedef typename _Base::difference_type difference_type;
 748                         typedef typename _Base::distance_type distance_type;
 749                         typedef access pointer;
 750                         typedef access reference;
 751
 752                         using _Base::pos;
 753                         using _Base::ref;
 754
 755                         //! Constructors.
 756                         _ustring16_iterator(const _Iter& i) : _ustring16_const_iterator(i) {}
 757                         _ustring16_iterator(const ustring16<TAlloc>& s) : _ustring16_const_iterator(s) {}
 758                         _ustring16_iterator(const ustring16<TAlloc>& s, const u32 p) : _ustring16_const_iterator(s, p) {}
 759
 760                         //! Accesses the full character at the iterator's position.
 761                         reference operator*() const
 762                         {
 763                                 if (pos >= ref->size_raw())
 764                                 {
 765                                         const uchar16_t* a = ref->c_str();
 766                                         u32 p = ref->size_raw();
 767                                         if (UTF16_IS_SURROGATE_LO(a[p]))
 768                                                 --p;
 769                                         reference ret(ref, p);
 770                                         return ret;
 771                                 }
 772                                 reference ret(ref, pos);
 773                                 return ret;
 774                         }
 775
 776                         //! Accesses the full character at the iterator's position.
 777                         reference operator*()
 778                         {
 779                                 if (pos >= ref->size_raw())
 780                                 {
 781                                         const uchar16_t* a = ref->c_str();
 782                                         u32 p = ref->size_raw();
 783                                         if (UTF16_IS_SURROGATE_LO(a[p]))
 784                                                 --p;
 785                                         reference ret(ref, p);
 786                                         return ret;
 787                                 }
 788                                 reference ret(ref, pos);
 789                                 return ret;
 790                         }
 791
 792                         //! Accesses the full character at the iterator's position.
 793                         pointer operator->() const
 794                         {
 795                                 return operator*();
 796                         }
 797
 798                         //! Accesses the full character at the iterator's position.
 799                         pointer operator->()
 800                         {
 801                                 return operator*();
 802                         }
 803         };
 804
 805         typedef typename ustring16<TAlloc>::_ustring16_iterator iterator;
 806         typedef typename ustring16<TAlloc>::_ustring16_const_iterator const_iterator;
 807
 808         ///----------------------///
 809         /// end iterator classes ///
 810         ///----------------------///
 811
 812         //! Default constructor
 813         ustring16()
 814         : array(0), allocated(1), used(0)
 815         {
 816 #if __BYTE_ORDER == __BIG_ENDIAN
 817                 encoding = unicode::EUTFE_UTF16_BE;
 818 #else
 819                 encoding = unicode::EUTFE_UTF16_LE;
 820 #endif
 821                 array = allocator.allocate(1); // new u16[1];
 822                 array[0] = 0x0;
 823         }
 824
 825
 826         //! Constructor
 827         ustring16(const ustring16<TAlloc>& other)
 828         : array(0), allocated(0), used(0)
 829         {
 830 #if __BYTE_ORDER == __BIG_ENDIAN
 831                 encoding = unicode::EUTFE_UTF16_BE;
 832 #else
 833                 encoding = unicode::EUTFE_UTF16_LE;
 834 #endif
 835                 *this = other;
 836         }
 837
 838
 839         //! Constructor from other string types
 840         template <class B, class A>
 841         ustring16(const string<B, A>& other)
 842         : array(0), allocated(0), used(0)
 843         {
 844 #if __BYTE_ORDER == __BIG_ENDIAN
 845                 encoding = unicode::EUTFE_UTF16_BE;
 846 #else
 847                 encoding = unicode::EUTFE_UTF16_LE;
 848 #endif
 849                 *this = other;
 850         }
 851
 852
 853 #ifndef USTRING_NO_STL
 854         //! Constructor from std::string
 855         template <class B, class A, typename Alloc>
 856         ustring16(const std::basic_string<B, A, Alloc>& other)
 857         : array(0), allocated(0), used(0)
 858         {
 859 #if __BYTE_ORDER == __BIG_ENDIAN
 860                 encoding = unicode::EUTFE_UTF16_BE;
 861 #else
 862                 encoding = unicode::EUTFE_UTF16_LE;
 863 #endif
 864                 *this = other.c_str();
 865         }
 866
 867
 868         //! Constructor from iterator.
 869         template <typename Itr>
 870         ustring16(Itr first, Itr last)
 871         : array(0), allocated(0), used(0)
 872         {
 873 #if __BYTE_ORDER == __BIG_ENDIAN
 874                 encoding = unicode::EUTFE_UTF16_BE;
 875 #else
 876                 encoding = unicode::EUTFE_UTF16_LE;
 877 #endif
 878                 reserve(std::distance(first, last));
 879                 array[used] = 0;
 880
 881                 for (; first != last; ++first)
 882                         append((uchar32_t)*first);
 883         }
 884 #endif
 885
 886
 887 #ifndef USTRING_CPP0X_NEWLITERALS
 888         //! Constructor for copying a character string from a pointer.
 889         ustring16(const char* const c)
 890         : array(0), allocated(0), used(0)
 891         {
 892 #if __BYTE_ORDER == __BIG_ENDIAN
 893                 encoding = unicode::EUTFE_UTF16_BE;
 894 #else
 895                 encoding = unicode::EUTFE_UTF16_LE;
 896 #endif
 897
 898                 loadDataStream(c, strlen(c));
 899                 //append((uchar8_t*)c);
 900         }
 901
 902
 903         //! Constructor for copying a character string from a pointer with a given length.
 904         ustring16(const char* const c, u32 length)
 905         : array(0), allocated(0), used(0)
 906         {
 907 #if __BYTE_ORDER == __BIG_ENDIAN
 908                 encoding = unicode::EUTFE_UTF16_BE;
 909 #else
 910                 encoding = unicode::EUTFE_UTF16_LE;
 911 #endif
 912
 913                 loadDataStream(c, length);
 914         }
 915 #endif
 916
 917
 918         //! Constructor for copying a UTF-8 string from a pointer.
 919         ustring16(const uchar8_t* const c)
 920         : array(0), allocated(0), used(0)
 921         {
 922 #if __BYTE_ORDER == __BIG_ENDIAN
 923                 encoding = unicode::EUTFE_UTF16_BE;
 924 #else
 925                 encoding = unicode::EUTFE_UTF16_LE;
 926 #endif
 927
 928                 append(c);
 929         }
 930
 931
 932         //! Constructor for copying a UTF-8 string from a single char.
 933         ustring16(const char c)
 934         : array(0), allocated(0), used(0)
 935         {
 936 #if __BYTE_ORDER == __BIG_ENDIAN
 937                 encoding = unicode::EUTFE_UTF16_BE;
 938 #else
 939                 encoding = unicode::EUTFE_UTF16_LE;
 940 #endif
 941
 942                 append((uchar32_t)c);
 943         }
 944
 945
 946         //! Constructor for copying a UTF-8 string from a pointer with a given length.
 947         ustring16(const uchar8_t* const c, u32 length)
 948         : array(0), allocated(0), used(0)
 949         {
 950 #if __BYTE_ORDER == __BIG_ENDIAN
 951                 encoding = unicode::EUTFE_UTF16_BE;
 952 #else
 953                 encoding = unicode::EUTFE_UTF16_LE;
 954 #endif
 955
 956                 append(c, length);
 957         }
 958
 959
 960         //! Constructor for copying a UTF-16 string from a pointer.
 961         ustring16(const uchar16_t* const c)
 962         : array(0), allocated(0), used(0)
 963         {
 964 #if __BYTE_ORDER == __BIG_ENDIAN
 965                 encoding = unicode::EUTFE_UTF16_BE;
 966 #else
 967                 encoding = unicode::EUTFE_UTF16_LE;
 968 #endif
 969
 970                 append(c);
 971         }
 972
 973
 974         //! Constructor for copying a UTF-16 string from a pointer with a given length
 975         ustring16(const uchar16_t* const c, u32 length)
 976         : array(0), allocated(0), used(0)
 977         {
 978 #if __BYTE_ORDER == __BIG_ENDIAN
 979                 encoding = unicode::EUTFE_UTF16_BE;
 980 #else
 981                 encoding = unicode::EUTFE_UTF16_LE;
 982 #endif
 983
 984                 append(c, length);
 985         }
 986
 987
 988         //! Constructor for copying a UTF-32 string from a pointer.
 989         ustring16(const uchar32_t* const c)
 990         : array(0), allocated(0), used(0)
 991         {
 992 #if __BYTE_ORDER == __BIG_ENDIAN
 993                 encoding = unicode::EUTFE_UTF16_BE;
 994 #else
 995                 encoding = unicode::EUTFE_UTF16_LE;
 996 #endif
 997
 998                 append(c);
 999         }
1000
1001
1002         //! Constructor for copying a UTF-32 from a pointer with a given length.
1003         ustring16(const uchar32_t* const c, u32 length)
1004         : array(0), allocated(0), used(0)
1005         {
1006 #if __BYTE_ORDER == __BIG_ENDIAN
1007                 encoding = unicode::EUTFE_UTF16_BE;
1008 #else
1009                 encoding = unicode::EUTFE_UTF16_LE;
1010 #endif
1011
1012                 append(c, length);
1013         }
1014
1015
1016         //! Constructor for copying a wchar_t string from a pointer.
1017         ustring16(const wchar_t* const c)
1018         : array(0), allocated(0), used(0)
1019         {
1020 #if __BYTE_ORDER == __BIG_ENDIAN
1021                 encoding = unicode::EUTFE_UTF16_BE;
1022 #else
1023                 encoding = unicode::EUTFE_UTF16_LE;
1024 #endif
1025
1026                 if (sizeof(wchar_t) == 4)
1027                         append(reinterpret_cast<const uchar32_t* const>(c));
1028                 else if (sizeof(wchar_t) == 2)
1029                         append(reinterpret_cast<const uchar16_t* const>(c));
1030                 else if (sizeof(wchar_t) == 1)
1031                         append(reinterpret_cast<const uchar8_t* const>(c));
1032         }
1033
1034
1035         //! Constructor for copying a wchar_t string from a pointer with a given length.
1036         ustring16(const wchar_t* const c, u32 length)
1037         : array(0), allocated(0), used(0)
1038         {
1039 #if __BYTE_ORDER == __BIG_ENDIAN
1040                 encoding = unicode::EUTFE_UTF16_BE;
1041 #else
1042                 encoding = unicode::EUTFE_UTF16_LE;
1043 #endif
1044
1045                 if (sizeof(wchar_t) == 4)
1046                         append(reinterpret_cast<const uchar32_t* const>(c), length);
1047                 else if (sizeof(wchar_t) == 2)
1048                         append(reinterpret_cast<const uchar16_t* const>(c), length);
1049                 else if (sizeof(wchar_t) == 1)
1050                         append(reinterpret_cast<const uchar8_t* const>(c), length);
1051         }
1052
1053
1054 #ifdef USTRING_CPP0X
1055         //! Constructor for moving a ustring16
1056         ustring16(ustring16<TAlloc>&& other)
1057         : array(other.array), encoding(other.encoding), allocated(other.allocated), used(other.used)
1058         {
1059                 //std::cout << "MOVE constructor" << std::endl;
1060                 other.array = 0;
1061                 other.allocated = 0;
1062                 other.used = 0;
1063         }
1064 #endif
1065
1066
1067         //! Destructor
1068         ~ustring16()
1069         {
1070                 allocator.deallocate(array); // delete [] array;
1071         }
1072
1073
1074         //! Assignment operator
1075         ustring16& operator=(const ustring16<TAlloc>& other)
1076         {
1077                 if (this == &other)
1078                         return *this;
1079
1080                 used = other.size_raw();
1081                 if (used >= allocated)
1082                 {
1083                         allocator.deallocate(array); // delete [] array;
1084                         allocated = used + 1;
1085                         array = allocator.allocate(used + 1); //new u16[used];
1086                 }
1087
1088                 const uchar16_t* p = other.c_str();
1089                 for (u32 i=0; i<=used; ++i, ++p)
1090                         array[i] = *p;
1091
1092                 array[used] = 0;
1093
1094                 // Validate our new UTF-16 string.
1095                 validate();
1096
1097                 return *this;
1098         }
1099
1100
1101 #ifdef USTRING_CPP0X
1102         //! Move assignment operator
1103         ustring16& operator=(ustring16<TAlloc>&& other)
1104         {
1105                 if (this != &other)
1106                 {
1107                         //std::cout << "MOVE operator=" << std::endl;
1108                         allocator.deallocate(array);
1109
1110                         array = other.array;
1111                         allocated = other.allocated;
1112                         encoding = other.encoding;
1113                         used = other.used;
1114                         other.array = 0;
1115                         other.used = 0;
1116                 }
1117                 return *this;
1118         }
1119 #endif
1120
1121
1122         //! Assignment operator for other string types
1123         template <class B, class A>
1124         ustring16<TAlloc>& operator=(const string<B, A>& other)
1125         {
1126                 *this = other.c_str();
1127                 return *this;
1128         }
1129
1130
1131         //! Assignment operator for UTF-8 strings
1132         ustring16<TAlloc>& operator=(const uchar8_t* const c)
1133         {
1134                 if (!array)
1135                 {
1136                         array = allocator.allocate(1); //new u16[1];
1137                         allocated = 1;
1138                 }
1139                 used = 0;
1140                 array[used] = 0x0;
1141                 if (!c) return *this;
1142
1143                 //! Append our string now.
1144                 append(c);
1145                 return *this;
1146         }
1147
1148
1149         //! Assignment operator for UTF-16 strings
1150         ustring16<TAlloc>& operator=(const uchar16_t* const c)
1151         {
1152                 if (!array)
1153                 {
1154                         array = allocator.allocate(1); //new u16[1];
1155                         allocated = 1;
1156                 }
1157                 used = 0;
1158                 array[used] = 0x0;
1159                 if (!c) return *this;
1160
1161                 //! Append our string now.
1162                 append(c);
1163                 return *this;
1164         }
1165
1166
1167         //! Assignment operator for UTF-32 strings
1168         ustring16<TAlloc>& operator=(const uchar32_t* const c)
1169         {
1170                 if (!array)
1171                 {
1172                         array = allocator.allocate(1); //new u16[1];
1173                         allocated = 1;
1174                 }
1175                 used = 0;
1176                 array[used] = 0x0;
1177                 if (!c) return *this;
1178
1179                 //! Append our string now.
1180                 append(c);
1181                 return *this;
1182         }
1183
1184
1185         //! Assignment operator for wchar_t strings.
1186         /** Note that this assumes that a correct unicode string is stored in the wchar_t string.
1187                 Since wchar_t changes depending on its platform, it could either be a UTF-8, -16, or -32 string.
1188                 This function assumes you are storing the correct unicode encoding inside the wchar_t string. **/
1189         ustring16<TAlloc>& operator=(const wchar_t* const c)
1190         {
1191                 if (sizeof(wchar_t) == 4)
1192                         *this = reinterpret_cast<const uchar32_t* const>(c);
1193                 else if (sizeof(wchar_t) == 2)
1194                         *this = reinterpret_cast<const uchar16_t* const>(c);
1195                 else if (sizeof(wchar_t) == 1)
1196                         *this = reinterpret_cast<const uchar8_t* const>(c);
1197
1198                 return *this;
1199         }
1200
1201
1202         //! Assignment operator for other strings.
1203         /** Note that this assumes that a correct unicode string is stored in the string. **/
1204         template <class B>
1205         ustring16<TAlloc>& operator=(const B* const c)
1206         {
1207                 if (sizeof(B) == 4)
1208                         *this = reinterpret_cast<const uchar32_t* const>(c);
1209                 else if (sizeof(B) == 2)
1210                         *this = reinterpret_cast<const uchar16_t* const>(c);
1211                 else if (sizeof(B) == 1)
1212                         *this = reinterpret_cast<const uchar8_t* const>(c);
1213
1214                 return *this;
1215         }
1216
1217
1218         //! Direct access operator
1219         access operator [](const u32 index)
1220         {
1221                 _IRR_DEBUG_BREAK_IF(index>=size()) // bad index
1222                 iterator iter(*this, index);
1223                 return iter.operator*();
1224         }
1225
1226
1227         //! Direct access operator
1228         const access operator [](const u32 index) const
1229         {
1230                 _IRR_DEBUG_BREAK_IF(index>=size()) // bad index
1231                 const_iterator iter(*this, index);
1232                 return iter.operator*();
1233         }
1234
1235
1236         //! Equality operator
1237         bool operator ==(const uchar16_t* const str) const
1238         {
1239                 if (!str)
1240                         return false;
1241
1242                 u32 i;
1243                 for(i=0; array[i] && str[i]; ++i)
1244                         if (array[i] != str[i])
1245                                 return false;
1246
1247                 return !array[i] && !str[i];
1248         }
1249
1250
1251         //! Equality operator
1252         bool operator ==(const ustring16<TAlloc>& other) const
1253         {
1254                 for(u32 i=0; array[i] && other.array[i]; ++i)
1255                         if (array[i] != other.array[i])
1256                                 return false;
1257
1258                 return used == other.used;
1259         }
1260
1261
1262         //! Is smaller comparator
1263         bool operator <(const ustring16<TAlloc>& other) const
1264         {
1265                 for(u32 i=0; array[i] && other.array[i]; ++i)
1266                 {
1267                         s32 diff = array[i] - other.array[i];
1268                         if ( diff )
1269                                 return diff < 0;
1270                 }
1271
1272                 return used < other.used;
1273         }
1274
1275
1276         //! Inequality operator
1277         bool operator !=(const uchar16_t* const str) const
1278         {
1279                 return !(*this == str);
1280         }
1281
1282
1283         //! Inequality operator
1284         bool operator !=(const ustring16<TAlloc>& other) const
1285         {
1286                 return !(*this == other);
1287         }
1288
1289
1290         //! Returns the length of a ustring16 in full characters.
1291         //! \return Length of a ustring16 in full characters.
1292         u32 size() const
1293         {
1294                 const_iterator i(*this, 0);
1295                 u32 pos = 0;
1296                 while (!i.atEnd())
1297                 {
1298                         ++i;
1299                         ++pos;
1300                 }
1301                 return pos;
1302         }
1303
1304
1305         //! Informs if the ustring is empty or not.
1306         //! \return True if the ustring is empty, false if not.
1307         bool empty() const
1308         {
1309                 return (size_raw() == 0);
1310         }
1311
1312
1313         //! Returns a pointer to the raw UTF-16 string data.
1314         //! \return pointer to C-style NUL terminated array of UTF-16 code points.
1315         const uchar16_t* c_str() const
1316         {
1317                 return array;
1318         }
1319
1320
1321         //! Compares the first n characters of this string with another.
1322         //! \param other Other string to compare to.
1323         //! \param n Number of characters to compare.
1324         //! \return True if the n first characters of both strings are equal.
1325         bool equalsn(const ustring16<TAlloc>& other, u32 n) const
1326         {
1327                 u32 i;
1328                 const uchar16_t* oa = other.c_str();
1329                 for(i=0; array[i] && oa[i] && i < n; ++i)
1330                         if (array[i] != oa[i])
1331                                 return false;
1332
1333                 // if one (or both) of the strings was smaller then they
1334                 // are only equal if they have the same length
1335                 return (i == n) || (used == other.used);
1336         }
1337
1338
1339         //! Compares the first n characters of this string with another.
1340         //! \param str Other string to compare to.
1341         //! \param n Number of characters to compare.
1342         //! \return True if the n first characters of both strings are equal.
1343         bool equalsn(const uchar16_t* const str, u32 n) const
1344         {
1345                 if (!str)
1346                         return false;
1347                 u32 i;
1348                 for(i=0; array[i] && str[i] && i < n; ++i)
1349                         if (array[i] != str[i])
1350                                 return false;
1351
1352                 // if one (or both) of the strings was smaller then they
1353                 // are only equal if they have the same length
1354                 return (i == n) || (array[i] == 0 && str[i] == 0);
1355         }
1356
1357
1358         //! Appends a character to this ustring16
1359         //! \param character The character to append.
1360         //! \return A reference to our current string.
1361         ustring16<TAlloc>& append(uchar32_t character)
1362         {
1363                 if (used + 2 >= allocated)
1364                         reallocate(used + 2);
1365
1366                 if (character > 0xFFFF)
1367                 {
1368                         used += 2;
1369
1370                         // character will be multibyte, so split it up into a surrogate pair.
1371                         uchar16_t x = static_cast<uchar16_t>(character);
1372                         uchar16_t vh = UTF16_HI_SURROGATE | ((((character >> 16) & ((1 << 5) - 1)) - 1) << 6) | (x >> 10);
1373                         uchar16_t vl = UTF16_LO_SURROGATE | (x & ((1 << 10) - 1));
1374                         array[used-2] = vh;
1375                         array[used-1] = vl;
1376                 }
1377                 else
1378                 {
1379                         ++used;
1380                         array[used-1] = character;
1381                 }
1382                 array[used] = 0;
1383
1384                 return *this;
1385         }
1386
1387
1388         //! Appends a UTF-8 string to this ustring16
1389         //! \param other The UTF-8 string to append.
1390         //! \param length The length of the string to append.
1391         //! \return A reference to our current string.
1392         ustring16<TAlloc>& append(const uchar8_t* const other, u32 length=0xffffffff)
1393         {
1394                 if (!other)
1395                         return *this;
1396
1397                 // Determine if the string is long enough for a BOM.
1398                 u32 len = 0;
1399                 const uchar8_t* p = other;
1400                 do
1401                 {
1402                         ++len;
1403                 } while (*p++ && len < unicode::BOM_ENCODE_UTF8_LEN);
1404
1405                 // Check for BOM.
1406                 unicode::EUTF_ENCODE c_bom = unicode::EUTFE_NONE;
1407                 if (len == unicode::BOM_ENCODE_UTF8_LEN)
1408                 {
1409                         if (memcmp(other, unicode::BOM_ENCODE_UTF8, unicode::BOM_ENCODE_UTF8_LEN) == 0)
1410                                 c_bom = unicode::EUTFE_UTF8;
1411                 }
1412
1413                 // If a BOM was found, don't include it in the string.
1414                 const uchar8_t* c2 = other;
1415                 if (c_bom != unicode::EUTFE_NONE)
1416                 {
1417                         c2 = other + unicode::BOM_UTF8_LEN;
1418                         length -= unicode::BOM_UTF8_LEN;
1419                 }
1420
1421                 // Calculate the size of the string to read in.
1422                 len = 0;
1423                 p = c2;
1424                 do
1425                 {
1426                         ++len;
1427                 } while(*p++ && len < length);
1428                 if (len > length)
1429                         len = length;
1430
1431                 // If we need to grow the array, do it now.
1432                 if (used + len >= allocated)
1433                         reallocate(used + (len * 2));
1434                 u32 start = used;
1435
1436                 // Convert UTF-8 to UTF-16.
1437                 u32 pos = start;
1438                 for (u32 l = 0; l<len;)
1439                 {
1440                         ++used;
1441                         if (((c2[l] >> 6) & 0x03) == 0x02)
1442                         {       // Invalid continuation byte.
1443                                 array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1444                                 ++l;
1445                         }
1446                         else if (c2[l] == 0xC0 || c2[l] == 0xC1)
1447                         {       // Invalid byte - overlong encoding.
1448                                 array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1449                                 ++l;
1450                         }
1451                         else if ((c2[l] & 0xF8) == 0xF0)
1452                         {       // 4 bytes UTF-8, 2 bytes UTF-16.
1453                                 // Check for a full string.
1454                                 if ((l + 3) >= len)
1455                                 {
1456                                         array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1457                                         l += 3;
1458                                         break;
1459                                 }
1460
1461                                 // Validate.
1462                                 bool valid = true;
1463                                 u8 l2 = 0;
1464                                 if (valid && (((c2[l+1] >> 6) & 0x03) == 0x02)) ++l2; else valid = false;
1465                                 if (valid && (((c2[l+2] >> 6) & 0x03) == 0x02)) ++l2; else valid = false;
1466                                 if (valid && (((c2[l+3] >> 6) & 0x03) == 0x02)) ++l2; else valid = false;
1467                                 if (!valid)
1468                                 {
1469                                         array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1470                                         l += l2;
1471                                         continue;
1472                                 }
1473
1474                                 // Decode.
1475                                 uchar8_t b1 = ((c2[l] & 0x7) << 2) | ((c2[l+1] >> 4) & 0x3);
1476                                 uchar8_t b2 = ((c2[l+1] & 0xF) << 4) | ((c2[l+2] >> 2) & 0xF);
1477                                 uchar8_t b3 = ((c2[l+2] & 0x3) << 6) | (c2[l+3] & 0x3F);
1478                                 uchar32_t v = b3 | ((uchar32_t)b2 << 8) | ((uchar32_t)b1 << 16);
1479
1480                                 // Split v up into a surrogate pair.
1481                                 uchar16_t x = static_cast<uchar16_t>(v);
1482                                 uchar16_t vh = UTF16_HI_SURROGATE | ((((v >> 16) & ((1 << 5) - 1)) - 1) << 6) | (x >> 10);
1483                                 uchar16_t vl = UTF16_LO_SURROGATE | (x & ((1 << 10) - 1));
1484
1485                                 array[pos++] = vh;
1486                                 array[pos++] = vl;
1487                                 l += 4;
1488                                 ++used;         // Using two shorts this time, so increase used by 1.
1489                         }
1490                         else if ((c2[l] & 0xF0) == 0xE0)
1491                         {       // 3 bytes UTF-8, 1 byte UTF-16.
1492                                 // Check for a full string.
1493                                 if ((l + 2) >= len)
1494                                 {
1495                                         array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1496                                         l += 2;
1497                                         break;
1498                                 }
1499
1500                                 // Validate.
1501                                 bool valid = true;
1502                                 u8 l2 = 0;
1503                                 if (valid && (((c2[l+1] >> 6) & 0x03) == 0x02)) ++l2; else valid = false;
1504                                 if (valid && (((c2[l+2] >> 6) & 0x03) == 0x02)) ++l2; else valid = false;
1505                                 if (!valid)
1506                                 {
1507                                         array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1508                                         l += l2;
1509                                         continue;
1510                                 }
1511
1512                                 // Decode.
1513                                 uchar8_t b1 = ((c2[l] & 0xF) << 4) | ((c2[l+1] >> 2) & 0xF);
1514                                 uchar8_t b2 = ((c2[l+1] & 0x3) << 6) | (c2[l+2] & 0x3F);
1515                                 uchar16_t ch = b2 | ((uchar16_t)b1 << 8);
1516                                 array[pos++] = ch;
1517                                 l += 3;
1518                         }
1519                         else if ((c2[l] & 0xE0) == 0xC0)
1520                         {       // 2 bytes UTF-8, 1 byte UTF-16.
1521                                 // Check for a full string.
1522                                 if ((l + 1) >= len)
1523                                 {
1524                                         array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1525                                         l += 1;
1526                                         break;
1527                                 }
1528
1529                                 // Validate.
1530                                 if (((c2[l+1] >> 6) & 0x03) != 0x02)
1531                                 {
1532                                         array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1533                                         ++l;
1534                                         continue;
1535                                 }
1536
1537                                 // Decode.
1538                                 uchar8_t b1 = (c2[l] >> 2) & 0x7;
1539                                 uchar8_t b2 = ((c2[l] & 0x3) << 6) | (c2[l+1] & 0x3F);
1540                                 uchar16_t ch = b2 | ((uchar16_t)b1 << 8);
1541                                 array[pos++] = ch;
1542                                 l += 2;
1543                         }
1544                         else
1545                         {       // 1 byte UTF-8, 1 byte UTF-16.
1546                                 // Validate.
1547                                 if (c2[l] > 0x7F)
1548                                 {       // Values above 0xF4 are restricted and aren't used.  By now, anything above 0x7F is invalid.
1549                                         array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1550                                 }
1551                                 else array[pos++] = static_cast<uchar16_t>(c2[l]);
1552                                 ++l;
1553                         }
1554                 }
1555                 array[used] = 0;
1556
1557                 // Validate our new UTF-16 string.
1558                 validate();
1559
1560                 return *this;
1561         }
1562
1563
1564         //! Appends a UTF-16 string to this ustring16
1565         //! \param other The UTF-16 string to append.
1566         //! \param length The length of the string to append.
1567         //! \return A reference to our current string.
1568         ustring16<TAlloc>& append(const uchar16_t* const other, u32 length=0xffffffff)
1569         {
1570                 if (!other)
1571                         return *this;
1572
1573                 // Determine if the string is long enough for a BOM.
1574                 u32 len = 0;
1575                 const uchar16_t* p = other;
1576                 do
1577                 {
1578                         ++len;
1579                 } while (*p++ && len < unicode::BOM_ENCODE_UTF16_LEN);
1580
1581                 // Check for the BOM to determine the string's endianness.
1582                 unicode::EUTF_ENDIAN c_end = unicode::EUTFEE_NATIVE;
1583                 if (memcmp(other, unicode::BOM_ENCODE_UTF16_LE, unicode::BOM_ENCODE_UTF16_LEN) == 0)
1584                         c_end = unicode::EUTFEE_LITTLE;
1585                 else if (memcmp(other, unicode::BOM_ENCODE_UTF16_BE, unicode::BOM_ENCODE_UTF16_LEN) == 0)
1586                         c_end = unicode::EUTFEE_BIG;
1587
1588                 // If a BOM was found, don't include it in the string.
1589                 const uchar16_t* c2 = other;
1590                 if (c_end != unicode::EUTFEE_NATIVE)
1591                 {
1592                         c2 = other + unicode::BOM_UTF16_LEN;
1593                         length -= unicode::BOM_UTF16_LEN;
1594                 }
1595
1596                 // Calculate the size of the string to read in.
1597                 len = 0;
1598                 p = c2;
1599                 do
1600                 {
1601                         ++len;
1602                 } while(*p++ && len < length);
1603                 if (len > length)
1604                         len = length;
1605
1606                 // If we need to grow the size of the array, do it now.
1607                 if (used + len >= allocated)
1608                         reallocate(used + (len * 2));
1609                 u32 start = used;
1610                 used += len;
1611
1612                 // Copy the string now.
1613                 unicode::EUTF_ENDIAN m_end = getEndianness();
1614                 for (u32 l = start; l < start + len; ++l)
1615                 {
1616                         array[l] = (uchar16_t)c2[l];
1617                         if (c_end != unicode::EUTFEE_NATIVE && c_end != m_end)
1618                                 array[l] = unicode::swapEndian16(array[l]);
1619                 }
1620
1621                 array[used] = 0;
1622
1623                 // Validate our new UTF-16 string.
1624                 validate();
1625                 return *this;
1626         }
1627
1628
1629         //! Appends a UTF-32 string to this ustring16
1630         //! \param other The UTF-32 string to append.
1631         //! \param length The length of the string to append.
1632         //! \return A reference to our current string.
1633         ustring16<TAlloc>& append(const uchar32_t* const other, u32 length=0xffffffff)
1634         {
1635                 if (!other)
1636                         return *this;
1637
1638                 // Check for the BOM to determine the string's endianness.
1639                 unicode::EUTF_ENDIAN c_end = unicode::EUTFEE_NATIVE;
1640                 if (memcmp(other, unicode::BOM_ENCODE_UTF32_LE, unicode::BOM_ENCODE_UTF32_LEN) == 0)
1641                         c_end = unicode::EUTFEE_LITTLE;
1642                 else if (memcmp(other, unicode::BOM_ENCODE_UTF32_BE, unicode::BOM_ENCODE_UTF32_LEN) == 0)
1643                         c_end = unicode::EUTFEE_BIG;
1644
1645                 // If a BOM was found, don't include it in the string.
1646                 const uchar32_t* c2 = other;
1647                 if (c_end != unicode::EUTFEE_NATIVE)
1648                 {
1649                         c2 = other + unicode::BOM_UTF32_LEN;
1650                         length -= unicode::BOM_UTF32_LEN;
1651                 }
1652
1653                 // Calculate the size of the string to read in.
1654                 u32 len = 0;
1655                 const uchar32_t* p = c2;
1656                 do
1657                 {
1658                         ++len;
1659                 } while(*p++ && len < length);
1660                 if (len > length)
1661                         len = length;
1662
1663                 // If we need to grow the size of the array, do it now.
1664                 // In case all of the UTF-32 string is split into surrogate pairs, do len * 2.
1665                 if (used + (len * 2) >= allocated)
1666                         reallocate(used + ((len * 2) * 2));
1667                 u32 start = used;
1668
1669                 // Convert UTF-32 to UTF-16.
1670                 unicode::EUTF_ENDIAN m_end = getEndianness();
1671                 u32 pos = start;
1672                 for (u32 l = 0; l<len; ++l)
1673                 {
1674                         ++used;
1675
1676                         uchar32_t ch = c2[l];
1677                         if (c_end != unicode::EUTFEE_NATIVE && c_end != m_end)
1678                                 ch = unicode::swapEndian32(ch);
1679
1680                         if (ch > 0xFFFF)
1681                         {
1682                                 // Split ch up into a surrogate pair as it is over 16 bits long.
1683                                 uchar16_t x = static_cast<uchar16_t>(ch);
1684                                 uchar16_t vh = UTF16_HI_SURROGATE | ((((ch >> 16) & ((1 << 5) - 1)) - 1) << 6) | (x >> 10);
1685                                 uchar16_t vl = UTF16_LO_SURROGATE | (x & ((1 << 10) - 1));
1686                                 array[pos++] = vh;
1687                                 array[pos++] = vl;
1688                                 ++used;         // Using two shorts, so increased used again.
1689                         }
1690                         else if (ch >= 0xD800 && ch <= 0xDFFF)
1691                         {
1692                                 // Between possible UTF-16 surrogates (invalid!)
1693                                 array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1694                         }
1695                         else array[pos++] = static_cast<uchar16_t>(ch);
1696                 }
1697                 array[used] = 0;
1698
1699                 // Validate our new UTF-16 string.
1700                 validate();
1701
1702                 return *this;
1703         }
1704
1705
1706         //! Appends a ustring16 to this ustring16
1707         //! \param other The string to append to this one.
1708         //! \return A reference to our current string.
1709         ustring16<TAlloc>& append(const ustring16<TAlloc>& other)
1710         {
1711                 const uchar16_t* oa = other.c_str();
1712
1713                 u32 len = other.size_raw();
1714
1715                 if (used + len >= allocated)
1716                         reallocate(used + len);
1717
1718                 for (u32 l=0; l<len; ++l)
1719                         array[used+l] = oa[l];
1720
1721                 used += len;
1722                 array[used] = 0;
1723
1724                 return *this;
1725         }
1726
1727
1728         //! Appends a certain amount of characters of a ustring16 to this ustring16.
1729         //! \param other The string to append to this one.
1730         //! \param length How many characters of the other string to add to this one.
1731         //! \return A reference to our current string.
1732         ustring16<TAlloc>& append(const ustring16<TAlloc>& other, u32 length)
1733         {
1734                 if (other.size() == 0)
1735                         return *this;
1736
1737                 if (other.size() < length)
1738                 {
1739                         append(other);
1740                         return *this;
1741                 }
1742
1743                 if (used + length * 2 >= allocated)
1744                         reallocate(used + length * 2);
1745
1746                 const_iterator iter(other, 0);
1747                 u32 l = length;
1748                 while (!iter.atEnd() && l)
1749                 {
1750                         uchar32_t c = *iter;
1751                         append(c);
1752                         ++iter;
1753                         --l;
1754                 }
1755
1756                 return *this;
1757         }
1758
1759
1760         //! Reserves some memory.
1761         //! \param count The amount of characters to reserve.
1762         void reserve(u32 count)
1763         {
1764                 if (count < allocated)
1765                         return;
1766
1767                 reallocate(count);
1768         }
1769
1770
1771         //! Finds first occurrence of character.
1772         //! \param c The character to search for.
1773         //! \return Position where the character has been found, or -1 if not found.
1774         s32 findFirst(uchar32_t c) const
1775         {
1776                 const_iterator i(*this, 0);
1777
1778                 s32 pos = 0;
1779                 while (!i.atEnd())
1780                 {
1781                         uchar32_t t = *i;
1782                         if (c == t)
1783                                 return pos;
1784                         ++pos;
1785                         ++i;
1786                 }
1787
1788                 return -1;
1789         }
1790
1791         //! Finds first occurrence of a character of a list.
1792         //! \param c A list of characters to find. For example if the method should find the first occurrence of 'a' or 'b', this parameter should be "ab".
1793         //! \param count The amount of characters in the list. Usually, this should be strlen(c).
1794         //! \return Position where one of the characters has been found, or -1 if not found.
1795         s32 findFirstChar(const uchar32_t* const c, u32 count=1) const
1796         {
1797                 if (!c || !count)
1798                         return -1;
1799
1800                 const_iterator i(*this, 0);
1801
1802                 s32 pos = 0;
1803                 while (!i.atEnd())
1804                 {
1805                         uchar32_t t = *i;
1806                         for (u32 j=0; j<count; ++j)
1807                                 if (t == c[j])
1808                                         return pos;
1809                         ++pos;
1810                         ++i;
1811                 }
1812
1813                 return -1;
1814         }
1815
1816
1817         //! Finds first position of a character not in a given list.
1818         //! \param c A list of characters to NOT find. For example if the method should find the first occurrence of a character not 'a' or 'b', this parameter should be "ab".
1819         //! \param count The amount of characters in the list. Usually, this should be strlen(c).
1820         //! \return Position where the character has been found, or -1 if not found.
1821         s32 findFirstCharNotInList(const uchar32_t* const c, u32 count=1) const
1822         {
1823                 if (!c || !count)
1824                         return -1;
1825
1826                 const_iterator i(*this, 0);
1827
1828                 s32 pos = 0;
1829                 while (!i.atEnd())
1830                 {
1831                         uchar32_t t = *i;
1832                         u32 j;
1833                         for (j=0; j<count; ++j)
1834                                 if (t == c[j])
1835                                         break;
1836
1837                         if (j==count)
1838                                 return pos;
1839                         ++pos;
1840                         ++i;
1841                 }
1842
1843                 return -1;
1844         }
1845
1846         //! Finds last position of a character not in a given list.
1847         //! \param c A list of characters to NOT find. For example if the method should find the first occurrence of a character not 'a' or 'b', this parameter should be "ab".
1848         //! \param count The amount of characters in the list. Usually, this should be strlen(c).
1849         //! \return Position where the character has been found, or -1 if not found.
1850         s32 findLastCharNotInList(const uchar32_t* const c, u32 count=1) const
1851         {
1852                 if (!c || !count)
1853                         return -1;
1854
1855                 const_iterator i(end());
1856                 --i;
1857
1858                 s32 pos = size() - 1;
1859                 while (!i.atStart())
1860                 {
1861                         uchar32_t t = *i;
1862                         u32 j;
1863                         for (j=0; j<count; ++j)
1864                                 if (t == c[j])
1865                                         break;
1866
1867                         if (j==count)
1868                                 return pos;
1869                         --pos;
1870                         --i;
1871                 }
1872
1873                 return -1;
1874         }
1875
1876         //! Finds next occurrence of character.
1877         //! \param c The character to search for.
1878         //! \param startPos The position in the string to start searching.
1879         //! \return Position where the character has been found, or -1 if not found.
1880         s32 findNext(uchar32_t c, u32 startPos) const
1881         {
1882                 const_iterator i(*this, startPos);
1883
1884                 s32 pos = startPos;
1885                 while (!i.atEnd())
1886                 {
1887                         uchar32_t t = *i;
1888                         if (t == c)
1889                                 return pos;
1890                         ++pos;
1891                         ++i;
1892                 }
1893
1894                 return -1;
1895         }
1896
1897
1898         //! Finds last occurrence of character.
1899         //! \param c The character to search for.
1900         //! \param start The start position of the reverse search ( default = -1, on end ).
1901         //! \return Position where the character has been found, or -1 if not found.
1902         s32 findLast(uchar32_t c, s32 start = -1) const
1903         {
1904                 u32 s = size();
1905                 start = core::clamp ( start < 0 ? (s32)s : start, 0, (s32)s ) - 1;
1906
1907                 const_iterator i(*this, start);
1908                 u32 pos = start;
1909                 while (!i.atStart())
1910                 {
1911                         uchar32_t t = *i;
1912                         if (t == c)
1913                                 return pos;
1914                         --pos;
1915                         --i;
1916                 }
1917
1918                 return -1;
1919         }
1920
1921         //! Finds last occurrence of a character in a list.
1922         //! \param c A list of strings to find. For example if the method should find the last occurrence of 'a' or 'b', this parameter should be "ab".
1923         //! \param count The amount of characters in the list. Usually, this should be strlen(c).
1924         //! \return Position where one of the characters has been found, or -1 if not found.
1925         s32 findLastChar(const uchar32_t* const c, u32 count=1) const
1926         {
1927                 if (!c || !count)
1928                         return -1;
1929
1930                 const_iterator i(end());
1931                 --i;
1932
1933                 s32 pos = size();
1934                 while (!i.atStart())
1935                 {
1936                         uchar32_t t = *i;
1937                         for (u32 j=0; j<count; ++j)
1938                                 if (t == c[j])
1939                                         return pos;
1940                         --pos;
1941                         --i;
1942                 }
1943
1944                 return -1;
1945         }
1946
1947
1948         //! Finds another ustring16 in this ustring16.
1949         //! \param str The string to find.
1950         //! \param start The start position of the search.
1951         //! \return Positions where the ustring16 has been found, or -1 if not found.
1952         s32 find(const ustring16<TAlloc>& str, const u32 start = 0) const
1953         {
1954                 u32 my_size = size();
1955                 u32 their_size = str.size();
1956
1957                 if (their_size == 0 || my_size - start < their_size)
1958                         return -1;
1959
1960                 const_iterator i(*this, start);
1961
1962                 s32 pos = start;
1963                 while (!i.atEnd())
1964                 {
1965                         const_iterator i2(i);
1966                         const_iterator j(str, 0);
1967                         uchar32_t t1 = (uchar32_t)*i2;
1968                         uchar32_t t2 = (uchar32_t)*j;
1969                         while (t1 == t2)
1970                         {
1971                                 ++i2;
1972                                 ++j;
1973                                 if (j.atEnd())
1974                                         return pos;
1975                                 t1 = (uchar32_t)*i2;
1976                                 t2 = (uchar32_t)*j;
1977                         }
1978                         ++i;
1979                         ++pos;
1980                 }
1981
1982                 return -1;
1983         }
1984
1985
1986         //! Finds another ustring16 in this ustring16.
1987         //! \param str The string to find.
1988         //! \param start The start position of the search.
1989         //! \return Positions where the string has been found, or -1 if not found.
1990         s32 find_raw(const ustring16<TAlloc>& str, const u32 start = 0) const
1991         {
1992                 const uchar16_t* data = str.c_str();
1993                 if (data && *data)
1994                 {
1995                         u32 len = 0;
1996
1997                         while (data[len])
1998                                 ++len;
1999
2000                         if (len > used)
2001                                 return -1;
2002
2003                         for (u32 i=start; i<=used-len; ++i)
2004                         {
2005                                 u32 j=0;
2006
2007                                 while(data[j] && array[i+j] == data[j])
2008                                         ++j;
2009
2010                                 if (!data[j])
2011                                         return i;
2012                         }
2013                 }
2014
2015                 return -1;
2016         }
2017
2018
2019         //! Returns a substring.
2020         //! \param begin: Start of substring.
2021         //! \param length: Length of substring.
2022         //! \return A reference to our current string.
2023         ustring16<TAlloc> subString(u32 begin, s32 length) const
2024         {
2025                 u32 len = size();
2026                 // if start after ustring16
2027                 // or no proper substring length
2028                 if ((length <= 0) || (begin>=len))
2029                         return ustring16<TAlloc>("");
2030                 // clamp length to maximal value
2031                 if ((length+begin) > len)
2032                         length = len-begin;
2033
2034                 ustring16<TAlloc> o;
2035                 o.reserve((length+1) * 2);
2036
2037                 const_iterator i(*this, begin);
2038                 while (!i.atEnd() && length)
2039                 {
2040                         o.append(*i);
2041                         ++i;
2042                         --length;
2043                 }
2044
2045                 return o;
2046         }
2047
2048
2049         //! Appends a character to this ustring16.
2050         //! \param c Character to append.
2051         //! \return A reference to our current string.
2052         ustring16<TAlloc>& operator += (char c)
2053         {
2054                 append((uchar32_t)c);
2055                 return *this;
2056         }
2057
2058
2059         //! Appends a character to this ustring16.
2060         //! \param c Character to append.
2061         //! \return A reference to our current string.
2062         ustring16<TAlloc>& operator += (uchar32_t c)
2063         {
2064                 append(c);
2065                 return *this;
2066         }
2067
2068
2069         //! Appends a number to this ustring16.
2070         //! \param c Number to append.
2071         //! \return A reference to our current string.
2072         ustring16<TAlloc>& operator += (short c)
2073         {
2074                 append(core::stringc(c));
2075                 return *this;
2076         }
2077
2078
2079         //! Appends a number to this ustring16.
2080         //! \param c Number to append.
2081         //! \return A reference to our current string.
2082         ustring16<TAlloc>& operator += (unsigned short c)
2083         {
2084                 append(core::stringc(c));
2085                 return *this;
2086         }
2087
2088
2089 #ifdef USTRING_CPP0X_NEWLITERALS
2090         //! Appends a number to this ustring16.
2091         //! \param c Number to append.
2092         //! \return A reference to our current string.
2093         ustring16<TAlloc>& operator += (int c)
2094         {
2095                 append(core::stringc(c));
2096                 return *this;
2097         }
2098
2099
2100         //! Appends a number to this ustring16.
2101         //! \param c Number to append.
2102         //! \return A reference to our current string.
2103         ustring16<TAlloc>& operator += (unsigned int c)
2104         {
2105                 append(core::stringc(c));
2106                 return *this;
2107         }
2108 #endif
2109
2110
2111         //! Appends a number to this ustring16.
2112         //! \param c Number to append.
2113         //! \return A reference to our current string.
2114         ustring16<TAlloc>& operator += (long c)
2115         {
2116                 append(core::stringc(c));
2117                 return *this;
2118         }
2119
2120
2121         //! Appends a number to this ustring16.
2122         //! \param c Number to append.
2123         //! \return A reference to our current string.
2124         ustring16<TAlloc>& operator += (unsigned long c)
2125         {
2126                 append(core::stringc(c));
2127                 return *this;
2128         }
2129
2130
2131         //! Appends a number to this ustring16.
2132         //! \param c Number to append.
2133         //! \return A reference to our current string.
2134         ustring16<TAlloc>& operator += (double c)
2135         {
2136                 append(core::stringc(c));
2137                 return *this;
2138         }
2139
2140
2141         //! Appends a char ustring16 to this ustring16.
2142         //! \param c Char ustring16 to append.
2143         //! \return A reference to our current string.
2144         ustring16<TAlloc>& operator += (const uchar16_t* const c)
2145         {
2146                 append(c);
2147                 return *this;
2148         }
2149
2150
2151         //! Appends a ustring16 to this ustring16.
2152         //! \param other ustring16 to append.
2153         //! \return A reference to our current string.
2154         ustring16<TAlloc>& operator += (const ustring16<TAlloc>& other)
2155         {
2156                 append(other);
2157                 return *this;
2158         }
2159
2160
2161         //! Replaces all characters of a given type with another one.
2162         //! \param toReplace Character to replace.
2163         //! \param replaceWith Character replacing the old one.
2164         //! \return A reference to our current string.
2165         ustring16<TAlloc>& replace(uchar32_t toReplace, uchar32_t replaceWith)
2166         {
2167                 iterator i(*this, 0);
2168                 while (!i.atEnd())
2169                 {
2170                         typename ustring16<TAlloc>::access a = *i;
2171                         if ((uchar32_t)a == toReplace)
2172                                 a = replaceWith;
2173                         ++i;
2174                 }
2175                 return *this;
2176         }
2177
2178
2179         //! Replaces all instances of a string with another one.
2180         //! \param toReplace The string to replace.
2181         //! \param replaceWith The string replacing the old one.
2182         //! \return A reference to our current string.
2183         ustring16<TAlloc>& replace(const ustring16<TAlloc>& toReplace, const ustring16<TAlloc>& replaceWith)
2184         {
2185                 if (toReplace.size() == 0)
2186                         return *this;
2187
2188                 const uchar16_t* other = toReplace.c_str();
2189                 const uchar16_t* replace = replaceWith.c_str();
2190                 const u32 other_size = toReplace.size_raw();
2191                 const u32 replace_size = replaceWith.size_raw();
2192
2193                 // Determine the delta.  The algorithm will change depending on the delta.
2194                 s32 delta = replace_size - other_size;
2195
2196                 // A character for character replace.  The string will not shrink or grow.
2197                 if (delta == 0)
2198                 {
2199                         s32 pos = 0;
2200                         while ((pos = find_raw(other, pos)) != -1)
2201                         {
2202                                 for (u32 i = 0; i < replace_size; ++i)
2203                                         array[pos + i] = replace[i];
2204                                 ++pos;
2205                         }
2206                         return *this;
2207                 }
2208
2209                 // We are going to be removing some characters.  The string will shrink.
2210                 if (delta < 0)
2211                 {
2212                         u32 i = 0;
2213                         for (u32 pos = 0; pos <= used; ++i, ++pos)
2214                         {
2215                                 // Is this potentially a match?
2216                                 if (array[pos] == *other)
2217                                 {
2218                                         // Check to see if we have a match.
2219                                         u32 j;
2220                                         for (j = 0; j < other_size; ++j)
2221                                         {
2222                                                 if (array[pos + j] != other[j])
2223                                                         break;
2224                                         }
2225
2226                                         // If we have a match, replace characters.
2227                                         if (j == other_size)
2228                                         {
2229                                                 for (j = 0; j < replace_size; ++j)
2230                                                         array[i + j] = replace[j];
2231                                                 i += replace_size - 1;
2232                                                 pos += other_size - 1;
2233                                                 continue;
2234                                         }
2235                                 }
2236
2237                                 // No match found, just copy characters.
2238                                 array[i - 1] = array[pos];
2239                         }
2240                         array[i] = 0;
2241                         used = i;
2242
2243                         return *this;
2244                 }
2245
2246                 // We are going to be adding characters, so the string size will increase.
2247                 // Count the number of times toReplace exists in the string so we can allocate the new size.
2248                 u32 find_count = 0;
2249                 s32 pos = 0;
2250                 while ((pos = find_raw(other, pos)) != -1)
2251                 {
2252                         ++find_count;
2253                         ++pos;
2254                 }
2255
2256                 // Re-allocate the string now, if needed.
2257                 u32 len = delta * find_count;
2258                 if (used + len >= allocated)
2259                         reallocate(used + len);
2260
2261                 // Start replacing.
2262                 pos = 0;
2263                 while ((pos = find_raw(other, pos)) != -1)
2264                 {
2265                         uchar16_t* start = array + pos + other_size - 1;
2266                         uchar16_t* ptr   = array + used;
2267                         uchar16_t* end   = array + used + delta;
2268
2269                         // Shift characters to make room for the string.
2270                         while (ptr != start)
2271                         {
2272                                 *end = *ptr;
2273                                 --ptr;
2274                                 --end;
2275                         }
2276
2277                         // Add the new string now.
2278                         for (u32 i = 0; i < replace_size; ++i)
2279                                 array[pos + i] = replace[i];
2280
2281                         pos += replace_size;
2282                         used += delta;
2283                 }
2284
2285                 // Terminate the string and return ourself.
2286                 array[used] = 0;
2287                 return *this;
2288         }
2289
2290
2291         //! Removes characters from a ustring16..
2292         //! \param c The character to remove.
2293         //! \return A reference to our current string.
2294         ustring16<TAlloc>& remove(uchar32_t c)
2295         {
2296                 u32 pos = 0;
2297                 u32 found = 0;
2298                 u32 len = (c > 0xFFFF ? 2 : 1);         // Remove characters equal to the size of c as a UTF-16 character.
2299                 for (u32 i=0; i<=used; ++i)
2300                 {
2301                         uchar32_t uc32 = 0;
2302                         if (!UTF16_IS_SURROGATE_HI(array[i]))
2303                                 uc32 |= array[i];
2304                         else if (i + 1 <= used)
2305                         {
2306                                 // Convert the surrogate pair into a single UTF-32 character.
2307                                 uc32 = unicode::toUTF32(array[i], array[i + 1]);
2308                         }
2309                         u32 len2 = (uc32 > 0xFFFF ? 2 : 1);
2310
2311                         if (uc32 == c)
2312                         {
2313                                 found += len;
2314                                 continue;
2315                         }
2316
2317                         array[pos++] = array[i];
2318                         if (len2 == 2)
2319                                 array[pos++] = array[++i];
2320                 }
2321                 used -= found;
2322                 array[used] = 0;
2323                 return *this;
2324         }
2325
2326
2327         //! Removes a ustring16 from the ustring16.
2328         //! \param toRemove The string to remove.
2329         //! \return A reference to our current string.
2330         ustring16<TAlloc>& remove(const ustring16<TAlloc>& toRemove)
2331         {
2332                 u32 size = toRemove.size_raw();
2333                 if (size == 0) return *this;
2334
2335                 const uchar16_t* tra = toRemove.c_str();
2336                 u32 pos = 0;
2337                 u32 found = 0;
2338                 for (u32 i=0; i<=used; ++i)
2339                 {
2340                         u32 j = 0;
2341                         while (j < size)
2342                         {
2343                                 if (array[i + j] != tra[j])
2344                                         break;
2345                                 ++j;
2346                         }
2347                         if (j == size)
2348                         {
2349                                 found += size;
2350                                 i += size - 1;
2351                                 continue;
2352                         }
2353
2354                         array[pos++] = array[i];
2355                 }
2356                 used -= found;
2357                 array[used] = 0;
2358                 return *this;
2359         }
2360
2361
2362         //! Removes characters from the ustring16.
2363         //! \param characters The characters to remove.
2364         //! \return A reference to our current string.
2365         ustring16<TAlloc>& removeChars(const ustring16<TAlloc>& characters)
2366         {
2367                 if (characters.size_raw() == 0)
2368                         return *this;
2369
2370                 u32 pos = 0;
2371                 u32 found = 0;
2372                 const_iterator iter(characters);
2373                 for (u32 i=0; i<=used; ++i)
2374                 {
2375                         uchar32_t uc32 = 0;
2376                         if (!UTF16_IS_SURROGATE_HI(array[i]))
2377                                 uc32 |= array[i];
2378                         else if (i + 1 <= used)
2379                         {
2380                                 // Convert the surrogate pair into a single UTF-32 character.
2381                                 uc32 = unicode::toUTF32(array[i], array[i+1]);
2382                         }
2383                         u32 len2 = (uc32 > 0xFFFF ? 2 : 1);
2384
2385                         bool cont = false;
2386                         iter.toStart();
2387                         while (!iter.atEnd())
2388                         {
2389                                 uchar32_t c = *iter;
2390                                 if (uc32 == c)
2391                                 {
2392                                         found += (c > 0xFFFF ? 2 : 1);          // Remove characters equal to the size of c as a UTF-16 character.
2393                                         ++i;
2394                                         cont = true;
2395                                         break;
2396                                 }
2397                                 ++iter;
2398                         }
2399                         if (cont) continue;
2400
2401                         array[pos++] = array[i];
2402                         if (len2 == 2)
2403                                 array[pos++] = array[++i];
2404                 }
2405                 used -= found;
2406                 array[used] = 0;
2407                 return *this;
2408         }
2409
2410
2411         //! Trims the ustring16.
2412         //! Removes the specified characters (by default, Latin-1 whitespace) from the begining and the end of the ustring16.
2413         //! \param whitespace The characters that are to be considered as whitespace.
2414         //! \return A reference to our current string.
2415         ustring16<TAlloc>& trim(const ustring16<TAlloc>& whitespace = " \t\n\r")
2416         {
2417                 core::array<uchar32_t> utf32white = whitespace.toUTF32();
2418
2419                 // find start and end of the substring without the specified characters
2420                 const s32 begin = findFirstCharNotInList(utf32white.const_pointer(), whitespace.used + 1);
2421                 if (begin == -1)
2422                         return (*this="");
2423
2424                 const s32 end = findLastCharNotInList(utf32white.const_pointer(), whitespace.used + 1);
2425
2426                 return (*this = subString(begin, (end +1) - begin));
2427         }
2428
2429
2430         //! Erases a character from the ustring16.
2431         //! May be slow, because all elements following after the erased element have to be copied.
2432         //! \param index Index of element to be erased.
2433         //! \return A reference to our current string.
2434         ustring16<TAlloc>& erase(u32 index)
2435         {
2436                 _IRR_DEBUG_BREAK_IF(index>used) // access violation
2437
2438                 iterator i(*this, index);
2439
2440                 uchar32_t t = *i;
2441                 u32 len = (t > 0xFFFF ? 2 : 1);
2442
2443                 for (u32 j = static_cast<u32>(i.getPos()) + len; j <= used; ++j)
2444                         array[j - len] = array[j];
2445
2446                 used -= len;
2447                 array[used] = 0;
2448
2449                 return *this;
2450         }
2451
2452
2453         //! Validate the existing ustring16, checking for valid surrogate pairs and checking for proper termination.
2454         //! \return A reference to our current string.
2455         ustring16<TAlloc>& validate()
2456         {
2457                 // Validate all unicode characters.
2458                 for (u32 i=0; i<allocated; ++i)
2459                 {
2460                         // Terminate on existing null.
2461                         if (array[i] == 0)
2462                         {
2463                                 used = i;
2464                                 return *this;
2465                         }
2466                         if (UTF16_IS_SURROGATE(array[i]))
2467                         {
2468                                 if (((i+1) >= allocated) || UTF16_IS_SURROGATE_LO(array[i]))
2469                                         array[i] = unicode::UTF_REPLACEMENT_CHARACTER;
2470                                 else if (UTF16_IS_SURROGATE_HI(array[i]) && !UTF16_IS_SURROGATE_LO(array[i+1]))
2471                                         array[i] = unicode::UTF_REPLACEMENT_CHARACTER;
2472                                 ++i;
2473                         }
2474                         if (array[i] >= 0xFDD0 && array[i] <= 0xFDEF)
2475                                 array[i] = unicode::UTF_REPLACEMENT_CHARACTER;
2476                 }
2477
2478                 // terminate
2479                 used = 0;
2480                 if (allocated > 0)
2481                 {
2482                         used = allocated - 1;
2483                         array[used] = 0;
2484                 }
2485                 return *this;
2486         }
2487
2488
2489         //! Gets the last char of the ustring16, or 0.
2490         //! \return The last char of the ustring16, or 0.
2491         uchar32_t lastChar() const
2492         {
2493                 if (used < 1)
2494                         return 0;
2495
2496                 if (UTF16_IS_SURROGATE_LO(array[used-1]))
2497                 {
2498                         // Make sure we have a paired surrogate.
2499                         if (used < 2)
2500                                 return 0;
2501
2502                         // Check for an invalid surrogate.
2503                         if (!UTF16_IS_SURROGATE_HI(array[used-2]))
2504                                 return 0;
2505
2506                         // Convert the surrogate pair into a single UTF-32 character.
2507                         return unicode::toUTF32(array[used-2], array[used-1]);
2508                 }
2509                 else
2510                 {
2511                         return array[used-1];
2512                 }
2513         }
2514
2515
2516         //! Split the ustring16 into parts.
2517         /** This method will split a ustring16 at certain delimiter characters
2518         into the container passed in as reference. The type of the container
2519         has to be given as template parameter. It must provide a push_back and
2520         a size method.
2521         \param ret The result container
2522         \param c C-style ustring16 of delimiter characters
2523         \param count Number of delimiter characters
2524         \param ignoreEmptyTokens Flag to avoid empty substrings in the result
2525         container. If two delimiters occur without a character in between, an
2526         empty substring would be placed in the result. If this flag is set,
2527         only non-empty strings are stored.
2528         \param keepSeparators Flag which allows to add the separator to the
2529         result ustring16. If this flag is true, the concatenation of the
2530         substrings results in the original ustring16. Otherwise, only the
2531         characters between the delimiters are returned.
2532         \return The number of resulting substrings
2533         */
2534         template<class container>
2535         u32 split(container& ret, const uchar32_t* const c, u32 count=1, bool ignoreEmptyTokens=true, bool keepSeparators=false) const
2536         {
2537                 if (!c)
2538                         return 0;
2539
2540                 const_iterator i(*this);
2541                 const u32 oldSize=ret.size();
2542                 u32 pos = 0;
2543                 u32 lastpos = 0;
2544                 u32 lastpospos = 0;
2545                 bool lastWasSeparator = false;
2546                 while (!i.atEnd())
2547                 {
2548                         uchar32_t ch = *i;
2549                         bool foundSeparator = false;
2550                         for (u32 j=0; j<count; ++j)
2551                         {
2552                                 if (ch == c[j])
2553                                 {
2554                                         if ((!ignoreEmptyTokens || pos - lastpos != 0) &&
2555                                                         !lastWasSeparator)
2556                                         ret.push_back(ustring16<TAlloc>(&array[lastpospos], pos - lastpos));
2557                                         foundSeparator = true;
2558                                         lastpos = (keepSeparators ? pos : pos + 1);
2559                                         lastpospos = (keepSeparators ? i.getPos() : i.getPos() + 1);
2560                                         break;
2561                                 }
2562                         }
2563                         lastWasSeparator = foundSeparator;
2564                         ++pos;
2565                         ++i;
2566                 }
2567                 u32 s = size() + 1;
2568                 if (s > lastpos)
2569                         ret.push_back(ustring16<TAlloc>(&array[lastpospos], s - lastpos));
2570                 return ret.size()-oldSize;
2571         }
2572
2573
2574         //! Split the ustring16 into parts.
2575         /** This method will split a ustring16 at certain delimiter characters
2576         into the container passed in as reference. The type of the container
2577         has to be given as template parameter. It must provide a push_back and
2578         a size method.
2579         \param ret The result container
2580         \param c A unicode string of delimiter characters
2581         \param ignoreEmptyTokens Flag to avoid empty substrings in the result
2582         container. If two delimiters occur without a character in between, an
2583         empty substring would be placed in the result. If this flag is set,
2584         only non-empty strings are stored.
2585         \param keepSeparators Flag which allows to add the separator to the
2586         result ustring16. If this flag is true, the concatenation of the
2587         substrings results in the original ustring16. Otherwise, only the
2588         characters between the delimiters are returned.
2589         \return The number of resulting substrings
2590         */
2591         template<class container>
2592         u32 split(container& ret, const ustring16<TAlloc>& c, bool ignoreEmptyTokens=true, bool keepSeparators=false) const
2593         {
2594                 core::array<uchar32_t> v = c.toUTF32();
2595                 return split(ret, v.pointer(), v.size(), ignoreEmptyTokens, keepSeparators);
2596         }
2597
2598
2599         //! Gets the size of the allocated memory buffer for the string.
2600         //! \return The size of the allocated memory buffer.
2601         u32 capacity() const
2602         {
2603                 return allocated;
2604         }
2605
2606
2607         //! Returns the raw number of UTF-16 code points in the string which includes the individual surrogates.
2608         //! \return The raw number of UTF-16 code points, excluding the trialing NUL.
2609         u32 size_raw() const
2610         {
2611                 return used;
2612         }
2613
2614
2615         //! Inserts a character into the string.
2616         //! \param c The character to insert.
2617         //! \param pos The position to insert the character.
2618         //! \return A reference to our current string.
2619         ustring16<TAlloc>& insert(uchar32_t c, u32 pos)
2620         {
2621                 u8 len = (c > 0xFFFF ? 2 : 1);
2622
2623                 if (used + len >= allocated)
2624                         reallocate(used + len);
2625
2626                 used += len;
2627
2628                 iterator iter(*this, pos);
2629                 for (u32 i = used - 2; i > iter.getPos(); --i)
2630                         array[i] = array[i - len];
2631
2632                 if (c > 0xFFFF)
2633                 {
2634                         // c will be multibyte, so split it up into a surrogate pair.
2635                         uchar16_t x = static_cast<uchar16_t>(c);
2636                         uchar16_t vh = UTF16_HI_SURROGATE | ((((c >> 16) & ((1 << 5) - 1)) - 1) << 6) | (x >> 10);
2637                         uchar16_t vl = UTF16_LO_SURROGATE | (x & ((1 << 10) - 1));
2638                         array[iter.getPos()] = vh;
2639                         array[iter.getPos()+1] = vl;
2640                 }
2641                 else
2642                 {
2643                         array[iter.getPos()] = static_cast<uchar16_t>(c);
2644                 }
2645                 array[used] = 0;
2646                 return *this;
2647         }
2648
2649
2650         //! Inserts a string into the string.
2651         //! \param c The string to insert.
2652         //! \param pos The position to insert the string.
2653         //! \return A reference to our current string.
2654         ustring16<TAlloc>& insert(const ustring16<TAlloc>& c, u32 pos)
2655         {
2656                 u32 len = c.size_raw();
2657                 if (len == 0) return *this;
2658
2659                 if (used + len >= allocated)
2660                         reallocate(used + len);
2661
2662                 used += len;
2663
2664                 iterator iter(*this, pos);
2665                 for (u32 i = used - 2; i > iter.getPos() + len; --i)
2666                         array[i] = array[i - len];
2667
2668                 const uchar16_t* s = c.c_str();
2669                 for (u32 i = 0; i < len; ++i)
2670                 {
2671                         array[pos++] = *s;
2672                         ++s;
2673                 }
2674
2675                 array[used] = 0;
2676                 return *this;
2677         }
2678
2679
2680         //! Inserts a character into the string.
2681         //! \param c The character to insert.
2682         //! \param pos The position to insert the character.
2683         //! \return A reference to our current string.
2684         ustring16<TAlloc>& insert_raw(uchar16_t c, u32 pos)
2685         {
2686                 if (used + 1 >= allocated)
2687                         reallocate(used + 1);
2688
2689                 ++used;
2690
2691                 for (u32 i = used - 1; i > pos; --i)
2692                         array[i] = array[i - 1];
2693
2694                 array[pos] = c;
2695                 array[used] = 0;
2696                 return *this;
2697         }
2698
2699
2700         //! Removes a character from string.
2701         //! \param pos Position of the character to remove.
2702         //! \return A reference to our current string.
2703         ustring16<TAlloc>& erase_raw(u32 pos)
2704         {
2705                 for (u32 i=pos; i<=used; ++i)
2706                 {
2707                         array[i] = array[i + 1];
2708                 }
2709                 --used;
2710                 array[used] = 0;
2711                 return *this;
2712         }
2713
2714
2715         //! Replaces a character in the string.
2716         //! \param c The new character.
2717         //! \param pos The position of the character to replace.
2718         //! \return A reference to our current string.
2719         ustring16<TAlloc>& replace_raw(uchar16_t c, u32 pos)
2720         {
2721                 array[pos] = c;
2722                 return *this;
2723         }
2724
2725
2726         //! Returns an iterator to the beginning of the string.
2727         //! \return An iterator to the beginning of the string.
2728         iterator begin()
2729         {
2730                 iterator i(*this, 0);
2731                 return i;
2732         }
2733
2734
2735         //! Returns an iterator to the beginning of the string.
2736         //! \return An iterator to the beginning of the string.
2737         const_iterator begin() const
2738         {
2739                 const_iterator i(*this, 0);
2740                 return i;
2741         }
2742
2743
2744         //! Returns an iterator to the beginning of the string.
2745         //! \return An iterator to the beginning of the string.
2746         const_iterator cbegin() const
2747         {
2748                 const_iterator i(*this, 0);
2749                 return i;
2750         }
2751
2752
2753         //! Returns an iterator to the end of the string.
2754         //! \return An iterator to the end of the string.
2755         iterator end()
2756         {
2757                 iterator i(*this, 0);
2758                 i.toEnd();
2759                 return i;
2760         }
2761
2762
2763         //! Returns an iterator to the end of the string.
2764         //! \return An iterator to the end of the string.
2765         const_iterator end() const
2766         {
2767                 const_iterator i(*this, 0);
2768                 i.toEnd();
2769                 return i;
2770         }
2771
2772
2773         //! Returns an iterator to the end of the string.
2774         //! \return An iterator to the end of the string.
2775         const_iterator cend() const
2776         {
2777                 const_iterator i(*this, 0);
2778                 i.toEnd();
2779                 return i;
2780         }
2781
2782
2783         //! Converts the string to a UTF-8 encoded string.
2784         //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
2785         //! \return A string containing the UTF-8 encoded string.
2786         core::string<uchar8_t> toUTF8_s(const bool addBOM = false) const
2787         {
2788                 core::string<uchar8_t> ret;
2789                 ret.reserve(used * 4 + (addBOM ? unicode::BOM_UTF8_LEN : 0) + 1);
2790                 const_iterator iter(*this, 0);
2791
2792                 // Add the byte order mark if the user wants it.
2793                 if (addBOM)
2794                 {
2795                         ret.append(unicode::BOM_ENCODE_UTF8[0]);
2796                         ret.append(unicode::BOM_ENCODE_UTF8[1]);
2797                         ret.append(unicode::BOM_ENCODE_UTF8[2]);
2798                 }
2799
2800                 while (!iter.atEnd())
2801                 {
2802                         uchar32_t c = *iter;
2803                         if (c > 0xFFFF)
2804                         {       // 4 bytes
2805                                 uchar8_t b1 = (0x1E << 3) | ((c >> 18) & 0x7);
2806                                 uchar8_t b2 = (0x2 << 6) | ((c >> 12) & 0x3F);
2807                                 uchar8_t b3 = (0x2 << 6) | ((c >> 6) & 0x3F);
2808                                 uchar8_t b4 = (0x2 << 6) | (c & 0x3F);
2809                                 ret.append(b1);
2810                                 ret.append(b2);
2811                                 ret.append(b3);
2812                                 ret.append(b4);
2813                         }
2814                         else if (c > 0x7FF)
2815                         {       // 3 bytes
2816                                 uchar8_t b1 = (0xE << 4) | ((c >> 12) & 0xF);
2817                                 uchar8_t b2 = (0x2 << 6) | ((c >> 6) & 0x3F);
2818                                 uchar8_t b3 = (0x2 << 6) | (c & 0x3F);
2819                                 ret.append(b1);
2820                                 ret.append(b2);
2821                                 ret.append(b3);
2822                         }
2823                         else if (c > 0x7F)
2824                         {       // 2 bytes
2825                                 uchar8_t b1 = (0x6 << 5) | ((c >> 6) & 0x1F);
2826                                 uchar8_t b2 = (0x2 << 6) | (c & 0x3F);
2827                                 ret.append(b1);
2828                                 ret.append(b2);
2829                         }
2830                         else
2831                         {       // 1 byte
2832                                 ret.append(static_cast<uchar8_t>(c));
2833                         }
2834                         ++iter;
2835                 }
2836                 return ret;
2837         }
2838
2839
2840         //! Converts the string to a UTF-8 encoded string array.
2841         //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
2842         //! \return An array containing the UTF-8 encoded string.
2843         core::array<uchar8_t> toUTF8(const bool addBOM = false) const
2844         {
2845                 core::array<uchar8_t> ret(used * 4 + (addBOM ? unicode::BOM_UTF8_LEN : 0) + 1);
2846                 const_iterator iter(*this, 0);
2847
2848                 // Add the byte order mark if the user wants it.
2849                 if (addBOM)
2850                 {
2851                         ret.push_back(unicode::BOM_ENCODE_UTF8[0]);
2852                         ret.push_back(unicode::BOM_ENCODE_UTF8[1]);
2853                         ret.push_back(unicode::BOM_ENCODE_UTF8[2]);
2854                 }
2855
2856                 while (!iter.atEnd())
2857                 {
2858                         uchar32_t c = *iter;
2859                         if (c > 0xFFFF)
2860                         {       // 4 bytes
2861                                 uchar8_t b1 = (0x1E << 3) | ((c >> 18) & 0x7);
2862                                 uchar8_t b2 = (0x2 << 6) | ((c >> 12) & 0x3F);
2863                                 uchar8_t b3 = (0x2 << 6) | ((c >> 6) & 0x3F);
2864                                 uchar8_t b4 = (0x2 << 6) | (c & 0x3F);
2865                                 ret.push_back(b1);
2866                                 ret.push_back(b2);
2867                                 ret.push_back(b3);
2868                                 ret.push_back(b4);
2869                         }
2870                         else if (c > 0x7FF)
2871                         {       // 3 bytes
2872                                 uchar8_t b1 = (0xE << 4) | ((c >> 12) & 0xF);
2873                                 uchar8_t b2 = (0x2 << 6) | ((c >> 6) & 0x3F);
2874                                 uchar8_t b3 = (0x2 << 6) | (c & 0x3F);
2875                                 ret.push_back(b1);
2876                                 ret.push_back(b2);
2877                                 ret.push_back(b3);
2878                         }
2879                         else if (c > 0x7F)
2880                         {       // 2 bytes
2881                                 uchar8_t b1 = (0x6 << 5) | ((c >> 6) & 0x1F);
2882                                 uchar8_t b2 = (0x2 << 6) | (c & 0x3F);
2883                                 ret.push_back(b1);
2884                                 ret.push_back(b2);
2885                         }
2886                         else
2887                         {       // 1 byte
2888                                 ret.push_back(static_cast<uchar8_t>(c));
2889                         }
2890                         ++iter;
2891                 }
2892                 ret.push_back(0);
2893                 return ret;
2894         }
2895
2896
2897 #ifdef USTRING_CPP0X_NEWLITERALS        // C++0x
2898         //! Converts the string to a UTF-16 encoded string.
2899         //! \param endian The desired endianness of the string.
2900         //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
2901         //! \return A string containing the UTF-16 encoded string.
2902         core::string<char16_t> toUTF16_s(const unicode::EUTF_ENDIAN endian = unicode::EUTFEE_NATIVE, const bool addBOM = false) const
2903         {
2904                 core::string<char16_t> ret;
2905                 ret.reserve(used + (addBOM ? unicode::BOM_UTF16_LEN : 0) + 1);
2906
2907                 // Add the BOM if specified.
2908                 if (addBOM)
2909                 {
2910                         if (endian == unicode::EUTFEE_NATIVE)
2911                                 ret[0] = unicode::BOM;
2912                         else if (endian == unicode::EUTFEE_LITTLE)
2913                         {
2914                                 uchar8_t* ptr8 = reinterpret_cast<uchar8_t*>(ret.c_str());
2915                                 *ptr8++ = unicode::BOM_ENCODE_UTF16_LE[0];
2916                                 *ptr8 = unicode::BOM_ENCODE_UTF16_LE[1];
2917                         }
2918                         else
2919                         {
2920                                 uchar8_t* ptr8 = reinterpret_cast<uchar8_t*>(ret.c_str());
2921                                 *ptr8++ = unicode::BOM_ENCODE_UTF16_BE[0];
2922                                 *ptr8 = unicode::BOM_ENCODE_UTF16_BE[1];
2923                         }
2924                 }
2925
2926                 ret.append(array);
2927                 if (endian != unicode::EUTFEE_NATIVE && getEndianness() != endian)
2928                 {
2929                         char16_t* ptr = ret.c_str();
2930                         for (u32 i = 0; i < ret.size(); ++i)
2931                                 *ptr++ = unicode::swapEndian16(*ptr);
2932                 }
2933                 return ret;
2934         }
2935 #endif
2936
2937
2938         //! Converts the string to a UTF-16 encoded string array.
2939         //! Unfortunately, no toUTF16_s() version exists due to limitations with Irrlicht's string class.
2940         //! \param endian The desired endianness of the string.
2941         //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
2942         //! \return An array containing the UTF-16 encoded string.
2943         core::array<uchar16_t> toUTF16(const unicode::EUTF_ENDIAN endian = unicode::EUTFEE_NATIVE, const bool addBOM = false) const
2944         {
2945                 core::array<uchar16_t> ret(used + (addBOM ? unicode::BOM_UTF16_LEN : 0) + 1);
2946                 uchar16_t* ptr = ret.pointer();
2947
2948                 // Add the BOM if specified.
2949                 if (addBOM)
2950                 {
2951                         if (endian == unicode::EUTFEE_NATIVE)
2952                                 *ptr = unicode::BOM;
2953                         else if (endian == unicode::EUTFEE_LITTLE)
2954                         {
2955                                 uchar8_t* ptr8 = reinterpret_cast<uchar8_t*>(ptr);
2956                                 *ptr8++ = unicode::BOM_ENCODE_UTF16_LE[0];
2957                                 *ptr8 = unicode::BOM_ENCODE_UTF16_LE[1];
2958                         }
2959                         else
2960                         {
2961                                 uchar8_t* ptr8 = reinterpret_cast<uchar8_t*>(ptr);
2962                                 *ptr8++ = unicode::BOM_ENCODE_UTF16_BE[0];
2963                                 *ptr8 = unicode::BOM_ENCODE_UTF16_BE[1];
2964                         }
2965                         ++ptr;
2966                 }
2967
2968                 memcpy((void*)ptr, (void*)array, used * sizeof(uchar16_t));
2969                 if (endian != unicode::EUTFEE_NATIVE && getEndianness() != endian)
2970                 {
2971                         for (u32 i = 0; i <= used; ++i)
2972                                 ptr[i] = unicode::swapEndian16(ptr[i]);
2973                 }
2974                 ret.set_used(used + (addBOM ? unicode::BOM_UTF16_LEN : 0));
2975                 ret.push_back(0);
2976                 return ret;
2977         }
2978
2979
2980 #ifdef USTRING_CPP0X_NEWLITERALS        // C++0x
2981         //! Converts the string to a UTF-32 encoded string.
2982         //! \param endian The desired endianness of the string.
2983         //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
2984         //! \return A string containing the UTF-32 encoded string.
2985         core::string<char32_t> toUTF32_s(const unicode::EUTF_ENDIAN endian = unicode::EUTFEE_NATIVE, const bool addBOM = false) const
2986         {
2987                 core::string<char32_t> ret;
2988                 ret.reserve(size() + 1 + (addBOM ? unicode::BOM_UTF32_LEN : 0));
2989                 const_iterator iter(*this, 0);
2990
2991                 // Add the BOM if specified.
2992                 if (addBOM)
2993                 {
2994                         if (endian == unicode::EUTFEE_NATIVE)
2995                                 ret.append(unicode::BOM);
2996                         else
2997                         {
2998                                 union
2999                                 {
3000                                         uchar32_t full;
3001                                         u8 chunk[4];
3002                                 } t;
3003
3004                                 if (endian == unicode::EUTFEE_LITTLE)
3005                                 {
3006                                         t.chunk[0] = unicode::BOM_ENCODE_UTF32_LE[0];
3007                                         t.chunk[1] = unicode::BOM_ENCODE_UTF32_LE[1];
3008                                         t.chunk[2] = unicode::BOM_ENCODE_UTF32_LE[2];
3009                                         t.chunk[3] = unicode::BOM_ENCODE_UTF32_LE[3];
3010                                 }
3011                                 else
3012                                 {
3013                                         t.chunk[0] = unicode::BOM_ENCODE_UTF32_BE[0];
3014                                         t.chunk[1] = unicode::BOM_ENCODE_UTF32_BE[1];
3015                                         t.chunk[2] = unicode::BOM_ENCODE_UTF32_BE[2];
3016                                         t.chunk[3] = unicode::BOM_ENCODE_UTF32_BE[3];
3017                                 }
3018                                 ret.append(t.full);
3019                         }
3020                 }
3021
3022                 while (!iter.atEnd())
3023                 {
3024                         uchar32_t c = *iter;
3025                         if (endian != unicode::EUTFEE_NATIVE && getEndianness() != endian)
3026                                 c = unicode::swapEndian32(c);
3027                         ret.append(c);
3028                         ++iter;
3029                 }
3030                 return ret;
3031         }
3032 #endif
3033
3034
3035         //! Converts the string to a UTF-32 encoded string array.
3036         //! Unfortunately, no toUTF32_s() version exists due to limitations with Irrlicht's string class.
3037         //! \param endian The desired endianness of the string.
3038         //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
3039         //! \return An array containing the UTF-32 encoded string.
3040         core::array<uchar32_t> toUTF32(const unicode::EUTF_ENDIAN endian = unicode::EUTFEE_NATIVE, const bool addBOM = false) const
3041         {
3042                 core::array<uchar32_t> ret(size() + (addBOM ? unicode::BOM_UTF32_LEN : 0) + 1);
3043                 const_iterator iter(*this, 0);
3044
3045                 // Add the BOM if specified.
3046                 if (addBOM)
3047                 {
3048                         if (endian == unicode::EUTFEE_NATIVE)
3049                                 ret.push_back(unicode::BOM);
3050                         else
3051                         {
3052                                 union
3053                                 {
3054                                         uchar32_t full;
3055                                         u8 chunk[4];
3056                                 } t;
3057
3058                                 if (endian == unicode::EUTFEE_LITTLE)
3059                                 {
3060                                         t.chunk[0] = unicode::BOM_ENCODE_UTF32_LE[0];
3061                                         t.chunk[1] = unicode::BOM_ENCODE_UTF32_LE[1];
3062                                         t.chunk[2] = unicode::BOM_ENCODE_UTF32_LE[2];
3063                                         t.chunk[3] = unicode::BOM_ENCODE_UTF32_LE[3];
3064                                 }
3065                                 else
3066                                 {
3067                                         t.chunk[0] = unicode::BOM_ENCODE_UTF32_BE[0];
3068                                         t.chunk[1] = unicode::BOM_ENCODE_UTF32_BE[1];
3069                                         t.chunk[2] = unicode::BOM_ENCODE_UTF32_BE[2];
3070                                         t.chunk[3] = unicode::BOM_ENCODE_UTF32_BE[3];
3071                                 }
3072                                 ret.push_back(t.full);
3073                         }
3074                 }
3075                 ret.push_back(0);
3076
3077                 while (!iter.atEnd())
3078                 {
3079                         uchar32_t c = *iter;
3080                         if (endian != unicode::EUTFEE_NATIVE && getEndianness() != endian)
3081                                 c = unicode::swapEndian32(c);
3082                         ret.push_back(c);
3083                         ++iter;
3084                 }
3085                 return ret;
3086         }
3087
3088
3089         //! Converts the string to a wchar_t encoded string.
3090         /** The size of a wchar_t changes depending on the platform.  This function will store a
3091         correct UTF-8, -16, or -32 encoded string depending on the size of a wchar_t. **/
3092         //! \param endian The desired endianness of the string.
3093         //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
3094         //! \return A string containing the wchar_t encoded string.
3095         core::string<wchar_t> toWCHAR_s(const unicode::EUTF_ENDIAN endian = unicode::EUTFEE_NATIVE, const bool addBOM = false) const
3096         {
3097                 if (sizeof(wchar_t) == 4)
3098                 {
3099                         core::array<uchar32_t> a(toUTF32(endian, addBOM));
3100                         core::stringw ret(a.pointer());
3101                         return ret;
3102                 }
3103                 else if (sizeof(wchar_t) == 2)
3104                 {
3105                         if (endian == unicode::EUTFEE_NATIVE && addBOM == false)
3106                         {
3107                                 core::stringw ret(array);
3108                                 return ret;
3109                         }
3110                         else
3111                         {
3112                                 core::array<uchar16_t> a(toUTF16(endian, addBOM));
3113                                 core::stringw ret(a.pointer());
3114                                 return ret;
3115                         }
3116                 }
3117                 else if (sizeof(wchar_t) == 1)
3118                 {
3119                         core::array<uchar8_t> a(toUTF8(addBOM));
3120                         core::stringw ret(a.pointer());
3121                         return ret;
3122                 }
3123
3124                 // Shouldn't happen.
3125                 return core::stringw();
3126         }
3127
3128
3129         //! Converts the string to a wchar_t encoded string array.
3130         /** The size of a wchar_t changes depending on the platform.  This function will store a
3131         correct UTF-8, -16, or -32 encoded string depending on the size of a wchar_t. **/
3132         //! \param endian The desired endianness of the string.
3133         //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
3134         //! \return An array containing the wchar_t encoded string.
3135         core::array<wchar_t> toWCHAR(const unicode::EUTF_ENDIAN endian = unicode::EUTFEE_NATIVE, const bool addBOM = false) const
3136         {
3137                 if (sizeof(wchar_t) == 4)
3138                 {
3139                         core::array<uchar32_t> a(toUTF32(endian, addBOM));
3140                         core::array<wchar_t> ret(a.size());
3141                         ret.set_used(a.size());
3142                         memcpy((void*)ret.pointer(), (void*)a.pointer(), a.size() * sizeof(uchar32_t));
3143                         return ret;
3144                 }
3145                 if (sizeof(wchar_t) == 2)
3146                 {
3147                         if (endian == unicode::EUTFEE_NATIVE && addBOM == false)
3148                         {
3149                                 core::array<wchar_t> ret(used);
3150                                 ret.set_used(used);
3151                                 memcpy((void*)ret.pointer(), (void*)array, used * sizeof(uchar16_t));
3152                                 return ret;
3153                         }
3154                         else
3155                         {
3156                                 core::array<uchar16_t> a(toUTF16(endian, addBOM));
3157                                 core::array<wchar_t> ret(a.size());
3158                                 ret.set_used(a.size());
3159                                 memcpy((void*)ret.pointer(), (void*)a.pointer(), a.size() * sizeof(uchar16_t));
3160                                 return ret;
3161                         }
3162                 }
3163                 if (sizeof(wchar_t) == 1)
3164                 {
3165                         core::array<uchar8_t> a(toUTF8(addBOM));
3166                         core::array<wchar_t> ret(a.size());
3167                         ret.set_used(a.size());
3168                         memcpy((void*)ret.pointer(), (void*)a.pointer(), a.size() * sizeof(uchar8_t));
3169                         return ret;
3170                 }
3171
3172                 // Shouldn't happen.
3173                 return core::array<wchar_t>();
3174         }
3175
3176         //! Converts the string to a properly encoded io::path string.
3177         //! \param endian The desired endianness of the string.
3178         //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
3179         //! \return An io::path string containing the properly encoded string.
3180         io::path toPATH_s(const unicode::EUTF_ENDIAN endian = unicode::EUTFEE_NATIVE, const bool addBOM = false) const
3181         {
3182 #if defined(_IRR_WCHAR_FILESYSTEM)
3183                 return toWCHAR_s(endian, addBOM);
3184 #else
3185                 return toUTF8_s(addBOM);
3186 #endif
3187         }
3188
3189         //! Loads an unknown stream of data.
3190         //! Will attempt to determine if the stream is unicode data.  Useful for loading from files.
3191         //! \param data The data stream to load from.
3192         //! \param data_size The length of the data string.
3193         //! \return A reference to our current string.
3194         ustring16<TAlloc>& loadDataStream(const char* data, size_t data_size)
3195         {
3196                 // Clear our string.
3197                 *this = "";
3198                 if (!data)
3199                         return *this;
3200
3201                 unicode::EUTF_ENCODE e = unicode::determineUnicodeBOM(data);
3202                 switch (e)
3203                 {
3204                         default:
3205                         case unicode::EUTFE_UTF8:
3206                                 append((uchar8_t*)data, data_size);
3207                                 break;
3208
3209                         case unicode::EUTFE_UTF16:
3210                         case unicode::EUTFE_UTF16_BE:
3211                         case unicode::EUTFE_UTF16_LE:
3212                                 append((uchar16_t*)data, data_size / 2);
3213                                 break;
3214
3215                         case unicode::EUTFE_UTF32:
3216                         case unicode::EUTFE_UTF32_BE:
3217                         case unicode::EUTFE_UTF32_LE:
3218                                 append((uchar32_t*)data, data_size / 4);
3219                                 break;
3220                 }
3221
3222                 return *this;
3223         }
3224
3225         //! Gets the encoding of the Unicode string this class contains.
3226         //! \return An enum describing the current encoding of this string.
3227         const unicode::EUTF_ENCODE getEncoding() const
3228         {
3229                 return encoding;
3230         }
3231
3232         //! Gets the endianness of the Unicode string this class contains.
3233         //! \return An enum describing the endianness of this string.
3234         const unicode::EUTF_ENDIAN getEndianness() const
3235         {
3236                 if (encoding == unicode::EUTFE_UTF16_LE ||
3237                         encoding == unicode::EUTFE_UTF32_LE)
3238                         return unicode::EUTFEE_LITTLE;
3239                 else return unicode::EUTFEE_BIG;
3240         }
3241
3242 private:
3243
3244         //! Reallocate the string, making it bigger or smaller.
3245         //! \param new_size The new size of the string.
3246         void reallocate(u32 new_size)
3247         {
3248                 uchar16_t* old_array = array;
3249
3250                 array = allocator.allocate(new_size + 1); //new u16[new_size];
3251                 allocated = new_size + 1;
3252                 if (old_array == 0) return;
3253
3254                 u32 amount = used < new_size ? used : new_size;
3255                 for (u32 i=0; i<=amount; ++i)
3256                         array[i] = old_array[i];
3257
3258                 if (allocated <= used)
3259                         used = allocated - 1;
3260
3261                 array[used] = 0;
3262
3263                 allocator.deallocate(old_array); // delete [] old_array;
3264         }
3265
3266         //--- member variables
3267
3268         uchar16_t* array;
3269         unicode::EUTF_ENCODE encoding;
3270         u32 allocated;
3271         u32 used;
3272         TAlloc allocator;
3273         //irrAllocator<uchar16_t> allocator;
3274 };
3275
3276 typedef ustring16<irrAllocator<uchar16_t> > ustring;
3277
3278
3279 //! Appends two ustring16s.
3280 template <typename TAlloc>
3281 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const ustring16<TAlloc>& right)
3282 {
3283         ustring16<TAlloc> ret(left);
3284         ret += right;
3285         return ret;
3286 }
3287
3288
3289 //! Appends a ustring16 and a null-terminated unicode string.
3290 template <typename TAlloc, class B>
3291 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const B* const right)
3292 {
3293         ustring16<TAlloc> ret(left);
3294         ret += right;
3295         return ret;
3296 }
3297
3298
3299 //! Appends a ustring16 and a null-terminated unicode string.
3300 template <class B, typename TAlloc>
3301 inline ustring16<TAlloc> operator+(const B* const left, const ustring16<TAlloc>& right)
3302 {
3303         ustring16<TAlloc> ret(left);
3304         ret += right;
3305         return ret;
3306 }
3307
3308
3309 //! Appends a ustring16 and an Irrlicht string.
3310 template <typename TAlloc, typename B, typename BAlloc>
3311 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const string<B, BAlloc>& right)
3312 {
3313         ustring16<TAlloc> ret(left);
3314         ret += right;
3315         return ret;
3316 }
3317
3318
3319 //! Appends a ustring16 and an Irrlicht string.
3320 template <typename TAlloc, typename B, typename BAlloc>
3321 inline ustring16<TAlloc> operator+(const string<B, BAlloc>& left, const ustring16<TAlloc>& right)
3322 {
3323         ustring16<TAlloc> ret(left);
3324         ret += right;
3325         return ret;
3326 }
3327
3328
3329 //! Appends a ustring16 and a std::basic_string.
3330 template <typename TAlloc, typename B, typename A, typename BAlloc>
3331 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const std::basic_string<B, A, BAlloc>& right)
3332 {
3333         ustring16<TAlloc> ret(left);
3334         ret += right;
3335         return ret;
3336 }
3337
3338
3339 //! Appends a ustring16 and a std::basic_string.
3340 template <typename TAlloc, typename B, typename A, typename BAlloc>
3341 inline ustring16<TAlloc> operator+(const std::basic_string<B, A, BAlloc>& left, const ustring16<TAlloc>& right)
3342 {
3343         ustring16<TAlloc> ret(left);
3344         ret += right;
3345         return ret;
3346 }
3347
3348
3349 //! Appends a ustring16 and a char.
3350 template <typename TAlloc>
3351 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const char right)
3352 {
3353         ustring16<TAlloc> ret(left);
3354         ret += right;
3355         return ret;
3356 }
3357
3358
3359 //! Appends a ustring16 and a char.
3360 template <typename TAlloc>
3361 inline ustring16<TAlloc> operator+(const char left, const ustring16<TAlloc>& right)
3362 {
3363         ustring16<TAlloc> ret(left);
3364         ret += right;
3365         return ret;
3366 }
3367
3368
3369 #ifdef USTRING_CPP0X_NEWLITERALS
3370 //! Appends a ustring16 and a uchar32_t.
3371 template <typename TAlloc>
3372 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const uchar32_t right)
3373 {
3374         ustring16<TAlloc> ret(left);
3375         ret += right;
3376         return ret;
3377 }
3378
3379
3380 //! Appends a ustring16 and a uchar32_t.
3381 template <typename TAlloc>
3382 inline ustring16<TAlloc> operator+(const uchar32_t left, const ustring16<TAlloc>& right)
3383 {
3384         ustring16<TAlloc> ret(left);
3385         ret += right;
3386         return ret;
3387 }
3388 #endif
3389
3390
3391 //! Appends a ustring16 and a short.
3392 template <typename TAlloc>
3393 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const short right)
3394 {
3395         ustring16<TAlloc> ret(left);
3396         ret += core::stringc(right);
3397         return ret;
3398 }
3399
3400
3401 //! Appends a ustring16 and a short.
3402 template <typename TAlloc>
3403 inline ustring16<TAlloc> operator+(const short left, const ustring16<TAlloc>& right)
3404 {
3405         ustring16<TAlloc> ret((core::stringc(left)));
3406         ret += right;
3407         return ret;
3408 }
3409
3410
3411 //! Appends a ustring16 and an unsigned short.
3412 template <typename TAlloc>
3413 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const unsigned short right)
3414 {
3415         ustring16<TAlloc> ret(left);
3416         ret += core::stringc(right);
3417         return ret;
3418 }
3419
3420
3421 //! Appends a ustring16 and an unsigned short.
3422 template <typename TAlloc>
3423 inline ustring16<TAlloc> operator+(const unsigned short left, const ustring16<TAlloc>& right)
3424 {
3425         ustring16<TAlloc> ret((core::stringc(left)));
3426         ret += right;
3427         return ret;
3428 }
3429
3430
3431 //! Appends a ustring16 and an int.
3432 template <typename TAlloc>
3433 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const int right)
3434 {
3435         ustring16<TAlloc> ret(left);
3436         ret += core::stringc(right);
3437         return ret;
3438 }
3439
3440
3441 //! Appends a ustring16 and an int.
3442 template <typename TAlloc>
3443 inline ustring16<TAlloc> operator+(const int left, const ustring16<TAlloc>& right)
3444 {
3445         ustring16<TAlloc> ret((core::stringc(left)));
3446         ret += right;
3447         return ret;
3448 }
3449
3450
3451 //! Appends a ustring16 and an unsigned int.
3452 template <typename TAlloc>
3453 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const unsigned int right)
3454 {
3455         ustring16<TAlloc> ret(left);
3456         ret += core::stringc(right);
3457         return ret;
3458 }
3459
3460
3461 //! Appends a ustring16 and an unsigned int.
3462 template <typename TAlloc>
3463 inline ustring16<TAlloc> operator+(const unsigned int left, const ustring16<TAlloc>& right)
3464 {
3465         ustring16<TAlloc> ret((core::stringc(left)));
3466         ret += right;
3467         return ret;
3468 }
3469
3470
3471 //! Appends a ustring16 and a long.
3472 template <typename TAlloc>
3473 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const long right)
3474 {
3475         ustring16<TAlloc> ret(left);
3476         ret += core::stringc(right);
3477         return ret;
3478 }
3479
3480
3481 //! Appends a ustring16 and a long.
3482 template <typename TAlloc>
3483 inline ustring16<TAlloc> operator+(const long left, const ustring16<TAlloc>& right)
3484 {
3485         ustring16<TAlloc> ret((core::stringc(left)));
3486         ret += right;
3487         return ret;
3488 }
3489
3490
3491 //! Appends a ustring16 and an unsigned long.
3492 template <typename TAlloc>
3493 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const unsigned long right)
3494 {
3495         ustring16<TAlloc> ret(left);
3496         ret += core::stringc(right);
3497         return ret;
3498 }
3499
3500
3501 //! Appends a ustring16 and an unsigned long.
3502 template <typename TAlloc>
3503 inline ustring16<TAlloc> operator+(const unsigned long left, const ustring16<TAlloc>& right)
3504 {
3505         ustring16<TAlloc> ret((core::stringc(left)));
3506         ret += right;
3507         return ret;
3508 }
3509
3510
3511 //! Appends a ustring16 and a float.
3512 template <typename TAlloc>
3513 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const float right)
3514 {
3515         ustring16<TAlloc> ret(left);
3516         ret += core::stringc(right);
3517         return ret;
3518 }
3519
3520
3521 //! Appends a ustring16 and a float.
3522 template <typename TAlloc>
3523 inline ustring16<TAlloc> operator+(const float left, const ustring16<TAlloc>& right)
3524 {
3525         ustring16<TAlloc> ret((core::stringc(left)));
3526         ret += right;
3527         return ret;
3528 }
3529
3530
3531 //! Appends a ustring16 and a double.
3532 template <typename TAlloc>
3533 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const double right)
3534 {
3535         ustring16<TAlloc> ret(left);
3536         ret += core::stringc(right);
3537         return ret;
3538 }
3539
3540
3541 //! Appends a ustring16 and a double.
3542 template <typename TAlloc>
3543 inline ustring16<TAlloc> operator+(const double left, const ustring16<TAlloc>& right)
3544 {
3545         ustring16<TAlloc> ret((core::stringc(left)));
3546         ret += right;
3547         return ret;
3548 }
3549
3550
3551 #ifdef USTRING_CPP0X
3552 //! Appends two ustring16s.
3553 template <typename TAlloc>
3554 inline ustring16<TAlloc>&& operator+(const ustring16<TAlloc>& left, ustring16<TAlloc>&& right)
3555 {
3556         //std::cout << "MOVE operator+(&, &&)" << std::endl;
3557         right.insert(left, 0);
3558         return std::move(right);
3559 }
3560
3561
3562 //! Appends two ustring16s.
3563 template <typename TAlloc>
3564 inline ustring16<TAlloc>&& operator+(ustring16<TAlloc>&& left, const ustring16<TAlloc>& right)
3565 {
3566         //std::cout << "MOVE operator+(&&, &)" << std::endl;
3567         left.append(right);
3568         return std::move(left);
3569 }
3570
3571
3572 //! Appends two ustring16s.
3573 template <typename TAlloc>
3574 inline ustring16<TAlloc>&& operator+(ustring16<TAlloc>&& left, ustring16<TAlloc>&& right)
3575 {
3576         //std::cout << "MOVE operator+(&&, &&)" << std::endl;
3577         if ((right.size_raw() <= left.capacity() - left.size_raw()) ||
3578                 (right.capacity() - right.size_raw() < left.size_raw()))
3579         {
3580                 left.append(right);
3581                 return std::move(left);
3582         }
3583         else
3584         {
3585                 right.insert(left, 0);
3586                 return std::move(right);
3587         }
3588 }
3589
3590
3591 //! Appends a ustring16 and a null-terminated unicode string.
3592 template <typename TAlloc, class B>
3593 inline ustring16<TAlloc>&& operator+(ustring16<TAlloc>&& left, const B* const right)
3594 {
3595         //std::cout << "MOVE operator+(&&, B*)" << std::endl;
3596         left.append(right);
3597         return std::move(left);
3598 }
3599
3600
3601 //! Appends a ustring16 and a null-terminated unicode string.
3602 template <class B, typename TAlloc>
3603 inline ustring16<TAlloc>&& operator+(const B* const left, ustring16<TAlloc>&& right)
3604 {
3605         //std::cout << "MOVE operator+(B*, &&)" << std::endl;
3606         right.insert(left, 0);
3607         return std::move(right);
3608 }
3609
3610
3611 //! Appends a ustring16 and an Irrlicht string.
3612 template <typename TAlloc, typename B, typename BAlloc>
3613 inline ustring16<TAlloc>&& operator+(const string<B, BAlloc>& left, ustring16<TAlloc>&& right)
3614 {
3615         //std::cout << "MOVE operator+(&, &&)" << std::endl;
3616         right.insert(left, 0);
3617         return std::move(right);
3618 }
3619
3620
3621 //! Appends a ustring16 and an Irrlicht string.
3622 template <typename TAlloc, typename B, typename BAlloc>
3623 inline ustring16<TAlloc>&& operator+(ustring16<TAlloc>&& left, const string<B, BAlloc>& right)
3624 {
3625         //std::cout << "MOVE operator+(&&, &)" << std::endl;
3626         left.append(right);
3627         return std::move(left);
3628 }
3629
3630
3631 //! Appends a ustring16 and a std::basic_string.
3632 template <typename TAlloc, typename B, typename A, typename BAlloc>
3633 inline ustring16<TAlloc>&& operator+(const std::basic_string<B, A, BAlloc>& left, ustring16<TAlloc>&& right)
3634 {
3635         //std::cout << "MOVE operator+(&, &&)" << std::endl;
3636         right.insert(core::ustring16<TAlloc>(left), 0);
3637         return std::move(right);
3638 }
3639
3640
3641 //! Appends a ustring16 and a std::basic_string.
3642 template <typename TAlloc, typename B, typename A, typename BAlloc>
3643 inline ustring16<TAlloc>&& operator+(ustring16<TAlloc>&& left, const std::basic_string<B, A, BAlloc>& right)
3644 {
3645         //std::cout << "MOVE operator+(&&, &)" << std::endl;
3646         left.append(right);
3647         return std::move(left);
3648 }
3649
3650
3651 //! Appends a ustring16 and a char.
3652 template <typename TAlloc>
3653 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const char right)
3654 {
3655         left.append((uchar32_t)right);
3656         return std::move(left);
3657 }
3658
3659
3660 //! Appends a ustring16 and a char.
3661 template <typename TAlloc>
3662 inline ustring16<TAlloc> operator+(const char left, ustring16<TAlloc>&& right)
3663 {
3664         right.insert((uchar32_t)left, 0);
3665         return std::move(right);
3666 }
3667
3668
3669 #ifdef USTRING_CPP0X_NEWLITERALS
3670 //! Appends a ustring16 and a uchar32_t.
3671 template <typename TAlloc>
3672 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const uchar32_t right)
3673 {
3674         left.append(right);
3675         return std::move(left);
3676 }
3677
3678
3679 //! Appends a ustring16 and a uchar32_t.
3680 template <typename TAlloc>
3681 inline ustring16<TAlloc> operator+(const uchar32_t left, ustring16<TAlloc>&& right)
3682 {
3683         right.insert(left, 0);
3684         return std::move(right);
3685 }
3686 #endif
3687
3688
3689 //! Appends a ustring16 and a short.
3690 template <typename TAlloc>
3691 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const short right)
3692 {
3693         left.append(core::stringc(right));
3694         return std::move(left);
3695 }
3696
3697
3698 //! Appends a ustring16 and a short.
3699 template <typename TAlloc>
3700 inline ustring16<TAlloc> operator+(const short left, ustring16<TAlloc>&& right)
3701 {
3702         right.insert(core::stringc(left), 0);
3703         return std::move(right);
3704 }
3705
3706
3707 //! Appends a ustring16 and an unsigned short.
3708 template <typename TAlloc>
3709 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const unsigned short right)
3710 {
3711         left.append(core::stringc(right));
3712         return std::move(left);
3713 }
3714
3715
3716 //! Appends a ustring16 and an unsigned short.
3717 template <typename TAlloc>
3718 inline ustring16<TAlloc> operator+(const unsigned short left, ustring16<TAlloc>&& right)
3719 {
3720         right.insert(core::stringc(left), 0);
3721         return std::move(right);
3722 }
3723
3724
3725 //! Appends a ustring16 and an int.
3726 template <typename TAlloc>
3727 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const int right)
3728 {
3729         left.append(core::stringc(right));
3730         return std::move(left);
3731 }
3732
3733
3734 //! Appends a ustring16 and an int.
3735 template <typename TAlloc>
3736 inline ustring16<TAlloc> operator+(const int left, ustring16<TAlloc>&& right)
3737 {
3738         right.insert(core::stringc(left), 0);
3739         return std::move(right);
3740 }
3741
3742
3743 //! Appends a ustring16 and an unsigned int.
3744 template <typename TAlloc>
3745 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const unsigned int right)
3746 {
3747         left.append(core::stringc(right));
3748         return std::move(left);
3749 }
3750
3751
3752 //! Appends a ustring16 and an unsigned int.
3753 template <typename TAlloc>
3754 inline ustring16<TAlloc> operator+(const unsigned int left, ustring16<TAlloc>&& right)
3755 {
3756         right.insert(core::stringc(left), 0);
3757         return std::move(right);
3758 }
3759
3760
3761 //! Appends a ustring16 and a long.
3762 template <typename TAlloc>
3763 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const long right)
3764 {
3765         left.append(core::stringc(right));
3766         return std::move(left);
3767 }
3768
3769
3770 //! Appends a ustring16 and a long.
3771 template <typename TAlloc>
3772 inline ustring16<TAlloc> operator+(const long left, ustring16<TAlloc>&& right)
3773 {
3774         right.insert(core::stringc(left), 0);
3775         return std::move(right);
3776 }
3777
3778
3779 //! Appends a ustring16 and an unsigned long.
3780 template <typename TAlloc>
3781 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const unsigned long right)
3782 {
3783         left.append(core::stringc(right));
3784         return std::move(left);
3785 }
3786
3787
3788 //! Appends a ustring16 and an unsigned long.
3789 template <typename TAlloc>
3790 inline ustring16<TAlloc> operator+(const unsigned long left, ustring16<TAlloc>&& right)
3791 {
3792         right.insert(core::stringc(left), 0);
3793         return std::move(right);
3794 }
3795
3796
3797 //! Appends a ustring16 and a float.
3798 template <typename TAlloc>
3799 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const float right)
3800 {
3801         left.append(core::stringc(right));
3802         return std::move(left);
3803 }
3804
3805
3806 //! Appends a ustring16 and a float.
3807 template <typename TAlloc>
3808 inline ustring16<TAlloc> operator+(const float left, ustring16<TAlloc>&& right)
3809 {
3810         right.insert(core::stringc(left), 0);
3811         return std::move(right);
3812 }
3813
3814
3815 //! Appends a ustring16 and a double.
3816 template <typename TAlloc>
3817 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const double right)
3818 {
3819         left.append(core::stringc(right));
3820         return std::move(left);
3821 }
3822
3823
3824 //! Appends a ustring16 and a double.
3825 template <typename TAlloc>
3826 inline ustring16<TAlloc> operator+(const double left, ustring16<TAlloc>&& right)
3827 {
3828         right.insert(core::stringc(left), 0);
3829         return std::move(right);
3830 }
3831 #endif
3832
3833
3834 #ifndef USTRING_NO_STL
3835 //! Writes a ustring16 to an ostream.
3836 template <typename TAlloc>
3837 inline std::ostream& operator<<(std::ostream& out, const ustring16<TAlloc>& in)
3838 {
3839         out << in.toUTF8_s().c_str();
3840         return out;
3841 }
3842
3843 //! Writes a ustring16 to a wostream.
3844 template <typename TAlloc>
3845 inline std::wostream& operator<<(std::wostream& out, const ustring16<TAlloc>& in)
3846 {
3847         out << in.toWCHAR_s().c_str();
3848         return out;
3849 }
3850 #endif
3851
3852
3853 #ifndef USTRING_NO_STL
3854
3855 namespace unicode
3856 {
3857
3858 //! Hashing algorithm for hashing a ustring.  Used for things like unordered_maps.
3859 //! Algorithm taken from std::hash<std::string>.
3860 class hash : public std::unary_function<core::ustring, size_t>
3861 {
3862         public:
3863                 size_t operator()(const core::ustring& s) const
3864                 {
3865                         size_t ret = 2166136261U;
3866                         size_t index = 0;
3867                         size_t stride = 1 + s.size_raw() / 10;
3868
3869                         core::ustring::const_iterator i = s.begin();
3870                         while (i != s.end())
3871                         {
3872                                 // TODO: Don't force u32 on an x64 OS.  Make it agnostic.
3873                                 ret = 16777619U * ret ^ (size_t)s[(u32)index];
3874                                 index += stride;
3875                                 i += stride;
3876                         }
3877                         return (ret);
3878                 }
3879 };
3880
3881 } // end namespace unicode
3882
3883 #endif
3884
3885 } // end namespace core
3886 } // end namespace irr
3887
3888 #endif