src/cguittfont/irrUString.h

   1 /*
   2    Basic Unicode string class for Irrlicht.
   3    Copyright (c) 2009-2011 John Norman
   4
   5    This software is provided 'as-is', without any express or implied
   6    warranty. In no event will the authors be held liable for any
   7    damages arising from the use of this software.
   8
   9    Permission is granted to anyone to use this software for any
  10    purpose, including commercial applications, and to alter it and
  11    redistribute it freely, subject to the following restrictions:
  12
  13    1. The origin of this software must not be misrepresented; you
  14       must not claim that you wrote the original software. If you use
  15       this software in a product, an acknowledgment in the product
  16       documentation would be appreciated but is not required.
  17
  18    2. Altered source versions must be plainly marked as such, and
  19       must not be misrepresented as being the original software.
  20
  21    3. This notice may not be removed or altered from any source
  22       distribution.
  23
  24    The original version of this class can be located at:
  25    http://irrlicht.suckerfreegames.com/
  26
  27    John Norman
  28    john@suckerfreegames.com
  29 */
  30
  31 #ifndef __IRR_USTRING_H_INCLUDED__
  32 #define __IRR_USTRING_H_INCLUDED__
  33
  34 #if (__cplusplus > 199711L) || (_MSC_VER >= 1600) || defined(__GXX_EXPERIMENTAL_CXX0X__)
  35 #       define USTRING_CPP0X
  36 #       if defined(__GXX_EXPERIMENTAL_CXX0X__) && ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 5)))
  37 #               define USTRING_CPP0X_NEWLITERALS
  38 #       endif
  39 #endif
  40
  41 #include <stdio.h>
  42 #include <string.h>
  43 #include <stdlib.h>
  44
  45 #ifdef USTRING_CPP0X
  46 #       include <utility>
  47 #endif
  48
  49 #ifndef USTRING_NO_STL
  50 #       include <string>
  51 #       include <iterator>
  52 #       include <ostream>
  53 #endif
  54
  55 #include "irrTypes.h"
  56 #include "irrAllocator.h"
  57 #include "irrArray.h"
  58 #include "irrMath.h"
  59 #include "irrString.h"
  60 #include "path.h"
  61
  62 //! UTF-16 surrogate start values.
  63 static const irr::u16 UTF16_HI_SURROGATE = 0xD800;
  64 static const irr::u16 UTF16_LO_SURROGATE = 0xDC00;
  65
  66 //! Is a UTF-16 code point a surrogate?
  67 #define UTF16_IS_SURROGATE(c)           (((c) & 0xF800) == 0xD800)
  68 #define UTF16_IS_SURROGATE_HI(c)        (((c) & 0xFC00) == 0xD800)
  69 #define UTF16_IS_SURROGATE_LO(c)        (((c) & 0xFC00) == 0xDC00)
  70
  71
  72 namespace irr
  73 {
  74
  75         // Define our character types.
  76 #ifdef USTRING_CPP0X_NEWLITERALS        // C++0x
  77         typedef char32_t uchar32_t;
  78         typedef char16_t uchar16_t;
  79         typedef char uchar8_t;
  80 #else
  81         typedef u32 uchar32_t;
  82         typedef u16 uchar16_t;
  83         typedef u8 uchar8_t;
  84 #endif
  85
  86 namespace core
  87 {
  88
  89 namespace unicode
  90 {
  91
  92 //! The unicode replacement character.  Used to replace invalid characters.
  93 const irr::u16 UTF_REPLACEMENT_CHARACTER = 0xFFFD;
  94
  95 //! Convert a UTF-16 surrogate pair into a UTF-32 character.
  96 //! \param high The high value of the pair.
  97 //! \param low The low value of the pair.
  98 //! \return The UTF-32 character expressed by the surrogate pair.
  99 inline uchar32_t toUTF32(uchar16_t high, uchar16_t low)
 100 {
 101         // Convert the surrogate pair into a single UTF-32 character.
 102         uchar32_t x = ((high & ((1 << 6) -1)) << 10) | (low & ((1 << 10) -1));
 103         uchar32_t wu = ((high >> 6) & ((1 << 5) - 1)) + 1;
 104         return (wu << 16) | x;
 105 }
 106
 107 //! Swaps the endianness of a 16-bit value.
 108 //! \return The new value.
 109 inline uchar16_t swapEndian16(const uchar16_t& c)
 110 {
 111         return ((c >> 8) & 0x00FF) | ((c << 8) & 0xFF00);
 112 }
 113
 114 //! Swaps the endianness of a 32-bit value.
 115 //! \return The new value.
 116 inline uchar32_t swapEndian32(const uchar32_t& c)
 117 {
 118         return  ((c >> 24) & 0x000000FF) |
 119                         ((c >> 8)  & 0x0000FF00) |
 120                         ((c << 8)  & 0x00FF0000) |
 121                         ((c << 24) & 0xFF000000);
 122 }
 123
 124 //! The Unicode byte order mark.
 125 const u16 BOM = 0xFEFF;
 126
 127 //! The size of the Unicode byte order mark in terms of the Unicode character size.
 128 const u8 BOM_UTF8_LEN = 3;
 129 const u8 BOM_UTF16_LEN = 1;
 130 const u8 BOM_UTF32_LEN = 1;
 131
 132 //! Unicode byte order marks for file operations.
 133 const u8 BOM_ENCODE_UTF8[3] = { 0xEF, 0xBB, 0xBF };
 134 const u8 BOM_ENCODE_UTF16_BE[2] = { 0xFE, 0xFF };
 135 const u8 BOM_ENCODE_UTF16_LE[2] = { 0xFF, 0xFE };
 136 const u8 BOM_ENCODE_UTF32_BE[4] = { 0x00, 0x00, 0xFE, 0xFF };
 137 const u8 BOM_ENCODE_UTF32_LE[4] = { 0xFF, 0xFE, 0x00, 0x00 };
 138
 139 //! The size in bytes of the Unicode byte marks for file operations.
 140 const u8 BOM_ENCODE_UTF8_LEN = 3;
 141 const u8 BOM_ENCODE_UTF16_LEN = 2;
 142 const u8 BOM_ENCODE_UTF32_LEN = 4;
 143
 144 //! Unicode encoding type.
 145 enum EUTF_ENCODE
 146 {
 147         EUTFE_NONE              = 0,
 148         EUTFE_UTF8,
 149         EUTFE_UTF16,
 150         EUTFE_UTF16_LE,
 151         EUTFE_UTF16_BE,
 152         EUTFE_UTF32,
 153         EUTFE_UTF32_LE,
 154         EUTFE_UTF32_BE
 155 };
 156
 157 //! Unicode endianness.
 158 enum EUTF_ENDIAN
 159 {
 160         EUTFEE_NATIVE   = 0,
 161         EUTFEE_LITTLE,
 162         EUTFEE_BIG
 163 };
 164
 165 //! Returns the specified unicode byte order mark in a byte array.
 166 //! The byte order mark is the first few bytes in a text file that signifies its encoding.
 167 /** \param mode The Unicode encoding method that we want to get the byte order mark for.
 168                 If EUTFE_UTF16 or EUTFE_UTF32 is passed, it uses the native system endianness. **/
 169 //! \return An array that contains a byte order mark.
 170 inline core::array<u8> getUnicodeBOM(EUTF_ENCODE mode)
 171 {
 172 #define COPY_ARRAY(source, size) \
 173         memcpy(ret.pointer(), source, size); \
 174         ret.set_used(size)
 175
 176         core::array<u8> ret(4);
 177         switch (mode)
 178         {
 179                 case EUTFE_UTF8:
 180                         COPY_ARRAY(BOM_ENCODE_UTF8, BOM_ENCODE_UTF8_LEN);
 181                         break;
 182                 case EUTFE_UTF16:
 183                         #ifdef __BIG_ENDIAN__
 184                                 COPY_ARRAY(BOM_ENCODE_UTF16_BE, BOM_ENCODE_UTF16_LEN);
 185                         #else
 186                                 COPY_ARRAY(BOM_ENCODE_UTF16_LE, BOM_ENCODE_UTF16_LEN);
 187                         #endif
 188                         break;
 189                 case EUTFE_UTF16_BE:
 190                         COPY_ARRAY(BOM_ENCODE_UTF16_BE, BOM_ENCODE_UTF16_LEN);
 191                         break;
 192                 case EUTFE_UTF16_LE:
 193                         COPY_ARRAY(BOM_ENCODE_UTF16_LE, BOM_ENCODE_UTF16_LEN);
 194                         break;
 195                 case EUTFE_UTF32:
 196                         #ifdef __BIG_ENDIAN__
 197                                 COPY_ARRAY(BOM_ENCODE_UTF32_BE, BOM_ENCODE_UTF32_LEN);
 198                         #else
 199                                 COPY_ARRAY(BOM_ENCODE_UTF32_LE, BOM_ENCODE_UTF32_LEN);
 200                         #endif
 201                         break;
 202                 case EUTFE_UTF32_BE:
 203                         COPY_ARRAY(BOM_ENCODE_UTF32_BE, BOM_ENCODE_UTF32_LEN);
 204                         break;
 205                 case EUTFE_UTF32_LE:
 206                         COPY_ARRAY(BOM_ENCODE_UTF32_LE, BOM_ENCODE_UTF32_LEN);
 207                         break;
 208         }
 209         return ret;
 210
 211 #undef COPY_ARRAY
 212 }
 213
 214 //! Detects if the given data stream starts with a unicode BOM.
 215 //! \param data The data stream to check.
 216 //! \return The unicode BOM associated with the data stream, or EUTFE_NONE if none was found.
 217 inline EUTF_ENCODE determineUnicodeBOM(const char* data)
 218 {
 219         if (memcmp(data, BOM_ENCODE_UTF8, 3) == 0) return EUTFE_UTF8;
 220         if (memcmp(data, BOM_ENCODE_UTF16_BE, 2) == 0) return EUTFE_UTF16_BE;
 221         if (memcmp(data, BOM_ENCODE_UTF16_LE, 2) == 0) return EUTFE_UTF16_LE;
 222         if (memcmp(data, BOM_ENCODE_UTF32_BE, 4) == 0) return EUTFE_UTF32_BE;
 223         if (memcmp(data, BOM_ENCODE_UTF32_LE, 4) == 0) return EUTFE_UTF32_LE;
 224         return EUTFE_NONE;
 225 }
 226
 227 } // end namespace unicode
 228
 229
 230 //! UTF-16 string class.
 231 template <typename TAlloc = irrAllocator<uchar16_t> >
 232 class ustring16
 233 {
 234 public:
 235
 236         ///------------------///
 237         /// iterator classes ///
 238         ///------------------///
 239
 240         //! Access an element in a unicode string, allowing one to change it.
 241         class _ustring16_iterator_access
 242         {
 243                 public:
 244                         _ustring16_iterator_access(const ustring16<TAlloc>* s, u32 p) : ref(s), pos(p) {}
 245
 246                         //! Allow the class to be interpreted as a single UTF-32 character.
 247                         operator uchar32_t() const
 248                         {
 249                                 return _get();
 250                         }
 251
 252                         //! Allow one to change the character in the unicode string.
 253                         //! \param c The new character to use.
 254                         //! \return Myself.
 255                         _ustring16_iterator_access& operator=(const uchar32_t c)
 256                         {
 257                                 _set(c);
 258                                 return *this;
 259                         }
 260
 261                         //! Increments the value by 1.
 262                         //! \return Myself.
 263                         _ustring16_iterator_access& operator++()
 264                         {
 265                                 _set(_get() + 1);
 266                                 return *this;
 267                         }
 268
 269                         //! Increments the value by 1, returning the old value.
 270                         //! \return A unicode character.
 271                         uchar32_t operator++(int)
 272                         {
 273                                 uchar32_t old = _get();
 274                                 _set(old + 1);
 275                                 return old;
 276                         }
 277
 278                         //! Decrements the value by 1.
 279                         //! \return Myself.
 280                         _ustring16_iterator_access& operator--()
 281                         {
 282                                 _set(_get() - 1);
 283                                 return *this;
 284                         }
 285
 286                         //! Decrements the value by 1, returning the old value.
 287                         //! \return A unicode character.
 288                         uchar32_t operator--(int)
 289                         {
 290                                 uchar32_t old = _get();
 291                                 _set(old - 1);
 292                                 return old;
 293                         }
 294
 295                         //! Adds to the value by a specified amount.
 296                         //! \param val The amount to add to this character.
 297                         //! \return Myself.
 298                         _ustring16_iterator_access& operator+=(int val)
 299                         {
 300                                 _set(_get() + val);
 301                                 return *this;
 302                         }
 303
 304                         //! Subtracts from the value by a specified amount.
 305                         //! \param val The amount to subtract from this character.
 306                         //! \return Myself.
 307                         _ustring16_iterator_access& operator-=(int val)
 308                         {
 309                                 _set(_get() - val);
 310                                 return *this;
 311                         }
 312
 313                         //! Multiples the value by a specified amount.
 314                         //! \param val The amount to multiply this character by.
 315                         //! \return Myself.
 316                         _ustring16_iterator_access& operator*=(int val)
 317                         {
 318                                 _set(_get() * val);
 319                                 return *this;
 320                         }
 321
 322                         //! Divides the value by a specified amount.
 323                         //! \param val The amount to divide this character by.
 324                         //! \return Myself.
 325                         _ustring16_iterator_access& operator/=(int val)
 326                         {
 327                                 _set(_get() / val);
 328                                 return *this;
 329                         }
 330
 331                         //! Modulos the value by a specified amount.
 332                         //! \param val The amount to modulo this character by.
 333                         //! \return Myself.
 334                         _ustring16_iterator_access& operator%=(int val)
 335                         {
 336                                 _set(_get() % val);
 337                                 return *this;
 338                         }
 339
 340                         //! Adds to the value by a specified amount.
 341                         //! \param val The amount to add to this character.
 342                         //! \return A unicode character.
 343                         uchar32_t operator+(int val) const
 344                         {
 345                                 return _get() + val;
 346                         }
 347
 348                         //! Subtracts from the value by a specified amount.
 349                         //! \param val The amount to subtract from this character.
 350                         //! \return A unicode character.
 351                         uchar32_t operator-(int val) const
 352                         {
 353                                 return _get() - val;
 354                         }
 355
 356                         //! Multiplies the value by a specified amount.
 357                         //! \param val The amount to multiply this character by.
 358                         //! \return A unicode character.
 359                         uchar32_t operator*(int val) const
 360                         {
 361                                 return _get() * val;
 362                         }
 363
 364                         //! Divides the value by a specified amount.
 365                         //! \param val The amount to divide this character by.
 366                         //! \return A unicode character.
 367                         uchar32_t operator/(int val) const
 368                         {
 369                                 return _get() / val;
 370                         }
 371
 372                         //! Modulos the value by a specified amount.
 373                         //! \param val The amount to modulo this character by.
 374                         //! \return A unicode character.
 375                         uchar32_t operator%(int val) const
 376                         {
 377                                 return _get() % val;
 378                         }
 379
 380                 private:
 381                         //! Gets a uchar32_t from our current position.
 382                         uchar32_t _get() const
 383                         {
 384                                 const uchar16_t* a = ref->c_str();
 385                                 if (!UTF16_IS_SURROGATE(a[pos]))
 386                                         return static_cast<uchar32_t>(a[pos]);
 387                                 else
 388                                 {
 389                                         if (pos + 1 >= ref->size_raw())
 390                                                 return 0;
 391
 392                                         return unicode::toUTF32(a[pos], a[pos + 1]);
 393                                 }
 394                         }
 395
 396                         //! Sets a uchar32_t at our current position.
 397                         void _set(uchar32_t c)
 398                         {
 399                                 ustring16<TAlloc>* ref2 = const_cast<ustring16<TAlloc>*>(ref);
 400                                 const uchar16_t* a = ref2->c_str();
 401                                 if (c > 0xFFFF)
 402                                 {
 403                                         // c will be multibyte, so split it up into the high and low surrogate pairs.
 404                                         uchar16_t x = static_cast<uchar16_t>(c);
 405                                         uchar16_t vh = UTF16_HI_SURROGATE | ((((c >> 16) & ((1 << 5) - 1)) - 1) << 6) | (x >> 10);
 406                                         uchar16_t vl = UTF16_LO_SURROGATE | (x & ((1 << 10) - 1));
 407
 408                                         // If the previous position was a surrogate pair, just replace them.  Else, insert the low pair.
 409                                         if (UTF16_IS_SURROGATE_HI(a[pos]) && pos + 1 != ref2->size_raw())
 410                                                 ref2->replace_raw(vl, static_cast<u32>(pos) + 1);
 411                                         else ref2->insert_raw(vl, static_cast<u32>(pos) + 1);
 412
 413                                         ref2->replace_raw(vh, static_cast<u32>(pos));
 414                                 }
 415                                 else
 416                                 {
 417                                         // c will be a single byte.
 418                                         uchar16_t vh = static_cast<uchar16_t>(c);
 419
 420                                         // If the previous position was a surrogate pair, remove the extra byte.
 421                                         if (UTF16_IS_SURROGATE_HI(a[pos]))
 422                                                 ref2->erase_raw(static_cast<u32>(pos) + 1);
 423
 424                                         ref2->replace_raw(vh, static_cast<u32>(pos));
 425                                 }
 426                         }
 427
 428                         const ustring16<TAlloc>* ref;
 429                         u32 pos;
 430         };
 431         typedef typename ustring16<TAlloc>::_ustring16_iterator_access access;
 432
 433
 434         //! Iterator to iterate through a UTF-16 string.
 435 #ifndef USTRING_NO_STL
 436         class _ustring16_const_iterator : public std::iterator<
 437                 std::bidirectional_iterator_tag,        // iterator_category
 438                 access,                                                         // value_type
 439                 ptrdiff_t,                                                      // difference_type
 440                 const access,                                           // pointer
 441                 const access                                            // reference
 442         >
 443 #else
 444         class _ustring16_const_iterator
 445 #endif
 446         {
 447                 public:
 448                         typedef _ustring16_const_iterator _Iter;
 449                         typedef std::iterator<std::bidirectional_iterator_tag, access, ptrdiff_t, const access, const access> _Base;
 450                         typedef const access const_pointer;
 451                         typedef const access const_reference;
 452
 453 #ifndef USTRING_NO_STL
 454                         typedef typename _Base::value_type value_type;
 455                         typedef typename _Base::difference_type difference_type;
 456                         typedef typename _Base::difference_type distance_type;
 457                         typedef typename _Base::pointer pointer;
 458                         typedef const_reference reference;
 459 #else
 460                         typedef access value_type;
 461                         typedef u32 difference_type;
 462                         typedef u32 distance_type;
 463                         typedef const_pointer pointer;
 464                         typedef const_reference reference;
 465 #endif
 466
 467                         //! Constructors.
 468                         _ustring16_const_iterator(const _Iter& i) : ref(i.ref), pos(i.pos) {}
 469                         _ustring16_const_iterator(const ustring16<TAlloc>& s) : ref(&s), pos(0) {}
 470                         _ustring16_const_iterator(const ustring16<TAlloc>& s, const u32 p) : ref(&s), pos(0)
 471                         {
 472                                 if (ref->size_raw() == 0 || p == 0)
 473                                         return;
 474
 475                                 // Go to the appropriate position.
 476                                 u32 i = p;
 477                                 u32 sr = ref->size_raw();
 478                                 const uchar16_t* a = ref->c_str();
 479                                 while (i != 0 && pos < sr)
 480                                 {
 481                                         if (UTF16_IS_SURROGATE_HI(a[pos]))
 482                                                 pos += 2;
 483                                         else ++pos;
 484                                         --i;
 485                                 }
 486                         }
 487
 488                         //! Test for equalness.
 489                         bool operator==(const _Iter& iter) const
 490                         {
 491                                 if (ref == iter.ref && pos == iter.pos)
 492                                         return true;
 493                                 return false;
 494                         }
 495
 496                         //! Test for unequalness.
 497                         bool operator!=(const _Iter& iter) const
 498                         {
 499                                 if (ref != iter.ref || pos != iter.pos)
 500                                         return true;
 501                                 return false;
 502                         }
 503
 504                         //! Switch to the next full character in the string.
 505                         _Iter& operator++()
 506                         {       // ++iterator
 507                                 if (pos == ref->size_raw()) return *this;
 508                                 const uchar16_t* a = ref->c_str();
 509                                 if (UTF16_IS_SURROGATE_HI(a[pos]))
 510                                         pos += 2;                       // TODO: check for valid low surrogate?
 511                                 else ++pos;
 512                                 if (pos > ref->size_raw()) pos = ref->size_raw();
 513                                 return *this;
 514                         }
 515
 516                         //! Switch to the next full character in the string, returning the previous position.
 517                         _Iter operator++(int)
 518                         {       // iterator++
 519                                 _Iter _tmp(*this);
 520                                 ++*this;
 521                                 return _tmp;
 522                         }
 523
 524                         //! Switch to the previous full character in the string.
 525                         _Iter& operator--()
 526                         {       // --iterator
 527                                 if (pos == 0) return *this;
 528                                 const uchar16_t* a = ref->c_str();
 529                                 --pos;
 530                                 if (UTF16_IS_SURROGATE_LO(a[pos]) && pos != 0)  // low surrogate, go back one more.
 531                                         --pos;
 532                                 return *this;
 533                         }
 534
 535                         //! Switch to the previous full character in the string, returning the previous position.
 536                         _Iter operator--(int)
 537                         {       // iterator--
 538                                 _Iter _tmp(*this);
 539                                 --*this;
 540                                 return _tmp;
 541                         }
 542
 543                         //! Advance a specified number of full characters in the string.
 544                         //! \return Myself.
 545                         _Iter& operator+=(const difference_type v)
 546                         {
 547                                 if (v == 0) return *this;
 548                                 if (v < 0) return operator-=(v * -1);
 549
 550                                 if (pos >= ref->size_raw())
 551                                         return *this;
 552
 553                                 // Go to the appropriate position.
 554                                 // TODO: Don't force u32 on an x64 OS.  Make it agnostic.
 555                                 u32 i = (u32)v;
 556                                 u32 sr = ref->size_raw();
 557                                 const uchar16_t* a = ref->c_str();
 558                                 while (i != 0 && pos < sr)
 559                                 {
 560                                         if (UTF16_IS_SURROGATE_HI(a[pos]))
 561                                                 pos += 2;
 562                                         else ++pos;
 563                                         --i;
 564                                 }
 565                                 if (pos > sr)
 566                                         pos = sr;
 567
 568                                 return *this;
 569                         }
 570
 571                         //! Go back a specified number of full characters in the string.
 572                         //! \return Myself.
 573                         _Iter& operator-=(const difference_type v)
 574                         {
 575                                 if (v == 0) return *this;
 576                                 if (v > 0) return operator+=(v * -1);
 577
 578                                 if (pos == 0)
 579                                         return *this;
 580
 581                                 // Go to the appropriate position.
 582                                 // TODO: Don't force u32 on an x64 OS.  Make it agnostic.
 583                                 u32 i = (u32)v;
 584                                 const uchar16_t* a = ref->c_str();
 585                                 while (i != 0 && pos != 0)
 586                                 {
 587                                         --pos;
 588                                         if (UTF16_IS_SURROGATE_LO(a[pos]) != 0 && pos != 0)
 589                                                 --pos;
 590                                         --i;
 591                                 }
 592
 593                                 return *this;
 594                         }
 595
 596                         //! Return a new iterator that is a variable number of full characters forward from the current position.
 597                         _Iter operator+(const difference_type v) const
 598                         {
 599                                 _Iter ret(*this);
 600                                 ret += v;
 601                                 return ret;
 602                         }
 603
 604                         //! Return a new iterator that is a variable number of full characters backward from the current position.
 605                         _Iter operator-(const difference_type v) const
 606                         {
 607                                 _Iter ret(*this);
 608                                 ret -= v;
 609                                 return ret;
 610                         }
 611
 612                         //! Returns the distance between two iterators.
 613                         difference_type operator-(const _Iter& iter) const
 614                         {
 615                                 // Make sure we reference the same object!
 616                                 if (ref != iter.ref)
 617                                         return difference_type();
 618
 619                                 _Iter i = iter;
 620                                 difference_type ret;
 621
 622                                 // Walk up.
 623                                 if (pos > i.pos)
 624                                 {
 625                                         while (pos > i.pos)
 626                                         {
 627                                                 ++i;
 628                                                 ++ret;
 629                                         }
 630                                         return ret;
 631                                 }
 632
 633                                 // Walk down.
 634                                 while (pos < i.pos)
 635                                 {
 636                                         --i;
 637                                         --ret;
 638                                 }
 639                                 return ret;
 640                         }
 641
 642                         //! Accesses the full character at the iterator's position.
 643                         const_reference operator*() const
 644                         {
 645                                 if (pos >= ref->size_raw())
 646                                 {
 647                                         const uchar16_t* a = ref->c_str();
 648                                         u32 p = ref->size_raw();
 649                                         if (UTF16_IS_SURROGATE_LO(a[p]))
 650                                                 --p;
 651                                         reference ret(ref, p);
 652                                         return ret;
 653                                 }
 654                                 const_reference ret(ref, pos);
 655                                 return ret;
 656                         }
 657
 658                         //! Accesses the full character at the iterator's position.
 659                         reference operator*()
 660                         {
 661                                 if (pos >= ref->size_raw())
 662                                 {
 663                                         const uchar16_t* a = ref->c_str();
 664                                         u32 p = ref->size_raw();
 665                                         if (UTF16_IS_SURROGATE_LO(a[p]))
 666                                                 --p;
 667                                         reference ret(ref, p);
 668                                         return ret;
 669                                 }
 670                                 reference ret(ref, pos);
 671                                 return ret;
 672                         }
 673
 674                         //! Accesses the full character at the iterator's position.
 675                         const_pointer operator->() const
 676                         {
 677                                 return operator*();
 678                         }
 679
 680                         //! Accesses the full character at the iterator's position.
 681                         pointer operator->()
 682                         {
 683                                 return operator*();
 684                         }
 685
 686                         //! Is the iterator at the start of the string?
 687                         bool atStart() const
 688                         {
 689                                 return pos == 0;
 690                         }
 691
 692                         //! Is the iterator at the end of the string?
 693                         bool atEnd() const
 694                         {
 695                                 const uchar16_t* a = ref->c_str();
 696                                 if (UTF16_IS_SURROGATE(a[pos]))
 697                                         return (pos + 1) >= ref->size_raw();
 698                                 else return pos >= ref->size_raw();
 699                         }
 700
 701                         //! Moves the iterator to the start of the string.
 702                         void toStart()
 703                         {
 704                                 pos = 0;
 705                         }
 706
 707                         //! Moves the iterator to the end of the string.
 708                         void toEnd()
 709                         {
 710                                 const uchar16_t* a = ref->c_str();
 711                                 pos = ref->size_raw();
 712                         }
 713
 714                         //! Returns the iterator's position.
 715                         //! \return The iterator's position.
 716                         u32 getPos() const
 717                         {
 718                                 return pos;
 719                         }
 720
 721                 protected:
 722                         const ustring16<TAlloc>* ref;
 723                         u32 pos;
 724         };
 725
 726         //! Iterator to iterate through a UTF-16 string.
 727         class _ustring16_iterator : public _ustring16_const_iterator
 728         {
 729                 public:
 730                         typedef _ustring16_iterator _Iter;
 731                         typedef _ustring16_const_iterator _Base;
 732                         typedef typename _Base::const_pointer const_pointer;
 733                         typedef typename _Base::const_reference const_reference;
 734
 735                         typedef typename _Base::value_type value_type;
 736                         typedef typename _Base::difference_type difference_type;
 737                         typedef typename _Base::distance_type distance_type;
 738                         typedef access pointer;
 739                         typedef access reference;
 740
 741                         using _Base::pos;
 742                         using _Base::ref;
 743
 744                         //! Constructors.
 745                         _ustring16_iterator(const _Iter& i) : _ustring16_const_iterator(i) {}
 746                         _ustring16_iterator(const ustring16<TAlloc>& s) : _ustring16_const_iterator(s) {}
 747                         _ustring16_iterator(const ustring16<TAlloc>& s, const u32 p) : _ustring16_const_iterator(s, p) {}
 748
 749                         //! Accesses the full character at the iterator's position.
 750                         reference operator*() const
 751                         {
 752                                 if (pos >= ref->size_raw())
 753                                 {
 754                                         const uchar16_t* a = ref->c_str();
 755                                         u32 p = ref->size_raw();
 756                                         if (UTF16_IS_SURROGATE_LO(a[p]))
 757                                                 --p;
 758                                         reference ret(ref, p);
 759                                         return ret;
 760                                 }
 761                                 reference ret(ref, pos);
 762                                 return ret;
 763                         }
 764
 765                         //! Accesses the full character at the iterator's position.
 766                         reference operator*()
 767                         {
 768                                 if (pos >= ref->size_raw())
 769                                 {
 770                                         const uchar16_t* a = ref->c_str();
 771                                         u32 p = ref->size_raw();
 772                                         if (UTF16_IS_SURROGATE_LO(a[p]))
 773                                                 --p;
 774                                         reference ret(ref, p);
 775                                         return ret;
 776                                 }
 777                                 reference ret(ref, pos);
 778                                 return ret;
 779                         }
 780
 781                         //! Accesses the full character at the iterator's position.
 782                         pointer operator->() const
 783                         {
 784                                 return operator*();
 785                         }
 786
 787                         //! Accesses the full character at the iterator's position.
 788                         pointer operator->()
 789                         {
 790                                 return operator*();
 791                         }
 792         };
 793
 794         typedef typename ustring16<TAlloc>::_ustring16_iterator iterator;
 795         typedef typename ustring16<TAlloc>::_ustring16_const_iterator const_iterator;
 796
 797         ///----------------------///
 798         /// end iterator classes ///
 799         ///----------------------///
 800
 801         //! Default constructor
 802         ustring16()
 803         : array(0), allocated(1), used(0)
 804         {
 805 #if __BIG_ENDIAN__
 806                 encoding = unicode::EUTFE_UTF16_BE;
 807 #else
 808                 encoding = unicode::EUTFE_UTF16_LE;
 809 #endif
 810                 array = allocator.allocate(1); // new u16[1];
 811                 array[0] = 0x0;
 812         }
 813
 814
 815         //! Constructor
 816         ustring16(const ustring16<TAlloc>& other)
 817         : array(0), allocated(0), used(0)
 818         {
 819 #if __BIG_ENDIAN__
 820                 encoding = unicode::EUTFE_UTF16_BE;
 821 #else
 822                 encoding = unicode::EUTFE_UTF16_LE;
 823 #endif
 824                 *this = other;
 825         }
 826
 827
 828         //! Constructor from other string types
 829         template <class B, class A>
 830         ustring16(const string<B, A>& other)
 831         : array(0), allocated(0), used(0)
 832         {
 833 #if __BIG_ENDIAN__
 834                 encoding = unicode::EUTFE_UTF16_BE;
 835 #else
 836                 encoding = unicode::EUTFE_UTF16_LE;
 837 #endif
 838                 *this = other;
 839         }
 840
 841
 842 #ifndef USTRING_NO_STL
 843         //! Constructor from std::string
 844         template <class B, class A, typename Alloc>
 845         ustring16(const std::basic_string<B, A, Alloc>& other)
 846         : array(0), allocated(0), used(0)
 847         {
 848 #if __BIG_ENDIAN__
 849                 encoding = unicode::EUTFE_UTF16_BE;
 850 #else
 851                 encoding = unicode::EUTFE_UTF16_LE;
 852 #endif
 853                 *this = other.c_str();
 854         }
 855
 856
 857         //! Constructor from iterator.
 858         template <typename Itr>
 859         ustring16(Itr first, Itr last)
 860         : array(0), allocated(0), used(0)
 861         {
 862 #if __BIG_ENDIAN__
 863                 encoding = unicode::EUTFE_UTF16_BE;
 864 #else
 865                 encoding = unicode::EUTFE_UTF16_LE;
 866 #endif
 867                 reserve(std::distance(first, last));
 868                 array[used] = 0;
 869
 870                 for (; first != last; ++first)
 871                         append((uchar32_t)*first);
 872         }
 873 #endif
 874
 875
 876 #ifndef USTRING_CPP0X_NEWLITERALS
 877         //! Constructor for copying a character string from a pointer.
 878         ustring16(const char* const c)
 879         : array(0), allocated(0), used(0)
 880         {
 881 #if __BIG_ENDIAN__
 882                 encoding = unicode::EUTFE_UTF16_BE;
 883 #else
 884                 encoding = unicode::EUTFE_UTF16_LE;
 885 #endif
 886
 887                 loadDataStream(c, strlen(c));
 888                 //append((uchar8_t*)c);
 889         }
 890
 891
 892         //! Constructor for copying a character string from a pointer with a given length.
 893         ustring16(const char* const c, u32 length)
 894         : array(0), allocated(0), used(0)
 895         {
 896 #if __BIG_ENDIAN__
 897                 encoding = unicode::EUTFE_UTF16_BE;
 898 #else
 899                 encoding = unicode::EUTFE_UTF16_LE;
 900 #endif
 901
 902                 loadDataStream(c, length);
 903         }
 904 #endif
 905
 906
 907         //! Constructor for copying a UTF-8 string from a pointer.
 908         ustring16(const uchar8_t* const c)
 909         : array(0), allocated(0), used(0)
 910         {
 911 #if __BIG_ENDIAN__
 912                 encoding = unicode::EUTFE_UTF16_BE;
 913 #else
 914                 encoding = unicode::EUTFE_UTF16_LE;
 915 #endif
 916
 917                 append(c);
 918         }
 919
 920
 921         //! Constructor for copying a UTF-8 string from a single char.
 922         ustring16(const char c)
 923         : array(0), allocated(0), used(0)
 924         {
 925 #if __BIG_ENDIAN__
 926                 encoding = unicode::EUTFE_UTF16_BE;
 927 #else
 928                 encoding = unicode::EUTFE_UTF16_LE;
 929 #endif
 930
 931                 append((uchar32_t)c);
 932         }
 933
 934
 935         //! Constructor for copying a UTF-8 string from a pointer with a given length.
 936         ustring16(const uchar8_t* const c, u32 length)
 937         : array(0), allocated(0), used(0)
 938         {
 939 #if __BIG_ENDIAN__
 940                 encoding = unicode::EUTFE_UTF16_BE;
 941 #else
 942                 encoding = unicode::EUTFE_UTF16_LE;
 943 #endif
 944
 945                 append(c, length);
 946         }
 947
 948
 949         //! Constructor for copying a UTF-16 string from a pointer.
 950         ustring16(const uchar16_t* const c)
 951         : array(0), allocated(0), used(0)
 952         {
 953 #if __BIG_ENDIAN__
 954                 encoding = unicode::EUTFE_UTF16_BE;
 955 #else
 956                 encoding = unicode::EUTFE_UTF16_LE;
 957 #endif
 958
 959                 append(c);
 960         }
 961
 962
 963         //! Constructor for copying a UTF-16 string from a pointer with a given length
 964         ustring16(const uchar16_t* const c, u32 length)
 965         : array(0), allocated(0), used(0)
 966         {
 967 #if __BIG_ENDIAN__
 968                 encoding = unicode::EUTFE_UTF16_BE;
 969 #else
 970                 encoding = unicode::EUTFE_UTF16_LE;
 971 #endif
 972
 973                 append(c, length);
 974         }
 975
 976
 977         //! Constructor for copying a UTF-32 string from a pointer.
 978         ustring16(const uchar32_t* const c)
 979         : array(0), allocated(0), used(0)
 980         {
 981 #if __BIG_ENDIAN__
 982                 encoding = unicode::EUTFE_UTF16_BE;
 983 #else
 984                 encoding = unicode::EUTFE_UTF16_LE;
 985 #endif
 986
 987                 append(c);
 988         }
 989
 990
 991         //! Constructor for copying a UTF-32 from a pointer with a given length.
 992         ustring16(const uchar32_t* const c, u32 length)
 993         : array(0), allocated(0), used(0)
 994         {
 995 #if __BIG_ENDIAN__
 996                 encoding = unicode::EUTFE_UTF16_BE;
 997 #else
 998                 encoding = unicode::EUTFE_UTF16_LE;
 999 #endif
1000
1001                 append(c, length);
1002         }
1003
1004
1005         //! Constructor for copying a wchar_t string from a pointer.
1006         ustring16(const wchar_t* const c)
1007         : array(0), allocated(0), used(0)
1008         {
1009 #if __BIG_ENDIAN__
1010                 encoding = unicode::EUTFE_UTF16_BE;
1011 #else
1012                 encoding = unicode::EUTFE_UTF16_LE;
1013 #endif
1014
1015                 if (sizeof(wchar_t) == 4)
1016                         append(reinterpret_cast<const uchar32_t* const>(c));
1017                 else if (sizeof(wchar_t) == 2)
1018                         append(reinterpret_cast<const uchar16_t* const>(c));
1019                 else if (sizeof(wchar_t) == 1)
1020                         append(reinterpret_cast<const uchar8_t* const>(c));
1021         }
1022
1023
1024         //! Constructor for copying a wchar_t string from a pointer with a given length.
1025         ustring16(const wchar_t* const c, u32 length)
1026         : array(0), allocated(0), used(0)
1027         {
1028 #if __BIG_ENDIAN__
1029                 encoding = unicode::EUTFE_UTF16_BE;
1030 #else
1031                 encoding = unicode::EUTFE_UTF16_LE;
1032 #endif
1033
1034                 if (sizeof(wchar_t) == 4)
1035                         append(reinterpret_cast<const uchar32_t* const>(c), length);
1036                 else if (sizeof(wchar_t) == 2)
1037                         append(reinterpret_cast<const uchar16_t* const>(c), length);
1038                 else if (sizeof(wchar_t) == 1)
1039                         append(reinterpret_cast<const uchar8_t* const>(c), length);
1040         }
1041
1042
1043 #ifdef USTRING_CPP0X
1044         //! Constructor for moving a ustring16
1045         ustring16(ustring16<TAlloc>&& other)
1046         : array(other.array), encoding(other.encoding), allocated(other.allocated), used(other.used)
1047         {
1048                 //std::cout << "MOVE constructor" << std::endl;
1049                 other.array = 0;
1050                 other.allocated = 0;
1051                 other.used = 0;
1052         }
1053 #endif
1054
1055
1056         //! Destructor
1057         ~ustring16()
1058         {
1059                 allocator.deallocate(array); // delete [] array;
1060         }
1061
1062
1063         //! Assignment operator
1064         ustring16& operator=(const ustring16<TAlloc>& other)
1065         {
1066                 if (this == &other)
1067                         return *this;
1068
1069                 used = other.size_raw();
1070                 if (used >= allocated)
1071                 {
1072                         allocator.deallocate(array); // delete [] array;
1073                         allocated = used + 1;
1074                         array = allocator.allocate(used + 1); //new u16[used];
1075                 }
1076
1077                 const uchar16_t* p = other.c_str();
1078                 for (u32 i=0; i<=used; ++i, ++p)
1079                         array[i] = *p;
1080
1081                 array[used] = 0;
1082
1083                 // Validate our new UTF-16 string.
1084                 validate();
1085
1086                 return *this;
1087         }
1088
1089
1090 #ifdef USTRING_CPP0X
1091         //! Move assignment operator
1092         ustring16& operator=(ustring16<TAlloc>&& other)
1093         {
1094                 if (this != &other)
1095                 {
1096                         //std::cout << "MOVE operator=" << std::endl;
1097                         allocator.deallocate(array);
1098
1099                         array = other.array;
1100                         allocated = other.allocated;
1101                         encoding = other.encoding;
1102                         used = other.used;
1103                         other.array = 0;
1104                         other.used = 0;
1105                 }
1106                 return *this;
1107         }
1108 #endif
1109
1110
1111         //! Assignment operator for other string types
1112         template <class B, class A>
1113         ustring16<TAlloc>& operator=(const string<B, A>& other)
1114         {
1115                 *this = other.c_str();
1116                 return *this;
1117         }
1118
1119
1120         //! Assignment operator for UTF-8 strings
1121         ustring16<TAlloc>& operator=(const uchar8_t* const c)
1122         {
1123                 if (!array)
1124                 {
1125                         array = allocator.allocate(1); //new u16[1];
1126                         allocated = 1;
1127                 }
1128                 used = 0;
1129                 array[used] = 0x0;
1130                 if (!c) return *this;
1131
1132                 //! Append our string now.
1133                 append(c);
1134                 return *this;
1135         }
1136
1137
1138         //! Assignment operator for UTF-16 strings
1139         ustring16<TAlloc>& operator=(const uchar16_t* const c)
1140         {
1141                 if (!array)
1142                 {
1143                         array = allocator.allocate(1); //new u16[1];
1144                         allocated = 1;
1145                 }
1146                 used = 0;
1147                 array[used] = 0x0;
1148                 if (!c) return *this;
1149
1150                 //! Append our string now.
1151                 append(c);
1152                 return *this;
1153         }
1154
1155
1156         //! Assignment operator for UTF-32 strings
1157         ustring16<TAlloc>& operator=(const uchar32_t* const c)
1158         {
1159                 if (!array)
1160                 {
1161                         array = allocator.allocate(1); //new u16[1];
1162                         allocated = 1;
1163                 }
1164                 used = 0;
1165                 array[used] = 0x0;
1166                 if (!c) return *this;
1167
1168                 //! Append our string now.
1169                 append(c);
1170                 return *this;
1171         }
1172
1173
1174         //! Assignment operator for wchar_t strings.
1175         /** Note that this assumes that a correct unicode string is stored in the wchar_t string.
1176                 Since wchar_t changes depending on its platform, it could either be a UTF-8, -16, or -32 string.
1177                 This function assumes you are storing the correct unicode encoding inside the wchar_t string. **/
1178         ustring16<TAlloc>& operator=(const wchar_t* const c)
1179         {
1180                 if (sizeof(wchar_t) == 4)
1181                         *this = reinterpret_cast<const uchar32_t* const>(c);
1182                 else if (sizeof(wchar_t) == 2)
1183                         *this = reinterpret_cast<const uchar16_t* const>(c);
1184                 else if (sizeof(wchar_t) == 1)
1185                         *this = reinterpret_cast<const uchar8_t* const>(c);
1186
1187                 return *this;
1188         }
1189
1190
1191         //! Assignment operator for other strings.
1192         /** Note that this assumes that a correct unicode string is stored in the string. **/
1193         template <class B>
1194         ustring16<TAlloc>& operator=(const B* const c)
1195         {
1196                 if (sizeof(B) == 4)
1197                         *this = reinterpret_cast<const uchar32_t* const>(c);
1198                 else if (sizeof(B) == 2)
1199                         *this = reinterpret_cast<const uchar16_t* const>(c);
1200                 else if (sizeof(B) == 1)
1201                         *this = reinterpret_cast<const uchar8_t* const>(c);
1202
1203                 return *this;
1204         }
1205
1206
1207         //! Direct access operator
1208         access operator [](const u32 index)
1209         {
1210                 _IRR_DEBUG_BREAK_IF(index>=size()) // bad index
1211                 iterator iter(*this, index);
1212                 return iter.operator*();
1213         }
1214
1215
1216         //! Direct access operator
1217         const access operator [](const u32 index) const
1218         {
1219                 _IRR_DEBUG_BREAK_IF(index>=size()) // bad index
1220                 const_iterator iter(*this, index);
1221                 return iter.operator*();
1222         }
1223
1224
1225         //! Equality operator
1226         bool operator ==(const uchar16_t* const str) const
1227         {
1228                 if (!str)
1229                         return false;
1230
1231                 u32 i;
1232                 for(i=0; array[i] && str[i]; ++i)
1233                         if (array[i] != str[i])
1234                                 return false;
1235
1236                 return !array[i] && !str[i];
1237         }
1238
1239
1240         //! Equality operator
1241         bool operator ==(const ustring16<TAlloc>& other) const
1242         {
1243                 for(u32 i=0; array[i] && other.array[i]; ++i)
1244                         if (array[i] != other.array[i])
1245                                 return false;
1246
1247                 return used == other.used;
1248         }
1249
1250
1251         //! Is smaller comparator
1252         bool operator <(const ustring16<TAlloc>& other) const
1253         {
1254                 for(u32 i=0; array[i] && other.array[i]; ++i)
1255                 {
1256                         s32 diff = array[i] - other.array[i];
1257                         if ( diff )
1258                                 return diff < 0;
1259                 }
1260
1261                 return used < other.used;
1262         }
1263
1264
1265         //! Inequality operator
1266         bool operator !=(const uchar16_t* const str) const
1267         {
1268                 return !(*this == str);
1269         }
1270
1271
1272         //! Inequality operator
1273         bool operator !=(const ustring16<TAlloc>& other) const
1274         {
1275                 return !(*this == other);
1276         }
1277
1278
1279         //! Returns the length of a ustring16 in full characters.
1280         //! \return Length of a ustring16 in full characters.
1281         u32 size() const
1282         {
1283                 const_iterator i(*this, 0);
1284                 u32 pos = 0;
1285                 while (!i.atEnd())
1286                 {
1287                         ++i;
1288                         ++pos;
1289                 }
1290                 return pos;
1291         }
1292
1293
1294         //! Informs if the ustring is empty or not.
1295         //! \return True if the ustring is empty, false if not.
1296         bool empty() const
1297         {
1298                 return (size_raw() == 0);
1299         }
1300
1301
1302         //! Returns a pointer to the raw UTF-16 string data.
1303         //! \return pointer to C-style NUL terminated array of UTF-16 code points.
1304         const uchar16_t* c_str() const
1305         {
1306                 return array;
1307         }
1308
1309
1310         //! Compares the first n characters of this string with another.
1311         //! \param other Other string to compare to.
1312         //! \param n Number of characters to compare.
1313         //! \return True if the n first characters of both strings are equal.
1314         bool equalsn(const ustring16<TAlloc>& other, u32 n) const
1315         {
1316                 u32 i;
1317                 const uchar16_t* oa = other.c_str();
1318                 for(i=0; array[i] && oa[i] && i < n; ++i)
1319                         if (array[i] != oa[i])
1320                                 return false;
1321
1322                 // if one (or both) of the strings was smaller then they
1323                 // are only equal if they have the same length
1324                 return (i == n) || (used == other.used);
1325         }
1326
1327
1328         //! Compares the first n characters of this string with another.
1329         //! \param str Other string to compare to.
1330         //! \param n Number of characters to compare.
1331         //! \return True if the n first characters of both strings are equal.
1332         bool equalsn(const uchar16_t* const str, u32 n) const
1333         {
1334                 if (!str)
1335                         return false;
1336                 u32 i;
1337                 for(i=0; array[i] && str[i] && i < n; ++i)
1338                         if (array[i] != str[i])
1339                                 return false;
1340
1341                 // if one (or both) of the strings was smaller then they
1342                 // are only equal if they have the same length
1343                 return (i == n) || (array[i] == 0 && str[i] == 0);
1344         }
1345
1346
1347         //! Appends a character to this ustring16
1348         //! \param character The character to append.
1349         //! \return A reference to our current string.
1350         ustring16<TAlloc>& append(uchar32_t character)
1351         {
1352                 if (used + 2 >= allocated)
1353                         reallocate(used + 2);
1354
1355                 if (character > 0xFFFF)
1356                 {
1357                         used += 2;
1358
1359                         // character will be multibyte, so split it up into a surrogate pair.
1360                         uchar16_t x = static_cast<uchar16_t>(character);
1361                         uchar16_t vh = UTF16_HI_SURROGATE | ((((character >> 16) & ((1 << 5) - 1)) - 1) << 6) | (x >> 10);
1362                         uchar16_t vl = UTF16_LO_SURROGATE | (x & ((1 << 10) - 1));
1363                         array[used-2] = vh;
1364                         array[used-1] = vl;
1365                 }
1366                 else
1367                 {
1368                         ++used;
1369                         array[used-1] = character;
1370                 }
1371                 array[used] = 0;
1372
1373                 return *this;
1374         }
1375
1376
1377         //! Appends a UTF-8 string to this ustring16
1378         //! \param other The UTF-8 string to append.
1379         //! \param length The length of the string to append.
1380         //! \return A reference to our current string.
1381         ustring16<TAlloc>& append(const uchar8_t* const other, u32 length=0xffffffff)
1382         {
1383                 if (!other)
1384                         return *this;
1385
1386                 // Determine if the string is long enough for a BOM.
1387                 u32 len = 0;
1388                 const uchar8_t* p = other;
1389                 do
1390                 {
1391                         ++len;
1392                 } while (*p++ && len < unicode::BOM_ENCODE_UTF8_LEN);
1393
1394                 // Check for BOM.
1395                 unicode::EUTF_ENCODE c_bom = unicode::EUTFE_NONE;
1396                 if (len == unicode::BOM_ENCODE_UTF8_LEN)
1397                 {
1398                         if (memcmp(other, unicode::BOM_ENCODE_UTF8, unicode::BOM_ENCODE_UTF8_LEN) == 0)
1399                                 c_bom = unicode::EUTFE_UTF8;
1400                 }
1401
1402                 // If a BOM was found, don't include it in the string.
1403                 const uchar8_t* c2 = other;
1404                 if (c_bom != unicode::EUTFE_NONE)
1405                 {
1406                         c2 = other + unicode::BOM_UTF8_LEN;
1407                         length -= unicode::BOM_UTF8_LEN;
1408                 }
1409
1410                 // Calculate the size of the string to read in.
1411                 len = 0;
1412                 p = c2;
1413                 do
1414                 {
1415                         ++len;
1416                 } while(*p++ && len < length);
1417                 if (len > length)
1418                         len = length;
1419
1420                 // If we need to grow the array, do it now.
1421                 if (used + len >= allocated)
1422                         reallocate(used + (len * 2));
1423                 u32 start = used;
1424
1425                 // Convert UTF-8 to UTF-16.
1426                 u32 pos = start;
1427                 for (u32 l = 0; l<len;)
1428                 {
1429                         ++used;
1430                         if (((c2[l] >> 6) & 0x03) == 0x02)
1431                         {       // Invalid continuation byte.
1432                                 array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1433                                 ++l;
1434                         }
1435                         else if (c2[l] == 0xC0 || c2[l] == 0xC1)
1436                         {       // Invalid byte - overlong encoding.
1437                                 array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1438                                 ++l;
1439                         }
1440                         else if ((c2[l] & 0xF8) == 0xF0)
1441                         {       // 4 bytes UTF-8, 2 bytes UTF-16.
1442                                 // Check for a full string.
1443                                 if ((l + 3) >= len)
1444                                 {
1445                                         array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1446                                         l += 3;
1447                                         break;
1448                                 }
1449
1450                                 // Validate.
1451                                 bool valid = true;
1452                                 u8 l2 = 0;
1453                                 if (valid && (((c2[l+1] >> 6) & 0x03) == 0x02)) ++l2; else valid = false;
1454                                 if (valid && (((c2[l+2] >> 6) & 0x03) == 0x02)) ++l2; else valid = false;
1455                                 if (valid && (((c2[l+3] >> 6) & 0x03) == 0x02)) ++l2; else valid = false;
1456                                 if (!valid)
1457                                 {
1458                                         array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1459                                         l += l2;
1460                                         continue;
1461                                 }
1462
1463                                 // Decode.
1464                                 uchar8_t b1 = ((c2[l] & 0x7) << 2) | ((c2[l+1] >> 4) & 0x3);
1465                                 uchar8_t b2 = ((c2[l+1] & 0xF) << 4) | ((c2[l+2] >> 2) & 0xF);
1466                                 uchar8_t b3 = ((c2[l+2] & 0x3) << 6) | (c2[l+3] & 0x3F);
1467                                 uchar32_t v = b3 | ((uchar32_t)b2 << 8) | ((uchar32_t)b1 << 16);
1468
1469                                 // Split v up into a surrogate pair.
1470                                 uchar16_t x = static_cast<uchar16_t>(v);
1471                                 uchar16_t vh = UTF16_HI_SURROGATE | ((((v >> 16) & ((1 << 5) - 1)) - 1) << 6) | (x >> 10);
1472                                 uchar16_t vl = UTF16_LO_SURROGATE | (x & ((1 << 10) - 1));
1473
1474                                 array[pos++] = vh;
1475                                 array[pos++] = vl;
1476                                 l += 4;
1477                                 ++used;         // Using two shorts this time, so increase used by 1.
1478                         }
1479                         else if ((c2[l] & 0xF0) == 0xE0)
1480                         {       // 3 bytes UTF-8, 1 byte UTF-16.
1481                                 // Check for a full string.
1482                                 if ((l + 2) >= len)
1483                                 {
1484                                         array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1485                                         l += 2;
1486                                         break;
1487                                 }
1488
1489                                 // Validate.
1490                                 bool valid = true;
1491                                 u8 l2 = 0;
1492                                 if (valid && (((c2[l+1] >> 6) & 0x03) == 0x02)) ++l2; else valid = false;
1493                                 if (valid && (((c2[l+2] >> 6) & 0x03) == 0x02)) ++l2; else valid = false;
1494                                 if (!valid)
1495                                 {
1496                                         array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1497                                         l += l2;
1498                                         continue;
1499                                 }
1500
1501                                 // Decode.
1502                                 uchar8_t b1 = ((c2[l] & 0xF) << 4) | ((c2[l+1] >> 2) & 0xF);
1503                                 uchar8_t b2 = ((c2[l+1] & 0x3) << 6) | (c2[l+2] & 0x3F);
1504                                 uchar16_t ch = b2 | ((uchar16_t)b1 << 8);
1505                                 array[pos++] = ch;
1506                                 l += 3;
1507                         }
1508                         else if ((c2[l] & 0xE0) == 0xC0)
1509                         {       // 2 bytes UTF-8, 1 byte UTF-16.
1510                                 // Check for a full string.
1511                                 if ((l + 1) >= len)
1512                                 {
1513                                         array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1514                                         l += 1;
1515                                         break;
1516                                 }
1517
1518                                 // Validate.
1519                                 if (((c2[l+1] >> 6) & 0x03) != 0x02)
1520                                 {
1521                                         array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1522                                         ++l;
1523                                         continue;
1524                                 }
1525
1526                                 // Decode.
1527                                 uchar8_t b1 = (c2[l] >> 2) & 0x7;
1528                                 uchar8_t b2 = ((c2[l] & 0x3) << 6) | (c2[l+1] & 0x3F);
1529                                 uchar16_t ch = b2 | ((uchar16_t)b1 << 8);
1530                                 array[pos++] = ch;
1531                                 l += 2;
1532                         }
1533                         else
1534                         {       // 1 byte UTF-8, 1 byte UTF-16.
1535                                 // Validate.
1536                                 if (c2[l] > 0x7F)
1537                                 {       // Values above 0xF4 are restricted and aren't used.  By now, anything above 0x7F is invalid.
1538                                         array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1539                                 }
1540                                 else array[pos++] = static_cast<uchar16_t>(c2[l]);
1541                                 ++l;
1542                         }
1543                 }
1544                 array[used] = 0;
1545
1546                 // Validate our new UTF-16 string.
1547                 validate();
1548
1549                 return *this;
1550         }
1551
1552
1553         //! Appends a UTF-16 string to this ustring16
1554         //! \param other The UTF-16 string to append.
1555         //! \param length The length of the string to append.
1556         //! \return A reference to our current string.
1557         ustring16<TAlloc>& append(const uchar16_t* const other, u32 length=0xffffffff)
1558         {
1559                 if (!other)
1560                         return *this;
1561
1562                 // Determine if the string is long enough for a BOM.
1563                 u32 len = 0;
1564                 const uchar16_t* p = other;
1565                 do
1566                 {
1567                         ++len;
1568                 } while (*p++ && len < unicode::BOM_ENCODE_UTF16_LEN);
1569
1570                 // Check for the BOM to determine the string's endianness.
1571                 unicode::EUTF_ENDIAN c_end = unicode::EUTFEE_NATIVE;
1572                 if (memcmp(other, unicode::BOM_ENCODE_UTF16_LE, unicode::BOM_ENCODE_UTF16_LEN) == 0)
1573                         c_end = unicode::EUTFEE_LITTLE;
1574                 else if (memcmp(other, unicode::BOM_ENCODE_UTF16_BE, unicode::BOM_ENCODE_UTF16_LEN) == 0)
1575                         c_end = unicode::EUTFEE_BIG;
1576
1577                 // If a BOM was found, don't include it in the string.
1578                 const uchar16_t* c2 = other;
1579                 if (c_end != unicode::EUTFEE_NATIVE)
1580                 {
1581                         c2 = other + unicode::BOM_UTF16_LEN;
1582                         length -= unicode::BOM_UTF16_LEN;
1583                 }
1584
1585                 // Calculate the size of the string to read in.
1586                 len = 0;
1587                 p = c2;
1588                 do
1589                 {
1590                         ++len;
1591                 } while(*p++ && len < length);
1592                 if (len > length)
1593                         len = length;
1594
1595                 // If we need to grow the size of the array, do it now.
1596                 if (used + len >= allocated)
1597                         reallocate(used + (len * 2));
1598                 u32 start = used;
1599                 used += len;
1600
1601                 // Copy the string now.
1602                 unicode::EUTF_ENDIAN m_end = getEndianness();
1603                 for (u32 l = start; l < start + len; ++l)
1604                 {
1605                         array[l] = (uchar16_t)c2[l];
1606                         if (c_end != unicode::EUTFEE_NATIVE && c_end != m_end)
1607                                 array[l] = unicode::swapEndian16(array[l]);
1608                 }
1609
1610                 array[used] = 0;
1611
1612                 // Validate our new UTF-16 string.
1613                 validate();
1614                 return *this;
1615         }
1616
1617
1618         //! Appends a UTF-32 string to this ustring16
1619         //! \param other The UTF-32 string to append.
1620         //! \param length The length of the string to append.
1621         //! \return A reference to our current string.
1622         ustring16<TAlloc>& append(const uchar32_t* const other, u32 length=0xffffffff)
1623         {
1624                 if (!other)
1625                         return *this;
1626
1627                 // Check for the BOM to determine the string's endianness.
1628                 unicode::EUTF_ENDIAN c_end = unicode::EUTFEE_NATIVE;
1629                 if (memcmp(other, unicode::BOM_ENCODE_UTF32_LE, unicode::BOM_ENCODE_UTF32_LEN) == 0)
1630                         c_end = unicode::EUTFEE_LITTLE;
1631                 else if (memcmp(other, unicode::BOM_ENCODE_UTF32_BE, unicode::BOM_ENCODE_UTF32_LEN) == 0)
1632                         c_end = unicode::EUTFEE_BIG;
1633
1634                 // If a BOM was found, don't include it in the string.
1635                 const uchar32_t* c2 = other;
1636                 if (c_end != unicode::EUTFEE_NATIVE)
1637                 {
1638                         c2 = other + unicode::BOM_UTF32_LEN;
1639                         length -= unicode::BOM_UTF32_LEN;
1640                 }
1641
1642                 // Calculate the size of the string to read in.
1643                 u32 len = 0;
1644                 const uchar32_t* p = c2;
1645                 do
1646                 {
1647                         ++len;
1648                 } while(*p++ && len < length);
1649                 if (len > length)
1650                         len = length;
1651
1652                 // If we need to grow the size of the array, do it now.
1653                 // In case all of the UTF-32 string is split into surrogate pairs, do len * 2.
1654                 if (used + (len * 2) >= allocated)
1655                         reallocate(used + ((len * 2) * 2));
1656                 u32 start = used;
1657
1658                 // Convert UTF-32 to UTF-16.
1659                 unicode::EUTF_ENDIAN m_end = getEndianness();
1660                 u32 pos = start;
1661                 for (u32 l = 0; l<len; ++l)
1662                 {
1663                         ++used;
1664
1665                         uchar32_t ch = c2[l];
1666                         if (c_end != unicode::EUTFEE_NATIVE && c_end != m_end)
1667                                 ch = unicode::swapEndian32(ch);
1668
1669                         if (ch > 0xFFFF)
1670                         {
1671                                 // Split ch up into a surrogate pair as it is over 16 bits long.
1672                                 uchar16_t x = static_cast<uchar16_t>(ch);
1673                                 uchar16_t vh = UTF16_HI_SURROGATE | ((((ch >> 16) & ((1 << 5) - 1)) - 1) << 6) | (x >> 10);
1674                                 uchar16_t vl = UTF16_LO_SURROGATE | (x & ((1 << 10) - 1));
1675                                 array[pos++] = vh;
1676                                 array[pos++] = vl;
1677                                 ++used;         // Using two shorts, so increased used again.
1678                         }
1679                         else if (ch >= 0xD800 && ch <= 0xDFFF)
1680                         {
1681                                 // Between possible UTF-16 surrogates (invalid!)
1682                                 array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1683                         }
1684                         else array[pos++] = static_cast<uchar16_t>(ch);
1685                 }
1686                 array[used] = 0;
1687
1688                 // Validate our new UTF-16 string.
1689                 validate();
1690
1691                 return *this;
1692         }
1693
1694
1695         //! Appends a ustring16 to this ustring16
1696         //! \param other The string to append to this one.
1697         //! \return A reference to our current string.
1698         ustring16<TAlloc>& append(const ustring16<TAlloc>& other)
1699         {
1700                 const uchar16_t* oa = other.c_str();
1701
1702                 u32 len = other.size_raw();
1703
1704                 if (used + len >= allocated)
1705                         reallocate(used + len);
1706
1707                 for (u32 l=0; l<len; ++l)
1708                         array[used+l] = oa[l];
1709
1710                 used += len;
1711                 array[used] = 0;
1712
1713                 return *this;
1714         }
1715
1716
1717         //! Appends a certain amount of characters of a ustring16 to this ustring16.
1718         //! \param other The string to append to this one.
1719         //! \param length How many characters of the other string to add to this one.
1720         //! \return A reference to our current string.
1721         ustring16<TAlloc>& append(const ustring16<TAlloc>& other, u32 length)
1722         {
1723                 if (other.size() == 0)
1724                         return *this;
1725
1726                 if (other.size() < length)
1727                 {
1728                         append(other);
1729                         return *this;
1730                 }
1731
1732                 if (used + length * 2 >= allocated)
1733                         reallocate(used + length * 2);
1734
1735                 const_iterator iter(other, 0);
1736                 u32 l = length;
1737                 while (!iter.atEnd() && l)
1738                 {
1739                         uchar32_t c = *iter;
1740                         append(c);
1741                         ++iter;
1742                         --l;
1743                 }
1744
1745                 return *this;
1746         }
1747
1748
1749         //! Reserves some memory.
1750         //! \param count The amount of characters to reserve.
1751         void reserve(u32 count)
1752         {
1753                 if (count < allocated)
1754                         return;
1755
1756                 reallocate(count);
1757         }
1758
1759
1760         //! Finds first occurrence of character.
1761         //! \param c The character to search for.
1762         //! \return Position where the character has been found, or -1 if not found.
1763         s32 findFirst(uchar32_t c) const
1764         {
1765                 const_iterator i(*this, 0);
1766
1767                 s32 pos = 0;
1768                 while (!i.atEnd())
1769                 {
1770                         uchar32_t t = *i;
1771                         if (c == t)
1772                                 return pos;
1773                         ++pos;
1774                         ++i;
1775                 }
1776
1777                 return -1;
1778         }
1779
1780         //! Finds first occurrence of a character of a list.
1781         //! \param c A list of characters to find. For example if the method should find the first occurrence of 'a' or 'b', this parameter should be "ab".
1782         //! \param count The amount of characters in the list. Usually, this should be strlen(c).
1783         //! \return Position where one of the characters has been found, or -1 if not found.
1784         s32 findFirstChar(const uchar32_t* const c, u32 count=1) const
1785         {
1786                 if (!c || !count)
1787                         return -1;
1788
1789                 const_iterator i(*this, 0);
1790
1791                 s32 pos = 0;
1792                 while (!i.atEnd())
1793                 {
1794                         uchar32_t t = *i;
1795                         for (u32 j=0; j<count; ++j)
1796                                 if (t == c[j])
1797                                         return pos;
1798                         ++pos;
1799                         ++i;
1800                 }
1801
1802                 return -1;
1803         }
1804
1805
1806         //! Finds first position of a character not in a given list.
1807         //! \param c A list of characters to NOT find. For example if the method should find the first occurrence of a character not 'a' or 'b', this parameter should be "ab".
1808         //! \param count The amount of characters in the list. Usually, this should be strlen(c).
1809         //! \return Position where the character has been found, or -1 if not found.
1810         s32 findFirstCharNotInList(const uchar32_t* const c, u32 count=1) const
1811         {
1812                 if (!c || !count)
1813                         return -1;
1814
1815                 const_iterator i(*this, 0);
1816
1817                 s32 pos = 0;
1818                 while (!i.atEnd())
1819                 {
1820                         uchar32_t t = *i;
1821                         u32 j;
1822                         for (j=0; j<count; ++j)
1823                                 if (t == c[j])
1824                                         break;
1825
1826                         if (j==count)
1827                                 return pos;
1828                         ++pos;
1829                         ++i;
1830                 }
1831
1832                 return -1;
1833         }
1834
1835         //! Finds last position of a character not in a given list.
1836         //! \param c A list of characters to NOT find. For example if the method should find the first occurrence of a character not 'a' or 'b', this parameter should be "ab".
1837         //! \param count The amount of characters in the list. Usually, this should be strlen(c).
1838         //! \return Position where the character has been found, or -1 if not found.
1839         s32 findLastCharNotInList(const uchar32_t* const c, u32 count=1) const
1840         {
1841                 if (!c || !count)
1842                         return -1;
1843
1844                 const_iterator i(end());
1845                 --i;
1846
1847                 s32 pos = size() - 1;
1848                 while (!i.atStart())
1849                 {
1850                         uchar32_t t = *i;
1851                         u32 j;
1852                         for (j=0; j<count; ++j)
1853                                 if (t == c[j])
1854                                         break;
1855
1856                         if (j==count)
1857                                 return pos;
1858                         --pos;
1859                         --i;
1860                 }
1861
1862                 return -1;
1863         }
1864
1865         //! Finds next occurrence of character.
1866         //! \param c The character to search for.
1867         //! \param startPos The position in the string to start searching.
1868         //! \return Position where the character has been found, or -1 if not found.
1869         s32 findNext(uchar32_t c, u32 startPos) const
1870         {
1871                 const_iterator i(*this, startPos);
1872
1873                 s32 pos = startPos;
1874                 while (!i.atEnd())
1875                 {
1876                         uchar32_t t = *i;
1877                         if (t == c)
1878                                 return pos;
1879                         ++pos;
1880                         ++i;
1881                 }
1882
1883                 return -1;
1884         }
1885
1886
1887         //! Finds last occurrence of character.
1888         //! \param c The character to search for.
1889         //! \param start The start position of the reverse search ( default = -1, on end ).
1890         //! \return Position where the character has been found, or -1 if not found.
1891         s32 findLast(uchar32_t c, s32 start = -1) const
1892         {
1893                 u32 s = size();
1894                 start = core::clamp ( start < 0 ? (s32)s : start, 0, (s32)s ) - 1;
1895
1896                 const_iterator i(*this, start);
1897                 u32 pos = start;
1898                 while (!i.atStart())
1899                 {
1900                         uchar32_t t = *i;
1901                         if (t == c)
1902                                 return pos;
1903                         --pos;
1904                         --i;
1905                 }
1906
1907                 return -1;
1908         }
1909
1910         //! Finds last occurrence of a character in a list.
1911         //! \param c A list of strings to find. For example if the method should find the last occurrence of 'a' or 'b', this parameter should be "ab".
1912         //! \param count The amount of characters in the list. Usually, this should be strlen(c).
1913         //! \return Position where one of the characters has been found, or -1 if not found.
1914         s32 findLastChar(const uchar32_t* const c, u32 count=1) const
1915         {
1916                 if (!c || !count)
1917                         return -1;
1918
1919                 const_iterator i(end());
1920                 --i;
1921
1922                 s32 pos = size();
1923                 while (!i.atStart())
1924                 {
1925                         uchar32_t t = *i;
1926                         for (u32 j=0; j<count; ++j)
1927                                 if (t == c[j])
1928                                         return pos;
1929                         --pos;
1930                         --i;
1931                 }
1932
1933                 return -1;
1934         }
1935
1936
1937         //! Finds another ustring16 in this ustring16.
1938         //! \param str The string to find.
1939         //! \param start The start position of the search.
1940         //! \return Positions where the ustring16 has been found, or -1 if not found.
1941         s32 find(const ustring16<TAlloc>& str, const u32 start = 0) const
1942         {
1943                 u32 my_size = size();
1944                 u32 their_size = str.size();
1945
1946                 if (their_size == 0 || my_size - start < their_size)
1947                         return -1;
1948
1949                 const_iterator i(*this, start);
1950
1951                 s32 pos = start;
1952                 while (!i.atEnd())
1953                 {
1954                         const_iterator i2(i);
1955                         const_iterator j(str, 0);
1956                         uchar32_t t1 = (uchar32_t)*i2;
1957                         uchar32_t t2 = (uchar32_t)*j;
1958                         while (t1 == t2)
1959                         {
1960                                 ++i2;
1961                                 ++j;
1962                                 if (j.atEnd())
1963                                         return pos;
1964                                 t1 = (uchar32_t)*i2;
1965                                 t2 = (uchar32_t)*j;
1966                         }
1967                         ++i;
1968                         ++pos;
1969                 }
1970
1971                 return -1;
1972         }
1973
1974
1975         //! Finds another ustring16 in this ustring16.
1976         //! \param str The string to find.
1977         //! \param start The start position of the search.
1978         //! \return Positions where the string has been found, or -1 if not found.
1979         s32 find_raw(const ustring16<TAlloc>& str, const u32 start = 0) const
1980         {
1981                 const uchar16_t* data = str.c_str();
1982                 if (data && *data)
1983                 {
1984                         u32 len = 0;
1985
1986                         while (data[len])
1987                                 ++len;
1988
1989                         if (len > used)
1990                                 return -1;
1991
1992                         for (u32 i=start; i<=used-len; ++i)
1993                         {
1994                                 u32 j=0;
1995
1996                                 while(data[j] && array[i+j] == data[j])
1997                                         ++j;
1998
1999                                 if (!data[j])
2000                                         return i;
2001                         }
2002                 }
2003
2004                 return -1;
2005         }
2006
2007
2008         //! Returns a substring.
2009         //! \param begin: Start of substring.
2010         //! \param length: Length of substring.
2011         //! \return A reference to our current string.
2012         ustring16<TAlloc> subString(u32 begin, s32 length) const
2013         {
2014                 u32 len = size();
2015                 // if start after ustring16
2016                 // or no proper substring length
2017                 if ((length <= 0) || (begin>=len))
2018                         return ustring16<TAlloc>("");
2019                 // clamp length to maximal value
2020                 if ((length+begin) > len)
2021                         length = len-begin;
2022
2023                 ustring16<TAlloc> o;
2024                 o.reserve((length+1) * 2);
2025
2026                 const_iterator i(*this, begin);
2027                 while (!i.atEnd() && length)
2028                 {
2029                         o.append(*i);
2030                         ++i;
2031                         --length;
2032                 }
2033
2034                 return o;
2035         }
2036
2037
2038         //! Appends a character to this ustring16.
2039         //! \param c Character to append.
2040         //! \return A reference to our current string.
2041         ustring16<TAlloc>& operator += (char c)
2042         {
2043                 append((uchar32_t)c);
2044                 return *this;
2045         }
2046
2047
2048         //! Appends a character to this ustring16.
2049         //! \param c Character to append.
2050         //! \return A reference to our current string.
2051         ustring16<TAlloc>& operator += (uchar32_t c)
2052         {
2053                 append(c);
2054                 return *this;
2055         }
2056
2057
2058         //! Appends a number to this ustring16.
2059         //! \param c Number to append.
2060         //! \return A reference to our current string.
2061         ustring16<TAlloc>& operator += (short c)
2062         {
2063                 append(core::stringc(c));
2064                 return *this;
2065         }
2066
2067
2068         //! Appends a number to this ustring16.
2069         //! \param c Number to append.
2070         //! \return A reference to our current string.
2071         ustring16<TAlloc>& operator += (unsigned short c)
2072         {
2073                 append(core::stringc(c));
2074                 return *this;
2075         }
2076
2077
2078 #ifdef USTRING_CPP0X_NEWLITERALS
2079         //! Appends a number to this ustring16.
2080         //! \param c Number to append.
2081         //! \return A reference to our current string.
2082         ustring16<TAlloc>& operator += (int c)
2083         {
2084                 append(core::stringc(c));
2085                 return *this;
2086         }
2087
2088
2089         //! Appends a number to this ustring16.
2090         //! \param c Number to append.
2091         //! \return A reference to our current string.
2092         ustring16<TAlloc>& operator += (unsigned int c)
2093         {
2094                 append(core::stringc(c));
2095                 return *this;
2096         }
2097 #endif
2098
2099
2100         //! Appends a number to this ustring16.
2101         //! \param c Number to append.
2102         //! \return A reference to our current string.
2103         ustring16<TAlloc>& operator += (long c)
2104         {
2105                 append(core::stringc(c));
2106                 return *this;
2107         }
2108
2109
2110         //! Appends a number to this ustring16.
2111         //! \param c Number to append.
2112         //! \return A reference to our current string.
2113         ustring16<TAlloc>& operator += (unsigned long c)
2114         {
2115                 append(core::stringc(c));
2116                 return *this;
2117         }
2118
2119
2120         //! Appends a number to this ustring16.
2121         //! \param c Number to append.
2122         //! \return A reference to our current string.
2123         ustring16<TAlloc>& operator += (double c)
2124         {
2125                 append(core::stringc(c));
2126                 return *this;
2127         }
2128
2129
2130         //! Appends a char ustring16 to this ustring16.
2131         //! \param c Char ustring16 to append.
2132         //! \return A reference to our current string.
2133         ustring16<TAlloc>& operator += (const uchar16_t* const c)
2134         {
2135                 append(c);
2136                 return *this;
2137         }
2138
2139
2140         //! Appends a ustring16 to this ustring16.
2141         //! \param other ustring16 to append.
2142         //! \return A reference to our current string.
2143         ustring16<TAlloc>& operator += (const ustring16<TAlloc>& other)
2144         {
2145                 append(other);
2146                 return *this;
2147         }
2148
2149
2150         //! Replaces all characters of a given type with another one.
2151         //! \param toReplace Character to replace.
2152         //! \param replaceWith Character replacing the old one.
2153         //! \return A reference to our current string.
2154         ustring16<TAlloc>& replace(uchar32_t toReplace, uchar32_t replaceWith)
2155         {
2156                 iterator i(*this, 0);
2157                 while (!i.atEnd())
2158                 {
2159                         typename ustring16<TAlloc>::access a = *i;
2160                         if ((uchar32_t)a == toReplace)
2161                                 a = replaceWith;
2162                         ++i;
2163                 }
2164                 return *this;
2165         }
2166
2167
2168         //! Replaces all instances of a string with another one.
2169         //! \param toReplace The string to replace.
2170         //! \param replaceWith The string replacing the old one.
2171         //! \return A reference to our current string.
2172         ustring16<TAlloc>& replace(const ustring16<TAlloc>& toReplace, const ustring16<TAlloc>& replaceWith)
2173         {
2174                 if (toReplace.size() == 0)
2175                         return *this;
2176
2177                 const uchar16_t* other = toReplace.c_str();
2178                 const uchar16_t* replace = replaceWith.c_str();
2179                 const u32 other_size = toReplace.size_raw();
2180                 const u32 replace_size = replaceWith.size_raw();
2181
2182                 // Determine the delta.  The algorithm will change depending on the delta.
2183                 s32 delta = replace_size - other_size;
2184
2185                 // A character for character replace.  The string will not shrink or grow.
2186                 if (delta == 0)
2187                 {
2188                         s32 pos = 0;
2189                         while ((pos = find_raw(other, pos)) != -1)
2190                         {
2191                                 for (u32 i = 0; i < replace_size; ++i)
2192                                         array[pos + i] = replace[i];
2193                                 ++pos;
2194                         }
2195                         return *this;
2196                 }
2197
2198                 // We are going to be removing some characters.  The string will shrink.
2199                 if (delta < 0)
2200                 {
2201                         u32 i = 0;
2202                         for (u32 pos = 0; pos <= used; ++i, ++pos)
2203                         {
2204                                 // Is this potentially a match?
2205                                 if (array[pos] == *other)
2206                                 {
2207                                         // Check to see if we have a match.
2208                                         u32 j;
2209                                         for (j = 0; j < other_size; ++j)
2210                                         {
2211                                                 if (array[pos + j] != other[j])
2212                                                         break;
2213                                         }
2214
2215                                         // If we have a match, replace characters.
2216                                         if (j == other_size)
2217                                         {
2218                                                 for (j = 0; j < replace_size; ++j)
2219                                                         array[i + j] = replace[j];
2220                                                 i += replace_size - 1;
2221                                                 pos += other_size - 1;
2222                                                 continue;
2223                                         }
2224                                 }
2225
2226                                 // No match found, just copy characters.
2227                                 array[i - 1] = array[pos];
2228                         }
2229                         array[i] = 0;
2230                         used = i;
2231
2232                         return *this;
2233                 }
2234
2235                 // We are going to be adding characters, so the string size will increase.
2236                 // Count the number of times toReplace exists in the string so we can allocate the new size.
2237                 u32 find_count = 0;
2238                 s32 pos = 0;
2239                 while ((pos = find_raw(other, pos)) != -1)
2240                 {
2241                         ++find_count;
2242                         ++pos;
2243                 }
2244
2245                 // Re-allocate the string now, if needed.
2246                 u32 len = delta * find_count;
2247                 if (used + len >= allocated)
2248                         reallocate(used + len);
2249
2250                 // Start replacing.
2251                 pos = 0;
2252                 while ((pos = find_raw(other, pos)) != -1)
2253                 {
2254                         uchar16_t* start = array + pos + other_size - 1;
2255                         uchar16_t* ptr   = array + used;
2256                         uchar16_t* end   = array + used + delta;
2257
2258                         // Shift characters to make room for the string.
2259                         while (ptr != start)
2260                         {
2261                                 *end = *ptr;
2262                                 --ptr;
2263                                 --end;
2264                         }
2265
2266                         // Add the new string now.
2267                         for (u32 i = 0; i < replace_size; ++i)
2268                                 array[pos + i] = replace[i];
2269
2270                         pos += replace_size;
2271                         used += delta;
2272                 }
2273
2274                 // Terminate the string and return ourself.
2275                 array[used] = 0;
2276                 return *this;
2277         }
2278
2279
2280         //! Removes characters from a ustring16..
2281         //! \param c The character to remove.
2282         //! \return A reference to our current string.
2283         ustring16<TAlloc>& remove(uchar32_t c)
2284         {
2285                 u32 pos = 0;
2286                 u32 found = 0;
2287                 u32 len = (c > 0xFFFF ? 2 : 1);         // Remove characters equal to the size of c as a UTF-16 character.
2288                 for (u32 i=0; i<=used; ++i)
2289                 {
2290                         uchar32_t uc32 = 0;
2291                         if (!UTF16_IS_SURROGATE_HI(array[i]))
2292                                 uc32 |= array[i];
2293                         else if (i + 1 <= used)
2294                         {
2295                                 // Convert the surrogate pair into a single UTF-32 character.
2296                                 uc32 = unicode::toUTF32(array[i], array[i + 1]);
2297                         }
2298                         u32 len2 = (uc32 > 0xFFFF ? 2 : 1);
2299
2300                         if (uc32 == c)
2301                         {
2302                                 found += len;
2303                                 continue;
2304                         }
2305
2306                         array[pos++] = array[i];
2307                         if (len2 == 2)
2308                                 array[pos++] = array[++i];
2309                 }
2310                 used -= found;
2311                 array[used] = 0;
2312                 return *this;
2313         }
2314
2315
2316         //! Removes a ustring16 from the ustring16.
2317         //! \param toRemove The string to remove.
2318         //! \return A reference to our current string.
2319         ustring16<TAlloc>& remove(const ustring16<TAlloc>& toRemove)
2320         {
2321                 u32 size = toRemove.size_raw();
2322                 if (size == 0) return *this;
2323
2324                 const uchar16_t* tra = toRemove.c_str();
2325                 u32 pos = 0;
2326                 u32 found = 0;
2327                 for (u32 i=0; i<=used; ++i)
2328                 {
2329                         u32 j = 0;
2330                         while (j < size)
2331                         {
2332                                 if (array[i + j] != tra[j])
2333                                         break;
2334                                 ++j;
2335                         }
2336                         if (j == size)
2337                         {
2338                                 found += size;
2339                                 i += size - 1;
2340                                 continue;
2341                         }
2342
2343                         array[pos++] = array[i];
2344                 }
2345                 used -= found;
2346                 array[used] = 0;
2347                 return *this;
2348         }
2349
2350
2351         //! Removes characters from the ustring16.
2352         //! \param characters The characters to remove.
2353         //! \return A reference to our current string.
2354         ustring16<TAlloc>& removeChars(const ustring16<TAlloc>& characters)
2355         {
2356                 if (characters.size_raw() == 0)
2357                         return *this;
2358
2359                 u32 pos = 0;
2360                 u32 found = 0;
2361                 const_iterator iter(characters);
2362                 for (u32 i=0; i<=used; ++i)
2363                 {
2364                         uchar32_t uc32 = 0;
2365                         if (!UTF16_IS_SURROGATE_HI(array[i]))
2366                                 uc32 |= array[i];
2367                         else if (i + 1 <= used)
2368                         {
2369                                 // Convert the surrogate pair into a single UTF-32 character.
2370                                 uc32 = unicode::toUTF32(array[i], array[i+1]);
2371                         }
2372                         u32 len2 = (uc32 > 0xFFFF ? 2 : 1);
2373
2374                         bool cont = false;
2375                         iter.toStart();
2376                         while (!iter.atEnd())
2377                         {
2378                                 uchar32_t c = *iter;
2379                                 if (uc32 == c)
2380                                 {
2381                                         found += (c > 0xFFFF ? 2 : 1);          // Remove characters equal to the size of c as a UTF-16 character.
2382                                         ++i;
2383                                         cont = true;
2384                                         break;
2385                                 }
2386                                 ++iter;
2387                         }
2388                         if (cont) continue;
2389
2390                         array[pos++] = array[i];
2391                         if (len2 == 2)
2392                                 array[pos++] = array[++i];
2393                 }
2394                 used -= found;
2395                 array[used] = 0;
2396                 return *this;
2397         }
2398
2399
2400         //! Trims the ustring16.
2401         //! Removes the specified characters (by default, Latin-1 whitespace) from the begining and the end of the ustring16.
2402         //! \param whitespace The characters that are to be considered as whitespace.
2403         //! \return A reference to our current string.
2404         ustring16<TAlloc>& trim(const ustring16<TAlloc>& whitespace = " \t\n\r")
2405         {
2406                 core::array<uchar32_t> utf32white = whitespace.toUTF32();
2407
2408                 // find start and end of the substring without the specified characters
2409                 const s32 begin = findFirstCharNotInList(utf32white.const_pointer(), whitespace.used + 1);
2410                 if (begin == -1)
2411                         return (*this="");
2412
2413                 const s32 end = findLastCharNotInList(utf32white.const_pointer(), whitespace.used + 1);
2414
2415                 return (*this = subString(begin, (end +1) - begin));
2416         }
2417
2418
2419         //! Erases a character from the ustring16.
2420         //! May be slow, because all elements following after the erased element have to be copied.
2421         //! \param index Index of element to be erased.
2422         //! \return A reference to our current string.
2423         ustring16<TAlloc>& erase(u32 index)
2424         {
2425                 _IRR_DEBUG_BREAK_IF(index>used) // access violation
2426
2427                 iterator i(*this, index);
2428
2429                 uchar32_t t = *i;
2430                 u32 len = (t > 0xFFFF ? 2 : 1);
2431
2432                 for (u32 j = static_cast<u32>(i.getPos()) + len; j <= used; ++j)
2433                         array[j - len] = array[j];
2434
2435                 used -= len;
2436                 array[used] = 0;
2437
2438                 return *this;
2439         }
2440
2441
2442         //! Validate the existing ustring16, checking for valid surrogate pairs and checking for proper termination.
2443         //! \return A reference to our current string.
2444         ustring16<TAlloc>& validate()
2445         {
2446                 // Validate all unicode characters.
2447                 for (u32 i=0; i<allocated; ++i)
2448                 {
2449                         // Terminate on existing null.
2450                         if (array[i] == 0)
2451                         {
2452                                 used = i;
2453                                 return *this;
2454                         }
2455                         if (UTF16_IS_SURROGATE(array[i]))
2456                         {
2457                                 if (((i+1) >= allocated) || UTF16_IS_SURROGATE_LO(array[i]))
2458                                         array[i] = unicode::UTF_REPLACEMENT_CHARACTER;
2459                                 else if (UTF16_IS_SURROGATE_HI(array[i]) && !UTF16_IS_SURROGATE_LO(array[i+1]))
2460                                         array[i] = unicode::UTF_REPLACEMENT_CHARACTER;
2461                                 ++i;
2462                         }
2463                         if (array[i] >= 0xFDD0 && array[i] <= 0xFDEF)
2464                                 array[i] = unicode::UTF_REPLACEMENT_CHARACTER;
2465                 }
2466
2467                 // terminate
2468                 used = 0;
2469                 if (allocated > 0)
2470                 {
2471                         used = allocated - 1;
2472                         array[used] = 0;
2473                 }
2474                 return *this;
2475         }
2476
2477
2478         //! Gets the last char of the ustring16, or 0.
2479         //! \return The last char of the ustring16, or 0.
2480         uchar32_t lastChar() const
2481         {
2482                 if (used < 1)
2483                         return 0;
2484
2485                 if (UTF16_IS_SURROGATE_LO(array[used-1]))
2486                 {
2487                         // Make sure we have a paired surrogate.
2488                         if (used < 2)
2489                                 return 0;
2490
2491                         // Check for an invalid surrogate.
2492                         if (!UTF16_IS_SURROGATE_HI(array[used-2]))
2493                                 return 0;
2494
2495                         // Convert the surrogate pair into a single UTF-32 character.
2496                         return unicode::toUTF32(array[used-2], array[used-1]);
2497                 }
2498                 else
2499                 {
2500                         return array[used-1];
2501                 }
2502         }
2503
2504
2505         //! Split the ustring16 into parts.
2506         /** This method will split a ustring16 at certain delimiter characters
2507         into the container passed in as reference. The type of the container
2508         has to be given as template parameter. It must provide a push_back and
2509         a size method.
2510         \param ret The result container
2511         \param c C-style ustring16 of delimiter characters
2512         \param count Number of delimiter characters
2513         \param ignoreEmptyTokens Flag to avoid empty substrings in the result
2514         container. If two delimiters occur without a character in between, an
2515         empty substring would be placed in the result. If this flag is set,
2516         only non-empty strings are stored.
2517         \param keepSeparators Flag which allows to add the separator to the
2518         result ustring16. If this flag is true, the concatenation of the
2519         substrings results in the original ustring16. Otherwise, only the
2520         characters between the delimiters are returned.
2521         \return The number of resulting substrings
2522         */
2523         template<class container>
2524         u32 split(container& ret, const uchar32_t* const c, u32 count=1, bool ignoreEmptyTokens=true, bool keepSeparators=false) const
2525         {
2526                 if (!c)
2527                         return 0;
2528
2529                 const_iterator i(*this);
2530                 const u32 oldSize=ret.size();
2531                 u32 pos = 0;
2532                 u32 lastpos = 0;
2533                 u32 lastpospos = 0;
2534                 bool lastWasSeparator = false;
2535                 while (!i.atEnd())
2536                 {
2537                         uchar32_t ch = *i;
2538                         bool foundSeparator = false;
2539                         for (u32 j=0; j<count; ++j)
2540                         {
2541                                 if (ch == c[j])
2542                                 {
2543                                         if ((!ignoreEmptyTokens || pos - lastpos != 0) &&
2544                                                         !lastWasSeparator)
2545                                         ret.push_back(ustring16<TAlloc>(&array[lastpospos], pos - lastpos));
2546                                         foundSeparator = true;
2547                                         lastpos = (keepSeparators ? pos : pos + 1);
2548                                         lastpospos = (keepSeparators ? i.getPos() : i.getPos() + 1);
2549                                         break;
2550                                 }
2551                         }
2552                         lastWasSeparator = foundSeparator;
2553                         ++pos;
2554                         ++i;
2555                 }
2556                 u32 s = size() + 1;
2557                 if (s > lastpos)
2558                         ret.push_back(ustring16<TAlloc>(&array[lastpospos], s - lastpos));
2559                 return ret.size()-oldSize;
2560         }
2561
2562
2563         //! Split the ustring16 into parts.
2564         /** This method will split a ustring16 at certain delimiter characters
2565         into the container passed in as reference. The type of the container
2566         has to be given as template parameter. It must provide a push_back and
2567         a size method.
2568         \param ret The result container
2569         \param c A unicode string of delimiter characters
2570         \param ignoreEmptyTokens Flag to avoid empty substrings in the result
2571         container. If two delimiters occur without a character in between, an
2572         empty substring would be placed in the result. If this flag is set,
2573         only non-empty strings are stored.
2574         \param keepSeparators Flag which allows to add the separator to the
2575         result ustring16. If this flag is true, the concatenation of the
2576         substrings results in the original ustring16. Otherwise, only the
2577         characters between the delimiters are returned.
2578         \return The number of resulting substrings
2579         */
2580         template<class container>
2581         u32 split(container& ret, const ustring16<TAlloc>& c, bool ignoreEmptyTokens=true, bool keepSeparators=false) const
2582         {
2583                 core::array<uchar32_t> v = c.toUTF32();
2584                 return split(ret, v.pointer(), v.size(), ignoreEmptyTokens, keepSeparators);
2585         }
2586
2587
2588         //! Gets the size of the allocated memory buffer for the string.
2589         //! \return The size of the allocated memory buffer.
2590         u32 capacity() const
2591         {
2592                 return allocated;
2593         }
2594
2595
2596         //! Returns the raw number of UTF-16 code points in the string which includes the individual surrogates.
2597         //! \return The raw number of UTF-16 code points, excluding the trialing NUL.
2598         u32 size_raw() const
2599         {
2600                 return used;
2601         }
2602
2603
2604         //! Inserts a character into the string.
2605         //! \param c The character to insert.
2606         //! \param pos The position to insert the character.
2607         //! \return A reference to our current string.
2608         ustring16<TAlloc>& insert(uchar32_t c, u32 pos)
2609         {
2610                 u8 len = (c > 0xFFFF ? 2 : 1);
2611
2612                 if (used + len >= allocated)
2613                         reallocate(used + len);
2614
2615                 used += len;
2616
2617                 iterator iter(*this, pos);
2618                 for (u32 i = used - 2; i > iter.getPos(); --i)
2619                         array[i] = array[i - len];
2620
2621                 if (c > 0xFFFF)
2622                 {
2623                         // c will be multibyte, so split it up into a surrogate pair.
2624                         uchar16_t x = static_cast<uchar16_t>(c);
2625                         uchar16_t vh = UTF16_HI_SURROGATE | ((((c >> 16) & ((1 << 5) - 1)) - 1) << 6) | (x >> 10);
2626                         uchar16_t vl = UTF16_LO_SURROGATE | (x & ((1 << 10) - 1));
2627                         array[iter.getPos()] = vh;
2628                         array[iter.getPos()+1] = vl;
2629                 }
2630                 else
2631                 {
2632                         array[iter.getPos()] = static_cast<uchar16_t>(c);
2633                 }
2634                 array[used] = 0;
2635                 return *this;
2636         }
2637
2638
2639         //! Inserts a string into the string.
2640         //! \param c The string to insert.
2641         //! \param pos The position to insert the string.
2642         //! \return A reference to our current string.
2643         ustring16<TAlloc>& insert(const ustring16<TAlloc>& c, u32 pos)
2644         {
2645                 u32 len = c.size_raw();
2646                 if (len == 0) return *this;
2647
2648                 if (used + len >= allocated)
2649                         reallocate(used + len);
2650
2651                 used += len;
2652
2653                 iterator iter(*this, pos);
2654                 for (u32 i = used - 2; i > iter.getPos() + len; --i)
2655                         array[i] = array[i - len];
2656
2657                 const uchar16_t* s = c.c_str();
2658                 for (u32 i = 0; i < len; ++i)
2659                 {
2660                         array[pos++] = *s;
2661                         ++s;
2662                 }
2663
2664                 array[used] = 0;
2665                 return *this;
2666         }
2667
2668
2669         //! Inserts a character into the string.
2670         //! \param c The character to insert.
2671         //! \param pos The position to insert the character.
2672         //! \return A reference to our current string.
2673         ustring16<TAlloc>& insert_raw(uchar16_t c, u32 pos)
2674         {
2675                 if (used + 1 >= allocated)
2676                         reallocate(used + 1);
2677
2678                 ++used;
2679
2680                 for (u32 i = used - 1; i > pos; --i)
2681                         array[i] = array[i - 1];
2682
2683                 array[pos] = c;
2684                 array[used] = 0;
2685                 return *this;
2686         }
2687
2688
2689         //! Removes a character from string.
2690         //! \param pos Position of the character to remove.
2691         //! \return A reference to our current string.
2692         ustring16<TAlloc>& erase_raw(u32 pos)
2693         {
2694                 for (u32 i=pos; i<=used; ++i)
2695                 {
2696                         array[i] = array[i + 1];
2697                 }
2698                 --used;
2699                 array[used] = 0;
2700                 return *this;
2701         }
2702
2703
2704         //! Replaces a character in the string.
2705         //! \param c The new character.
2706         //! \param pos The position of the character to replace.
2707         //! \return A reference to our current string.
2708         ustring16<TAlloc>& replace_raw(uchar16_t c, u32 pos)
2709         {
2710                 array[pos] = c;
2711                 return *this;
2712         }
2713
2714
2715         //! Returns an iterator to the beginning of the string.
2716         //! \return An iterator to the beginning of the string.
2717         iterator begin()
2718         {
2719                 iterator i(*this, 0);
2720                 return i;
2721         }
2722
2723
2724         //! Returns an iterator to the beginning of the string.
2725         //! \return An iterator to the beginning of the string.
2726         const_iterator begin() const
2727         {
2728                 const_iterator i(*this, 0);
2729                 return i;
2730         }
2731
2732
2733         //! Returns an iterator to the beginning of the string.
2734         //! \return An iterator to the beginning of the string.
2735         const_iterator cbegin() const
2736         {
2737                 const_iterator i(*this, 0);
2738                 return i;
2739         }
2740
2741
2742         //! Returns an iterator to the end of the string.
2743         //! \return An iterator to the end of the string.
2744         iterator end()
2745         {
2746                 iterator i(*this, 0);
2747                 i.toEnd();
2748                 return i;
2749         }
2750
2751
2752         //! Returns an iterator to the end of the string.
2753         //! \return An iterator to the end of the string.
2754         const_iterator end() const
2755         {
2756                 const_iterator i(*this, 0);
2757                 i.toEnd();
2758                 return i;
2759         }
2760
2761
2762         //! Returns an iterator to the end of the string.
2763         //! \return An iterator to the end of the string.
2764         const_iterator cend() const
2765         {
2766                 const_iterator i(*this, 0);
2767                 i.toEnd();
2768                 return i;
2769         }
2770
2771
2772         //! Converts the string to a UTF-8 encoded string.
2773         //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
2774         //! \return A string containing the UTF-8 encoded string.
2775         core::string<uchar8_t> toUTF8_s(const bool addBOM = false) const
2776         {
2777                 core::string<uchar8_t> ret;
2778                 ret.reserve(used * 4 + (addBOM ? unicode::BOM_UTF8_LEN : 0) + 1);
2779                 const_iterator iter(*this, 0);
2780
2781                 // Add the byte order mark if the user wants it.
2782                 if (addBOM)
2783                 {
2784                         ret.append(unicode::BOM_ENCODE_UTF8[0]);
2785                         ret.append(unicode::BOM_ENCODE_UTF8[1]);
2786                         ret.append(unicode::BOM_ENCODE_UTF8[2]);
2787                 }
2788
2789                 while (!iter.atEnd())
2790                 {
2791                         uchar32_t c = *iter;
2792                         if (c > 0xFFFF)
2793                         {       // 4 bytes
2794                                 uchar8_t b1 = (0x1E << 3) | ((c >> 18) & 0x7);
2795                                 uchar8_t b2 = (0x2 << 6) | ((c >> 12) & 0x3F);
2796                                 uchar8_t b3 = (0x2 << 6) | ((c >> 6) & 0x3F);
2797                                 uchar8_t b4 = (0x2 << 6) | (c & 0x3F);
2798                                 ret.append(b1);
2799                                 ret.append(b2);
2800                                 ret.append(b3);
2801                                 ret.append(b4);
2802                         }
2803                         else if (c > 0x7FF)
2804                         {       // 3 bytes
2805                                 uchar8_t b1 = (0xE << 4) | ((c >> 12) & 0xF);
2806                                 uchar8_t b2 = (0x2 << 6) | ((c >> 6) & 0x3F);
2807                                 uchar8_t b3 = (0x2 << 6) | (c & 0x3F);
2808                                 ret.append(b1);
2809                                 ret.append(b2);
2810                                 ret.append(b3);
2811                         }
2812                         else if (c > 0x7F)
2813                         {       // 2 bytes
2814                                 uchar8_t b1 = (0x6 << 5) | ((c >> 6) & 0x1F);
2815                                 uchar8_t b2 = (0x2 << 6) | (c & 0x3F);
2816                                 ret.append(b1);
2817                                 ret.append(b2);
2818                         }
2819                         else
2820                         {       // 1 byte
2821                                 ret.append(static_cast<uchar8_t>(c));
2822                         }
2823                         ++iter;
2824                 }
2825                 return ret;
2826         }
2827
2828
2829         //! Converts the string to a UTF-8 encoded string array.
2830         //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
2831         //! \return An array containing the UTF-8 encoded string.
2832         core::array<uchar8_t> toUTF8(const bool addBOM = false) const
2833         {
2834                 core::array<uchar8_t> ret(used * 4 + (addBOM ? unicode::BOM_UTF8_LEN : 0) + 1);
2835                 const_iterator iter(*this, 0);
2836
2837                 // Add the byte order mark if the user wants it.
2838                 if (addBOM)
2839                 {
2840                         ret.push_back(unicode::BOM_ENCODE_UTF8[0]);
2841                         ret.push_back(unicode::BOM_ENCODE_UTF8[1]);
2842                         ret.push_back(unicode::BOM_ENCODE_UTF8[2]);
2843                 }
2844
2845                 while (!iter.atEnd())
2846                 {
2847                         uchar32_t c = *iter;
2848                         if (c > 0xFFFF)
2849                         {       // 4 bytes
2850                                 uchar8_t b1 = (0x1E << 3) | ((c >> 18) & 0x7);
2851                                 uchar8_t b2 = (0x2 << 6) | ((c >> 12) & 0x3F);
2852                                 uchar8_t b3 = (0x2 << 6) | ((c >> 6) & 0x3F);
2853                                 uchar8_t b4 = (0x2 << 6) | (c & 0x3F);
2854                                 ret.push_back(b1);
2855                                 ret.push_back(b2);
2856                                 ret.push_back(b3);
2857                                 ret.push_back(b4);
2858                         }
2859                         else if (c > 0x7FF)
2860                         {       // 3 bytes
2861                                 uchar8_t b1 = (0xE << 4) | ((c >> 12) & 0xF);
2862                                 uchar8_t b2 = (0x2 << 6) | ((c >> 6) & 0x3F);
2863                                 uchar8_t b3 = (0x2 << 6) | (c & 0x3F);
2864                                 ret.push_back(b1);
2865                                 ret.push_back(b2);
2866                                 ret.push_back(b3);
2867                         }
2868                         else if (c > 0x7F)
2869                         {       // 2 bytes
2870                                 uchar8_t b1 = (0x6 << 5) | ((c >> 6) & 0x1F);
2871                                 uchar8_t b2 = (0x2 << 6) | (c & 0x3F);
2872                                 ret.push_back(b1);
2873                                 ret.push_back(b2);
2874                         }
2875                         else
2876                         {       // 1 byte
2877                                 ret.push_back(static_cast<uchar8_t>(c));
2878                         }
2879                         ++iter;
2880                 }
2881                 ret.push_back(0);
2882                 return ret;
2883         }
2884
2885
2886 #ifdef USTRING_CPP0X_NEWLITERALS        // C++0x
2887         //! Converts the string to a UTF-16 encoded string.
2888         //! \param endian The desired endianness of the string.
2889         //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
2890         //! \return A string containing the UTF-16 encoded string.
2891         core::string<char16_t> toUTF16_s(const unicode::EUTF_ENDIAN endian = unicode::EUTFEE_NATIVE, const bool addBOM = false) const
2892         {
2893                 core::string<char16_t> ret;
2894                 ret.reserve(used + (addBOM ? unicode::BOM_UTF16_LEN : 0) + 1);
2895
2896                 // Add the BOM if specified.
2897                 if (addBOM)
2898                 {
2899                         if (endian == unicode::EUTFEE_NATIVE)
2900                                 ret[0] = unicode::BOM;
2901                         else if (endian == unicode::EUTFEE_LITTLE)
2902                         {
2903                                 uchar8_t* ptr8 = reinterpret_cast<uchar8_t*>(ret.c_str());
2904                                 *ptr8++ = unicode::BOM_ENCODE_UTF16_LE[0];
2905                                 *ptr8 = unicode::BOM_ENCODE_UTF16_LE[1];
2906                         }
2907                         else
2908                         {
2909                                 uchar8_t* ptr8 = reinterpret_cast<uchar8_t*>(ret.c_str());
2910                                 *ptr8++ = unicode::BOM_ENCODE_UTF16_BE[0];
2911                                 *ptr8 = unicode::BOM_ENCODE_UTF16_BE[1];
2912                         }
2913                 }
2914
2915                 ret.append(array);
2916                 if (endian != unicode::EUTFEE_NATIVE && getEndianness() != endian)
2917                 {
2918                         char16_t* ptr = ret.c_str();
2919                         for (u32 i = 0; i < ret.size(); ++i)
2920                                 *ptr++ = unicode::swapEndian16(*ptr);
2921                 }
2922                 return ret;
2923         }
2924 #endif
2925
2926
2927         //! Converts the string to a UTF-16 encoded string array.
2928         //! Unfortunately, no toUTF16_s() version exists due to limitations with Irrlicht's string class.
2929         //! \param endian The desired endianness of the string.
2930         //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
2931         //! \return An array containing the UTF-16 encoded string.
2932         core::array<uchar16_t> toUTF16(const unicode::EUTF_ENDIAN endian = unicode::EUTFEE_NATIVE, const bool addBOM = false) const
2933         {
2934                 core::array<uchar16_t> ret(used + (addBOM ? unicode::BOM_UTF16_LEN : 0) + 1);
2935                 uchar16_t* ptr = ret.pointer();
2936
2937                 // Add the BOM if specified.
2938                 if (addBOM)
2939                 {
2940                         if (endian == unicode::EUTFEE_NATIVE)
2941                                 *ptr = unicode::BOM;
2942                         else if (endian == unicode::EUTFEE_LITTLE)
2943                         {
2944                                 uchar8_t* ptr8 = reinterpret_cast<uchar8_t*>(ptr);
2945                                 *ptr8++ = unicode::BOM_ENCODE_UTF16_LE[0];
2946                                 *ptr8 = unicode::BOM_ENCODE_UTF16_LE[1];
2947                         }
2948                         else
2949                         {
2950                                 uchar8_t* ptr8 = reinterpret_cast<uchar8_t*>(ptr);
2951                                 *ptr8++ = unicode::BOM_ENCODE_UTF16_BE[0];
2952                                 *ptr8 = unicode::BOM_ENCODE_UTF16_BE[1];
2953                         }
2954                         ++ptr;
2955                 }
2956
2957                 memcpy((void*)ptr, (void*)array, used * sizeof(uchar16_t));
2958                 if (endian != unicode::EUTFEE_NATIVE && getEndianness() != endian)
2959                 {
2960                         for (u32 i = 0; i <= used; ++i)
2961                                 *ptr++ = unicode::swapEndian16(*ptr);
2962                 }
2963                 ret.set_used(used + (addBOM ? unicode::BOM_UTF16_LEN : 0));
2964                 ret.push_back(0);
2965                 return ret;
2966         }
2967
2968
2969 #ifdef USTRING_CPP0X_NEWLITERALS        // C++0x
2970         //! Converts the string to a UTF-32 encoded string.
2971         //! \param endian The desired endianness of the string.
2972         //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
2973         //! \return A string containing the UTF-32 encoded string.
2974         core::string<char32_t> toUTF32_s(const unicode::EUTF_ENDIAN endian = unicode::EUTFEE_NATIVE, const bool addBOM = false) const
2975         {
2976                 core::string<char32_t> ret;
2977                 ret.reserve(size() + 1 + (addBOM ? unicode::BOM_UTF32_LEN : 0));
2978                 const_iterator iter(*this, 0);
2979
2980                 // Add the BOM if specified.
2981                 if (addBOM)
2982                 {
2983                         if (endian == unicode::EUTFEE_NATIVE)
2984                                 ret.append(unicode::BOM);
2985                         else
2986                         {
2987                                 union
2988                                 {
2989                                         uchar32_t full;
2990                                         u8 chunk[4];
2991                                 } t;
2992
2993                                 if (endian == unicode::EUTFEE_LITTLE)
2994                                 {
2995                                         t.chunk[0] = unicode::BOM_ENCODE_UTF32_LE[0];
2996                                         t.chunk[1] = unicode::BOM_ENCODE_UTF32_LE[1];
2997                                         t.chunk[2] = unicode::BOM_ENCODE_UTF32_LE[2];
2998                                         t.chunk[3] = unicode::BOM_ENCODE_UTF32_LE[3];
2999                                 }
3000                                 else
3001                                 {
3002                                         t.chunk[0] = unicode::BOM_ENCODE_UTF32_BE[0];
3003                                         t.chunk[1] = unicode::BOM_ENCODE_UTF32_BE[1];
3004                                         t.chunk[2] = unicode::BOM_ENCODE_UTF32_BE[2];
3005                                         t.chunk[3] = unicode::BOM_ENCODE_UTF32_BE[3];
3006                                 }
3007                                 ret.append(t.full);
3008                         }
3009                 }
3010
3011                 while (!iter.atEnd())
3012                 {
3013                         uchar32_t c = *iter;
3014                         if (endian != unicode::EUTFEE_NATIVE && getEndianness() != endian)
3015                                 c = unicode::swapEndian32(c);
3016                         ret.append(c);
3017                         ++iter;
3018                 }
3019                 return ret;
3020         }
3021 #endif
3022
3023
3024         //! Converts the string to a UTF-32 encoded string array.
3025         //! Unfortunately, no toUTF32_s() version exists due to limitations with Irrlicht's string class.
3026         //! \param endian The desired endianness of the string.
3027         //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
3028         //! \return An array containing the UTF-32 encoded string.
3029         core::array<uchar32_t> toUTF32(const unicode::EUTF_ENDIAN endian = unicode::EUTFEE_NATIVE, const bool addBOM = false) const
3030         {
3031                 core::array<uchar32_t> ret(size() + (addBOM ? unicode::BOM_UTF32_LEN : 0) + 1);
3032                 const_iterator iter(*this, 0);
3033
3034                 // Add the BOM if specified.
3035                 if (addBOM)
3036                 {
3037                         if (endian == unicode::EUTFEE_NATIVE)
3038                                 ret.push_back(unicode::BOM);
3039                         else
3040                         {
3041                                 union
3042                                 {
3043                                         uchar32_t full;
3044                                         u8 chunk[4];
3045                                 } t;
3046
3047                                 if (endian == unicode::EUTFEE_LITTLE)
3048                                 {
3049                                         t.chunk[0] = unicode::BOM_ENCODE_UTF32_LE[0];
3050                                         t.chunk[1] = unicode::BOM_ENCODE_UTF32_LE[1];
3051                                         t.chunk[2] = unicode::BOM_ENCODE_UTF32_LE[2];
3052                                         t.chunk[3] = unicode::BOM_ENCODE_UTF32_LE[3];
3053                                 }
3054                                 else
3055                                 {
3056                                         t.chunk[0] = unicode::BOM_ENCODE_UTF32_BE[0];
3057                                         t.chunk[1] = unicode::BOM_ENCODE_UTF32_BE[1];
3058                                         t.chunk[2] = unicode::BOM_ENCODE_UTF32_BE[2];
3059                                         t.chunk[3] = unicode::BOM_ENCODE_UTF32_BE[3];
3060                                 }
3061                                 ret.push_back(t.full);
3062                         }
3063                 }
3064                 ret.push_back(0);
3065
3066                 while (!iter.atEnd())
3067                 {
3068                         uchar32_t c = *iter;
3069                         if (endian != unicode::EUTFEE_NATIVE && getEndianness() != endian)
3070                                 c = unicode::swapEndian32(c);
3071                         ret.push_back(c);
3072                         ++iter;
3073                 }
3074                 return ret;
3075         }
3076
3077
3078         //! Converts the string to a wchar_t encoded string.
3079         /** The size of a wchar_t changes depending on the platform.  This function will store a
3080         correct UTF-8, -16, or -32 encoded string depending on the size of a wchar_t. **/
3081         //! \param endian The desired endianness of the string.
3082         //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
3083         //! \return A string containing the wchar_t encoded string.
3084         core::string<wchar_t> toWCHAR_s(const unicode::EUTF_ENDIAN endian = unicode::EUTFEE_NATIVE, const bool addBOM = false) const
3085         {
3086                 if (sizeof(wchar_t) == 4)
3087                 {
3088                         core::array<uchar32_t> a(toUTF32(endian, addBOM));
3089                         core::stringw ret(a.pointer());
3090                         return ret;
3091                 }
3092                 else if (sizeof(wchar_t) == 2)
3093                 {
3094                         if (endian == unicode::EUTFEE_NATIVE && addBOM == false)
3095                         {
3096                                 core::stringw ret(array);
3097                                 return ret;
3098                         }
3099                         else
3100                         {
3101                                 core::array<uchar16_t> a(toUTF16(endian, addBOM));
3102                                 core::stringw ret(a.pointer());
3103                                 return ret;
3104                         }
3105                 }
3106                 else if (sizeof(wchar_t) == 1)
3107                 {
3108                         core::array<uchar8_t> a(toUTF8(addBOM));
3109                         core::stringw ret(a.pointer());
3110                         return ret;
3111                 }
3112
3113                 // Shouldn't happen.
3114                 return core::stringw();
3115         }
3116
3117
3118         //! Converts the string to a wchar_t encoded string array.
3119         /** The size of a wchar_t changes depending on the platform.  This function will store a
3120         correct UTF-8, -16, or -32 encoded string depending on the size of a wchar_t. **/
3121         //! \param endian The desired endianness of the string.
3122         //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
3123         //! \return An array containing the wchar_t encoded string.
3124         core::array<wchar_t> toWCHAR(const unicode::EUTF_ENDIAN endian = unicode::EUTFEE_NATIVE, const bool addBOM = false) const
3125         {
3126                 if (sizeof(wchar_t) == 4)
3127                 {
3128                         core::array<uchar32_t> a(toUTF32(endian, addBOM));
3129                         core::array<wchar_t> ret(a.size());
3130                         ret.set_used(a.size());
3131                         memcpy((void*)ret.pointer(), (void*)a.pointer(), a.size() * sizeof(uchar32_t));
3132                         return ret;
3133                 }
3134                 if (sizeof(wchar_t) == 2)
3135                 {
3136                         if (endian == unicode::EUTFEE_NATIVE && addBOM == false)
3137                         {
3138                                 core::array<wchar_t> ret(used);
3139                                 ret.set_used(used);
3140                                 memcpy((void*)ret.pointer(), (void*)array, used * sizeof(uchar16_t));
3141                                 return ret;
3142                         }
3143                         else
3144                         {
3145                                 core::array<uchar16_t> a(toUTF16(endian, addBOM));
3146                                 core::array<wchar_t> ret(a.size());
3147                                 ret.set_used(a.size());
3148                                 memcpy((void*)ret.pointer(), (void*)a.pointer(), a.size() * sizeof(uchar16_t));
3149                                 return ret;
3150                         }
3151                 }
3152                 if (sizeof(wchar_t) == 1)
3153                 {
3154                         core::array<uchar8_t> a(toUTF8(addBOM));
3155                         core::array<wchar_t> ret(a.size());
3156                         ret.set_used(a.size());
3157                         memcpy((void*)ret.pointer(), (void*)a.pointer(), a.size() * sizeof(uchar8_t));
3158                         return ret;
3159                 }
3160
3161                 // Shouldn't happen.
3162                 return core::array<wchar_t>();
3163         }
3164
3165         //! Converts the string to a properly encoded io::path string.
3166         //! \param endian The desired endianness of the string.
3167         //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
3168         //! \return An io::path string containing the properly encoded string.
3169         io::path toPATH_s(const unicode::EUTF_ENDIAN endian = unicode::EUTFEE_NATIVE, const bool addBOM = false) const
3170         {
3171 #if defined(_IRR_WCHAR_FILESYSTEM)
3172                 return toWCHAR_s(endian, addBOM);
3173 #else
3174                 return toUTF8_s(addBOM);
3175 #endif
3176         }
3177
3178         //! Loads an unknown stream of data.
3179         //! Will attempt to determine if the stream is unicode data.  Useful for loading from files.
3180         //! \param data The data stream to load from.
3181         //! \param data_size The length of the data string.
3182         //! \return A reference to our current string.
3183         ustring16<TAlloc>& loadDataStream(const char* data, size_t data_size)
3184         {
3185                 // Clear our string.
3186                 *this = "";
3187                 if (!data)
3188                         return *this;
3189
3190                 unicode::EUTF_ENCODE e = unicode::determineUnicodeBOM(data);
3191                 switch (e)
3192                 {
3193                         default:
3194                         case unicode::EUTFE_UTF8:
3195                                 append((uchar8_t*)data, data_size);
3196                                 break;
3197
3198                         case unicode::EUTFE_UTF16:
3199                         case unicode::EUTFE_UTF16_BE:
3200                         case unicode::EUTFE_UTF16_LE:
3201                                 append((uchar16_t*)data, data_size / 2);
3202                                 break;
3203
3204                         case unicode::EUTFE_UTF32:
3205                         case unicode::EUTFE_UTF32_BE:
3206                         case unicode::EUTFE_UTF32_LE:
3207                                 append((uchar32_t*)data, data_size / 4);
3208                                 break;
3209                 }
3210
3211                 return *this;
3212         }
3213
3214         //! Gets the encoding of the Unicode string this class contains.
3215         //! \return An enum describing the current encoding of this string.
3216         const unicode::EUTF_ENCODE getEncoding() const
3217         {
3218                 return encoding;
3219         }
3220
3221         //! Gets the endianness of the Unicode string this class contains.
3222         //! \return An enum describing the endianness of this string.
3223         const unicode::EUTF_ENDIAN getEndianness() const
3224         {
3225                 if (encoding == unicode::EUTFE_UTF16_LE ||
3226                         encoding == unicode::EUTFE_UTF32_LE)
3227                         return unicode::EUTFEE_LITTLE;
3228                 else return unicode::EUTFEE_BIG;
3229         }
3230
3231 private:
3232
3233         //! Reallocate the string, making it bigger or smaller.
3234         //! \param new_size The new size of the string.
3235         void reallocate(u32 new_size)
3236         {
3237                 uchar16_t* old_array = array;
3238
3239                 array = allocator.allocate(new_size + 1); //new u16[new_size];
3240                 allocated = new_size + 1;
3241                 if (old_array == 0) return;
3242
3243                 u32 amount = used < new_size ? used : new_size;
3244                 for (u32 i=0; i<=amount; ++i)
3245                         array[i] = old_array[i];
3246
3247                 if (allocated <= used)
3248                         used = allocated - 1;
3249
3250                 array[used] = 0;
3251
3252                 allocator.deallocate(old_array); // delete [] old_array;
3253         }
3254
3255         //--- member variables
3256
3257         uchar16_t* array;
3258         unicode::EUTF_ENCODE encoding;
3259         u32 allocated;
3260         u32 used;
3261         TAlloc allocator;
3262         //irrAllocator<uchar16_t> allocator;
3263 };
3264
3265 typedef ustring16<irrAllocator<uchar16_t> > ustring;
3266
3267
3268 //! Appends two ustring16s.
3269 template <typename TAlloc>
3270 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const ustring16<TAlloc>& right)
3271 {
3272         ustring16<TAlloc> ret(left);
3273         ret += right;
3274         return ret;
3275 }
3276
3277
3278 //! Appends a ustring16 and a null-terminated unicode string.
3279 template <typename TAlloc, class B>
3280 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const B* const right)
3281 {
3282         ustring16<TAlloc> ret(left);
3283         ret += right;
3284         return ret;
3285 }
3286
3287
3288 //! Appends a ustring16 and a null-terminated unicode string.
3289 template <class B, typename TAlloc>
3290 inline ustring16<TAlloc> operator+(const B* const left, const ustring16<TAlloc>& right)
3291 {
3292         ustring16<TAlloc> ret(left);
3293         ret += right;
3294         return ret;
3295 }
3296
3297
3298 //! Appends a ustring16 and an Irrlicht string.
3299 template <typename TAlloc, typename B, typename BAlloc>
3300 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const string<B, BAlloc>& right)
3301 {
3302         ustring16<TAlloc> ret(left);
3303         ret += right;
3304         return ret;
3305 }
3306
3307
3308 //! Appends a ustring16 and an Irrlicht string.
3309 template <typename TAlloc, typename B, typename BAlloc>
3310 inline ustring16<TAlloc> operator+(const string<B, BAlloc>& left, const ustring16<TAlloc>& right)
3311 {
3312         ustring16<TAlloc> ret(left);
3313         ret += right;
3314         return ret;
3315 }
3316
3317
3318 //! Appends a ustring16 and a std::basic_string.
3319 template <typename TAlloc, typename B, typename A, typename BAlloc>
3320 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const std::basic_string<B, A, BAlloc>& right)
3321 {
3322         ustring16<TAlloc> ret(left);
3323         ret += right;
3324         return ret;
3325 }
3326
3327
3328 //! Appends a ustring16 and a std::basic_string.
3329 template <typename TAlloc, typename B, typename A, typename BAlloc>
3330 inline ustring16<TAlloc> operator+(const std::basic_string<B, A, BAlloc>& left, const ustring16<TAlloc>& right)
3331 {
3332         ustring16<TAlloc> ret(left);
3333         ret += right;
3334         return ret;
3335 }
3336
3337
3338 //! Appends a ustring16 and a char.
3339 template <typename TAlloc>
3340 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const char right)
3341 {
3342         ustring16<TAlloc> ret(left);
3343         ret += right;
3344         return ret;
3345 }
3346
3347
3348 //! Appends a ustring16 and a char.
3349 template <typename TAlloc>
3350 inline ustring16<TAlloc> operator+(const char left, const ustring16<TAlloc>& right)
3351 {
3352         ustring16<TAlloc> ret(left);
3353         ret += right;
3354         return ret;
3355 }
3356
3357
3358 #ifdef USTRING_CPP0X_NEWLITERALS
3359 //! Appends a ustring16 and a uchar32_t.
3360 template <typename TAlloc>
3361 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const uchar32_t right)
3362 {
3363         ustring16<TAlloc> ret(left);
3364         ret += right;
3365         return ret;
3366 }
3367
3368
3369 //! Appends a ustring16 and a uchar32_t.
3370 template <typename TAlloc>
3371 inline ustring16<TAlloc> operator+(const uchar32_t left, const ustring16<TAlloc>& right)
3372 {
3373         ustring16<TAlloc> ret(left);
3374         ret += right;
3375         return ret;
3376 }
3377 #endif
3378
3379
3380 //! Appends a ustring16 and a short.
3381 template <typename TAlloc>
3382 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const short right)
3383 {
3384         ustring16<TAlloc> ret(left);
3385         ret += core::stringc(right);
3386         return ret;
3387 }
3388
3389
3390 //! Appends a ustring16 and a short.
3391 template <typename TAlloc>
3392 inline ustring16<TAlloc> operator+(const short left, const ustring16<TAlloc>& right)
3393 {
3394         ustring16<TAlloc> ret(core::stringc(left));
3395         ret += right;
3396         return ret;
3397 }
3398
3399
3400 //! Appends a ustring16 and an unsigned short.
3401 template <typename TAlloc>
3402 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const unsigned short right)
3403 {
3404         ustring16<TAlloc> ret(left);
3405         ret += core::stringc(right);
3406         return ret;
3407 }
3408
3409
3410 //! Appends a ustring16 and an unsigned short.
3411 template <typename TAlloc>
3412 inline ustring16<TAlloc> operator+(const unsigned short left, const ustring16<TAlloc>& right)
3413 {
3414         ustring16<TAlloc> ret(core::stringc(left));
3415         ret += right;
3416         return ret;
3417 }
3418
3419
3420 //! Appends a ustring16 and an int.
3421 template <typename TAlloc>
3422 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const int right)
3423 {
3424         ustring16<TAlloc> ret(left);
3425         ret += core::stringc(right);
3426         return ret;
3427 }
3428
3429
3430 //! Appends a ustring16 and an int.
3431 template <typename TAlloc>
3432 inline ustring16<TAlloc> operator+(const int left, const ustring16<TAlloc>& right)
3433 {
3434         ustring16<TAlloc> ret(core::stringc(left));
3435         ret += right;
3436         return ret;
3437 }
3438
3439
3440 //! Appends a ustring16 and an unsigned int.
3441 template <typename TAlloc>
3442 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const unsigned int right)
3443 {
3444         ustring16<TAlloc> ret(left);
3445         ret += core::stringc(right);
3446         return ret;
3447 }
3448
3449
3450 //! Appends a ustring16 and an unsigned int.
3451 template <typename TAlloc>
3452 inline ustring16<TAlloc> operator+(const unsigned int left, const ustring16<TAlloc>& right)
3453 {
3454         ustring16<TAlloc> ret(core::stringc(left));
3455         ret += right;
3456         return ret;
3457 }
3458
3459
3460 //! Appends a ustring16 and a long.
3461 template <typename TAlloc>
3462 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const long right)
3463 {
3464         ustring16<TAlloc> ret(left);
3465         ret += core::stringc(right);
3466         return ret;
3467 }
3468
3469
3470 //! Appends a ustring16 and a long.
3471 template <typename TAlloc>
3472 inline ustring16<TAlloc> operator+(const long left, const ustring16<TAlloc>& right)
3473 {
3474         ustring16<TAlloc> ret(core::stringc(left));
3475         ret += right;
3476         return ret;
3477 }
3478
3479
3480 //! Appends a ustring16 and an unsigned long.
3481 template <typename TAlloc>
3482 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const unsigned long right)
3483 {
3484         ustring16<TAlloc> ret(left);
3485         ret += core::stringc(right);
3486         return ret;
3487 }
3488
3489
3490 //! Appends a ustring16 and an unsigned long.
3491 template <typename TAlloc>
3492 inline ustring16<TAlloc> operator+(const unsigned long left, const ustring16<TAlloc>& right)
3493 {
3494         ustring16<TAlloc> ret(core::stringc(left));
3495         ret += right;
3496         return ret;
3497 }
3498
3499
3500 //! Appends a ustring16 and a float.
3501 template <typename TAlloc>
3502 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const float right)
3503 {
3504         ustring16<TAlloc> ret(left);
3505         ret += core::stringc(right);
3506         return ret;
3507 }
3508
3509
3510 //! Appends a ustring16 and a float.
3511 template <typename TAlloc>
3512 inline ustring16<TAlloc> operator+(const float left, const ustring16<TAlloc>& right)
3513 {
3514         ustring16<TAlloc> ret(core::stringc(left));
3515         ret += right;
3516         return ret;
3517 }
3518
3519
3520 //! Appends a ustring16 and a double.
3521 template <typename TAlloc>
3522 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const double right)
3523 {
3524         ustring16<TAlloc> ret(left);
3525         ret += core::stringc(right);
3526         return ret;
3527 }
3528
3529
3530 //! Appends a ustring16 and a double.
3531 template <typename TAlloc>
3532 inline ustring16<TAlloc> operator+(const double left, const ustring16<TAlloc>& right)
3533 {
3534         ustring16<TAlloc> ret(core::stringc(left));
3535         ret += right;
3536         return ret;
3537 }
3538
3539
3540 #ifdef USTRING_CPP0X
3541 //! Appends two ustring16s.
3542 template <typename TAlloc>
3543 inline ustring16<TAlloc>&& operator+(const ustring16<TAlloc>& left, ustring16<TAlloc>&& right)
3544 {
3545         //std::cout << "MOVE operator+(&, &&)" << std::endl;
3546         right.insert(left, 0);
3547         return std::move(right);
3548 }
3549
3550
3551 //! Appends two ustring16s.
3552 template <typename TAlloc>
3553 inline ustring16<TAlloc>&& operator+(ustring16<TAlloc>&& left, const ustring16<TAlloc>& right)
3554 {
3555         //std::cout << "MOVE operator+(&&, &)" << std::endl;
3556         left.append(right);
3557         return std::move(left);
3558 }
3559
3560
3561 //! Appends two ustring16s.
3562 template <typename TAlloc>
3563 inline ustring16<TAlloc>&& operator+(ustring16<TAlloc>&& left, ustring16<TAlloc>&& right)
3564 {
3565         //std::cout << "MOVE operator+(&&, &&)" << std::endl;
3566         if ((right.size_raw() <= left.capacity() - left.size_raw()) ||
3567                 (right.capacity() - right.size_raw() < left.size_raw()))
3568         {
3569                 left.append(right);
3570                 return std::move(left);
3571         }
3572         else
3573         {
3574                 right.insert(left, 0);
3575                 return std::move(right);
3576         }
3577 }
3578
3579
3580 //! Appends a ustring16 and a null-terminated unicode string.
3581 template <typename TAlloc, class B>
3582 inline ustring16<TAlloc>&& operator+(ustring16<TAlloc>&& left, const B* const right)
3583 {
3584         //std::cout << "MOVE operator+(&&, B*)" << std::endl;
3585         left.append(right);
3586         return std::move(left);
3587 }
3588
3589
3590 //! Appends a ustring16 and a null-terminated unicode string.
3591 template <class B, typename TAlloc>
3592 inline ustring16<TAlloc>&& operator+(const B* const left, ustring16<TAlloc>&& right)
3593 {
3594         //std::cout << "MOVE operator+(B*, &&)" << std::endl;
3595         right.insert(left, 0);
3596         return std::move(right);
3597 }
3598
3599
3600 //! Appends a ustring16 and an Irrlicht string.
3601 template <typename TAlloc, typename B, typename BAlloc>
3602 inline ustring16<TAlloc>&& operator+(const string<B, BAlloc>& left, ustring16<TAlloc>&& right)
3603 {
3604         //std::cout << "MOVE operator+(&, &&)" << std::endl;
3605         right.insert(left, 0);
3606         return std::move(right);
3607 }
3608
3609
3610 //! Appends a ustring16 and an Irrlicht string.
3611 template <typename TAlloc, typename B, typename BAlloc>
3612 inline ustring16<TAlloc>&& operator+(ustring16<TAlloc>&& left, const string<B, BAlloc>& right)
3613 {
3614         //std::cout << "MOVE operator+(&&, &)" << std::endl;
3615         left.append(right);
3616         return std::move(left);
3617 }
3618
3619
3620 //! Appends a ustring16 and a std::basic_string.
3621 template <typename TAlloc, typename B, typename A, typename BAlloc>
3622 inline ustring16<TAlloc>&& operator+(const std::basic_string<B, A, BAlloc>& left, ustring16<TAlloc>&& right)
3623 {
3624         //std::cout << "MOVE operator+(&, &&)" << std::endl;
3625         right.insert(core::ustring16<TAlloc>(left), 0);
3626         return std::move(right);
3627 }
3628
3629
3630 //! Appends a ustring16 and a std::basic_string.
3631 template <typename TAlloc, typename B, typename A, typename BAlloc>
3632 inline ustring16<TAlloc>&& operator+(ustring16<TAlloc>&& left, const std::basic_string<B, A, BAlloc>& right)
3633 {
3634         //std::cout << "MOVE operator+(&&, &)" << std::endl;
3635         left.append(right);
3636         return std::move(left);
3637 }
3638
3639
3640 //! Appends a ustring16 and a char.
3641 template <typename TAlloc>
3642 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const char right)
3643 {
3644         left.append((uchar32_t)right);
3645         return std::move(left);
3646 }
3647
3648
3649 //! Appends a ustring16 and a char.
3650 template <typename TAlloc>
3651 inline ustring16<TAlloc> operator+(const char left, ustring16<TAlloc>&& right)
3652 {
3653         right.insert((uchar32_t)left, 0);
3654         return std::move(right);
3655 }
3656
3657
3658 #ifdef USTRING_CPP0X_NEWLITERALS
3659 //! Appends a ustring16 and a uchar32_t.
3660 template <typename TAlloc>
3661 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const uchar32_t right)
3662 {
3663         left.append(right);
3664         return std::move(left);
3665 }
3666
3667
3668 //! Appends a ustring16 and a uchar32_t.
3669 template <typename TAlloc>
3670 inline ustring16<TAlloc> operator+(const uchar32_t left, ustring16<TAlloc>&& right)
3671 {
3672         right.insert(left, 0);
3673         return std::move(right);
3674 }
3675 #endif
3676
3677
3678 //! Appends a ustring16 and a short.
3679 template <typename TAlloc>
3680 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const short right)
3681 {
3682         left.append(core::stringc(right));
3683         return std::move(left);
3684 }
3685
3686
3687 //! Appends a ustring16 and a short.
3688 template <typename TAlloc>
3689 inline ustring16<TAlloc> operator+(const short left, ustring16<TAlloc>&& right)
3690 {
3691         right.insert(core::stringc(left), 0);
3692         return std::move(right);
3693 }
3694
3695
3696 //! Appends a ustring16 and an unsigned short.
3697 template <typename TAlloc>
3698 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const unsigned short right)
3699 {
3700         left.append(core::stringc(right));
3701         return std::move(left);
3702 }
3703
3704
3705 //! Appends a ustring16 and an unsigned short.
3706 template <typename TAlloc>
3707 inline ustring16<TAlloc> operator+(const unsigned short left, ustring16<TAlloc>&& right)
3708 {
3709         right.insert(core::stringc(left), 0);
3710         return std::move(right);
3711 }
3712
3713
3714 //! Appends a ustring16 and an int.
3715 template <typename TAlloc>
3716 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const int right)
3717 {
3718         left.append(core::stringc(right));
3719         return std::move(left);
3720 }
3721
3722
3723 //! Appends a ustring16 and an int.
3724 template <typename TAlloc>
3725 inline ustring16<TAlloc> operator+(const int left, ustring16<TAlloc>&& right)
3726 {
3727         right.insert(core::stringc(left), 0);
3728         return std::move(right);
3729 }
3730
3731
3732 //! Appends a ustring16 and an unsigned int.
3733 template <typename TAlloc>
3734 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const unsigned int right)
3735 {
3736         left.append(core::stringc(right));
3737         return std::move(left);
3738 }
3739
3740
3741 //! Appends a ustring16 and an unsigned int.
3742 template <typename TAlloc>
3743 inline ustring16<TAlloc> operator+(const unsigned int left, ustring16<TAlloc>&& right)
3744 {
3745         right.insert(core::stringc(left), 0);
3746         return std::move(right);
3747 }
3748
3749
3750 //! Appends a ustring16 and a long.
3751 template <typename TAlloc>
3752 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const long right)
3753 {
3754         left.append(core::stringc(right));
3755         return std::move(left);
3756 }
3757
3758
3759 //! Appends a ustring16 and a long.
3760 template <typename TAlloc>
3761 inline ustring16<TAlloc> operator+(const long left, ustring16<TAlloc>&& right)
3762 {
3763         right.insert(core::stringc(left), 0);
3764         return std::move(right);
3765 }
3766
3767
3768 //! Appends a ustring16 and an unsigned long.
3769 template <typename TAlloc>
3770 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const unsigned long right)
3771 {
3772         left.append(core::stringc(right));
3773         return std::move(left);
3774 }
3775
3776
3777 //! Appends a ustring16 and an unsigned long.
3778 template <typename TAlloc>
3779 inline ustring16<TAlloc> operator+(const unsigned long left, ustring16<TAlloc>&& right)
3780 {
3781         right.insert(core::stringc(left), 0);
3782         return std::move(right);
3783 }
3784
3785
3786 //! Appends a ustring16 and a float.
3787 template <typename TAlloc>
3788 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const float right)
3789 {
3790         left.append(core::stringc(right));
3791         return std::move(left);
3792 }
3793
3794
3795 //! Appends a ustring16 and a float.
3796 template <typename TAlloc>
3797 inline ustring16<TAlloc> operator+(const float left, ustring16<TAlloc>&& right)
3798 {
3799         right.insert(core::stringc(left), 0);
3800         return std::move(right);
3801 }
3802
3803
3804 //! Appends a ustring16 and a double.
3805 template <typename TAlloc>
3806 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const double right)
3807 {
3808         left.append(core::stringc(right));
3809         return std::move(left);
3810 }
3811
3812
3813 //! Appends a ustring16 and a double.
3814 template <typename TAlloc>
3815 inline ustring16<TAlloc> operator+(const double left, ustring16<TAlloc>&& right)
3816 {
3817         right.insert(core::stringc(left), 0);
3818         return std::move(right);
3819 }
3820 #endif
3821
3822
3823 #ifndef USTRING_NO_STL
3824 //! Writes a ustring16 to an ostream.
3825 template <typename TAlloc>
3826 inline std::ostream& operator<<(std::ostream& out, const ustring16<TAlloc>& in)
3827 {
3828         out << in.toUTF8_s().c_str();
3829         return out;
3830 }
3831
3832 //! Writes a ustring16 to a wostream.
3833 template <typename TAlloc>
3834 inline std::wostream& operator<<(std::wostream& out, const ustring16<TAlloc>& in)
3835 {
3836         out << in.toWCHAR_s().c_str();
3837         return out;
3838 }
3839 #endif
3840
3841
3842 #ifndef USTRING_NO_STL
3843
3844 namespace unicode
3845 {
3846
3847 //! Hashing algorithm for hashing a ustring.  Used for things like unordered_maps.
3848 //! Algorithm taken from std::hash<std::string>.
3849 class hash : public std::unary_function<core::ustring, size_t>
3850 {
3851         public:
3852                 size_t operator()(const core::ustring& s) const
3853                 {
3854                         size_t ret = 2166136261U;
3855                         size_t index = 0;
3856                         size_t stride = 1 + s.size_raw() / 10;
3857
3858                         core::ustring::const_iterator i = s.begin();
3859                         while (i != s.end())
3860                         {
3861                                 // TODO: Don't force u32 on an x64 OS.  Make it agnostic.
3862                                 ret = 16777619U * ret ^ (size_t)s[(u32)index];
3863                                 index += stride;
3864                                 i += stride;
3865                         }
3866                         return (ret);
3867                 }
3868 };
3869
3870 } // end namespace unicode
3871
3872 #endif
3873
3874 } // end namespace core
3875 } // end namespace irr
3876
3877 #endif