2 Basic Unicode string class for Irrlicht.
3 Copyright (c) 2009-2011 John Norman
5 This software is provided 'as-is', without any express or implied
6 warranty. In no event will the authors be held liable for any
7 damages arising from the use of this software.
9 Permission is granted to anyone to use this software for any
10 purpose, including commercial applications, and to alter it and
11 redistribute it freely, subject to the following restrictions:
13 1. The origin of this software must not be misrepresented; you
14 must not claim that you wrote the original software. If you use
15 this software in a product, an acknowledgment in the product
16 documentation would be appreciated but is not required.
18 2. Altered source versions must be plainly marked as such, and
19 must not be misrepresented as being the original software.
21 3. This notice may not be removed or altered from any source
24 The original version of this class can be located at:
25 http://irrlicht.suckerfreegames.com/
28 john@suckerfreegames.com
31 #ifndef __IRR_USTRING_H_INCLUDED__
32 #define __IRR_USTRING_H_INCLUDED__
34 #if (__cplusplus > 199711L) || (_MSC_VER >= 1600) || defined(__GXX_EXPERIMENTAL_CXX0X__)
35 # define USTRING_CPP0X
36 # if defined(__GXX_EXPERIMENTAL_CXX0X__) && ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 5)))
37 # define USTRING_CPP0X_NEWLITERALS
49 #ifndef USTRING_NO_STL
56 #include "irrAllocator.h"
59 #include "irrString.h"
62 //! UTF-16 surrogate start values.
63 static const irr::u16 UTF16_HI_SURROGATE = 0xD800;
64 static const irr::u16 UTF16_LO_SURROGATE = 0xDC00;
66 //! Is a UTF-16 code point a surrogate?
67 #define UTF16_IS_SURROGATE(c) (((c) & 0xF800) == 0xD800)
68 #define UTF16_IS_SURROGATE_HI(c) (((c) & 0xFC00) == 0xD800)
69 #define UTF16_IS_SURROGATE_LO(c) (((c) & 0xFC00) == 0xDC00)
75 // Define our character types.
76 #ifdef USTRING_CPP0X_NEWLITERALS // C++0x
77 typedef char32_t uchar32_t;
78 typedef char16_t uchar16_t;
79 typedef char uchar8_t;
81 typedef u32 uchar32_t;
82 typedef u16 uchar16_t;
92 //! The unicode replacement character. Used to replace invalid characters.
93 const irr::u16 UTF_REPLACEMENT_CHARACTER = 0xFFFD;
95 //! Convert a UTF-16 surrogate pair into a UTF-32 character.
96 //! \param high The high value of the pair.
97 //! \param low The low value of the pair.
98 //! \return The UTF-32 character expressed by the surrogate pair.
99 inline uchar32_t toUTF32(uchar16_t high, uchar16_t low)
101 // Convert the surrogate pair into a single UTF-32 character.
102 uchar32_t x = ((high & ((1 << 6) -1)) << 10) | (low & ((1 << 10) -1));
103 uchar32_t wu = ((high >> 6) & ((1 << 5) - 1)) + 1;
104 return (wu << 16) | x;
107 //! Swaps the endianness of a 16-bit value.
108 //! \return The new value.
109 inline uchar16_t swapEndian16(const uchar16_t& c)
111 return ((c >> 8) & 0x00FF) | ((c << 8) & 0xFF00);
114 //! Swaps the endianness of a 32-bit value.
115 //! \return The new value.
116 inline uchar32_t swapEndian32(const uchar32_t& c)
118 return ((c >> 24) & 0x000000FF) |
119 ((c >> 8) & 0x0000FF00) |
120 ((c << 8) & 0x00FF0000) |
121 ((c << 24) & 0xFF000000);
124 //! The Unicode byte order mark.
125 const u16 BOM = 0xFEFF;
127 //! The size of the Unicode byte order mark in terms of the Unicode character size.
128 const u8 BOM_UTF8_LEN = 3;
129 const u8 BOM_UTF16_LEN = 1;
130 const u8 BOM_UTF32_LEN = 1;
132 //! Unicode byte order marks for file operations.
133 const u8 BOM_ENCODE_UTF8[3] = { 0xEF, 0xBB, 0xBF };
134 const u8 BOM_ENCODE_UTF16_BE[2] = { 0xFE, 0xFF };
135 const u8 BOM_ENCODE_UTF16_LE[2] = { 0xFF, 0xFE };
136 const u8 BOM_ENCODE_UTF32_BE[4] = { 0x00, 0x00, 0xFE, 0xFF };
137 const u8 BOM_ENCODE_UTF32_LE[4] = { 0xFF, 0xFE, 0x00, 0x00 };
139 //! The size in bytes of the Unicode byte marks for file operations.
140 const u8 BOM_ENCODE_UTF8_LEN = 3;
141 const u8 BOM_ENCODE_UTF16_LEN = 2;
142 const u8 BOM_ENCODE_UTF32_LEN = 4;
144 //! Unicode encoding type.
157 //! Unicode endianness.
165 //! Returns the specified unicode byte order mark in a byte array.
166 //! The byte order mark is the first few bytes in a text file that signifies its encoding.
167 /** \param mode The Unicode encoding method that we want to get the byte order mark for.
168 If EUTFE_UTF16 or EUTFE_UTF32 is passed, it uses the native system endianness. **/
169 //! \return An array that contains a byte order mark.
170 inline core::array<u8> getUnicodeBOM(EUTF_ENCODE mode)
172 #define COPY_ARRAY(source, size) \
173 memcpy(ret.pointer(), source, size); \
176 core::array<u8> ret(4);
180 COPY_ARRAY(BOM_ENCODE_UTF8, BOM_ENCODE_UTF8_LEN);
183 #ifdef __BIG_ENDIAN__
184 COPY_ARRAY(BOM_ENCODE_UTF16_BE, BOM_ENCODE_UTF16_LEN);
186 COPY_ARRAY(BOM_ENCODE_UTF16_LE, BOM_ENCODE_UTF16_LEN);
190 COPY_ARRAY(BOM_ENCODE_UTF16_BE, BOM_ENCODE_UTF16_LEN);
193 COPY_ARRAY(BOM_ENCODE_UTF16_LE, BOM_ENCODE_UTF16_LEN);
196 #ifdef __BIG_ENDIAN__
197 COPY_ARRAY(BOM_ENCODE_UTF32_BE, BOM_ENCODE_UTF32_LEN);
199 COPY_ARRAY(BOM_ENCODE_UTF32_LE, BOM_ENCODE_UTF32_LEN);
203 COPY_ARRAY(BOM_ENCODE_UTF32_BE, BOM_ENCODE_UTF32_LEN);
206 COPY_ARRAY(BOM_ENCODE_UTF32_LE, BOM_ENCODE_UTF32_LEN);
209 // TODO sapier: fixed warning only,
210 // don't know if something needs to be done here
218 //! Detects if the given data stream starts with a unicode BOM.
219 //! \param data The data stream to check.
220 //! \return The unicode BOM associated with the data stream, or EUTFE_NONE if none was found.
221 inline EUTF_ENCODE determineUnicodeBOM(const char* data)
223 if (memcmp(data, BOM_ENCODE_UTF8, 3) == 0) return EUTFE_UTF8;
224 if (memcmp(data, BOM_ENCODE_UTF16_BE, 2) == 0) return EUTFE_UTF16_BE;
225 if (memcmp(data, BOM_ENCODE_UTF16_LE, 2) == 0) return EUTFE_UTF16_LE;
226 if (memcmp(data, BOM_ENCODE_UTF32_BE, 4) == 0) return EUTFE_UTF32_BE;
227 if (memcmp(data, BOM_ENCODE_UTF32_LE, 4) == 0) return EUTFE_UTF32_LE;
231 } // end namespace unicode
234 //! UTF-16 string class.
235 template <typename TAlloc = irrAllocator<uchar16_t> >
240 ///------------------///
241 /// iterator classes ///
242 ///------------------///
244 //! Access an element in a unicode string, allowing one to change it.
245 class _ustring16_iterator_access
248 _ustring16_iterator_access(const ustring16<TAlloc>* s, u32 p) : ref(s), pos(p) {}
250 //! Allow the class to be interpreted as a single UTF-32 character.
251 operator uchar32_t() const
256 //! Allow one to change the character in the unicode string.
257 //! \param c The new character to use.
259 _ustring16_iterator_access& operator=(const uchar32_t c)
265 //! Increments the value by 1.
267 _ustring16_iterator_access& operator++()
273 //! Increments the value by 1, returning the old value.
274 //! \return A unicode character.
275 uchar32_t operator++(int)
277 uchar32_t old = _get();
282 //! Decrements the value by 1.
284 _ustring16_iterator_access& operator--()
290 //! Decrements the value by 1, returning the old value.
291 //! \return A unicode character.
292 uchar32_t operator--(int)
294 uchar32_t old = _get();
299 //! Adds to the value by a specified amount.
300 //! \param val The amount to add to this character.
302 _ustring16_iterator_access& operator+=(int val)
308 //! Subtracts from the value by a specified amount.
309 //! \param val The amount to subtract from this character.
311 _ustring16_iterator_access& operator-=(int val)
317 //! Multiples the value by a specified amount.
318 //! \param val The amount to multiply this character by.
320 _ustring16_iterator_access& operator*=(int val)
326 //! Divides the value by a specified amount.
327 //! \param val The amount to divide this character by.
329 _ustring16_iterator_access& operator/=(int val)
335 //! Modulos the value by a specified amount.
336 //! \param val The amount to modulo this character by.
338 _ustring16_iterator_access& operator%=(int val)
344 //! Adds to the value by a specified amount.
345 //! \param val The amount to add to this character.
346 //! \return A unicode character.
347 uchar32_t operator+(int val) const
352 //! Subtracts from the value by a specified amount.
353 //! \param val The amount to subtract from this character.
354 //! \return A unicode character.
355 uchar32_t operator-(int val) const
360 //! Multiplies the value by a specified amount.
361 //! \param val The amount to multiply this character by.
362 //! \return A unicode character.
363 uchar32_t operator*(int val) const
368 //! Divides the value by a specified amount.
369 //! \param val The amount to divide this character by.
370 //! \return A unicode character.
371 uchar32_t operator/(int val) const
376 //! Modulos the value by a specified amount.
377 //! \param val The amount to modulo this character by.
378 //! \return A unicode character.
379 uchar32_t operator%(int val) const
385 //! Gets a uchar32_t from our current position.
386 uchar32_t _get() const
388 const uchar16_t* a = ref->c_str();
389 if (!UTF16_IS_SURROGATE(a[pos]))
390 return static_cast<uchar32_t>(a[pos]);
393 if (pos + 1 >= ref->size_raw())
396 return unicode::toUTF32(a[pos], a[pos + 1]);
400 //! Sets a uchar32_t at our current position.
401 void _set(uchar32_t c)
403 ustring16<TAlloc>* ref2 = const_cast<ustring16<TAlloc>*>(ref);
404 const uchar16_t* a = ref2->c_str();
407 // c will be multibyte, so split it up into the high and low surrogate pairs.
408 uchar16_t x = static_cast<uchar16_t>(c);
409 uchar16_t vh = UTF16_HI_SURROGATE | ((((c >> 16) & ((1 << 5) - 1)) - 1) << 6) | (x >> 10);
410 uchar16_t vl = UTF16_LO_SURROGATE | (x & ((1 << 10) - 1));
412 // If the previous position was a surrogate pair, just replace them. Else, insert the low pair.
413 if (UTF16_IS_SURROGATE_HI(a[pos]) && pos + 1 != ref2->size_raw())
414 ref2->replace_raw(vl, static_cast<u32>(pos) + 1);
415 else ref2->insert_raw(vl, static_cast<u32>(pos) + 1);
417 ref2->replace_raw(vh, static_cast<u32>(pos));
421 // c will be a single byte.
422 uchar16_t vh = static_cast<uchar16_t>(c);
424 // If the previous position was a surrogate pair, remove the extra byte.
425 if (UTF16_IS_SURROGATE_HI(a[pos]))
426 ref2->erase_raw(static_cast<u32>(pos) + 1);
428 ref2->replace_raw(vh, static_cast<u32>(pos));
432 const ustring16<TAlloc>* ref;
435 typedef typename ustring16<TAlloc>::_ustring16_iterator_access access;
438 //! Iterator to iterate through a UTF-16 string.
439 #ifndef USTRING_NO_STL
440 class _ustring16_const_iterator : public std::iterator<
441 std::bidirectional_iterator_tag, // iterator_category
442 access, // value_type
443 ptrdiff_t, // difference_type
444 const access, // pointer
445 const access // reference
448 class _ustring16_const_iterator
452 typedef _ustring16_const_iterator _Iter;
453 typedef std::iterator<std::bidirectional_iterator_tag, access, ptrdiff_t, const access, const access> _Base;
454 typedef const access const_pointer;
455 typedef const access const_reference;
457 #ifndef USTRING_NO_STL
458 typedef typename _Base::value_type value_type;
459 typedef typename _Base::difference_type difference_type;
460 typedef typename _Base::difference_type distance_type;
461 typedef typename _Base::pointer pointer;
462 typedef const_reference reference;
464 typedef access value_type;
465 typedef u32 difference_type;
466 typedef u32 distance_type;
467 typedef const_pointer pointer;
468 typedef const_reference reference;
472 _ustring16_const_iterator(const _Iter& i) : ref(i.ref), pos(i.pos) {}
473 _ustring16_const_iterator(const ustring16<TAlloc>& s) : ref(&s), pos(0) {}
474 _ustring16_const_iterator(const ustring16<TAlloc>& s, const u32 p) : ref(&s), pos(0)
476 if (ref->size_raw() == 0 || p == 0)
479 // Go to the appropriate position.
481 u32 sr = ref->size_raw();
482 const uchar16_t* a = ref->c_str();
483 while (i != 0 && pos < sr)
485 if (UTF16_IS_SURROGATE_HI(a[pos]))
492 //! Test for equalness.
493 bool operator==(const _Iter& iter) const
495 if (ref == iter.ref && pos == iter.pos)
500 //! Test for unequalness.
501 bool operator!=(const _Iter& iter) const
503 if (ref != iter.ref || pos != iter.pos)
508 //! Switch to the next full character in the string.
511 if (pos == ref->size_raw()) return *this;
512 const uchar16_t* a = ref->c_str();
513 if (UTF16_IS_SURROGATE_HI(a[pos]))
514 pos += 2; // TODO: check for valid low surrogate?
516 if (pos > ref->size_raw()) pos = ref->size_raw();
520 //! Switch to the next full character in the string, returning the previous position.
521 _Iter operator++(int)
528 //! Switch to the previous full character in the string.
531 if (pos == 0) return *this;
532 const uchar16_t* a = ref->c_str();
534 if (UTF16_IS_SURROGATE_LO(a[pos]) && pos != 0) // low surrogate, go back one more.
539 //! Switch to the previous full character in the string, returning the previous position.
540 _Iter operator--(int)
547 //! Advance a specified number of full characters in the string.
549 _Iter& operator+=(const difference_type v)
551 if (v == 0) return *this;
552 if (v < 0) return operator-=(v * -1);
554 if (pos >= ref->size_raw())
557 // Go to the appropriate position.
558 // TODO: Don't force u32 on an x64 OS. Make it agnostic.
560 u32 sr = ref->size_raw();
561 const uchar16_t* a = ref->c_str();
562 while (i != 0 && pos < sr)
564 if (UTF16_IS_SURROGATE_HI(a[pos]))
575 //! Go back a specified number of full characters in the string.
577 _Iter& operator-=(const difference_type v)
579 if (v == 0) return *this;
580 if (v > 0) return operator+=(v * -1);
585 // Go to the appropriate position.
586 // TODO: Don't force u32 on an x64 OS. Make it agnostic.
588 const uchar16_t* a = ref->c_str();
589 while (i != 0 && pos != 0)
592 if (UTF16_IS_SURROGATE_LO(a[pos]) != 0 && pos != 0)
600 //! Return a new iterator that is a variable number of full characters forward from the current position.
601 _Iter operator+(const difference_type v) const
608 //! Return a new iterator that is a variable number of full characters backward from the current position.
609 _Iter operator-(const difference_type v) const
616 //! Returns the distance between two iterators.
617 difference_type operator-(const _Iter& iter) const
619 // Make sure we reference the same object!
621 return difference_type();
646 //! Accesses the full character at the iterator's position.
647 const_reference operator*() const
649 if (pos >= ref->size_raw())
651 const uchar16_t* a = ref->c_str();
652 u32 p = ref->size_raw();
653 if (UTF16_IS_SURROGATE_LO(a[p]))
655 reference ret(ref, p);
658 const_reference ret(ref, pos);
662 //! Accesses the full character at the iterator's position.
663 reference operator*()
665 if (pos >= ref->size_raw())
667 const uchar16_t* a = ref->c_str();
668 u32 p = ref->size_raw();
669 if (UTF16_IS_SURROGATE_LO(a[p]))
671 reference ret(ref, p);
674 reference ret(ref, pos);
678 //! Accesses the full character at the iterator's position.
679 const_pointer operator->() const
684 //! Accesses the full character at the iterator's position.
690 //! Is the iterator at the start of the string?
696 //! Is the iterator at the end of the string?
699 const uchar16_t* a = ref->c_str();
700 if (UTF16_IS_SURROGATE(a[pos]))
701 return (pos + 1) >= ref->size_raw();
702 else return pos >= ref->size_raw();
705 //! Moves the iterator to the start of the string.
711 //! Moves the iterator to the end of the string.
714 pos = ref->size_raw();
717 //! Returns the iterator's position.
718 //! \return The iterator's position.
725 const ustring16<TAlloc>* ref;
729 //! Iterator to iterate through a UTF-16 string.
730 class _ustring16_iterator : public _ustring16_const_iterator
733 typedef _ustring16_iterator _Iter;
734 typedef _ustring16_const_iterator _Base;
735 typedef typename _Base::const_pointer const_pointer;
736 typedef typename _Base::const_reference const_reference;
739 typedef typename _Base::value_type value_type;
740 typedef typename _Base::difference_type difference_type;
741 typedef typename _Base::distance_type distance_type;
742 typedef access pointer;
743 typedef access reference;
749 _ustring16_iterator(const _Iter& i) : _ustring16_const_iterator(i) {}
750 _ustring16_iterator(const ustring16<TAlloc>& s) : _ustring16_const_iterator(s) {}
751 _ustring16_iterator(const ustring16<TAlloc>& s, const u32 p) : _ustring16_const_iterator(s, p) {}
753 //! Accesses the full character at the iterator's position.
754 reference operator*() const
756 if (pos >= ref->size_raw())
758 const uchar16_t* a = ref->c_str();
759 u32 p = ref->size_raw();
760 if (UTF16_IS_SURROGATE_LO(a[p]))
762 reference ret(ref, p);
765 reference ret(ref, pos);
769 //! Accesses the full character at the iterator's position.
770 reference operator*()
772 if (pos >= ref->size_raw())
774 const uchar16_t* a = ref->c_str();
775 u32 p = ref->size_raw();
776 if (UTF16_IS_SURROGATE_LO(a[p]))
778 reference ret(ref, p);
781 reference ret(ref, pos);
785 //! Accesses the full character at the iterator's position.
786 pointer operator->() const
791 //! Accesses the full character at the iterator's position.
798 typedef typename ustring16<TAlloc>::_ustring16_iterator iterator;
799 typedef typename ustring16<TAlloc>::_ustring16_const_iterator const_iterator;
801 ///----------------------///
802 /// end iterator classes ///
803 ///----------------------///
805 //! Default constructor
807 : array(0), allocated(1), used(0)
810 encoding = unicode::EUTFE_UTF16_BE;
812 encoding = unicode::EUTFE_UTF16_LE;
814 array = allocator.allocate(1); // new u16[1];
820 ustring16(const ustring16<TAlloc>& other)
821 : array(0), allocated(0), used(0)
824 encoding = unicode::EUTFE_UTF16_BE;
826 encoding = unicode::EUTFE_UTF16_LE;
832 //! Constructor from other string types
833 template <class B, class A>
834 ustring16(const string<B, A>& other)
835 : array(0), allocated(0), used(0)
838 encoding = unicode::EUTFE_UTF16_BE;
840 encoding = unicode::EUTFE_UTF16_LE;
846 #ifndef USTRING_NO_STL
847 //! Constructor from std::string
848 template <class B, class A, typename Alloc>
849 ustring16(const std::basic_string<B, A, Alloc>& other)
850 : array(0), allocated(0), used(0)
853 encoding = unicode::EUTFE_UTF16_BE;
855 encoding = unicode::EUTFE_UTF16_LE;
857 *this = other.c_str();
861 //! Constructor from iterator.
862 template <typename Itr>
863 ustring16(Itr first, Itr last)
864 : array(0), allocated(0), used(0)
867 encoding = unicode::EUTFE_UTF16_BE;
869 encoding = unicode::EUTFE_UTF16_LE;
871 reserve(std::distance(first, last));
874 for (; first != last; ++first)
875 append((uchar32_t)*first);
880 #ifndef USTRING_CPP0X_NEWLITERALS
881 //! Constructor for copying a character string from a pointer.
882 ustring16(const char* const c)
883 : array(0), allocated(0), used(0)
886 encoding = unicode::EUTFE_UTF16_BE;
888 encoding = unicode::EUTFE_UTF16_LE;
891 loadDataStream(c, strlen(c));
892 //append((uchar8_t*)c);
896 //! Constructor for copying a character string from a pointer with a given length.
897 ustring16(const char* const c, u32 length)
898 : array(0), allocated(0), used(0)
901 encoding = unicode::EUTFE_UTF16_BE;
903 encoding = unicode::EUTFE_UTF16_LE;
906 loadDataStream(c, length);
911 //! Constructor for copying a UTF-8 string from a pointer.
912 ustring16(const uchar8_t* const c)
913 : array(0), allocated(0), used(0)
916 encoding = unicode::EUTFE_UTF16_BE;
918 encoding = unicode::EUTFE_UTF16_LE;
925 //! Constructor for copying a UTF-8 string from a single char.
926 ustring16(const char c)
927 : array(0), allocated(0), used(0)
930 encoding = unicode::EUTFE_UTF16_BE;
932 encoding = unicode::EUTFE_UTF16_LE;
935 append((uchar32_t)c);
939 //! Constructor for copying a UTF-8 string from a pointer with a given length.
940 ustring16(const uchar8_t* const c, u32 length)
941 : array(0), allocated(0), used(0)
944 encoding = unicode::EUTFE_UTF16_BE;
946 encoding = unicode::EUTFE_UTF16_LE;
953 //! Constructor for copying a UTF-16 string from a pointer.
954 ustring16(const uchar16_t* const c)
955 : array(0), allocated(0), used(0)
958 encoding = unicode::EUTFE_UTF16_BE;
960 encoding = unicode::EUTFE_UTF16_LE;
967 //! Constructor for copying a UTF-16 string from a pointer with a given length
968 ustring16(const uchar16_t* const c, u32 length)
969 : array(0), allocated(0), used(0)
972 encoding = unicode::EUTFE_UTF16_BE;
974 encoding = unicode::EUTFE_UTF16_LE;
981 //! Constructor for copying a UTF-32 string from a pointer.
982 ustring16(const uchar32_t* const c)
983 : array(0), allocated(0), used(0)
986 encoding = unicode::EUTFE_UTF16_BE;
988 encoding = unicode::EUTFE_UTF16_LE;
995 //! Constructor for copying a UTF-32 from a pointer with a given length.
996 ustring16(const uchar32_t* const c, u32 length)
997 : array(0), allocated(0), used(0)
1000 encoding = unicode::EUTFE_UTF16_BE;
1002 encoding = unicode::EUTFE_UTF16_LE;
1009 //! Constructor for copying a wchar_t string from a pointer.
1010 ustring16(const wchar_t* const c)
1011 : array(0), allocated(0), used(0)
1014 encoding = unicode::EUTFE_UTF16_BE;
1016 encoding = unicode::EUTFE_UTF16_LE;
1019 if (sizeof(wchar_t) == 4)
1020 append(reinterpret_cast<const uchar32_t* const>(c));
1021 else if (sizeof(wchar_t) == 2)
1022 append(reinterpret_cast<const uchar16_t* const>(c));
1023 else if (sizeof(wchar_t) == 1)
1024 append(reinterpret_cast<const uchar8_t* const>(c));
1028 //! Constructor for copying a wchar_t string from a pointer with a given length.
1029 ustring16(const wchar_t* const c, u32 length)
1030 : array(0), allocated(0), used(0)
1033 encoding = unicode::EUTFE_UTF16_BE;
1035 encoding = unicode::EUTFE_UTF16_LE;
1038 if (sizeof(wchar_t) == 4)
1039 append(reinterpret_cast<const uchar32_t* const>(c), length);
1040 else if (sizeof(wchar_t) == 2)
1041 append(reinterpret_cast<const uchar16_t* const>(c), length);
1042 else if (sizeof(wchar_t) == 1)
1043 append(reinterpret_cast<const uchar8_t* const>(c), length);
1047 #ifdef USTRING_CPP0X
1048 //! Constructor for moving a ustring16
1049 ustring16(ustring16<TAlloc>&& other)
1050 : array(other.array), encoding(other.encoding), allocated(other.allocated), used(other.used)
1052 //std::cout << "MOVE constructor" << std::endl;
1054 other.allocated = 0;
1063 allocator.deallocate(array); // delete [] array;
1067 //! Assignment operator
1068 ustring16& operator=(const ustring16<TAlloc>& other)
1073 used = other.size_raw();
1074 if (used >= allocated)
1076 allocator.deallocate(array); // delete [] array;
1077 allocated = used + 1;
1078 array = allocator.allocate(used + 1); //new u16[used];
1081 const uchar16_t* p = other.c_str();
1082 for (u32 i=0; i<=used; ++i, ++p)
1087 // Validate our new UTF-16 string.
1094 #ifdef USTRING_CPP0X
1095 //! Move assignment operator
1096 ustring16& operator=(ustring16<TAlloc>&& other)
1100 //std::cout << "MOVE operator=" << std::endl;
1101 allocator.deallocate(array);
1103 array = other.array;
1104 allocated = other.allocated;
1105 encoding = other.encoding;
1115 //! Assignment operator for other string types
1116 template <class B, class A>
1117 ustring16<TAlloc>& operator=(const string<B, A>& other)
1119 *this = other.c_str();
1124 //! Assignment operator for UTF-8 strings
1125 ustring16<TAlloc>& operator=(const uchar8_t* const c)
1129 array = allocator.allocate(1); //new u16[1];
1134 if (!c) return *this;
1136 //! Append our string now.
1142 //! Assignment operator for UTF-16 strings
1143 ustring16<TAlloc>& operator=(const uchar16_t* const c)
1147 array = allocator.allocate(1); //new u16[1];
1152 if (!c) return *this;
1154 //! Append our string now.
1160 //! Assignment operator for UTF-32 strings
1161 ustring16<TAlloc>& operator=(const uchar32_t* const c)
1165 array = allocator.allocate(1); //new u16[1];
1170 if (!c) return *this;
1172 //! Append our string now.
1178 //! Assignment operator for wchar_t strings.
1179 /** Note that this assumes that a correct unicode string is stored in the wchar_t string.
1180 Since wchar_t changes depending on its platform, it could either be a UTF-8, -16, or -32 string.
1181 This function assumes you are storing the correct unicode encoding inside the wchar_t string. **/
1182 ustring16<TAlloc>& operator=(const wchar_t* const c)
1184 if (sizeof(wchar_t) == 4)
1185 *this = reinterpret_cast<const uchar32_t* const>(c);
1186 else if (sizeof(wchar_t) == 2)
1187 *this = reinterpret_cast<const uchar16_t* const>(c);
1188 else if (sizeof(wchar_t) == 1)
1189 *this = reinterpret_cast<const uchar8_t* const>(c);
1195 //! Assignment operator for other strings.
1196 /** Note that this assumes that a correct unicode string is stored in the string. **/
1198 ustring16<TAlloc>& operator=(const B* const c)
1201 *this = reinterpret_cast<const uchar32_t* const>(c);
1202 else if (sizeof(B) == 2)
1203 *this = reinterpret_cast<const uchar16_t* const>(c);
1204 else if (sizeof(B) == 1)
1205 *this = reinterpret_cast<const uchar8_t* const>(c);
1211 //! Direct access operator
1212 access operator [](const u32 index)
1214 _IRR_DEBUG_BREAK_IF(index>=size()) // bad index
1215 iterator iter(*this, index);
1216 return iter.operator*();
1220 //! Direct access operator
1221 const access operator [](const u32 index) const
1223 _IRR_DEBUG_BREAK_IF(index>=size()) // bad index
1224 const_iterator iter(*this, index);
1225 return iter.operator*();
1229 //! Equality operator
1230 bool operator ==(const uchar16_t* const str) const
1236 for(i=0; array[i] && str[i]; ++i)
1237 if (array[i] != str[i])
1240 return !array[i] && !str[i];
1244 //! Equality operator
1245 bool operator ==(const ustring16<TAlloc>& other) const
1247 for(u32 i=0; array[i] && other.array[i]; ++i)
1248 if (array[i] != other.array[i])
1251 return used == other.used;
1255 //! Is smaller comparator
1256 bool operator <(const ustring16<TAlloc>& other) const
1258 for(u32 i=0; array[i] && other.array[i]; ++i)
1260 s32 diff = array[i] - other.array[i];
1265 return used < other.used;
1269 //! Inequality operator
1270 bool operator !=(const uchar16_t* const str) const
1272 return !(*this == str);
1276 //! Inequality operator
1277 bool operator !=(const ustring16<TAlloc>& other) const
1279 return !(*this == other);
1283 //! Returns the length of a ustring16 in full characters.
1284 //! \return Length of a ustring16 in full characters.
1287 const_iterator i(*this, 0);
1298 //! Informs if the ustring is empty or not.
1299 //! \return True if the ustring is empty, false if not.
1302 return (size_raw() == 0);
1306 //! Returns a pointer to the raw UTF-16 string data.
1307 //! \return pointer to C-style NUL terminated array of UTF-16 code points.
1308 const uchar16_t* c_str() const
1314 //! Compares the first n characters of this string with another.
1315 //! \param other Other string to compare to.
1316 //! \param n Number of characters to compare.
1317 //! \return True if the n first characters of both strings are equal.
1318 bool equalsn(const ustring16<TAlloc>& other, u32 n) const
1321 const uchar16_t* oa = other.c_str();
1322 for(i=0; array[i] && oa[i] && i < n; ++i)
1323 if (array[i] != oa[i])
1326 // if one (or both) of the strings was smaller then they
1327 // are only equal if they have the same length
1328 return (i == n) || (used == other.used);
1332 //! Compares the first n characters of this string with another.
1333 //! \param str Other string to compare to.
1334 //! \param n Number of characters to compare.
1335 //! \return True if the n first characters of both strings are equal.
1336 bool equalsn(const uchar16_t* const str, u32 n) const
1341 for(i=0; array[i] && str[i] && i < n; ++i)
1342 if (array[i] != str[i])
1345 // if one (or both) of the strings was smaller then they
1346 // are only equal if they have the same length
1347 return (i == n) || (array[i] == 0 && str[i] == 0);
1351 //! Appends a character to this ustring16
1352 //! \param character The character to append.
1353 //! \return A reference to our current string.
1354 ustring16<TAlloc>& append(uchar32_t character)
1356 if (used + 2 >= allocated)
1357 reallocate(used + 2);
1359 if (character > 0xFFFF)
1363 // character will be multibyte, so split it up into a surrogate pair.
1364 uchar16_t x = static_cast<uchar16_t>(character);
1365 uchar16_t vh = UTF16_HI_SURROGATE | ((((character >> 16) & ((1 << 5) - 1)) - 1) << 6) | (x >> 10);
1366 uchar16_t vl = UTF16_LO_SURROGATE | (x & ((1 << 10) - 1));
1373 array[used-1] = character;
1381 //! Appends a UTF-8 string to this ustring16
1382 //! \param other The UTF-8 string to append.
1383 //! \param length The length of the string to append.
1384 //! \return A reference to our current string.
1385 ustring16<TAlloc>& append(const uchar8_t* const other, u32 length=0xffffffff)
1390 // Determine if the string is long enough for a BOM.
1392 const uchar8_t* p = other;
1396 } while (*p++ && len < unicode::BOM_ENCODE_UTF8_LEN);
1399 unicode::EUTF_ENCODE c_bom = unicode::EUTFE_NONE;
1400 if (len == unicode::BOM_ENCODE_UTF8_LEN)
1402 if (memcmp(other, unicode::BOM_ENCODE_UTF8, unicode::BOM_ENCODE_UTF8_LEN) == 0)
1403 c_bom = unicode::EUTFE_UTF8;
1406 // If a BOM was found, don't include it in the string.
1407 const uchar8_t* c2 = other;
1408 if (c_bom != unicode::EUTFE_NONE)
1410 c2 = other + unicode::BOM_UTF8_LEN;
1411 length -= unicode::BOM_UTF8_LEN;
1414 // Calculate the size of the string to read in.
1420 } while(*p++ && len < length);
1424 // If we need to grow the array, do it now.
1425 if (used + len >= allocated)
1426 reallocate(used + (len * 2));
1429 // Convert UTF-8 to UTF-16.
1431 for (u32 l = 0; l<len;)
1434 if (((c2[l] >> 6) & 0x03) == 0x02)
1435 { // Invalid continuation byte.
1436 array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1439 else if (c2[l] == 0xC0 || c2[l] == 0xC1)
1440 { // Invalid byte - overlong encoding.
1441 array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1444 else if ((c2[l] & 0xF8) == 0xF0)
1445 { // 4 bytes UTF-8, 2 bytes UTF-16.
1446 // Check for a full string.
1449 array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1457 if (valid && (((c2[l+1] >> 6) & 0x03) == 0x02)) ++l2; else valid = false;
1458 if (valid && (((c2[l+2] >> 6) & 0x03) == 0x02)) ++l2; else valid = false;
1459 if (valid && (((c2[l+3] >> 6) & 0x03) == 0x02)) ++l2; else valid = false;
1462 array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1468 uchar8_t b1 = ((c2[l] & 0x7) << 2) | ((c2[l+1] >> 4) & 0x3);
1469 uchar8_t b2 = ((c2[l+1] & 0xF) << 4) | ((c2[l+2] >> 2) & 0xF);
1470 uchar8_t b3 = ((c2[l+2] & 0x3) << 6) | (c2[l+3] & 0x3F);
1471 uchar32_t v = b3 | ((uchar32_t)b2 << 8) | ((uchar32_t)b1 << 16);
1473 // Split v up into a surrogate pair.
1474 uchar16_t x = static_cast<uchar16_t>(v);
1475 uchar16_t vh = UTF16_HI_SURROGATE | ((((v >> 16) & ((1 << 5) - 1)) - 1) << 6) | (x >> 10);
1476 uchar16_t vl = UTF16_LO_SURROGATE | (x & ((1 << 10) - 1));
1481 ++used; // Using two shorts this time, so increase used by 1.
1483 else if ((c2[l] & 0xF0) == 0xE0)
1484 { // 3 bytes UTF-8, 1 byte UTF-16.
1485 // Check for a full string.
1488 array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1496 if (valid && (((c2[l+1] >> 6) & 0x03) == 0x02)) ++l2; else valid = false;
1497 if (valid && (((c2[l+2] >> 6) & 0x03) == 0x02)) ++l2; else valid = false;
1500 array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1506 uchar8_t b1 = ((c2[l] & 0xF) << 4) | ((c2[l+1] >> 2) & 0xF);
1507 uchar8_t b2 = ((c2[l+1] & 0x3) << 6) | (c2[l+2] & 0x3F);
1508 uchar16_t ch = b2 | ((uchar16_t)b1 << 8);
1512 else if ((c2[l] & 0xE0) == 0xC0)
1513 { // 2 bytes UTF-8, 1 byte UTF-16.
1514 // Check for a full string.
1517 array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1523 if (((c2[l+1] >> 6) & 0x03) != 0x02)
1525 array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1531 uchar8_t b1 = (c2[l] >> 2) & 0x7;
1532 uchar8_t b2 = ((c2[l] & 0x3) << 6) | (c2[l+1] & 0x3F);
1533 uchar16_t ch = b2 | ((uchar16_t)b1 << 8);
1538 { // 1 byte UTF-8, 1 byte UTF-16.
1541 { // Values above 0xF4 are restricted and aren't used. By now, anything above 0x7F is invalid.
1542 array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1544 else array[pos++] = static_cast<uchar16_t>(c2[l]);
1550 // Validate our new UTF-16 string.
1557 //! Appends a UTF-16 string to this ustring16
1558 //! \param other The UTF-16 string to append.
1559 //! \param length The length of the string to append.
1560 //! \return A reference to our current string.
1561 ustring16<TAlloc>& append(const uchar16_t* const other, u32 length=0xffffffff)
1566 // Determine if the string is long enough for a BOM.
1568 const uchar16_t* p = other;
1572 } while (*p++ && len < unicode::BOM_ENCODE_UTF16_LEN);
1574 // Check for the BOM to determine the string's endianness.
1575 unicode::EUTF_ENDIAN c_end = unicode::EUTFEE_NATIVE;
1576 if (memcmp(other, unicode::BOM_ENCODE_UTF16_LE, unicode::BOM_ENCODE_UTF16_LEN) == 0)
1577 c_end = unicode::EUTFEE_LITTLE;
1578 else if (memcmp(other, unicode::BOM_ENCODE_UTF16_BE, unicode::BOM_ENCODE_UTF16_LEN) == 0)
1579 c_end = unicode::EUTFEE_BIG;
1581 // If a BOM was found, don't include it in the string.
1582 const uchar16_t* c2 = other;
1583 if (c_end != unicode::EUTFEE_NATIVE)
1585 c2 = other + unicode::BOM_UTF16_LEN;
1586 length -= unicode::BOM_UTF16_LEN;
1589 // Calculate the size of the string to read in.
1595 } while(*p++ && len < length);
1599 // If we need to grow the size of the array, do it now.
1600 if (used + len >= allocated)
1601 reallocate(used + (len * 2));
1605 // Copy the string now.
1606 unicode::EUTF_ENDIAN m_end = getEndianness();
1607 for (u32 l = start; l < start + len; ++l)
1609 array[l] = (uchar16_t)c2[l];
1610 if (c_end != unicode::EUTFEE_NATIVE && c_end != m_end)
1611 array[l] = unicode::swapEndian16(array[l]);
1616 // Validate our new UTF-16 string.
1622 //! Appends a UTF-32 string to this ustring16
1623 //! \param other The UTF-32 string to append.
1624 //! \param length The length of the string to append.
1625 //! \return A reference to our current string.
1626 ustring16<TAlloc>& append(const uchar32_t* const other, u32 length=0xffffffff)
1631 // Check for the BOM to determine the string's endianness.
1632 unicode::EUTF_ENDIAN c_end = unicode::EUTFEE_NATIVE;
1633 if (memcmp(other, unicode::BOM_ENCODE_UTF32_LE, unicode::BOM_ENCODE_UTF32_LEN) == 0)
1634 c_end = unicode::EUTFEE_LITTLE;
1635 else if (memcmp(other, unicode::BOM_ENCODE_UTF32_BE, unicode::BOM_ENCODE_UTF32_LEN) == 0)
1636 c_end = unicode::EUTFEE_BIG;
1638 // If a BOM was found, don't include it in the string.
1639 const uchar32_t* c2 = other;
1640 if (c_end != unicode::EUTFEE_NATIVE)
1642 c2 = other + unicode::BOM_UTF32_LEN;
1643 length -= unicode::BOM_UTF32_LEN;
1646 // Calculate the size of the string to read in.
1648 const uchar32_t* p = c2;
1652 } while(*p++ && len < length);
1656 // If we need to grow the size of the array, do it now.
1657 // In case all of the UTF-32 string is split into surrogate pairs, do len * 2.
1658 if (used + (len * 2) >= allocated)
1659 reallocate(used + ((len * 2) * 2));
1662 // Convert UTF-32 to UTF-16.
1663 unicode::EUTF_ENDIAN m_end = getEndianness();
1665 for (u32 l = 0; l<len; ++l)
1669 uchar32_t ch = c2[l];
1670 if (c_end != unicode::EUTFEE_NATIVE && c_end != m_end)
1671 ch = unicode::swapEndian32(ch);
1675 // Split ch up into a surrogate pair as it is over 16 bits long.
1676 uchar16_t x = static_cast<uchar16_t>(ch);
1677 uchar16_t vh = UTF16_HI_SURROGATE | ((((ch >> 16) & ((1 << 5) - 1)) - 1) << 6) | (x >> 10);
1678 uchar16_t vl = UTF16_LO_SURROGATE | (x & ((1 << 10) - 1));
1681 ++used; // Using two shorts, so increased used again.
1683 else if (ch >= 0xD800 && ch <= 0xDFFF)
1685 // Between possible UTF-16 surrogates (invalid!)
1686 array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1688 else array[pos++] = static_cast<uchar16_t>(ch);
1692 // Validate our new UTF-16 string.
1699 //! Appends a ustring16 to this ustring16
1700 //! \param other The string to append to this one.
1701 //! \return A reference to our current string.
1702 ustring16<TAlloc>& append(const ustring16<TAlloc>& other)
1704 const uchar16_t* oa = other.c_str();
1706 u32 len = other.size_raw();
1708 if (used + len >= allocated)
1709 reallocate(used + len);
1711 for (u32 l=0; l<len; ++l)
1712 array[used+l] = oa[l];
1721 //! Appends a certain amount of characters of a ustring16 to this ustring16.
1722 //! \param other The string to append to this one.
1723 //! \param length How many characters of the other string to add to this one.
1724 //! \return A reference to our current string.
1725 ustring16<TAlloc>& append(const ustring16<TAlloc>& other, u32 length)
1727 if (other.size() == 0)
1730 if (other.size() < length)
1736 if (used + length * 2 >= allocated)
1737 reallocate(used + length * 2);
1739 const_iterator iter(other, 0);
1741 while (!iter.atEnd() && l)
1743 uchar32_t c = *iter;
1753 //! Reserves some memory.
1754 //! \param count The amount of characters to reserve.
1755 void reserve(u32 count)
1757 if (count < allocated)
1764 //! Finds first occurrence of character.
1765 //! \param c The character to search for.
1766 //! \return Position where the character has been found, or -1 if not found.
1767 s32 findFirst(uchar32_t c) const
1769 const_iterator i(*this, 0);
1784 //! Finds first occurrence of a character of a list.
1785 //! \param c A list of characters to find. For example if the method should find the first occurrence of 'a' or 'b', this parameter should be "ab".
1786 //! \param count The amount of characters in the list. Usually, this should be strlen(c).
1787 //! \return Position where one of the characters has been found, or -1 if not found.
1788 s32 findFirstChar(const uchar32_t* const c, u32 count=1) const
1793 const_iterator i(*this, 0);
1799 for (u32 j=0; j<count; ++j)
1810 //! Finds first position of a character not in a given list.
1811 //! \param c A list of characters to NOT find. For example if the method should find the first occurrence of a character not 'a' or 'b', this parameter should be "ab".
1812 //! \param count The amount of characters in the list. Usually, this should be strlen(c).
1813 //! \return Position where the character has been found, or -1 if not found.
1814 s32 findFirstCharNotInList(const uchar32_t* const c, u32 count=1) const
1819 const_iterator i(*this, 0);
1826 for (j=0; j<count; ++j)
1839 //! Finds last position of a character not in a given list.
1840 //! \param c A list of characters to NOT find. For example if the method should find the first occurrence of a character not 'a' or 'b', this parameter should be "ab".
1841 //! \param count The amount of characters in the list. Usually, this should be strlen(c).
1842 //! \return Position where the character has been found, or -1 if not found.
1843 s32 findLastCharNotInList(const uchar32_t* const c, u32 count=1) const
1848 const_iterator i(end());
1851 s32 pos = size() - 1;
1852 while (!i.atStart())
1856 for (j=0; j<count; ++j)
1869 //! Finds next occurrence of character.
1870 //! \param c The character to search for.
1871 //! \param startPos The position in the string to start searching.
1872 //! \return Position where the character has been found, or -1 if not found.
1873 s32 findNext(uchar32_t c, u32 startPos) const
1875 const_iterator i(*this, startPos);
1891 //! Finds last occurrence of character.
1892 //! \param c The character to search for.
1893 //! \param start The start position of the reverse search ( default = -1, on end ).
1894 //! \return Position where the character has been found, or -1 if not found.
1895 s32 findLast(uchar32_t c, s32 start = -1) const
1898 start = core::clamp ( start < 0 ? (s32)s : start, 0, (s32)s ) - 1;
1900 const_iterator i(*this, start);
1902 while (!i.atStart())
1914 //! Finds last occurrence of a character in a list.
1915 //! \param c A list of strings to find. For example if the method should find the last occurrence of 'a' or 'b', this parameter should be "ab".
1916 //! \param count The amount of characters in the list. Usually, this should be strlen(c).
1917 //! \return Position where one of the characters has been found, or -1 if not found.
1918 s32 findLastChar(const uchar32_t* const c, u32 count=1) const
1923 const_iterator i(end());
1927 while (!i.atStart())
1930 for (u32 j=0; j<count; ++j)
1941 //! Finds another ustring16 in this ustring16.
1942 //! \param str The string to find.
1943 //! \param start The start position of the search.
1944 //! \return Positions where the ustring16 has been found, or -1 if not found.
1945 s32 find(const ustring16<TAlloc>& str, const u32 start = 0) const
1947 u32 my_size = size();
1948 u32 their_size = str.size();
1950 if (their_size == 0 || my_size - start < their_size)
1953 const_iterator i(*this, start);
1958 const_iterator i2(i);
1959 const_iterator j(str, 0);
1960 uchar32_t t1 = (uchar32_t)*i2;
1961 uchar32_t t2 = (uchar32_t)*j;
1968 t1 = (uchar32_t)*i2;
1979 //! Finds another ustring16 in this ustring16.
1980 //! \param str The string to find.
1981 //! \param start The start position of the search.
1982 //! \return Positions where the string has been found, or -1 if not found.
1983 s32 find_raw(const ustring16<TAlloc>& str, const u32 start = 0) const
1985 const uchar16_t* data = str.c_str();
1996 for (u32 i=start; i<=used-len; ++i)
2000 while(data[j] && array[i+j] == data[j])
2012 //! Returns a substring.
2013 //! \param begin: Start of substring.
2014 //! \param length: Length of substring.
2015 //! \return A reference to our current string.
2016 ustring16<TAlloc> subString(u32 begin, s32 length) const
2019 // if start after ustring16
2020 // or no proper substring length
2021 if ((length <= 0) || (begin>=len))
2022 return ustring16<TAlloc>("");
2023 // clamp length to maximal value
2024 if ((length+begin) > len)
2027 ustring16<TAlloc> o;
2028 o.reserve((length+1) * 2);
2030 const_iterator i(*this, begin);
2031 while (!i.atEnd() && length)
2042 //! Appends a character to this ustring16.
2043 //! \param c Character to append.
2044 //! \return A reference to our current string.
2045 ustring16<TAlloc>& operator += (char c)
2047 append((uchar32_t)c);
2052 //! Appends a character to this ustring16.
2053 //! \param c Character to append.
2054 //! \return A reference to our current string.
2055 ustring16<TAlloc>& operator += (uchar32_t c)
2062 //! Appends a number to this ustring16.
2063 //! \param c Number to append.
2064 //! \return A reference to our current string.
2065 ustring16<TAlloc>& operator += (short c)
2067 append(core::stringc(c));
2072 //! Appends a number to this ustring16.
2073 //! \param c Number to append.
2074 //! \return A reference to our current string.
2075 ustring16<TAlloc>& operator += (unsigned short c)
2077 append(core::stringc(c));
2082 #ifdef USTRING_CPP0X_NEWLITERALS
2083 //! Appends a number to this ustring16.
2084 //! \param c Number to append.
2085 //! \return A reference to our current string.
2086 ustring16<TAlloc>& operator += (int c)
2088 append(core::stringc(c));
2093 //! Appends a number to this ustring16.
2094 //! \param c Number to append.
2095 //! \return A reference to our current string.
2096 ustring16<TAlloc>& operator += (unsigned int c)
2098 append(core::stringc(c));
2104 //! Appends a number to this ustring16.
2105 //! \param c Number to append.
2106 //! \return A reference to our current string.
2107 ustring16<TAlloc>& operator += (long c)
2109 append(core::stringc(c));
2114 //! Appends a number to this ustring16.
2115 //! \param c Number to append.
2116 //! \return A reference to our current string.
2117 ustring16<TAlloc>& operator += (unsigned long c)
2119 append(core::stringc(c));
2124 //! Appends a number to this ustring16.
2125 //! \param c Number to append.
2126 //! \return A reference to our current string.
2127 ustring16<TAlloc>& operator += (double c)
2129 append(core::stringc(c));
2134 //! Appends a char ustring16 to this ustring16.
2135 //! \param c Char ustring16 to append.
2136 //! \return A reference to our current string.
2137 ustring16<TAlloc>& operator += (const uchar16_t* const c)
2144 //! Appends a ustring16 to this ustring16.
2145 //! \param other ustring16 to append.
2146 //! \return A reference to our current string.
2147 ustring16<TAlloc>& operator += (const ustring16<TAlloc>& other)
2154 //! Replaces all characters of a given type with another one.
2155 //! \param toReplace Character to replace.
2156 //! \param replaceWith Character replacing the old one.
2157 //! \return A reference to our current string.
2158 ustring16<TAlloc>& replace(uchar32_t toReplace, uchar32_t replaceWith)
2160 iterator i(*this, 0);
2163 typename ustring16<TAlloc>::access a = *i;
2164 if ((uchar32_t)a == toReplace)
2172 //! Replaces all instances of a string with another one.
2173 //! \param toReplace The string to replace.
2174 //! \param replaceWith The string replacing the old one.
2175 //! \return A reference to our current string.
2176 ustring16<TAlloc>& replace(const ustring16<TAlloc>& toReplace, const ustring16<TAlloc>& replaceWith)
2178 if (toReplace.size() == 0)
2181 const uchar16_t* other = toReplace.c_str();
2182 const uchar16_t* replace = replaceWith.c_str();
2183 const u32 other_size = toReplace.size_raw();
2184 const u32 replace_size = replaceWith.size_raw();
2186 // Determine the delta. The algorithm will change depending on the delta.
2187 s32 delta = replace_size - other_size;
2189 // A character for character replace. The string will not shrink or grow.
2193 while ((pos = find_raw(other, pos)) != -1)
2195 for (u32 i = 0; i < replace_size; ++i)
2196 array[pos + i] = replace[i];
2202 // We are going to be removing some characters. The string will shrink.
2206 for (u32 pos = 0; pos <= used; ++i, ++pos)
2208 // Is this potentially a match?
2209 if (array[pos] == *other)
2211 // Check to see if we have a match.
2213 for (j = 0; j < other_size; ++j)
2215 if (array[pos + j] != other[j])
2219 // If we have a match, replace characters.
2220 if (j == other_size)
2222 for (j = 0; j < replace_size; ++j)
2223 array[i + j] = replace[j];
2224 i += replace_size - 1;
2225 pos += other_size - 1;
2230 // No match found, just copy characters.
2231 array[i - 1] = array[pos];
2239 // We are going to be adding characters, so the string size will increase.
2240 // Count the number of times toReplace exists in the string so we can allocate the new size.
2243 while ((pos = find_raw(other, pos)) != -1)
2249 // Re-allocate the string now, if needed.
2250 u32 len = delta * find_count;
2251 if (used + len >= allocated)
2252 reallocate(used + len);
2256 while ((pos = find_raw(other, pos)) != -1)
2258 uchar16_t* start = array + pos + other_size - 1;
2259 uchar16_t* ptr = array + used;
2260 uchar16_t* end = array + used + delta;
2262 // Shift characters to make room for the string.
2263 while (ptr != start)
2270 // Add the new string now.
2271 for (u32 i = 0; i < replace_size; ++i)
2272 array[pos + i] = replace[i];
2274 pos += replace_size;
2278 // Terminate the string and return ourself.
2284 //! Removes characters from a ustring16..
2285 //! \param c The character to remove.
2286 //! \return A reference to our current string.
2287 ustring16<TAlloc>& remove(uchar32_t c)
2291 u32 len = (c > 0xFFFF ? 2 : 1); // Remove characters equal to the size of c as a UTF-16 character.
2292 for (u32 i=0; i<=used; ++i)
2295 if (!UTF16_IS_SURROGATE_HI(array[i]))
2297 else if (i + 1 <= used)
2299 // Convert the surrogate pair into a single UTF-32 character.
2300 uc32 = unicode::toUTF32(array[i], array[i + 1]);
2302 u32 len2 = (uc32 > 0xFFFF ? 2 : 1);
2310 array[pos++] = array[i];
2312 array[pos++] = array[++i];
2320 //! Removes a ustring16 from the ustring16.
2321 //! \param toRemove The string to remove.
2322 //! \return A reference to our current string.
2323 ustring16<TAlloc>& remove(const ustring16<TAlloc>& toRemove)
2325 u32 size = toRemove.size_raw();
2326 if (size == 0) return *this;
2328 const uchar16_t* tra = toRemove.c_str();
2331 for (u32 i=0; i<=used; ++i)
2336 if (array[i + j] != tra[j])
2347 array[pos++] = array[i];
2355 //! Removes characters from the ustring16.
2356 //! \param characters The characters to remove.
2357 //! \return A reference to our current string.
2358 ustring16<TAlloc>& removeChars(const ustring16<TAlloc>& characters)
2360 if (characters.size_raw() == 0)
2365 const_iterator iter(characters);
2366 for (u32 i=0; i<=used; ++i)
2369 if (!UTF16_IS_SURROGATE_HI(array[i]))
2371 else if (i + 1 <= used)
2373 // Convert the surrogate pair into a single UTF-32 character.
2374 uc32 = unicode::toUTF32(array[i], array[i+1]);
2376 u32 len2 = (uc32 > 0xFFFF ? 2 : 1);
2380 while (!iter.atEnd())
2382 uchar32_t c = *iter;
2385 found += (c > 0xFFFF ? 2 : 1); // Remove characters equal to the size of c as a UTF-16 character.
2394 array[pos++] = array[i];
2396 array[pos++] = array[++i];
2404 //! Trims the ustring16.
2405 //! Removes the specified characters (by default, Latin-1 whitespace) from the begining and the end of the ustring16.
2406 //! \param whitespace The characters that are to be considered as whitespace.
2407 //! \return A reference to our current string.
2408 ustring16<TAlloc>& trim(const ustring16<TAlloc>& whitespace = " \t\n\r")
2410 core::array<uchar32_t> utf32white = whitespace.toUTF32();
2412 // find start and end of the substring without the specified characters
2413 const s32 begin = findFirstCharNotInList(utf32white.const_pointer(), whitespace.used + 1);
2417 const s32 end = findLastCharNotInList(utf32white.const_pointer(), whitespace.used + 1);
2419 return (*this = subString(begin, (end +1) - begin));
2423 //! Erases a character from the ustring16.
2424 //! May be slow, because all elements following after the erased element have to be copied.
2425 //! \param index Index of element to be erased.
2426 //! \return A reference to our current string.
2427 ustring16<TAlloc>& erase(u32 index)
2429 _IRR_DEBUG_BREAK_IF(index>used) // access violation
2431 iterator i(*this, index);
2434 u32 len = (t > 0xFFFF ? 2 : 1);
2436 for (u32 j = static_cast<u32>(i.getPos()) + len; j <= used; ++j)
2437 array[j - len] = array[j];
2446 //! Validate the existing ustring16, checking for valid surrogate pairs and checking for proper termination.
2447 //! \return A reference to our current string.
2448 ustring16<TAlloc>& validate()
2450 // Validate all unicode characters.
2451 for (u32 i=0; i<allocated; ++i)
2453 // Terminate on existing null.
2459 if (UTF16_IS_SURROGATE(array[i]))
2461 if (((i+1) >= allocated) || UTF16_IS_SURROGATE_LO(array[i]))
2462 array[i] = unicode::UTF_REPLACEMENT_CHARACTER;
2463 else if (UTF16_IS_SURROGATE_HI(array[i]) && !UTF16_IS_SURROGATE_LO(array[i+1]))
2464 array[i] = unicode::UTF_REPLACEMENT_CHARACTER;
2467 if (array[i] >= 0xFDD0 && array[i] <= 0xFDEF)
2468 array[i] = unicode::UTF_REPLACEMENT_CHARACTER;
2475 used = allocated - 1;
2482 //! Gets the last char of the ustring16, or 0.
2483 //! \return The last char of the ustring16, or 0.
2484 uchar32_t lastChar() const
2489 if (UTF16_IS_SURROGATE_LO(array[used-1]))
2491 // Make sure we have a paired surrogate.
2495 // Check for an invalid surrogate.
2496 if (!UTF16_IS_SURROGATE_HI(array[used-2]))
2499 // Convert the surrogate pair into a single UTF-32 character.
2500 return unicode::toUTF32(array[used-2], array[used-1]);
2504 return array[used-1];
2509 //! Split the ustring16 into parts.
2510 /** This method will split a ustring16 at certain delimiter characters
2511 into the container passed in as reference. The type of the container
2512 has to be given as template parameter. It must provide a push_back and
2514 \param ret The result container
2515 \param c C-style ustring16 of delimiter characters
2516 \param count Number of delimiter characters
2517 \param ignoreEmptyTokens Flag to avoid empty substrings in the result
2518 container. If two delimiters occur without a character in between, an
2519 empty substring would be placed in the result. If this flag is set,
2520 only non-empty strings are stored.
2521 \param keepSeparators Flag which allows to add the separator to the
2522 result ustring16. If this flag is true, the concatenation of the
2523 substrings results in the original ustring16. Otherwise, only the
2524 characters between the delimiters are returned.
2525 \return The number of resulting substrings
2527 template<class container>
2528 u32 split(container& ret, const uchar32_t* const c, u32 count=1, bool ignoreEmptyTokens=true, bool keepSeparators=false) const
2533 const_iterator i(*this);
2534 const u32 oldSize=ret.size();
2538 bool lastWasSeparator = false;
2542 bool foundSeparator = false;
2543 for (u32 j=0; j<count; ++j)
2547 if ((!ignoreEmptyTokens || pos - lastpos != 0) &&
2549 ret.push_back(ustring16<TAlloc>(&array[lastpospos], pos - lastpos));
2550 foundSeparator = true;
2551 lastpos = (keepSeparators ? pos : pos + 1);
2552 lastpospos = (keepSeparators ? i.getPos() : i.getPos() + 1);
2556 lastWasSeparator = foundSeparator;
2562 ret.push_back(ustring16<TAlloc>(&array[lastpospos], s - lastpos));
2563 return ret.size()-oldSize;
2567 //! Split the ustring16 into parts.
2568 /** This method will split a ustring16 at certain delimiter characters
2569 into the container passed in as reference. The type of the container
2570 has to be given as template parameter. It must provide a push_back and
2572 \param ret The result container
2573 \param c A unicode string of delimiter characters
2574 \param ignoreEmptyTokens Flag to avoid empty substrings in the result
2575 container. If two delimiters occur without a character in between, an
2576 empty substring would be placed in the result. If this flag is set,
2577 only non-empty strings are stored.
2578 \param keepSeparators Flag which allows to add the separator to the
2579 result ustring16. If this flag is true, the concatenation of the
2580 substrings results in the original ustring16. Otherwise, only the
2581 characters between the delimiters are returned.
2582 \return The number of resulting substrings
2584 template<class container>
2585 u32 split(container& ret, const ustring16<TAlloc>& c, bool ignoreEmptyTokens=true, bool keepSeparators=false) const
2587 core::array<uchar32_t> v = c.toUTF32();
2588 return split(ret, v.pointer(), v.size(), ignoreEmptyTokens, keepSeparators);
2592 //! Gets the size of the allocated memory buffer for the string.
2593 //! \return The size of the allocated memory buffer.
2594 u32 capacity() const
2600 //! Returns the raw number of UTF-16 code points in the string which includes the individual surrogates.
2601 //! \return The raw number of UTF-16 code points, excluding the trialing NUL.
2602 u32 size_raw() const
2608 //! Inserts a character into the string.
2609 //! \param c The character to insert.
2610 //! \param pos The position to insert the character.
2611 //! \return A reference to our current string.
2612 ustring16<TAlloc>& insert(uchar32_t c, u32 pos)
2614 u8 len = (c > 0xFFFF ? 2 : 1);
2616 if (used + len >= allocated)
2617 reallocate(used + len);
2621 iterator iter(*this, pos);
2622 for (u32 i = used - 2; i > iter.getPos(); --i)
2623 array[i] = array[i - len];
2627 // c will be multibyte, so split it up into a surrogate pair.
2628 uchar16_t x = static_cast<uchar16_t>(c);
2629 uchar16_t vh = UTF16_HI_SURROGATE | ((((c >> 16) & ((1 << 5) - 1)) - 1) << 6) | (x >> 10);
2630 uchar16_t vl = UTF16_LO_SURROGATE | (x & ((1 << 10) - 1));
2631 array[iter.getPos()] = vh;
2632 array[iter.getPos()+1] = vl;
2636 array[iter.getPos()] = static_cast<uchar16_t>(c);
2643 //! Inserts a string into the string.
2644 //! \param c The string to insert.
2645 //! \param pos The position to insert the string.
2646 //! \return A reference to our current string.
2647 ustring16<TAlloc>& insert(const ustring16<TAlloc>& c, u32 pos)
2649 u32 len = c.size_raw();
2650 if (len == 0) return *this;
2652 if (used + len >= allocated)
2653 reallocate(used + len);
2657 iterator iter(*this, pos);
2658 for (u32 i = used - 2; i > iter.getPos() + len; --i)
2659 array[i] = array[i - len];
2661 const uchar16_t* s = c.c_str();
2662 for (u32 i = 0; i < len; ++i)
2673 //! Inserts a character into the string.
2674 //! \param c The character to insert.
2675 //! \param pos The position to insert the character.
2676 //! \return A reference to our current string.
2677 ustring16<TAlloc>& insert_raw(uchar16_t c, u32 pos)
2679 if (used + 1 >= allocated)
2680 reallocate(used + 1);
2684 for (u32 i = used - 1; i > pos; --i)
2685 array[i] = array[i - 1];
2693 //! Removes a character from string.
2694 //! \param pos Position of the character to remove.
2695 //! \return A reference to our current string.
2696 ustring16<TAlloc>& erase_raw(u32 pos)
2698 for (u32 i=pos; i<=used; ++i)
2700 array[i] = array[i + 1];
2708 //! Replaces a character in the string.
2709 //! \param c The new character.
2710 //! \param pos The position of the character to replace.
2711 //! \return A reference to our current string.
2712 ustring16<TAlloc>& replace_raw(uchar16_t c, u32 pos)
2719 //! Returns an iterator to the beginning of the string.
2720 //! \return An iterator to the beginning of the string.
2723 iterator i(*this, 0);
2728 //! Returns an iterator to the beginning of the string.
2729 //! \return An iterator to the beginning of the string.
2730 const_iterator begin() const
2732 const_iterator i(*this, 0);
2737 //! Returns an iterator to the beginning of the string.
2738 //! \return An iterator to the beginning of the string.
2739 const_iterator cbegin() const
2741 const_iterator i(*this, 0);
2746 //! Returns an iterator to the end of the string.
2747 //! \return An iterator to the end of the string.
2750 iterator i(*this, 0);
2756 //! Returns an iterator to the end of the string.
2757 //! \return An iterator to the end of the string.
2758 const_iterator end() const
2760 const_iterator i(*this, 0);
2766 //! Returns an iterator to the end of the string.
2767 //! \return An iterator to the end of the string.
2768 const_iterator cend() const
2770 const_iterator i(*this, 0);
2776 //! Converts the string to a UTF-8 encoded string.
2777 //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
2778 //! \return A string containing the UTF-8 encoded string.
2779 core::string<uchar8_t> toUTF8_s(const bool addBOM = false) const
2781 core::string<uchar8_t> ret;
2782 ret.reserve(used * 4 + (addBOM ? unicode::BOM_UTF8_LEN : 0) + 1);
2783 const_iterator iter(*this, 0);
2785 // Add the byte order mark if the user wants it.
2788 ret.append(unicode::BOM_ENCODE_UTF8[0]);
2789 ret.append(unicode::BOM_ENCODE_UTF8[1]);
2790 ret.append(unicode::BOM_ENCODE_UTF8[2]);
2793 while (!iter.atEnd())
2795 uchar32_t c = *iter;
2798 uchar8_t b1 = (0x1E << 3) | ((c >> 18) & 0x7);
2799 uchar8_t b2 = (0x2 << 6) | ((c >> 12) & 0x3F);
2800 uchar8_t b3 = (0x2 << 6) | ((c >> 6) & 0x3F);
2801 uchar8_t b4 = (0x2 << 6) | (c & 0x3F);
2809 uchar8_t b1 = (0xE << 4) | ((c >> 12) & 0xF);
2810 uchar8_t b2 = (0x2 << 6) | ((c >> 6) & 0x3F);
2811 uchar8_t b3 = (0x2 << 6) | (c & 0x3F);
2818 uchar8_t b1 = (0x6 << 5) | ((c >> 6) & 0x1F);
2819 uchar8_t b2 = (0x2 << 6) | (c & 0x3F);
2825 ret.append(static_cast<uchar8_t>(c));
2833 //! Converts the string to a UTF-8 encoded string array.
2834 //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
2835 //! \return An array containing the UTF-8 encoded string.
2836 core::array<uchar8_t> toUTF8(const bool addBOM = false) const
2838 core::array<uchar8_t> ret(used * 4 + (addBOM ? unicode::BOM_UTF8_LEN : 0) + 1);
2839 const_iterator iter(*this, 0);
2841 // Add the byte order mark if the user wants it.
2844 ret.push_back(unicode::BOM_ENCODE_UTF8[0]);
2845 ret.push_back(unicode::BOM_ENCODE_UTF8[1]);
2846 ret.push_back(unicode::BOM_ENCODE_UTF8[2]);
2849 while (!iter.atEnd())
2851 uchar32_t c = *iter;
2854 uchar8_t b1 = (0x1E << 3) | ((c >> 18) & 0x7);
2855 uchar8_t b2 = (0x2 << 6) | ((c >> 12) & 0x3F);
2856 uchar8_t b3 = (0x2 << 6) | ((c >> 6) & 0x3F);
2857 uchar8_t b4 = (0x2 << 6) | (c & 0x3F);
2865 uchar8_t b1 = (0xE << 4) | ((c >> 12) & 0xF);
2866 uchar8_t b2 = (0x2 << 6) | ((c >> 6) & 0x3F);
2867 uchar8_t b3 = (0x2 << 6) | (c & 0x3F);
2874 uchar8_t b1 = (0x6 << 5) | ((c >> 6) & 0x1F);
2875 uchar8_t b2 = (0x2 << 6) | (c & 0x3F);
2881 ret.push_back(static_cast<uchar8_t>(c));
2890 #ifdef USTRING_CPP0X_NEWLITERALS // C++0x
2891 //! Converts the string to a UTF-16 encoded string.
2892 //! \param endian The desired endianness of the string.
2893 //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
2894 //! \return A string containing the UTF-16 encoded string.
2895 core::string<char16_t> toUTF16_s(const unicode::EUTF_ENDIAN endian = unicode::EUTFEE_NATIVE, const bool addBOM = false) const
2897 core::string<char16_t> ret;
2898 ret.reserve(used + (addBOM ? unicode::BOM_UTF16_LEN : 0) + 1);
2900 // Add the BOM if specified.
2903 if (endian == unicode::EUTFEE_NATIVE)
2904 ret[0] = unicode::BOM;
2905 else if (endian == unicode::EUTFEE_LITTLE)
2907 uchar8_t* ptr8 = reinterpret_cast<uchar8_t*>(ret.c_str());
2908 *ptr8++ = unicode::BOM_ENCODE_UTF16_LE[0];
2909 *ptr8 = unicode::BOM_ENCODE_UTF16_LE[1];
2913 uchar8_t* ptr8 = reinterpret_cast<uchar8_t*>(ret.c_str());
2914 *ptr8++ = unicode::BOM_ENCODE_UTF16_BE[0];
2915 *ptr8 = unicode::BOM_ENCODE_UTF16_BE[1];
2920 if (endian != unicode::EUTFEE_NATIVE && getEndianness() != endian)
2922 char16_t* ptr = ret.c_str();
2923 for (u32 i = 0; i < ret.size(); ++i)
2924 *ptr++ = unicode::swapEndian16(*ptr);
2931 //! Converts the string to a UTF-16 encoded string array.
2932 //! Unfortunately, no toUTF16_s() version exists due to limitations with Irrlicht's string class.
2933 //! \param endian The desired endianness of the string.
2934 //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
2935 //! \return An array containing the UTF-16 encoded string.
2936 core::array<uchar16_t> toUTF16(const unicode::EUTF_ENDIAN endian = unicode::EUTFEE_NATIVE, const bool addBOM = false) const
2938 core::array<uchar16_t> ret(used + (addBOM ? unicode::BOM_UTF16_LEN : 0) + 1);
2939 uchar16_t* ptr = ret.pointer();
2941 // Add the BOM if specified.
2944 if (endian == unicode::EUTFEE_NATIVE)
2945 *ptr = unicode::BOM;
2946 else if (endian == unicode::EUTFEE_LITTLE)
2948 uchar8_t* ptr8 = reinterpret_cast<uchar8_t*>(ptr);
2949 *ptr8++ = unicode::BOM_ENCODE_UTF16_LE[0];
2950 *ptr8 = unicode::BOM_ENCODE_UTF16_LE[1];
2954 uchar8_t* ptr8 = reinterpret_cast<uchar8_t*>(ptr);
2955 *ptr8++ = unicode::BOM_ENCODE_UTF16_BE[0];
2956 *ptr8 = unicode::BOM_ENCODE_UTF16_BE[1];
2961 memcpy((void*)ptr, (void*)array, used * sizeof(uchar16_t));
2962 if (endian != unicode::EUTFEE_NATIVE && getEndianness() != endian)
2964 for (u32 i = 0; i <= used; ++i)
2965 ptr[i] = unicode::swapEndian16(ptr[i]);
2967 ret.set_used(used + (addBOM ? unicode::BOM_UTF16_LEN : 0));
2973 #ifdef USTRING_CPP0X_NEWLITERALS // C++0x
2974 //! Converts the string to a UTF-32 encoded string.
2975 //! \param endian The desired endianness of the string.
2976 //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
2977 //! \return A string containing the UTF-32 encoded string.
2978 core::string<char32_t> toUTF32_s(const unicode::EUTF_ENDIAN endian = unicode::EUTFEE_NATIVE, const bool addBOM = false) const
2980 core::string<char32_t> ret;
2981 ret.reserve(size() + 1 + (addBOM ? unicode::BOM_UTF32_LEN : 0));
2982 const_iterator iter(*this, 0);
2984 // Add the BOM if specified.
2987 if (endian == unicode::EUTFEE_NATIVE)
2988 ret.append(unicode::BOM);
2997 if (endian == unicode::EUTFEE_LITTLE)
2999 t.chunk[0] = unicode::BOM_ENCODE_UTF32_LE[0];
3000 t.chunk[1] = unicode::BOM_ENCODE_UTF32_LE[1];
3001 t.chunk[2] = unicode::BOM_ENCODE_UTF32_LE[2];
3002 t.chunk[3] = unicode::BOM_ENCODE_UTF32_LE[3];
3006 t.chunk[0] = unicode::BOM_ENCODE_UTF32_BE[0];
3007 t.chunk[1] = unicode::BOM_ENCODE_UTF32_BE[1];
3008 t.chunk[2] = unicode::BOM_ENCODE_UTF32_BE[2];
3009 t.chunk[3] = unicode::BOM_ENCODE_UTF32_BE[3];
3015 while (!iter.atEnd())
3017 uchar32_t c = *iter;
3018 if (endian != unicode::EUTFEE_NATIVE && getEndianness() != endian)
3019 c = unicode::swapEndian32(c);
3028 //! Converts the string to a UTF-32 encoded string array.
3029 //! Unfortunately, no toUTF32_s() version exists due to limitations with Irrlicht's string class.
3030 //! \param endian The desired endianness of the string.
3031 //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
3032 //! \return An array containing the UTF-32 encoded string.
3033 core::array<uchar32_t> toUTF32(const unicode::EUTF_ENDIAN endian = unicode::EUTFEE_NATIVE, const bool addBOM = false) const
3035 core::array<uchar32_t> ret(size() + (addBOM ? unicode::BOM_UTF32_LEN : 0) + 1);
3036 const_iterator iter(*this, 0);
3038 // Add the BOM if specified.
3041 if (endian == unicode::EUTFEE_NATIVE)
3042 ret.push_back(unicode::BOM);
3051 if (endian == unicode::EUTFEE_LITTLE)
3053 t.chunk[0] = unicode::BOM_ENCODE_UTF32_LE[0];
3054 t.chunk[1] = unicode::BOM_ENCODE_UTF32_LE[1];
3055 t.chunk[2] = unicode::BOM_ENCODE_UTF32_LE[2];
3056 t.chunk[3] = unicode::BOM_ENCODE_UTF32_LE[3];
3060 t.chunk[0] = unicode::BOM_ENCODE_UTF32_BE[0];
3061 t.chunk[1] = unicode::BOM_ENCODE_UTF32_BE[1];
3062 t.chunk[2] = unicode::BOM_ENCODE_UTF32_BE[2];
3063 t.chunk[3] = unicode::BOM_ENCODE_UTF32_BE[3];
3065 ret.push_back(t.full);
3070 while (!iter.atEnd())
3072 uchar32_t c = *iter;
3073 if (endian != unicode::EUTFEE_NATIVE && getEndianness() != endian)
3074 c = unicode::swapEndian32(c);
3082 //! Converts the string to a wchar_t encoded string.
3083 /** The size of a wchar_t changes depending on the platform. This function will store a
3084 correct UTF-8, -16, or -32 encoded string depending on the size of a wchar_t. **/
3085 //! \param endian The desired endianness of the string.
3086 //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
3087 //! \return A string containing the wchar_t encoded string.
3088 core::string<wchar_t> toWCHAR_s(const unicode::EUTF_ENDIAN endian = unicode::EUTFEE_NATIVE, const bool addBOM = false) const
3090 if (sizeof(wchar_t) == 4)
3092 core::array<uchar32_t> a(toUTF32(endian, addBOM));
3093 core::stringw ret(a.pointer());
3096 else if (sizeof(wchar_t) == 2)
3098 if (endian == unicode::EUTFEE_NATIVE && addBOM == false)
3100 core::stringw ret(array);
3105 core::array<uchar16_t> a(toUTF16(endian, addBOM));
3106 core::stringw ret(a.pointer());
3110 else if (sizeof(wchar_t) == 1)
3112 core::array<uchar8_t> a(toUTF8(addBOM));
3113 core::stringw ret(a.pointer());
3117 // Shouldn't happen.
3118 return core::stringw();
3122 //! Converts the string to a wchar_t encoded string array.
3123 /** The size of a wchar_t changes depending on the platform. This function will store a
3124 correct UTF-8, -16, or -32 encoded string depending on the size of a wchar_t. **/
3125 //! \param endian The desired endianness of the string.
3126 //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
3127 //! \return An array containing the wchar_t encoded string.
3128 core::array<wchar_t> toWCHAR(const unicode::EUTF_ENDIAN endian = unicode::EUTFEE_NATIVE, const bool addBOM = false) const
3130 if (sizeof(wchar_t) == 4)
3132 core::array<uchar32_t> a(toUTF32(endian, addBOM));
3133 core::array<wchar_t> ret(a.size());
3134 ret.set_used(a.size());
3135 memcpy((void*)ret.pointer(), (void*)a.pointer(), a.size() * sizeof(uchar32_t));
3138 if (sizeof(wchar_t) == 2)
3140 if (endian == unicode::EUTFEE_NATIVE && addBOM == false)
3142 core::array<wchar_t> ret(used);
3144 memcpy((void*)ret.pointer(), (void*)array, used * sizeof(uchar16_t));
3149 core::array<uchar16_t> a(toUTF16(endian, addBOM));
3150 core::array<wchar_t> ret(a.size());
3151 ret.set_used(a.size());
3152 memcpy((void*)ret.pointer(), (void*)a.pointer(), a.size() * sizeof(uchar16_t));
3156 if (sizeof(wchar_t) == 1)
3158 core::array<uchar8_t> a(toUTF8(addBOM));
3159 core::array<wchar_t> ret(a.size());
3160 ret.set_used(a.size());
3161 memcpy((void*)ret.pointer(), (void*)a.pointer(), a.size() * sizeof(uchar8_t));
3165 // Shouldn't happen.
3166 return core::array<wchar_t>();
3169 //! Converts the string to a properly encoded io::path string.
3170 //! \param endian The desired endianness of the string.
3171 //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
3172 //! \return An io::path string containing the properly encoded string.
3173 io::path toPATH_s(const unicode::EUTF_ENDIAN endian = unicode::EUTFEE_NATIVE, const bool addBOM = false) const
3175 #if defined(_IRR_WCHAR_FILESYSTEM)
3176 return toWCHAR_s(endian, addBOM);
3178 return toUTF8_s(addBOM);
3182 //! Loads an unknown stream of data.
3183 //! Will attempt to determine if the stream is unicode data. Useful for loading from files.
3184 //! \param data The data stream to load from.
3185 //! \param data_size The length of the data string.
3186 //! \return A reference to our current string.
3187 ustring16<TAlloc>& loadDataStream(const char* data, size_t data_size)
3189 // Clear our string.
3194 unicode::EUTF_ENCODE e = unicode::determineUnicodeBOM(data);
3198 case unicode::EUTFE_UTF8:
3199 append((uchar8_t*)data, data_size);
3202 case unicode::EUTFE_UTF16:
3203 case unicode::EUTFE_UTF16_BE:
3204 case unicode::EUTFE_UTF16_LE:
3205 append((uchar16_t*)data, data_size / 2);
3208 case unicode::EUTFE_UTF32:
3209 case unicode::EUTFE_UTF32_BE:
3210 case unicode::EUTFE_UTF32_LE:
3211 append((uchar32_t*)data, data_size / 4);
3218 //! Gets the encoding of the Unicode string this class contains.
3219 //! \return An enum describing the current encoding of this string.
3220 const unicode::EUTF_ENCODE getEncoding() const
3225 //! Gets the endianness of the Unicode string this class contains.
3226 //! \return An enum describing the endianness of this string.
3227 const unicode::EUTF_ENDIAN getEndianness() const
3229 if (encoding == unicode::EUTFE_UTF16_LE ||
3230 encoding == unicode::EUTFE_UTF32_LE)
3231 return unicode::EUTFEE_LITTLE;
3232 else return unicode::EUTFEE_BIG;
3237 //! Reallocate the string, making it bigger or smaller.
3238 //! \param new_size The new size of the string.
3239 void reallocate(u32 new_size)
3241 uchar16_t* old_array = array;
3243 array = allocator.allocate(new_size + 1); //new u16[new_size];
3244 allocated = new_size + 1;
3245 if (old_array == 0) return;
3247 u32 amount = used < new_size ? used : new_size;
3248 for (u32 i=0; i<=amount; ++i)
3249 array[i] = old_array[i];
3251 if (allocated <= used)
3252 used = allocated - 1;
3256 allocator.deallocate(old_array); // delete [] old_array;
3259 //--- member variables
3262 unicode::EUTF_ENCODE encoding;
3266 //irrAllocator<uchar16_t> allocator;
3269 typedef ustring16<irrAllocator<uchar16_t> > ustring;
3272 //! Appends two ustring16s.
3273 template <typename TAlloc>
3274 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const ustring16<TAlloc>& right)
3276 ustring16<TAlloc> ret(left);
3282 //! Appends a ustring16 and a null-terminated unicode string.
3283 template <typename TAlloc, class B>
3284 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const B* const right)
3286 ustring16<TAlloc> ret(left);
3292 //! Appends a ustring16 and a null-terminated unicode string.
3293 template <class B, typename TAlloc>
3294 inline ustring16<TAlloc> operator+(const B* const left, const ustring16<TAlloc>& right)
3296 ustring16<TAlloc> ret(left);
3302 //! Appends a ustring16 and an Irrlicht string.
3303 template <typename TAlloc, typename B, typename BAlloc>
3304 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const string<B, BAlloc>& right)
3306 ustring16<TAlloc> ret(left);
3312 //! Appends a ustring16 and an Irrlicht string.
3313 template <typename TAlloc, typename B, typename BAlloc>
3314 inline ustring16<TAlloc> operator+(const string<B, BAlloc>& left, const ustring16<TAlloc>& right)
3316 ustring16<TAlloc> ret(left);
3322 //! Appends a ustring16 and a std::basic_string.
3323 template <typename TAlloc, typename B, typename A, typename BAlloc>
3324 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const std::basic_string<B, A, BAlloc>& right)
3326 ustring16<TAlloc> ret(left);
3332 //! Appends a ustring16 and a std::basic_string.
3333 template <typename TAlloc, typename B, typename A, typename BAlloc>
3334 inline ustring16<TAlloc> operator+(const std::basic_string<B, A, BAlloc>& left, const ustring16<TAlloc>& right)
3336 ustring16<TAlloc> ret(left);
3342 //! Appends a ustring16 and a char.
3343 template <typename TAlloc>
3344 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const char right)
3346 ustring16<TAlloc> ret(left);
3352 //! Appends a ustring16 and a char.
3353 template <typename TAlloc>
3354 inline ustring16<TAlloc> operator+(const char left, const ustring16<TAlloc>& right)
3356 ustring16<TAlloc> ret(left);
3362 #ifdef USTRING_CPP0X_NEWLITERALS
3363 //! Appends a ustring16 and a uchar32_t.
3364 template <typename TAlloc>
3365 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const uchar32_t right)
3367 ustring16<TAlloc> ret(left);
3373 //! Appends a ustring16 and a uchar32_t.
3374 template <typename TAlloc>
3375 inline ustring16<TAlloc> operator+(const uchar32_t left, const ustring16<TAlloc>& right)
3377 ustring16<TAlloc> ret(left);
3384 //! Appends a ustring16 and a short.
3385 template <typename TAlloc>
3386 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const short right)
3388 ustring16<TAlloc> ret(left);
3389 ret += core::stringc(right);
3394 //! Appends a ustring16 and a short.
3395 template <typename TAlloc>
3396 inline ustring16<TAlloc> operator+(const short left, const ustring16<TAlloc>& right)
3398 ustring16<TAlloc> ret(core::stringc(left));
3404 //! Appends a ustring16 and an unsigned short.
3405 template <typename TAlloc>
3406 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const unsigned short right)
3408 ustring16<TAlloc> ret(left);
3409 ret += core::stringc(right);
3414 //! Appends a ustring16 and an unsigned short.
3415 template <typename TAlloc>
3416 inline ustring16<TAlloc> operator+(const unsigned short left, const ustring16<TAlloc>& right)
3418 ustring16<TAlloc> ret(core::stringc(left));
3424 //! Appends a ustring16 and an int.
3425 template <typename TAlloc>
3426 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const int right)
3428 ustring16<TAlloc> ret(left);
3429 ret += core::stringc(right);
3434 //! Appends a ustring16 and an int.
3435 template <typename TAlloc>
3436 inline ustring16<TAlloc> operator+(const int left, const ustring16<TAlloc>& right)
3438 ustring16<TAlloc> ret(core::stringc(left));
3444 //! Appends a ustring16 and an unsigned int.
3445 template <typename TAlloc>
3446 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const unsigned int right)
3448 ustring16<TAlloc> ret(left);
3449 ret += core::stringc(right);
3454 //! Appends a ustring16 and an unsigned int.
3455 template <typename TAlloc>
3456 inline ustring16<TAlloc> operator+(const unsigned int left, const ustring16<TAlloc>& right)
3458 ustring16<TAlloc> ret(core::stringc(left));
3464 //! Appends a ustring16 and a long.
3465 template <typename TAlloc>
3466 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const long right)
3468 ustring16<TAlloc> ret(left);
3469 ret += core::stringc(right);
3474 //! Appends a ustring16 and a long.
3475 template <typename TAlloc>
3476 inline ustring16<TAlloc> operator+(const long left, const ustring16<TAlloc>& right)
3478 ustring16<TAlloc> ret(core::stringc(left));
3484 //! Appends a ustring16 and an unsigned long.
3485 template <typename TAlloc>
3486 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const unsigned long right)
3488 ustring16<TAlloc> ret(left);
3489 ret += core::stringc(right);
3494 //! Appends a ustring16 and an unsigned long.
3495 template <typename TAlloc>
3496 inline ustring16<TAlloc> operator+(const unsigned long left, const ustring16<TAlloc>& right)
3498 ustring16<TAlloc> ret(core::stringc(left));
3504 //! Appends a ustring16 and a float.
3505 template <typename TAlloc>
3506 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const float right)
3508 ustring16<TAlloc> ret(left);
3509 ret += core::stringc(right);
3514 //! Appends a ustring16 and a float.
3515 template <typename TAlloc>
3516 inline ustring16<TAlloc> operator+(const float left, const ustring16<TAlloc>& right)
3518 ustring16<TAlloc> ret(core::stringc(left));
3524 //! Appends a ustring16 and a double.
3525 template <typename TAlloc>
3526 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const double right)
3528 ustring16<TAlloc> ret(left);
3529 ret += core::stringc(right);
3534 //! Appends a ustring16 and a double.
3535 template <typename TAlloc>
3536 inline ustring16<TAlloc> operator+(const double left, const ustring16<TAlloc>& right)
3538 ustring16<TAlloc> ret(core::stringc(left));
3544 #ifdef USTRING_CPP0X
3545 //! Appends two ustring16s.
3546 template <typename TAlloc>
3547 inline ustring16<TAlloc>&& operator+(const ustring16<TAlloc>& left, ustring16<TAlloc>&& right)
3549 //std::cout << "MOVE operator+(&, &&)" << std::endl;
3550 right.insert(left, 0);
3551 return std::move(right);
3555 //! Appends two ustring16s.
3556 template <typename TAlloc>
3557 inline ustring16<TAlloc>&& operator+(ustring16<TAlloc>&& left, const ustring16<TAlloc>& right)
3559 //std::cout << "MOVE operator+(&&, &)" << std::endl;
3561 return std::move(left);
3565 //! Appends two ustring16s.
3566 template <typename TAlloc>
3567 inline ustring16<TAlloc>&& operator+(ustring16<TAlloc>&& left, ustring16<TAlloc>&& right)
3569 //std::cout << "MOVE operator+(&&, &&)" << std::endl;
3570 if ((right.size_raw() <= left.capacity() - left.size_raw()) ||
3571 (right.capacity() - right.size_raw() < left.size_raw()))
3574 return std::move(left);
3578 right.insert(left, 0);
3579 return std::move(right);
3584 //! Appends a ustring16 and a null-terminated unicode string.
3585 template <typename TAlloc, class B>
3586 inline ustring16<TAlloc>&& operator+(ustring16<TAlloc>&& left, const B* const right)
3588 //std::cout << "MOVE operator+(&&, B*)" << std::endl;
3590 return std::move(left);
3594 //! Appends a ustring16 and a null-terminated unicode string.
3595 template <class B, typename TAlloc>
3596 inline ustring16<TAlloc>&& operator+(const B* const left, ustring16<TAlloc>&& right)
3598 //std::cout << "MOVE operator+(B*, &&)" << std::endl;
3599 right.insert(left, 0);
3600 return std::move(right);
3604 //! Appends a ustring16 and an Irrlicht string.
3605 template <typename TAlloc, typename B, typename BAlloc>
3606 inline ustring16<TAlloc>&& operator+(const string<B, BAlloc>& left, ustring16<TAlloc>&& right)
3608 //std::cout << "MOVE operator+(&, &&)" << std::endl;
3609 right.insert(left, 0);
3610 return std::move(right);
3614 //! Appends a ustring16 and an Irrlicht string.
3615 template <typename TAlloc, typename B, typename BAlloc>
3616 inline ustring16<TAlloc>&& operator+(ustring16<TAlloc>&& left, const string<B, BAlloc>& right)
3618 //std::cout << "MOVE operator+(&&, &)" << std::endl;
3620 return std::move(left);
3624 //! Appends a ustring16 and a std::basic_string.
3625 template <typename TAlloc, typename B, typename A, typename BAlloc>
3626 inline ustring16<TAlloc>&& operator+(const std::basic_string<B, A, BAlloc>& left, ustring16<TAlloc>&& right)
3628 //std::cout << "MOVE operator+(&, &&)" << std::endl;
3629 right.insert(core::ustring16<TAlloc>(left), 0);
3630 return std::move(right);
3634 //! Appends a ustring16 and a std::basic_string.
3635 template <typename TAlloc, typename B, typename A, typename BAlloc>
3636 inline ustring16<TAlloc>&& operator+(ustring16<TAlloc>&& left, const std::basic_string<B, A, BAlloc>& right)
3638 //std::cout << "MOVE operator+(&&, &)" << std::endl;
3640 return std::move(left);
3644 //! Appends a ustring16 and a char.
3645 template <typename TAlloc>
3646 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const char right)
3648 left.append((uchar32_t)right);
3649 return std::move(left);
3653 //! Appends a ustring16 and a char.
3654 template <typename TAlloc>
3655 inline ustring16<TAlloc> operator+(const char left, ustring16<TAlloc>&& right)
3657 right.insert((uchar32_t)left, 0);
3658 return std::move(right);
3662 #ifdef USTRING_CPP0X_NEWLITERALS
3663 //! Appends a ustring16 and a uchar32_t.
3664 template <typename TAlloc>
3665 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const uchar32_t right)
3668 return std::move(left);
3672 //! Appends a ustring16 and a uchar32_t.
3673 template <typename TAlloc>
3674 inline ustring16<TAlloc> operator+(const uchar32_t left, ustring16<TAlloc>&& right)
3676 right.insert(left, 0);
3677 return std::move(right);
3682 //! Appends a ustring16 and a short.
3683 template <typename TAlloc>
3684 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const short right)
3686 left.append(core::stringc(right));
3687 return std::move(left);
3691 //! Appends a ustring16 and a short.
3692 template <typename TAlloc>
3693 inline ustring16<TAlloc> operator+(const short left, ustring16<TAlloc>&& right)
3695 right.insert(core::stringc(left), 0);
3696 return std::move(right);
3700 //! Appends a ustring16 and an unsigned short.
3701 template <typename TAlloc>
3702 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const unsigned short right)
3704 left.append(core::stringc(right));
3705 return std::move(left);
3709 //! Appends a ustring16 and an unsigned short.
3710 template <typename TAlloc>
3711 inline ustring16<TAlloc> operator+(const unsigned short left, ustring16<TAlloc>&& right)
3713 right.insert(core::stringc(left), 0);
3714 return std::move(right);
3718 //! Appends a ustring16 and an int.
3719 template <typename TAlloc>
3720 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const int right)
3722 left.append(core::stringc(right));
3723 return std::move(left);
3727 //! Appends a ustring16 and an int.
3728 template <typename TAlloc>
3729 inline ustring16<TAlloc> operator+(const int left, ustring16<TAlloc>&& right)
3731 right.insert(core::stringc(left), 0);
3732 return std::move(right);
3736 //! Appends a ustring16 and an unsigned int.
3737 template <typename TAlloc>
3738 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const unsigned int right)
3740 left.append(core::stringc(right));
3741 return std::move(left);
3745 //! Appends a ustring16 and an unsigned int.
3746 template <typename TAlloc>
3747 inline ustring16<TAlloc> operator+(const unsigned int left, ustring16<TAlloc>&& right)
3749 right.insert(core::stringc(left), 0);
3750 return std::move(right);
3754 //! Appends a ustring16 and a long.
3755 template <typename TAlloc>
3756 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const long right)
3758 left.append(core::stringc(right));
3759 return std::move(left);
3763 //! Appends a ustring16 and a long.
3764 template <typename TAlloc>
3765 inline ustring16<TAlloc> operator+(const long left, ustring16<TAlloc>&& right)
3767 right.insert(core::stringc(left), 0);
3768 return std::move(right);
3772 //! Appends a ustring16 and an unsigned long.
3773 template <typename TAlloc>
3774 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const unsigned long right)
3776 left.append(core::stringc(right));
3777 return std::move(left);
3781 //! Appends a ustring16 and an unsigned long.
3782 template <typename TAlloc>
3783 inline ustring16<TAlloc> operator+(const unsigned long left, ustring16<TAlloc>&& right)
3785 right.insert(core::stringc(left), 0);
3786 return std::move(right);
3790 //! Appends a ustring16 and a float.
3791 template <typename TAlloc>
3792 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const float right)
3794 left.append(core::stringc(right));
3795 return std::move(left);
3799 //! Appends a ustring16 and a float.
3800 template <typename TAlloc>
3801 inline ustring16<TAlloc> operator+(const float left, ustring16<TAlloc>&& right)
3803 right.insert(core::stringc(left), 0);
3804 return std::move(right);
3808 //! Appends a ustring16 and a double.
3809 template <typename TAlloc>
3810 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const double right)
3812 left.append(core::stringc(right));
3813 return std::move(left);
3817 //! Appends a ustring16 and a double.
3818 template <typename TAlloc>
3819 inline ustring16<TAlloc> operator+(const double left, ustring16<TAlloc>&& right)
3821 right.insert(core::stringc(left), 0);
3822 return std::move(right);
3827 #ifndef USTRING_NO_STL
3828 //! Writes a ustring16 to an ostream.
3829 template <typename TAlloc>
3830 inline std::ostream& operator<<(std::ostream& out, const ustring16<TAlloc>& in)
3832 out << in.toUTF8_s().c_str();
3836 //! Writes a ustring16 to a wostream.
3837 template <typename TAlloc>
3838 inline std::wostream& operator<<(std::wostream& out, const ustring16<TAlloc>& in)
3840 out << in.toWCHAR_s().c_str();
3846 #ifndef USTRING_NO_STL
3851 //! Hashing algorithm for hashing a ustring. Used for things like unordered_maps.
3852 //! Algorithm taken from std::hash<std::string>.
3853 class hash : public std::unary_function<core::ustring, size_t>
3856 size_t operator()(const core::ustring& s) const
3858 size_t ret = 2166136261U;
3860 size_t stride = 1 + s.size_raw() / 10;
3862 core::ustring::const_iterator i = s.begin();
3863 while (i != s.end())
3865 // TODO: Don't force u32 on an x64 OS. Make it agnostic.
3866 ret = 16777619U * ret ^ (size_t)s[(u32)index];
3874 } // end namespace unicode
3878 } // end namespace core
3879 } // end namespace irr