2 Basic Unicode string class for Irrlicht.
3 Copyright (c) 2009-2011 John Norman
5 This software is provided 'as-is', without any express or implied
6 warranty. In no event will the authors be held liable for any
7 damages arising from the use of this software.
9 Permission is granted to anyone to use this software for any
10 purpose, including commercial applications, and to alter it and
11 redistribute it freely, subject to the following restrictions:
13 1. The origin of this software must not be misrepresented; you
14 must not claim that you wrote the original software. If you use
15 this software in a product, an acknowledgment in the product
16 documentation would be appreciated but is not required.
18 2. Altered source versions must be plainly marked as such, and
19 must not be misrepresented as being the original software.
21 3. This notice may not be removed or altered from any source
24 The original version of this class can be located at:
25 http://irrlicht.suckerfreegames.com/
28 john@suckerfreegames.com
31 #ifndef __IRR_USTRING_H_INCLUDED__
32 #define __IRR_USTRING_H_INCLUDED__
34 #if (__cplusplus > 199711L) || (_MSC_VER >= 1600) || defined(__GXX_EXPERIMENTAL_CXX0X__)
35 # define USTRING_CPP0X
36 # if defined(__GXX_EXPERIMENTAL_CXX0X__) && ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 5)))
37 # define USTRING_CPP0X_NEWLITERALS
49 #ifndef USTRING_NO_STL
56 #include "irrAllocator.h"
59 #include "irrString.h"
62 //! UTF-16 surrogate start values.
63 static const irr::u16 UTF16_HI_SURROGATE = 0xD800;
64 static const irr::u16 UTF16_LO_SURROGATE = 0xDC00;
66 //! Is a UTF-16 code point a surrogate?
67 #define UTF16_IS_SURROGATE(c) (((c) & 0xF800) == 0xD800)
68 #define UTF16_IS_SURROGATE_HI(c) (((c) & 0xFC00) == 0xD800)
69 #define UTF16_IS_SURROGATE_LO(c) (((c) & 0xFC00) == 0xDC00)
75 // Define our character types.
76 #ifdef USTRING_CPP0X_NEWLITERALS // C++0x
77 typedef char32_t uchar32_t;
78 typedef char16_t uchar16_t;
79 typedef char uchar8_t;
81 typedef u32 uchar32_t;
82 typedef u16 uchar16_t;
92 //! The unicode replacement character. Used to replace invalid characters.
93 const irr::u16 UTF_REPLACEMENT_CHARACTER = 0xFFFD;
95 //! Convert a UTF-16 surrogate pair into a UTF-32 character.
96 //! \param high The high value of the pair.
97 //! \param low The low value of the pair.
98 //! \return The UTF-32 character expressed by the surrogate pair.
99 inline uchar32_t toUTF32(uchar16_t high, uchar16_t low)
101 // Convert the surrogate pair into a single UTF-32 character.
102 uchar32_t x = ((high & ((1 << 6) -1)) << 10) | (low & ((1 << 10) -1));
103 uchar32_t wu = ((high >> 6) & ((1 << 5) - 1)) + 1;
104 return (wu << 16) | x;
107 //! Swaps the endianness of a 16-bit value.
108 //! \return The new value.
109 inline uchar16_t swapEndian16(const uchar16_t& c)
111 return ((c >> 8) & 0x00FF) | ((c << 8) & 0xFF00);
114 //! Swaps the endianness of a 32-bit value.
115 //! \return The new value.
116 inline uchar32_t swapEndian32(const uchar32_t& c)
118 return ((c >> 24) & 0x000000FF) |
119 ((c >> 8) & 0x0000FF00) |
120 ((c << 8) & 0x00FF0000) |
121 ((c << 24) & 0xFF000000);
124 //! The Unicode byte order mark.
125 const u16 BOM = 0xFEFF;
127 //! The size of the Unicode byte order mark in terms of the Unicode character size.
128 const u8 BOM_UTF8_LEN = 3;
129 const u8 BOM_UTF16_LEN = 1;
130 const u8 BOM_UTF32_LEN = 1;
132 //! Unicode byte order marks for file operations.
133 const u8 BOM_ENCODE_UTF8[3] = { 0xEF, 0xBB, 0xBF };
134 const u8 BOM_ENCODE_UTF16_BE[2] = { 0xFE, 0xFF };
135 const u8 BOM_ENCODE_UTF16_LE[2] = { 0xFF, 0xFE };
136 const u8 BOM_ENCODE_UTF32_BE[4] = { 0x00, 0x00, 0xFE, 0xFF };
137 const u8 BOM_ENCODE_UTF32_LE[4] = { 0xFF, 0xFE, 0x00, 0x00 };
139 //! The size in bytes of the Unicode byte marks for file operations.
140 const u8 BOM_ENCODE_UTF8_LEN = 3;
141 const u8 BOM_ENCODE_UTF16_LEN = 2;
142 const u8 BOM_ENCODE_UTF32_LEN = 4;
144 //! Unicode encoding type.
157 //! Unicode endianness.
165 //! Returns the specified unicode byte order mark in a byte array.
166 //! The byte order mark is the first few bytes in a text file that signifies its encoding.
167 /** \param mode The Unicode encoding method that we want to get the byte order mark for.
168 If EUTFE_UTF16 or EUTFE_UTF32 is passed, it uses the native system endianness. **/
169 //! \return An array that contains a byte order mark.
170 inline core::array<u8> getUnicodeBOM(EUTF_ENCODE mode)
172 #define COPY_ARRAY(source, size) \
173 memcpy(ret.pointer(), source, size); \
176 core::array<u8> ret(4);
180 COPY_ARRAY(BOM_ENCODE_UTF8, BOM_ENCODE_UTF8_LEN);
183 #ifdef __BIG_ENDIAN__
184 COPY_ARRAY(BOM_ENCODE_UTF16_BE, BOM_ENCODE_UTF16_LEN);
186 COPY_ARRAY(BOM_ENCODE_UTF16_LE, BOM_ENCODE_UTF16_LEN);
190 COPY_ARRAY(BOM_ENCODE_UTF16_BE, BOM_ENCODE_UTF16_LEN);
193 COPY_ARRAY(BOM_ENCODE_UTF16_LE, BOM_ENCODE_UTF16_LEN);
196 #ifdef __BIG_ENDIAN__
197 COPY_ARRAY(BOM_ENCODE_UTF32_BE, BOM_ENCODE_UTF32_LEN);
199 COPY_ARRAY(BOM_ENCODE_UTF32_LE, BOM_ENCODE_UTF32_LEN);
203 COPY_ARRAY(BOM_ENCODE_UTF32_BE, BOM_ENCODE_UTF32_LEN);
206 COPY_ARRAY(BOM_ENCODE_UTF32_LE, BOM_ENCODE_UTF32_LEN);
214 //! Detects if the given data stream starts with a unicode BOM.
215 //! \param data The data stream to check.
216 //! \return The unicode BOM associated with the data stream, or EUTFE_NONE if none was found.
217 inline EUTF_ENCODE determineUnicodeBOM(const char* data)
219 if (memcmp(data, BOM_ENCODE_UTF8, 3) == 0) return EUTFE_UTF8;
220 if (memcmp(data, BOM_ENCODE_UTF16_BE, 2) == 0) return EUTFE_UTF16_BE;
221 if (memcmp(data, BOM_ENCODE_UTF16_LE, 2) == 0) return EUTFE_UTF16_LE;
222 if (memcmp(data, BOM_ENCODE_UTF32_BE, 4) == 0) return EUTFE_UTF32_BE;
223 if (memcmp(data, BOM_ENCODE_UTF32_LE, 4) == 0) return EUTFE_UTF32_LE;
227 } // end namespace unicode
230 //! UTF-16 string class.
231 template <typename TAlloc = irrAllocator<uchar16_t> >
236 ///------------------///
237 /// iterator classes ///
238 ///------------------///
240 //! Access an element in a unicode string, allowing one to change it.
241 class _ustring16_iterator_access
244 _ustring16_iterator_access(const ustring16<TAlloc>* s, u32 p) : ref(s), pos(p) {}
246 //! Allow the class to be interpreted as a single UTF-32 character.
247 operator uchar32_t() const
252 //! Allow one to change the character in the unicode string.
253 //! \param c The new character to use.
255 _ustring16_iterator_access& operator=(const uchar32_t c)
261 //! Increments the value by 1.
263 _ustring16_iterator_access& operator++()
269 //! Increments the value by 1, returning the old value.
270 //! \return A unicode character.
271 uchar32_t operator++(int)
273 uchar32_t old = _get();
278 //! Decrements the value by 1.
280 _ustring16_iterator_access& operator--()
286 //! Decrements the value by 1, returning the old value.
287 //! \return A unicode character.
288 uchar32_t operator--(int)
290 uchar32_t old = _get();
295 //! Adds to the value by a specified amount.
296 //! \param val The amount to add to this character.
298 _ustring16_iterator_access& operator+=(int val)
304 //! Subtracts from the value by a specified amount.
305 //! \param val The amount to subtract from this character.
307 _ustring16_iterator_access& operator-=(int val)
313 //! Multiples the value by a specified amount.
314 //! \param val The amount to multiply this character by.
316 _ustring16_iterator_access& operator*=(int val)
322 //! Divides the value by a specified amount.
323 //! \param val The amount to divide this character by.
325 _ustring16_iterator_access& operator/=(int val)
331 //! Modulos the value by a specified amount.
332 //! \param val The amount to modulo this character by.
334 _ustring16_iterator_access& operator%=(int val)
340 //! Adds to the value by a specified amount.
341 //! \param val The amount to add to this character.
342 //! \return A unicode character.
343 uchar32_t operator+(int val) const
348 //! Subtracts from the value by a specified amount.
349 //! \param val The amount to subtract from this character.
350 //! \return A unicode character.
351 uchar32_t operator-(int val) const
356 //! Multiplies the value by a specified amount.
357 //! \param val The amount to multiply this character by.
358 //! \return A unicode character.
359 uchar32_t operator*(int val) const
364 //! Divides the value by a specified amount.
365 //! \param val The amount to divide this character by.
366 //! \return A unicode character.
367 uchar32_t operator/(int val) const
372 //! Modulos the value by a specified amount.
373 //! \param val The amount to modulo this character by.
374 //! \return A unicode character.
375 uchar32_t operator%(int val) const
381 //! Gets a uchar32_t from our current position.
382 uchar32_t _get() const
384 const uchar16_t* a = ref->c_str();
385 if (!UTF16_IS_SURROGATE(a[pos]))
386 return static_cast<uchar32_t>(a[pos]);
389 if (pos + 1 >= ref->size_raw())
392 return unicode::toUTF32(a[pos], a[pos + 1]);
396 //! Sets a uchar32_t at our current position.
397 void _set(uchar32_t c)
399 ustring16<TAlloc>* ref2 = const_cast<ustring16<TAlloc>*>(ref);
400 const uchar16_t* a = ref2->c_str();
403 // c will be multibyte, so split it up into the high and low surrogate pairs.
404 uchar16_t x = static_cast<uchar16_t>(c);
405 uchar16_t vh = UTF16_HI_SURROGATE | ((((c >> 16) & ((1 << 5) - 1)) - 1) << 6) | (x >> 10);
406 uchar16_t vl = UTF16_LO_SURROGATE | (x & ((1 << 10) - 1));
408 // If the previous position was a surrogate pair, just replace them. Else, insert the low pair.
409 if (UTF16_IS_SURROGATE_HI(a[pos]) && pos + 1 != ref2->size_raw())
410 ref2->replace_raw(vl, static_cast<u32>(pos) + 1);
411 else ref2->insert_raw(vl, static_cast<u32>(pos) + 1);
413 ref2->replace_raw(vh, static_cast<u32>(pos));
417 // c will be a single byte.
418 uchar16_t vh = static_cast<uchar16_t>(c);
420 // If the previous position was a surrogate pair, remove the extra byte.
421 if (UTF16_IS_SURROGATE_HI(a[pos]))
422 ref2->erase_raw(static_cast<u32>(pos) + 1);
424 ref2->replace_raw(vh, static_cast<u32>(pos));
428 const ustring16<TAlloc>* ref;
431 typedef typename ustring16<TAlloc>::_ustring16_iterator_access access;
434 //! Iterator to iterate through a UTF-16 string.
435 #ifndef USTRING_NO_STL
436 class _ustring16_const_iterator : public std::iterator<
437 std::bidirectional_iterator_tag, // iterator_category
438 access, // value_type
439 ptrdiff_t, // difference_type
440 const access, // pointer
441 const access // reference
444 class _ustring16_const_iterator
448 typedef _ustring16_const_iterator _Iter;
449 typedef std::iterator<std::bidirectional_iterator_tag, access, ptrdiff_t, const access, const access> _Base;
450 typedef const access const_pointer;
451 typedef const access const_reference;
453 #ifndef USTRING_NO_STL
454 typedef typename _Base::value_type value_type;
455 typedef typename _Base::difference_type difference_type;
456 typedef typename _Base::difference_type distance_type;
457 typedef typename _Base::pointer pointer;
458 typedef const_reference reference;
460 typedef access value_type;
461 typedef u32 difference_type;
462 typedef u32 distance_type;
463 typedef const_pointer pointer;
464 typedef const_reference reference;
468 _ustring16_const_iterator(const _Iter& i) : ref(i.ref), pos(i.pos) {}
469 _ustring16_const_iterator(const ustring16<TAlloc>& s) : ref(&s), pos(0) {}
470 _ustring16_const_iterator(const ustring16<TAlloc>& s, const u32 p) : ref(&s), pos(0)
472 if (ref->size_raw() == 0 || p == 0)
475 // Go to the appropriate position.
477 u32 sr = ref->size_raw();
478 const uchar16_t* a = ref->c_str();
479 while (i != 0 && pos < sr)
481 if (UTF16_IS_SURROGATE_HI(a[pos]))
488 //! Test for equalness.
489 bool operator==(const _Iter& iter) const
491 if (ref == iter.ref && pos == iter.pos)
496 //! Test for unequalness.
497 bool operator!=(const _Iter& iter) const
499 if (ref != iter.ref || pos != iter.pos)
504 //! Switch to the next full character in the string.
507 if (pos == ref->size_raw()) return *this;
508 const uchar16_t* a = ref->c_str();
509 if (UTF16_IS_SURROGATE_HI(a[pos]))
510 pos += 2; // TODO: check for valid low surrogate?
512 if (pos > ref->size_raw()) pos = ref->size_raw();
516 //! Switch to the next full character in the string, returning the previous position.
517 _Iter operator++(int)
524 //! Switch to the previous full character in the string.
527 if (pos == 0) return *this;
528 const uchar16_t* a = ref->c_str();
530 if (UTF16_IS_SURROGATE_LO(a[pos]) && pos != 0) // low surrogate, go back one more.
535 //! Switch to the previous full character in the string, returning the previous position.
536 _Iter operator--(int)
543 //! Advance a specified number of full characters in the string.
545 _Iter& operator+=(const difference_type v)
547 if (v == 0) return *this;
548 if (v < 0) return operator-=(v * -1);
550 if (pos >= ref->size_raw())
553 // Go to the appropriate position.
554 // TODO: Don't force u32 on an x64 OS. Make it agnostic.
556 u32 sr = ref->size_raw();
557 const uchar16_t* a = ref->c_str();
558 while (i != 0 && pos < sr)
560 if (UTF16_IS_SURROGATE_HI(a[pos]))
571 //! Go back a specified number of full characters in the string.
573 _Iter& operator-=(const difference_type v)
575 if (v == 0) return *this;
576 if (v > 0) return operator+=(v * -1);
581 // Go to the appropriate position.
582 // TODO: Don't force u32 on an x64 OS. Make it agnostic.
584 const uchar16_t* a = ref->c_str();
585 while (i != 0 && pos != 0)
588 if (UTF16_IS_SURROGATE_LO(a[pos]) != 0 && pos != 0)
596 //! Return a new iterator that is a variable number of full characters forward from the current position.
597 _Iter operator+(const difference_type v) const
604 //! Return a new iterator that is a variable number of full characters backward from the current position.
605 _Iter operator-(const difference_type v) const
612 //! Returns the distance between two iterators.
613 difference_type operator-(const _Iter& iter) const
615 // Make sure we reference the same object!
617 return difference_type();
642 //! Accesses the full character at the iterator's position.
643 const_reference operator*() const
645 if (pos >= ref->size_raw())
647 const uchar16_t* a = ref->c_str();
648 u32 p = ref->size_raw();
649 if (UTF16_IS_SURROGATE_LO(a[p]))
651 reference ret(ref, p);
654 const_reference ret(ref, pos);
658 //! Accesses the full character at the iterator's position.
659 reference operator*()
661 if (pos >= ref->size_raw())
663 const uchar16_t* a = ref->c_str();
664 u32 p = ref->size_raw();
665 if (UTF16_IS_SURROGATE_LO(a[p]))
667 reference ret(ref, p);
670 reference ret(ref, pos);
674 //! Accesses the full character at the iterator's position.
675 const_pointer operator->() const
680 //! Accesses the full character at the iterator's position.
686 //! Is the iterator at the start of the string?
692 //! Is the iterator at the end of the string?
695 const uchar16_t* a = ref->c_str();
696 if (UTF16_IS_SURROGATE(a[pos]))
697 return (pos + 1) >= ref->size_raw();
698 else return pos >= ref->size_raw();
701 //! Moves the iterator to the start of the string.
707 //! Moves the iterator to the end of the string.
710 const uchar16_t* a = ref->c_str();
711 pos = ref->size_raw();
714 //! Returns the iterator's position.
715 //! \return The iterator's position.
722 const ustring16<TAlloc>* ref;
726 //! Iterator to iterate through a UTF-16 string.
727 class _ustring16_iterator : public _ustring16_const_iterator
730 typedef _ustring16_iterator _Iter;
731 typedef _ustring16_const_iterator _Base;
732 typedef typename _Base::const_pointer const_pointer;
733 typedef typename _Base::const_reference const_reference;
735 typedef typename _Base::value_type value_type;
736 typedef typename _Base::difference_type difference_type;
737 typedef typename _Base::distance_type distance_type;
738 typedef access pointer;
739 typedef access reference;
745 _ustring16_iterator(const _Iter& i) : _ustring16_const_iterator(i) {}
746 _ustring16_iterator(const ustring16<TAlloc>& s) : _ustring16_const_iterator(s) {}
747 _ustring16_iterator(const ustring16<TAlloc>& s, const u32 p) : _ustring16_const_iterator(s, p) {}
749 //! Accesses the full character at the iterator's position.
750 reference operator*() const
752 if (pos >= ref->size_raw())
754 const uchar16_t* a = ref->c_str();
755 u32 p = ref->size_raw();
756 if (UTF16_IS_SURROGATE_LO(a[p]))
758 reference ret(ref, p);
761 reference ret(ref, pos);
765 //! Accesses the full character at the iterator's position.
766 reference operator*()
768 if (pos >= ref->size_raw())
770 const uchar16_t* a = ref->c_str();
771 u32 p = ref->size_raw();
772 if (UTF16_IS_SURROGATE_LO(a[p]))
774 reference ret(ref, p);
777 reference ret(ref, pos);
781 //! Accesses the full character at the iterator's position.
782 pointer operator->() const
787 //! Accesses the full character at the iterator's position.
794 typedef typename ustring16<TAlloc>::_ustring16_iterator iterator;
795 typedef typename ustring16<TAlloc>::_ustring16_const_iterator const_iterator;
797 ///----------------------///
798 /// end iterator classes ///
799 ///----------------------///
801 //! Default constructor
803 : array(0), allocated(1), used(0)
806 encoding = unicode::EUTFE_UTF16_BE;
808 encoding = unicode::EUTFE_UTF16_LE;
810 array = allocator.allocate(1); // new u16[1];
816 ustring16(const ustring16<TAlloc>& other)
817 : array(0), allocated(0), used(0)
820 encoding = unicode::EUTFE_UTF16_BE;
822 encoding = unicode::EUTFE_UTF16_LE;
828 //! Constructor from other string types
829 template <class B, class A>
830 ustring16(const string<B, A>& other)
831 : array(0), allocated(0), used(0)
834 encoding = unicode::EUTFE_UTF16_BE;
836 encoding = unicode::EUTFE_UTF16_LE;
842 #ifndef USTRING_NO_STL
843 //! Constructor from std::string
844 template <class B, class A, typename Alloc>
845 ustring16(const std::basic_string<B, A, Alloc>& other)
846 : array(0), allocated(0), used(0)
849 encoding = unicode::EUTFE_UTF16_BE;
851 encoding = unicode::EUTFE_UTF16_LE;
853 *this = other.c_str();
857 //! Constructor from iterator.
858 template <typename Itr>
859 ustring16(Itr first, Itr last)
860 : array(0), allocated(0), used(0)
863 encoding = unicode::EUTFE_UTF16_BE;
865 encoding = unicode::EUTFE_UTF16_LE;
867 reserve(std::distance(first, last));
870 for (; first != last; ++first)
871 append((uchar32_t)*first);
876 #ifndef USTRING_CPP0X_NEWLITERALS
877 //! Constructor for copying a character string from a pointer.
878 ustring16(const char* const c)
879 : array(0), allocated(0), used(0)
882 encoding = unicode::EUTFE_UTF16_BE;
884 encoding = unicode::EUTFE_UTF16_LE;
887 loadDataStream(c, strlen(c));
888 //append((uchar8_t*)c);
892 //! Constructor for copying a character string from a pointer with a given length.
893 ustring16(const char* const c, u32 length)
894 : array(0), allocated(0), used(0)
897 encoding = unicode::EUTFE_UTF16_BE;
899 encoding = unicode::EUTFE_UTF16_LE;
902 loadDataStream(c, length);
907 //! Constructor for copying a UTF-8 string from a pointer.
908 ustring16(const uchar8_t* const c)
909 : array(0), allocated(0), used(0)
912 encoding = unicode::EUTFE_UTF16_BE;
914 encoding = unicode::EUTFE_UTF16_LE;
921 //! Constructor for copying a UTF-8 string from a single char.
922 ustring16(const char c)
923 : array(0), allocated(0), used(0)
926 encoding = unicode::EUTFE_UTF16_BE;
928 encoding = unicode::EUTFE_UTF16_LE;
931 append((uchar32_t)c);
935 //! Constructor for copying a UTF-8 string from a pointer with a given length.
936 ustring16(const uchar8_t* const c, u32 length)
937 : array(0), allocated(0), used(0)
940 encoding = unicode::EUTFE_UTF16_BE;
942 encoding = unicode::EUTFE_UTF16_LE;
949 //! Constructor for copying a UTF-16 string from a pointer.
950 ustring16(const uchar16_t* const c)
951 : array(0), allocated(0), used(0)
954 encoding = unicode::EUTFE_UTF16_BE;
956 encoding = unicode::EUTFE_UTF16_LE;
963 //! Constructor for copying a UTF-16 string from a pointer with a given length
964 ustring16(const uchar16_t* const c, u32 length)
965 : array(0), allocated(0), used(0)
968 encoding = unicode::EUTFE_UTF16_BE;
970 encoding = unicode::EUTFE_UTF16_LE;
977 //! Constructor for copying a UTF-32 string from a pointer.
978 ustring16(const uchar32_t* const c)
979 : array(0), allocated(0), used(0)
982 encoding = unicode::EUTFE_UTF16_BE;
984 encoding = unicode::EUTFE_UTF16_LE;
991 //! Constructor for copying a UTF-32 from a pointer with a given length.
992 ustring16(const uchar32_t* const c, u32 length)
993 : array(0), allocated(0), used(0)
996 encoding = unicode::EUTFE_UTF16_BE;
998 encoding = unicode::EUTFE_UTF16_LE;
1005 //! Constructor for copying a wchar_t string from a pointer.
1006 ustring16(const wchar_t* const c)
1007 : array(0), allocated(0), used(0)
1010 encoding = unicode::EUTFE_UTF16_BE;
1012 encoding = unicode::EUTFE_UTF16_LE;
1015 if (sizeof(wchar_t) == 4)
1016 append(reinterpret_cast<const uchar32_t* const>(c));
1017 else if (sizeof(wchar_t) == 2)
1018 append(reinterpret_cast<const uchar16_t* const>(c));
1019 else if (sizeof(wchar_t) == 1)
1020 append(reinterpret_cast<const uchar8_t* const>(c));
1024 //! Constructor for copying a wchar_t string from a pointer with a given length.
1025 ustring16(const wchar_t* const c, u32 length)
1026 : array(0), allocated(0), used(0)
1029 encoding = unicode::EUTFE_UTF16_BE;
1031 encoding = unicode::EUTFE_UTF16_LE;
1034 if (sizeof(wchar_t) == 4)
1035 append(reinterpret_cast<const uchar32_t* const>(c), length);
1036 else if (sizeof(wchar_t) == 2)
1037 append(reinterpret_cast<const uchar16_t* const>(c), length);
1038 else if (sizeof(wchar_t) == 1)
1039 append(reinterpret_cast<const uchar8_t* const>(c), length);
1043 #ifdef USTRING_CPP0X
1044 //! Constructor for moving a ustring16
1045 ustring16(ustring16<TAlloc>&& other)
1046 : array(other.array), encoding(other.encoding), allocated(other.allocated), used(other.used)
1048 //std::cout << "MOVE constructor" << std::endl;
1050 other.allocated = 0;
1059 allocator.deallocate(array); // delete [] array;
1063 //! Assignment operator
1064 ustring16& operator=(const ustring16<TAlloc>& other)
1069 used = other.size_raw();
1070 if (used >= allocated)
1072 allocator.deallocate(array); // delete [] array;
1073 allocated = used + 1;
1074 array = allocator.allocate(used + 1); //new u16[used];
1077 const uchar16_t* p = other.c_str();
1078 for (u32 i=0; i<=used; ++i, ++p)
1083 // Validate our new UTF-16 string.
1090 #ifdef USTRING_CPP0X
1091 //! Move assignment operator
1092 ustring16& operator=(ustring16<TAlloc>&& other)
1096 //std::cout << "MOVE operator=" << std::endl;
1097 allocator.deallocate(array);
1099 array = other.array;
1100 allocated = other.allocated;
1101 encoding = other.encoding;
1111 //! Assignment operator for other string types
1112 template <class B, class A>
1113 ustring16<TAlloc>& operator=(const string<B, A>& other)
1115 *this = other.c_str();
1120 //! Assignment operator for UTF-8 strings
1121 ustring16<TAlloc>& operator=(const uchar8_t* const c)
1125 array = allocator.allocate(1); //new u16[1];
1130 if (!c) return *this;
1132 //! Append our string now.
1138 //! Assignment operator for UTF-16 strings
1139 ustring16<TAlloc>& operator=(const uchar16_t* const c)
1143 array = allocator.allocate(1); //new u16[1];
1148 if (!c) return *this;
1150 //! Append our string now.
1156 //! Assignment operator for UTF-32 strings
1157 ustring16<TAlloc>& operator=(const uchar32_t* const c)
1161 array = allocator.allocate(1); //new u16[1];
1166 if (!c) return *this;
1168 //! Append our string now.
1174 //! Assignment operator for wchar_t strings.
1175 /** Note that this assumes that a correct unicode string is stored in the wchar_t string.
1176 Since wchar_t changes depending on its platform, it could either be a UTF-8, -16, or -32 string.
1177 This function assumes you are storing the correct unicode encoding inside the wchar_t string. **/
1178 ustring16<TAlloc>& operator=(const wchar_t* const c)
1180 if (sizeof(wchar_t) == 4)
1181 *this = reinterpret_cast<const uchar32_t* const>(c);
1182 else if (sizeof(wchar_t) == 2)
1183 *this = reinterpret_cast<const uchar16_t* const>(c);
1184 else if (sizeof(wchar_t) == 1)
1185 *this = reinterpret_cast<const uchar8_t* const>(c);
1191 //! Assignment operator for other strings.
1192 /** Note that this assumes that a correct unicode string is stored in the string. **/
1194 ustring16<TAlloc>& operator=(const B* const c)
1197 *this = reinterpret_cast<const uchar32_t* const>(c);
1198 else if (sizeof(B) == 2)
1199 *this = reinterpret_cast<const uchar16_t* const>(c);
1200 else if (sizeof(B) == 1)
1201 *this = reinterpret_cast<const uchar8_t* const>(c);
1207 //! Direct access operator
1208 access operator [](const u32 index)
1210 _IRR_DEBUG_BREAK_IF(index>=size()) // bad index
1211 iterator iter(*this, index);
1212 return iter.operator*();
1216 //! Direct access operator
1217 const access operator [](const u32 index) const
1219 _IRR_DEBUG_BREAK_IF(index>=size()) // bad index
1220 const_iterator iter(*this, index);
1221 return iter.operator*();
1225 //! Equality operator
1226 bool operator ==(const uchar16_t* const str) const
1232 for(i=0; array[i] && str[i]; ++i)
1233 if (array[i] != str[i])
1236 return !array[i] && !str[i];
1240 //! Equality operator
1241 bool operator ==(const ustring16<TAlloc>& other) const
1243 for(u32 i=0; array[i] && other.array[i]; ++i)
1244 if (array[i] != other.array[i])
1247 return used == other.used;
1251 //! Is smaller comparator
1252 bool operator <(const ustring16<TAlloc>& other) const
1254 for(u32 i=0; array[i] && other.array[i]; ++i)
1256 s32 diff = array[i] - other.array[i];
1261 return used < other.used;
1265 //! Inequality operator
1266 bool operator !=(const uchar16_t* const str) const
1268 return !(*this == str);
1272 //! Inequality operator
1273 bool operator !=(const ustring16<TAlloc>& other) const
1275 return !(*this == other);
1279 //! Returns the length of a ustring16 in full characters.
1280 //! \return Length of a ustring16 in full characters.
1283 const_iterator i(*this, 0);
1294 //! Informs if the ustring is empty or not.
1295 //! \return True if the ustring is empty, false if not.
1298 return (size_raw() == 0);
1302 //! Returns a pointer to the raw UTF-16 string data.
1303 //! \return pointer to C-style NUL terminated array of UTF-16 code points.
1304 const uchar16_t* c_str() const
1310 //! Compares the first n characters of this string with another.
1311 //! \param other Other string to compare to.
1312 //! \param n Number of characters to compare.
1313 //! \return True if the n first characters of both strings are equal.
1314 bool equalsn(const ustring16<TAlloc>& other, u32 n) const
1317 const uchar16_t* oa = other.c_str();
1318 for(i=0; array[i] && oa[i] && i < n; ++i)
1319 if (array[i] != oa[i])
1322 // if one (or both) of the strings was smaller then they
1323 // are only equal if they have the same length
1324 return (i == n) || (used == other.used);
1328 //! Compares the first n characters of this string with another.
1329 //! \param str Other string to compare to.
1330 //! \param n Number of characters to compare.
1331 //! \return True if the n first characters of both strings are equal.
1332 bool equalsn(const uchar16_t* const str, u32 n) const
1337 for(i=0; array[i] && str[i] && i < n; ++i)
1338 if (array[i] != str[i])
1341 // if one (or both) of the strings was smaller then they
1342 // are only equal if they have the same length
1343 return (i == n) || (array[i] == 0 && str[i] == 0);
1347 //! Appends a character to this ustring16
1348 //! \param character The character to append.
1349 //! \return A reference to our current string.
1350 ustring16<TAlloc>& append(uchar32_t character)
1352 if (used + 2 >= allocated)
1353 reallocate(used + 2);
1355 if (character > 0xFFFF)
1359 // character will be multibyte, so split it up into a surrogate pair.
1360 uchar16_t x = static_cast<uchar16_t>(character);
1361 uchar16_t vh = UTF16_HI_SURROGATE | ((((character >> 16) & ((1 << 5) - 1)) - 1) << 6) | (x >> 10);
1362 uchar16_t vl = UTF16_LO_SURROGATE | (x & ((1 << 10) - 1));
1369 array[used-1] = character;
1377 //! Appends a UTF-8 string to this ustring16
1378 //! \param other The UTF-8 string to append.
1379 //! \param length The length of the string to append.
1380 //! \return A reference to our current string.
1381 ustring16<TAlloc>& append(const uchar8_t* const other, u32 length=0xffffffff)
1386 // Determine if the string is long enough for a BOM.
1388 const uchar8_t* p = other;
1392 } while (*p++ && len < unicode::BOM_ENCODE_UTF8_LEN);
1395 unicode::EUTF_ENCODE c_bom = unicode::EUTFE_NONE;
1396 if (len == unicode::BOM_ENCODE_UTF8_LEN)
1398 if (memcmp(other, unicode::BOM_ENCODE_UTF8, unicode::BOM_ENCODE_UTF8_LEN) == 0)
1399 c_bom = unicode::EUTFE_UTF8;
1402 // If a BOM was found, don't include it in the string.
1403 const uchar8_t* c2 = other;
1404 if (c_bom != unicode::EUTFE_NONE)
1406 c2 = other + unicode::BOM_UTF8_LEN;
1407 length -= unicode::BOM_UTF8_LEN;
1410 // Calculate the size of the string to read in.
1416 } while(*p++ && len < length);
1420 // If we need to grow the array, do it now.
1421 if (used + len >= allocated)
1422 reallocate(used + (len * 2));
1425 // Convert UTF-8 to UTF-16.
1427 for (u32 l = 0; l<len;)
1430 if (((c2[l] >> 6) & 0x03) == 0x02)
1431 { // Invalid continuation byte.
1432 array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1435 else if (c2[l] == 0xC0 || c2[l] == 0xC1)
1436 { // Invalid byte - overlong encoding.
1437 array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1440 else if ((c2[l] & 0xF8) == 0xF0)
1441 { // 4 bytes UTF-8, 2 bytes UTF-16.
1442 // Check for a full string.
1445 array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1453 if (valid && (((c2[l+1] >> 6) & 0x03) == 0x02)) ++l2; else valid = false;
1454 if (valid && (((c2[l+2] >> 6) & 0x03) == 0x02)) ++l2; else valid = false;
1455 if (valid && (((c2[l+3] >> 6) & 0x03) == 0x02)) ++l2; else valid = false;
1458 array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1464 uchar8_t b1 = ((c2[l] & 0x7) << 2) | ((c2[l+1] >> 4) & 0x3);
1465 uchar8_t b2 = ((c2[l+1] & 0xF) << 4) | ((c2[l+2] >> 2) & 0xF);
1466 uchar8_t b3 = ((c2[l+2] & 0x3) << 6) | (c2[l+3] & 0x3F);
1467 uchar32_t v = b3 | ((uchar32_t)b2 << 8) | ((uchar32_t)b1 << 16);
1469 // Split v up into a surrogate pair.
1470 uchar16_t x = static_cast<uchar16_t>(v);
1471 uchar16_t vh = UTF16_HI_SURROGATE | ((((v >> 16) & ((1 << 5) - 1)) - 1) << 6) | (x >> 10);
1472 uchar16_t vl = UTF16_LO_SURROGATE | (x & ((1 << 10) - 1));
1477 ++used; // Using two shorts this time, so increase used by 1.
1479 else if ((c2[l] & 0xF0) == 0xE0)
1480 { // 3 bytes UTF-8, 1 byte UTF-16.
1481 // Check for a full string.
1484 array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1492 if (valid && (((c2[l+1] >> 6) & 0x03) == 0x02)) ++l2; else valid = false;
1493 if (valid && (((c2[l+2] >> 6) & 0x03) == 0x02)) ++l2; else valid = false;
1496 array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1502 uchar8_t b1 = ((c2[l] & 0xF) << 4) | ((c2[l+1] >> 2) & 0xF);
1503 uchar8_t b2 = ((c2[l+1] & 0x3) << 6) | (c2[l+2] & 0x3F);
1504 uchar16_t ch = b2 | ((uchar16_t)b1 << 8);
1508 else if ((c2[l] & 0xE0) == 0xC0)
1509 { // 2 bytes UTF-8, 1 byte UTF-16.
1510 // Check for a full string.
1513 array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1519 if (((c2[l+1] >> 6) & 0x03) != 0x02)
1521 array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1527 uchar8_t b1 = (c2[l] >> 2) & 0x7;
1528 uchar8_t b2 = ((c2[l] & 0x3) << 6) | (c2[l+1] & 0x3F);
1529 uchar16_t ch = b2 | ((uchar16_t)b1 << 8);
1534 { // 1 byte UTF-8, 1 byte UTF-16.
1537 { // Values above 0xF4 are restricted and aren't used. By now, anything above 0x7F is invalid.
1538 array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1540 else array[pos++] = static_cast<uchar16_t>(c2[l]);
1546 // Validate our new UTF-16 string.
1553 //! Appends a UTF-16 string to this ustring16
1554 //! \param other The UTF-16 string to append.
1555 //! \param length The length of the string to append.
1556 //! \return A reference to our current string.
1557 ustring16<TAlloc>& append(const uchar16_t* const other, u32 length=0xffffffff)
1562 // Determine if the string is long enough for a BOM.
1564 const uchar16_t* p = other;
1568 } while (*p++ && len < unicode::BOM_ENCODE_UTF16_LEN);
1570 // Check for the BOM to determine the string's endianness.
1571 unicode::EUTF_ENDIAN c_end = unicode::EUTFEE_NATIVE;
1572 if (memcmp(other, unicode::BOM_ENCODE_UTF16_LE, unicode::BOM_ENCODE_UTF16_LEN) == 0)
1573 c_end = unicode::EUTFEE_LITTLE;
1574 else if (memcmp(other, unicode::BOM_ENCODE_UTF16_BE, unicode::BOM_ENCODE_UTF16_LEN) == 0)
1575 c_end = unicode::EUTFEE_BIG;
1577 // If a BOM was found, don't include it in the string.
1578 const uchar16_t* c2 = other;
1579 if (c_end != unicode::EUTFEE_NATIVE)
1581 c2 = other + unicode::BOM_UTF16_LEN;
1582 length -= unicode::BOM_UTF16_LEN;
1585 // Calculate the size of the string to read in.
1591 } while(*p++ && len < length);
1595 // If we need to grow the size of the array, do it now.
1596 if (used + len >= allocated)
1597 reallocate(used + (len * 2));
1601 // Copy the string now.
1602 unicode::EUTF_ENDIAN m_end = getEndianness();
1603 for (u32 l = start; l < start + len; ++l)
1605 array[l] = (uchar16_t)c2[l];
1606 if (c_end != unicode::EUTFEE_NATIVE && c_end != m_end)
1607 array[l] = unicode::swapEndian16(array[l]);
1612 // Validate our new UTF-16 string.
1618 //! Appends a UTF-32 string to this ustring16
1619 //! \param other The UTF-32 string to append.
1620 //! \param length The length of the string to append.
1621 //! \return A reference to our current string.
1622 ustring16<TAlloc>& append(const uchar32_t* const other, u32 length=0xffffffff)
1627 // Check for the BOM to determine the string's endianness.
1628 unicode::EUTF_ENDIAN c_end = unicode::EUTFEE_NATIVE;
1629 if (memcmp(other, unicode::BOM_ENCODE_UTF32_LE, unicode::BOM_ENCODE_UTF32_LEN) == 0)
1630 c_end = unicode::EUTFEE_LITTLE;
1631 else if (memcmp(other, unicode::BOM_ENCODE_UTF32_BE, unicode::BOM_ENCODE_UTF32_LEN) == 0)
1632 c_end = unicode::EUTFEE_BIG;
1634 // If a BOM was found, don't include it in the string.
1635 const uchar32_t* c2 = other;
1636 if (c_end != unicode::EUTFEE_NATIVE)
1638 c2 = other + unicode::BOM_UTF32_LEN;
1639 length -= unicode::BOM_UTF32_LEN;
1642 // Calculate the size of the string to read in.
1644 const uchar32_t* p = c2;
1648 } while(*p++ && len < length);
1652 // If we need to grow the size of the array, do it now.
1653 // In case all of the UTF-32 string is split into surrogate pairs, do len * 2.
1654 if (used + (len * 2) >= allocated)
1655 reallocate(used + ((len * 2) * 2));
1658 // Convert UTF-32 to UTF-16.
1659 unicode::EUTF_ENDIAN m_end = getEndianness();
1661 for (u32 l = 0; l<len; ++l)
1665 uchar32_t ch = c2[l];
1666 if (c_end != unicode::EUTFEE_NATIVE && c_end != m_end)
1667 ch = unicode::swapEndian32(ch);
1671 // Split ch up into a surrogate pair as it is over 16 bits long.
1672 uchar16_t x = static_cast<uchar16_t>(ch);
1673 uchar16_t vh = UTF16_HI_SURROGATE | ((((ch >> 16) & ((1 << 5) - 1)) - 1) << 6) | (x >> 10);
1674 uchar16_t vl = UTF16_LO_SURROGATE | (x & ((1 << 10) - 1));
1677 ++used; // Using two shorts, so increased used again.
1679 else if (ch >= 0xD800 && ch <= 0xDFFF)
1681 // Between possible UTF-16 surrogates (invalid!)
1682 array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1684 else array[pos++] = static_cast<uchar16_t>(ch);
1688 // Validate our new UTF-16 string.
1695 //! Appends a ustring16 to this ustring16
1696 //! \param other The string to append to this one.
1697 //! \return A reference to our current string.
1698 ustring16<TAlloc>& append(const ustring16<TAlloc>& other)
1700 const uchar16_t* oa = other.c_str();
1702 u32 len = other.size_raw();
1704 if (used + len >= allocated)
1705 reallocate(used + len);
1707 for (u32 l=0; l<len; ++l)
1708 array[used+l] = oa[l];
1717 //! Appends a certain amount of characters of a ustring16 to this ustring16.
1718 //! \param other The string to append to this one.
1719 //! \param length How many characters of the other string to add to this one.
1720 //! \return A reference to our current string.
1721 ustring16<TAlloc>& append(const ustring16<TAlloc>& other, u32 length)
1723 if (other.size() == 0)
1726 if (other.size() < length)
1732 if (used + length * 2 >= allocated)
1733 reallocate(used + length * 2);
1735 const_iterator iter(other, 0);
1737 while (!iter.atEnd() && l)
1739 uchar32_t c = *iter;
1749 //! Reserves some memory.
1750 //! \param count The amount of characters to reserve.
1751 void reserve(u32 count)
1753 if (count < allocated)
1760 //! Finds first occurrence of character.
1761 //! \param c The character to search for.
1762 //! \return Position where the character has been found, or -1 if not found.
1763 s32 findFirst(uchar32_t c) const
1765 const_iterator i(*this, 0);
1780 //! Finds first occurrence of a character of a list.
1781 //! \param c A list of characters to find. For example if the method should find the first occurrence of 'a' or 'b', this parameter should be "ab".
1782 //! \param count The amount of characters in the list. Usually, this should be strlen(c).
1783 //! \return Position where one of the characters has been found, or -1 if not found.
1784 s32 findFirstChar(const uchar32_t* const c, u32 count=1) const
1789 const_iterator i(*this, 0);
1795 for (u32 j=0; j<count; ++j)
1806 //! Finds first position of a character not in a given list.
1807 //! \param c A list of characters to NOT find. For example if the method should find the first occurrence of a character not 'a' or 'b', this parameter should be "ab".
1808 //! \param count The amount of characters in the list. Usually, this should be strlen(c).
1809 //! \return Position where the character has been found, or -1 if not found.
1810 s32 findFirstCharNotInList(const uchar32_t* const c, u32 count=1) const
1815 const_iterator i(*this, 0);
1822 for (j=0; j<count; ++j)
1835 //! Finds last position of a character not in a given list.
1836 //! \param c A list of characters to NOT find. For example if the method should find the first occurrence of a character not 'a' or 'b', this parameter should be "ab".
1837 //! \param count The amount of characters in the list. Usually, this should be strlen(c).
1838 //! \return Position where the character has been found, or -1 if not found.
1839 s32 findLastCharNotInList(const uchar32_t* const c, u32 count=1) const
1844 const_iterator i(end());
1847 s32 pos = size() - 1;
1848 while (!i.atStart())
1852 for (j=0; j<count; ++j)
1865 //! Finds next occurrence of character.
1866 //! \param c The character to search for.
1867 //! \param startPos The position in the string to start searching.
1868 //! \return Position where the character has been found, or -1 if not found.
1869 s32 findNext(uchar32_t c, u32 startPos) const
1871 const_iterator i(*this, startPos);
1887 //! Finds last occurrence of character.
1888 //! \param c The character to search for.
1889 //! \param start The start position of the reverse search ( default = -1, on end ).
1890 //! \return Position where the character has been found, or -1 if not found.
1891 s32 findLast(uchar32_t c, s32 start = -1) const
1894 start = core::clamp ( start < 0 ? (s32)s : start, 0, (s32)s ) - 1;
1896 const_iterator i(*this, start);
1898 while (!i.atStart())
1910 //! Finds last occurrence of a character in a list.
1911 //! \param c A list of strings to find. For example if the method should find the last occurrence of 'a' or 'b', this parameter should be "ab".
1912 //! \param count The amount of characters in the list. Usually, this should be strlen(c).
1913 //! \return Position where one of the characters has been found, or -1 if not found.
1914 s32 findLastChar(const uchar32_t* const c, u32 count=1) const
1919 const_iterator i(end());
1923 while (!i.atStart())
1926 for (u32 j=0; j<count; ++j)
1937 //! Finds another ustring16 in this ustring16.
1938 //! \param str The string to find.
1939 //! \param start The start position of the search.
1940 //! \return Positions where the ustring16 has been found, or -1 if not found.
1941 s32 find(const ustring16<TAlloc>& str, const u32 start = 0) const
1943 u32 my_size = size();
1944 u32 their_size = str.size();
1946 if (their_size == 0 || my_size - start < their_size)
1949 const_iterator i(*this, start);
1954 const_iterator i2(i);
1955 const_iterator j(str, 0);
1956 uchar32_t t1 = (uchar32_t)*i2;
1957 uchar32_t t2 = (uchar32_t)*j;
1964 t1 = (uchar32_t)*i2;
1975 //! Finds another ustring16 in this ustring16.
1976 //! \param str The string to find.
1977 //! \param start The start position of the search.
1978 //! \return Positions where the string has been found, or -1 if not found.
1979 s32 find_raw(const ustring16<TAlloc>& str, const u32 start = 0) const
1981 const uchar16_t* data = str.c_str();
1992 for (u32 i=start; i<=used-len; ++i)
1996 while(data[j] && array[i+j] == data[j])
2008 //! Returns a substring.
2009 //! \param begin: Start of substring.
2010 //! \param length: Length of substring.
2011 //! \return A reference to our current string.
2012 ustring16<TAlloc> subString(u32 begin, s32 length) const
2015 // if start after ustring16
2016 // or no proper substring length
2017 if ((length <= 0) || (begin>=len))
2018 return ustring16<TAlloc>("");
2019 // clamp length to maximal value
2020 if ((length+begin) > len)
2023 ustring16<TAlloc> o;
2024 o.reserve((length+1) * 2);
2026 const_iterator i(*this, begin);
2027 while (!i.atEnd() && length)
2038 //! Appends a character to this ustring16.
2039 //! \param c Character to append.
2040 //! \return A reference to our current string.
2041 ustring16<TAlloc>& operator += (char c)
2043 append((uchar32_t)c);
2048 //! Appends a character to this ustring16.
2049 //! \param c Character to append.
2050 //! \return A reference to our current string.
2051 ustring16<TAlloc>& operator += (uchar32_t c)
2058 //! Appends a number to this ustring16.
2059 //! \param c Number to append.
2060 //! \return A reference to our current string.
2061 ustring16<TAlloc>& operator += (short c)
2063 append(core::stringc(c));
2068 //! Appends a number to this ustring16.
2069 //! \param c Number to append.
2070 //! \return A reference to our current string.
2071 ustring16<TAlloc>& operator += (unsigned short c)
2073 append(core::stringc(c));
2078 #ifdef USTRING_CPP0X_NEWLITERALS
2079 //! Appends a number to this ustring16.
2080 //! \param c Number to append.
2081 //! \return A reference to our current string.
2082 ustring16<TAlloc>& operator += (int c)
2084 append(core::stringc(c));
2089 //! Appends a number to this ustring16.
2090 //! \param c Number to append.
2091 //! \return A reference to our current string.
2092 ustring16<TAlloc>& operator += (unsigned int c)
2094 append(core::stringc(c));
2100 //! Appends a number to this ustring16.
2101 //! \param c Number to append.
2102 //! \return A reference to our current string.
2103 ustring16<TAlloc>& operator += (long c)
2105 append(core::stringc(c));
2110 //! Appends a number to this ustring16.
2111 //! \param c Number to append.
2112 //! \return A reference to our current string.
2113 ustring16<TAlloc>& operator += (unsigned long c)
2115 append(core::stringc(c));
2120 //! Appends a number to this ustring16.
2121 //! \param c Number to append.
2122 //! \return A reference to our current string.
2123 ustring16<TAlloc>& operator += (double c)
2125 append(core::stringc(c));
2130 //! Appends a char ustring16 to this ustring16.
2131 //! \param c Char ustring16 to append.
2132 //! \return A reference to our current string.
2133 ustring16<TAlloc>& operator += (const uchar16_t* const c)
2140 //! Appends a ustring16 to this ustring16.
2141 //! \param other ustring16 to append.
2142 //! \return A reference to our current string.
2143 ustring16<TAlloc>& operator += (const ustring16<TAlloc>& other)
2150 //! Replaces all characters of a given type with another one.
2151 //! \param toReplace Character to replace.
2152 //! \param replaceWith Character replacing the old one.
2153 //! \return A reference to our current string.
2154 ustring16<TAlloc>& replace(uchar32_t toReplace, uchar32_t replaceWith)
2156 iterator i(*this, 0);
2159 typename ustring16<TAlloc>::access a = *i;
2160 if ((uchar32_t)a == toReplace)
2168 //! Replaces all instances of a string with another one.
2169 //! \param toReplace The string to replace.
2170 //! \param replaceWith The string replacing the old one.
2171 //! \return A reference to our current string.
2172 ustring16<TAlloc>& replace(const ustring16<TAlloc>& toReplace, const ustring16<TAlloc>& replaceWith)
2174 if (toReplace.size() == 0)
2177 const uchar16_t* other = toReplace.c_str();
2178 const uchar16_t* replace = replaceWith.c_str();
2179 const u32 other_size = toReplace.size_raw();
2180 const u32 replace_size = replaceWith.size_raw();
2182 // Determine the delta. The algorithm will change depending on the delta.
2183 s32 delta = replace_size - other_size;
2185 // A character for character replace. The string will not shrink or grow.
2189 while ((pos = find_raw(other, pos)) != -1)
2191 for (u32 i = 0; i < replace_size; ++i)
2192 array[pos + i] = replace[i];
2198 // We are going to be removing some characters. The string will shrink.
2202 for (u32 pos = 0; pos <= used; ++i, ++pos)
2204 // Is this potentially a match?
2205 if (array[pos] == *other)
2207 // Check to see if we have a match.
2209 for (j = 0; j < other_size; ++j)
2211 if (array[pos + j] != other[j])
2215 // If we have a match, replace characters.
2216 if (j == other_size)
2218 for (j = 0; j < replace_size; ++j)
2219 array[i + j] = replace[j];
2220 i += replace_size - 1;
2221 pos += other_size - 1;
2226 // No match found, just copy characters.
2227 array[i - 1] = array[pos];
2235 // We are going to be adding characters, so the string size will increase.
2236 // Count the number of times toReplace exists in the string so we can allocate the new size.
2239 while ((pos = find_raw(other, pos)) != -1)
2245 // Re-allocate the string now, if needed.
2246 u32 len = delta * find_count;
2247 if (used + len >= allocated)
2248 reallocate(used + len);
2252 while ((pos = find_raw(other, pos)) != -1)
2254 uchar16_t* start = array + pos + other_size - 1;
2255 uchar16_t* ptr = array + used;
2256 uchar16_t* end = array + used + delta;
2258 // Shift characters to make room for the string.
2259 while (ptr != start)
2266 // Add the new string now.
2267 for (u32 i = 0; i < replace_size; ++i)
2268 array[pos + i] = replace[i];
2270 pos += replace_size;
2274 // Terminate the string and return ourself.
2280 //! Removes characters from a ustring16..
2281 //! \param c The character to remove.
2282 //! \return A reference to our current string.
2283 ustring16<TAlloc>& remove(uchar32_t c)
2287 u32 len = (c > 0xFFFF ? 2 : 1); // Remove characters equal to the size of c as a UTF-16 character.
2288 for (u32 i=0; i<=used; ++i)
2291 if (!UTF16_IS_SURROGATE_HI(array[i]))
2293 else if (i + 1 <= used)
2295 // Convert the surrogate pair into a single UTF-32 character.
2296 uc32 = unicode::toUTF32(array[i], array[i + 1]);
2298 u32 len2 = (uc32 > 0xFFFF ? 2 : 1);
2306 array[pos++] = array[i];
2308 array[pos++] = array[++i];
2316 //! Removes a ustring16 from the ustring16.
2317 //! \param toRemove The string to remove.
2318 //! \return A reference to our current string.
2319 ustring16<TAlloc>& remove(const ustring16<TAlloc>& toRemove)
2321 u32 size = toRemove.size_raw();
2322 if (size == 0) return *this;
2324 const uchar16_t* tra = toRemove.c_str();
2327 for (u32 i=0; i<=used; ++i)
2332 if (array[i + j] != tra[j])
2343 array[pos++] = array[i];
2351 //! Removes characters from the ustring16.
2352 //! \param characters The characters to remove.
2353 //! \return A reference to our current string.
2354 ustring16<TAlloc>& removeChars(const ustring16<TAlloc>& characters)
2356 if (characters.size_raw() == 0)
2361 const_iterator iter(characters);
2362 for (u32 i=0; i<=used; ++i)
2365 if (!UTF16_IS_SURROGATE_HI(array[i]))
2367 else if (i + 1 <= used)
2369 // Convert the surrogate pair into a single UTF-32 character.
2370 uc32 = unicode::toUTF32(array[i], array[i+1]);
2372 u32 len2 = (uc32 > 0xFFFF ? 2 : 1);
2376 while (!iter.atEnd())
2378 uchar32_t c = *iter;
2381 found += (c > 0xFFFF ? 2 : 1); // Remove characters equal to the size of c as a UTF-16 character.
2390 array[pos++] = array[i];
2392 array[pos++] = array[++i];
2400 //! Trims the ustring16.
2401 //! Removes the specified characters (by default, Latin-1 whitespace) from the begining and the end of the ustring16.
2402 //! \param whitespace The characters that are to be considered as whitespace.
2403 //! \return A reference to our current string.
2404 ustring16<TAlloc>& trim(const ustring16<TAlloc>& whitespace = " \t\n\r")
2406 core::array<uchar32_t> utf32white = whitespace.toUTF32();
2408 // find start and end of the substring without the specified characters
2409 const s32 begin = findFirstCharNotInList(utf32white.const_pointer(), whitespace.used + 1);
2413 const s32 end = findLastCharNotInList(utf32white.const_pointer(), whitespace.used + 1);
2415 return (*this = subString(begin, (end +1) - begin));
2419 //! Erases a character from the ustring16.
2420 //! May be slow, because all elements following after the erased element have to be copied.
2421 //! \param index Index of element to be erased.
2422 //! \return A reference to our current string.
2423 ustring16<TAlloc>& erase(u32 index)
2425 _IRR_DEBUG_BREAK_IF(index>used) // access violation
2427 iterator i(*this, index);
2430 u32 len = (t > 0xFFFF ? 2 : 1);
2432 for (u32 j = static_cast<u32>(i.getPos()) + len; j <= used; ++j)
2433 array[j - len] = array[j];
2442 //! Validate the existing ustring16, checking for valid surrogate pairs and checking for proper termination.
2443 //! \return A reference to our current string.
2444 ustring16<TAlloc>& validate()
2446 // Validate all unicode characters.
2447 for (u32 i=0; i<allocated; ++i)
2449 // Terminate on existing null.
2455 if (UTF16_IS_SURROGATE(array[i]))
2457 if (((i+1) >= allocated) || UTF16_IS_SURROGATE_LO(array[i]))
2458 array[i] = unicode::UTF_REPLACEMENT_CHARACTER;
2459 else if (UTF16_IS_SURROGATE_HI(array[i]) && !UTF16_IS_SURROGATE_LO(array[i+1]))
2460 array[i] = unicode::UTF_REPLACEMENT_CHARACTER;
2463 if (array[i] >= 0xFDD0 && array[i] <= 0xFDEF)
2464 array[i] = unicode::UTF_REPLACEMENT_CHARACTER;
2471 used = allocated - 1;
2478 //! Gets the last char of the ustring16, or 0.
2479 //! \return The last char of the ustring16, or 0.
2480 uchar32_t lastChar() const
2485 if (UTF16_IS_SURROGATE_LO(array[used-1]))
2487 // Make sure we have a paired surrogate.
2491 // Check for an invalid surrogate.
2492 if (!UTF16_IS_SURROGATE_HI(array[used-2]))
2495 // Convert the surrogate pair into a single UTF-32 character.
2496 return unicode::toUTF32(array[used-2], array[used-1]);
2500 return array[used-1];
2505 //! Split the ustring16 into parts.
2506 /** This method will split a ustring16 at certain delimiter characters
2507 into the container passed in as reference. The type of the container
2508 has to be given as template parameter. It must provide a push_back and
2510 \param ret The result container
2511 \param c C-style ustring16 of delimiter characters
2512 \param count Number of delimiter characters
2513 \param ignoreEmptyTokens Flag to avoid empty substrings in the result
2514 container. If two delimiters occur without a character in between, an
2515 empty substring would be placed in the result. If this flag is set,
2516 only non-empty strings are stored.
2517 \param keepSeparators Flag which allows to add the separator to the
2518 result ustring16. If this flag is true, the concatenation of the
2519 substrings results in the original ustring16. Otherwise, only the
2520 characters between the delimiters are returned.
2521 \return The number of resulting substrings
2523 template<class container>
2524 u32 split(container& ret, const uchar32_t* const c, u32 count=1, bool ignoreEmptyTokens=true, bool keepSeparators=false) const
2529 const_iterator i(*this);
2530 const u32 oldSize=ret.size();
2534 bool lastWasSeparator = false;
2538 bool foundSeparator = false;
2539 for (u32 j=0; j<count; ++j)
2543 if ((!ignoreEmptyTokens || pos - lastpos != 0) &&
2545 ret.push_back(ustring16<TAlloc>(&array[lastpospos], pos - lastpos));
2546 foundSeparator = true;
2547 lastpos = (keepSeparators ? pos : pos + 1);
2548 lastpospos = (keepSeparators ? i.getPos() : i.getPos() + 1);
2552 lastWasSeparator = foundSeparator;
2558 ret.push_back(ustring16<TAlloc>(&array[lastpospos], s - lastpos));
2559 return ret.size()-oldSize;
2563 //! Split the ustring16 into parts.
2564 /** This method will split a ustring16 at certain delimiter characters
2565 into the container passed in as reference. The type of the container
2566 has to be given as template parameter. It must provide a push_back and
2568 \param ret The result container
2569 \param c A unicode string of delimiter characters
2570 \param ignoreEmptyTokens Flag to avoid empty substrings in the result
2571 container. If two delimiters occur without a character in between, an
2572 empty substring would be placed in the result. If this flag is set,
2573 only non-empty strings are stored.
2574 \param keepSeparators Flag which allows to add the separator to the
2575 result ustring16. If this flag is true, the concatenation of the
2576 substrings results in the original ustring16. Otherwise, only the
2577 characters between the delimiters are returned.
2578 \return The number of resulting substrings
2580 template<class container>
2581 u32 split(container& ret, const ustring16<TAlloc>& c, bool ignoreEmptyTokens=true, bool keepSeparators=false) const
2583 core::array<uchar32_t> v = c.toUTF32();
2584 return split(ret, v.pointer(), v.size(), ignoreEmptyTokens, keepSeparators);
2588 //! Gets the size of the allocated memory buffer for the string.
2589 //! \return The size of the allocated memory buffer.
2590 u32 capacity() const
2596 //! Returns the raw number of UTF-16 code points in the string which includes the individual surrogates.
2597 //! \return The raw number of UTF-16 code points, excluding the trialing NUL.
2598 u32 size_raw() const
2604 //! Inserts a character into the string.
2605 //! \param c The character to insert.
2606 //! \param pos The position to insert the character.
2607 //! \return A reference to our current string.
2608 ustring16<TAlloc>& insert(uchar32_t c, u32 pos)
2610 u8 len = (c > 0xFFFF ? 2 : 1);
2612 if (used + len >= allocated)
2613 reallocate(used + len);
2617 iterator iter(*this, pos);
2618 for (u32 i = used - 2; i > iter.getPos(); --i)
2619 array[i] = array[i - len];
2623 // c will be multibyte, so split it up into a surrogate pair.
2624 uchar16_t x = static_cast<uchar16_t>(c);
2625 uchar16_t vh = UTF16_HI_SURROGATE | ((((c >> 16) & ((1 << 5) - 1)) - 1) << 6) | (x >> 10);
2626 uchar16_t vl = UTF16_LO_SURROGATE | (x & ((1 << 10) - 1));
2627 array[iter.getPos()] = vh;
2628 array[iter.getPos()+1] = vl;
2632 array[iter.getPos()] = static_cast<uchar16_t>(c);
2639 //! Inserts a string into the string.
2640 //! \param c The string to insert.
2641 //! \param pos The position to insert the string.
2642 //! \return A reference to our current string.
2643 ustring16<TAlloc>& insert(const ustring16<TAlloc>& c, u32 pos)
2645 u32 len = c.size_raw();
2646 if (len == 0) return *this;
2648 if (used + len >= allocated)
2649 reallocate(used + len);
2653 iterator iter(*this, pos);
2654 for (u32 i = used - 2; i > iter.getPos() + len; --i)
2655 array[i] = array[i - len];
2657 const uchar16_t* s = c.c_str();
2658 for (u32 i = 0; i < len; ++i)
2669 //! Inserts a character into the string.
2670 //! \param c The character to insert.
2671 //! \param pos The position to insert the character.
2672 //! \return A reference to our current string.
2673 ustring16<TAlloc>& insert_raw(uchar16_t c, u32 pos)
2675 if (used + 1 >= allocated)
2676 reallocate(used + 1);
2680 for (u32 i = used - 1; i > pos; --i)
2681 array[i] = array[i - 1];
2689 //! Removes a character from string.
2690 //! \param pos Position of the character to remove.
2691 //! \return A reference to our current string.
2692 ustring16<TAlloc>& erase_raw(u32 pos)
2694 for (u32 i=pos; i<=used; ++i)
2696 array[i] = array[i + 1];
2704 //! Replaces a character in the string.
2705 //! \param c The new character.
2706 //! \param pos The position of the character to replace.
2707 //! \return A reference to our current string.
2708 ustring16<TAlloc>& replace_raw(uchar16_t c, u32 pos)
2715 //! Returns an iterator to the beginning of the string.
2716 //! \return An iterator to the beginning of the string.
2719 iterator i(*this, 0);
2724 //! Returns an iterator to the beginning of the string.
2725 //! \return An iterator to the beginning of the string.
2726 const_iterator begin() const
2728 const_iterator i(*this, 0);
2733 //! Returns an iterator to the beginning of the string.
2734 //! \return An iterator to the beginning of the string.
2735 const_iterator cbegin() const
2737 const_iterator i(*this, 0);
2742 //! Returns an iterator to the end of the string.
2743 //! \return An iterator to the end of the string.
2746 iterator i(*this, 0);
2752 //! Returns an iterator to the end of the string.
2753 //! \return An iterator to the end of the string.
2754 const_iterator end() const
2756 const_iterator i(*this, 0);
2762 //! Returns an iterator to the end of the string.
2763 //! \return An iterator to the end of the string.
2764 const_iterator cend() const
2766 const_iterator i(*this, 0);
2772 //! Converts the string to a UTF-8 encoded string.
2773 //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
2774 //! \return A string containing the UTF-8 encoded string.
2775 core::string<uchar8_t> toUTF8_s(const bool addBOM = false) const
2777 core::string<uchar8_t> ret;
2778 ret.reserve(used * 4 + (addBOM ? unicode::BOM_UTF8_LEN : 0) + 1);
2779 const_iterator iter(*this, 0);
2781 // Add the byte order mark if the user wants it.
2784 ret.append(unicode::BOM_ENCODE_UTF8[0]);
2785 ret.append(unicode::BOM_ENCODE_UTF8[1]);
2786 ret.append(unicode::BOM_ENCODE_UTF8[2]);
2789 while (!iter.atEnd())
2791 uchar32_t c = *iter;
2794 uchar8_t b1 = (0x1E << 3) | ((c >> 18) & 0x7);
2795 uchar8_t b2 = (0x2 << 6) | ((c >> 12) & 0x3F);
2796 uchar8_t b3 = (0x2 << 6) | ((c >> 6) & 0x3F);
2797 uchar8_t b4 = (0x2 << 6) | (c & 0x3F);
2805 uchar8_t b1 = (0xE << 4) | ((c >> 12) & 0xF);
2806 uchar8_t b2 = (0x2 << 6) | ((c >> 6) & 0x3F);
2807 uchar8_t b3 = (0x2 << 6) | (c & 0x3F);
2814 uchar8_t b1 = (0x6 << 5) | ((c >> 6) & 0x1F);
2815 uchar8_t b2 = (0x2 << 6) | (c & 0x3F);
2821 ret.append(static_cast<uchar8_t>(c));
2829 //! Converts the string to a UTF-8 encoded string array.
2830 //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
2831 //! \return An array containing the UTF-8 encoded string.
2832 core::array<uchar8_t> toUTF8(const bool addBOM = false) const
2834 core::array<uchar8_t> ret(used * 4 + (addBOM ? unicode::BOM_UTF8_LEN : 0) + 1);
2835 const_iterator iter(*this, 0);
2837 // Add the byte order mark if the user wants it.
2840 ret.push_back(unicode::BOM_ENCODE_UTF8[0]);
2841 ret.push_back(unicode::BOM_ENCODE_UTF8[1]);
2842 ret.push_back(unicode::BOM_ENCODE_UTF8[2]);
2845 while (!iter.atEnd())
2847 uchar32_t c = *iter;
2850 uchar8_t b1 = (0x1E << 3) | ((c >> 18) & 0x7);
2851 uchar8_t b2 = (0x2 << 6) | ((c >> 12) & 0x3F);
2852 uchar8_t b3 = (0x2 << 6) | ((c >> 6) & 0x3F);
2853 uchar8_t b4 = (0x2 << 6) | (c & 0x3F);
2861 uchar8_t b1 = (0xE << 4) | ((c >> 12) & 0xF);
2862 uchar8_t b2 = (0x2 << 6) | ((c >> 6) & 0x3F);
2863 uchar8_t b3 = (0x2 << 6) | (c & 0x3F);
2870 uchar8_t b1 = (0x6 << 5) | ((c >> 6) & 0x1F);
2871 uchar8_t b2 = (0x2 << 6) | (c & 0x3F);
2877 ret.push_back(static_cast<uchar8_t>(c));
2886 #ifdef USTRING_CPP0X_NEWLITERALS // C++0x
2887 //! Converts the string to a UTF-16 encoded string.
2888 //! \param endian The desired endianness of the string.
2889 //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
2890 //! \return A string containing the UTF-16 encoded string.
2891 core::string<char16_t> toUTF16_s(const unicode::EUTF_ENDIAN endian = unicode::EUTFEE_NATIVE, const bool addBOM = false) const
2893 core::string<char16_t> ret;
2894 ret.reserve(used + (addBOM ? unicode::BOM_UTF16_LEN : 0) + 1);
2896 // Add the BOM if specified.
2899 if (endian == unicode::EUTFEE_NATIVE)
2900 ret[0] = unicode::BOM;
2901 else if (endian == unicode::EUTFEE_LITTLE)
2903 uchar8_t* ptr8 = reinterpret_cast<uchar8_t*>(ret.c_str());
2904 *ptr8++ = unicode::BOM_ENCODE_UTF16_LE[0];
2905 *ptr8 = unicode::BOM_ENCODE_UTF16_LE[1];
2909 uchar8_t* ptr8 = reinterpret_cast<uchar8_t*>(ret.c_str());
2910 *ptr8++ = unicode::BOM_ENCODE_UTF16_BE[0];
2911 *ptr8 = unicode::BOM_ENCODE_UTF16_BE[1];
2916 if (endian != unicode::EUTFEE_NATIVE && getEndianness() != endian)
2918 char16_t* ptr = ret.c_str();
2919 for (u32 i = 0; i < ret.size(); ++i)
2920 *ptr++ = unicode::swapEndian16(*ptr);
2927 //! Converts the string to a UTF-16 encoded string array.
2928 //! Unfortunately, no toUTF16_s() version exists due to limitations with Irrlicht's string class.
2929 //! \param endian The desired endianness of the string.
2930 //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
2931 //! \return An array containing the UTF-16 encoded string.
2932 core::array<uchar16_t> toUTF16(const unicode::EUTF_ENDIAN endian = unicode::EUTFEE_NATIVE, const bool addBOM = false) const
2934 core::array<uchar16_t> ret(used + (addBOM ? unicode::BOM_UTF16_LEN : 0) + 1);
2935 uchar16_t* ptr = ret.pointer();
2937 // Add the BOM if specified.
2940 if (endian == unicode::EUTFEE_NATIVE)
2941 *ptr = unicode::BOM;
2942 else if (endian == unicode::EUTFEE_LITTLE)
2944 uchar8_t* ptr8 = reinterpret_cast<uchar8_t*>(ptr);
2945 *ptr8++ = unicode::BOM_ENCODE_UTF16_LE[0];
2946 *ptr8 = unicode::BOM_ENCODE_UTF16_LE[1];
2950 uchar8_t* ptr8 = reinterpret_cast<uchar8_t*>(ptr);
2951 *ptr8++ = unicode::BOM_ENCODE_UTF16_BE[0];
2952 *ptr8 = unicode::BOM_ENCODE_UTF16_BE[1];
2957 memcpy((void*)ptr, (void*)array, used * sizeof(uchar16_t));
2958 if (endian != unicode::EUTFEE_NATIVE && getEndianness() != endian)
2960 for (u32 i = 0; i <= used; ++i)
2961 *ptr++ = unicode::swapEndian16(*ptr);
2963 ret.set_used(used + (addBOM ? unicode::BOM_UTF16_LEN : 0));
2969 #ifdef USTRING_CPP0X_NEWLITERALS // C++0x
2970 //! Converts the string to a UTF-32 encoded string.
2971 //! \param endian The desired endianness of the string.
2972 //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
2973 //! \return A string containing the UTF-32 encoded string.
2974 core::string<char32_t> toUTF32_s(const unicode::EUTF_ENDIAN endian = unicode::EUTFEE_NATIVE, const bool addBOM = false) const
2976 core::string<char32_t> ret;
2977 ret.reserve(size() + 1 + (addBOM ? unicode::BOM_UTF32_LEN : 0));
2978 const_iterator iter(*this, 0);
2980 // Add the BOM if specified.
2983 if (endian == unicode::EUTFEE_NATIVE)
2984 ret.append(unicode::BOM);
2993 if (endian == unicode::EUTFEE_LITTLE)
2995 t.chunk[0] = unicode::BOM_ENCODE_UTF32_LE[0];
2996 t.chunk[1] = unicode::BOM_ENCODE_UTF32_LE[1];
2997 t.chunk[2] = unicode::BOM_ENCODE_UTF32_LE[2];
2998 t.chunk[3] = unicode::BOM_ENCODE_UTF32_LE[3];
3002 t.chunk[0] = unicode::BOM_ENCODE_UTF32_BE[0];
3003 t.chunk[1] = unicode::BOM_ENCODE_UTF32_BE[1];
3004 t.chunk[2] = unicode::BOM_ENCODE_UTF32_BE[2];
3005 t.chunk[3] = unicode::BOM_ENCODE_UTF32_BE[3];
3011 while (!iter.atEnd())
3013 uchar32_t c = *iter;
3014 if (endian != unicode::EUTFEE_NATIVE && getEndianness() != endian)
3015 c = unicode::swapEndian32(c);
3024 //! Converts the string to a UTF-32 encoded string array.
3025 //! Unfortunately, no toUTF32_s() version exists due to limitations with Irrlicht's string class.
3026 //! \param endian The desired endianness of the string.
3027 //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
3028 //! \return An array containing the UTF-32 encoded string.
3029 core::array<uchar32_t> toUTF32(const unicode::EUTF_ENDIAN endian = unicode::EUTFEE_NATIVE, const bool addBOM = false) const
3031 core::array<uchar32_t> ret(size() + (addBOM ? unicode::BOM_UTF32_LEN : 0) + 1);
3032 const_iterator iter(*this, 0);
3034 // Add the BOM if specified.
3037 if (endian == unicode::EUTFEE_NATIVE)
3038 ret.push_back(unicode::BOM);
3047 if (endian == unicode::EUTFEE_LITTLE)
3049 t.chunk[0] = unicode::BOM_ENCODE_UTF32_LE[0];
3050 t.chunk[1] = unicode::BOM_ENCODE_UTF32_LE[1];
3051 t.chunk[2] = unicode::BOM_ENCODE_UTF32_LE[2];
3052 t.chunk[3] = unicode::BOM_ENCODE_UTF32_LE[3];
3056 t.chunk[0] = unicode::BOM_ENCODE_UTF32_BE[0];
3057 t.chunk[1] = unicode::BOM_ENCODE_UTF32_BE[1];
3058 t.chunk[2] = unicode::BOM_ENCODE_UTF32_BE[2];
3059 t.chunk[3] = unicode::BOM_ENCODE_UTF32_BE[3];
3061 ret.push_back(t.full);
3066 while (!iter.atEnd())
3068 uchar32_t c = *iter;
3069 if (endian != unicode::EUTFEE_NATIVE && getEndianness() != endian)
3070 c = unicode::swapEndian32(c);
3078 //! Converts the string to a wchar_t encoded string.
3079 /** The size of a wchar_t changes depending on the platform. This function will store a
3080 correct UTF-8, -16, or -32 encoded string depending on the size of a wchar_t. **/
3081 //! \param endian The desired endianness of the string.
3082 //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
3083 //! \return A string containing the wchar_t encoded string.
3084 core::string<wchar_t> toWCHAR_s(const unicode::EUTF_ENDIAN endian = unicode::EUTFEE_NATIVE, const bool addBOM = false) const
3086 if (sizeof(wchar_t) == 4)
3088 core::array<uchar32_t> a(toUTF32(endian, addBOM));
3089 core::stringw ret(a.pointer());
3092 else if (sizeof(wchar_t) == 2)
3094 if (endian == unicode::EUTFEE_NATIVE && addBOM == false)
3096 core::stringw ret(array);
3101 core::array<uchar16_t> a(toUTF16(endian, addBOM));
3102 core::stringw ret(a.pointer());
3106 else if (sizeof(wchar_t) == 1)
3108 core::array<uchar8_t> a(toUTF8(addBOM));
3109 core::stringw ret(a.pointer());
3113 // Shouldn't happen.
3114 return core::stringw();
3118 //! Converts the string to a wchar_t encoded string array.
3119 /** The size of a wchar_t changes depending on the platform. This function will store a
3120 correct UTF-8, -16, or -32 encoded string depending on the size of a wchar_t. **/
3121 //! \param endian The desired endianness of the string.
3122 //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
3123 //! \return An array containing the wchar_t encoded string.
3124 core::array<wchar_t> toWCHAR(const unicode::EUTF_ENDIAN endian = unicode::EUTFEE_NATIVE, const bool addBOM = false) const
3126 if (sizeof(wchar_t) == 4)
3128 core::array<uchar32_t> a(toUTF32(endian, addBOM));
3129 core::array<wchar_t> ret(a.size());
3130 ret.set_used(a.size());
3131 memcpy((void*)ret.pointer(), (void*)a.pointer(), a.size() * sizeof(uchar32_t));
3134 if (sizeof(wchar_t) == 2)
3136 if (endian == unicode::EUTFEE_NATIVE && addBOM == false)
3138 core::array<wchar_t> ret(used);
3140 memcpy((void*)ret.pointer(), (void*)array, used * sizeof(uchar16_t));
3145 core::array<uchar16_t> a(toUTF16(endian, addBOM));
3146 core::array<wchar_t> ret(a.size());
3147 ret.set_used(a.size());
3148 memcpy((void*)ret.pointer(), (void*)a.pointer(), a.size() * sizeof(uchar16_t));
3152 if (sizeof(wchar_t) == 1)
3154 core::array<uchar8_t> a(toUTF8(addBOM));
3155 core::array<wchar_t> ret(a.size());
3156 ret.set_used(a.size());
3157 memcpy((void*)ret.pointer(), (void*)a.pointer(), a.size() * sizeof(uchar8_t));
3161 // Shouldn't happen.
3162 return core::array<wchar_t>();
3165 //! Converts the string to a properly encoded io::path string.
3166 //! \param endian The desired endianness of the string.
3167 //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
3168 //! \return An io::path string containing the properly encoded string.
3169 io::path toPATH_s(const unicode::EUTF_ENDIAN endian = unicode::EUTFEE_NATIVE, const bool addBOM = false) const
3171 #if defined(_IRR_WCHAR_FILESYSTEM)
3172 return toWCHAR_s(endian, addBOM);
3174 return toUTF8_s(addBOM);
3178 //! Loads an unknown stream of data.
3179 //! Will attempt to determine if the stream is unicode data. Useful for loading from files.
3180 //! \param data The data stream to load from.
3181 //! \param data_size The length of the data string.
3182 //! \return A reference to our current string.
3183 ustring16<TAlloc>& loadDataStream(const char* data, size_t data_size)
3185 // Clear our string.
3190 unicode::EUTF_ENCODE e = unicode::determineUnicodeBOM(data);
3194 case unicode::EUTFE_UTF8:
3195 append((uchar8_t*)data, data_size);
3198 case unicode::EUTFE_UTF16:
3199 case unicode::EUTFE_UTF16_BE:
3200 case unicode::EUTFE_UTF16_LE:
3201 append((uchar16_t*)data, data_size / 2);
3204 case unicode::EUTFE_UTF32:
3205 case unicode::EUTFE_UTF32_BE:
3206 case unicode::EUTFE_UTF32_LE:
3207 append((uchar32_t*)data, data_size / 4);
3214 //! Gets the encoding of the Unicode string this class contains.
3215 //! \return An enum describing the current encoding of this string.
3216 const unicode::EUTF_ENCODE getEncoding() const
3221 //! Gets the endianness of the Unicode string this class contains.
3222 //! \return An enum describing the endianness of this string.
3223 const unicode::EUTF_ENDIAN getEndianness() const
3225 if (encoding == unicode::EUTFE_UTF16_LE ||
3226 encoding == unicode::EUTFE_UTF32_LE)
3227 return unicode::EUTFEE_LITTLE;
3228 else return unicode::EUTFEE_BIG;
3233 //! Reallocate the string, making it bigger or smaller.
3234 //! \param new_size The new size of the string.
3235 void reallocate(u32 new_size)
3237 uchar16_t* old_array = array;
3239 array = allocator.allocate(new_size + 1); //new u16[new_size];
3240 allocated = new_size + 1;
3241 if (old_array == 0) return;
3243 u32 amount = used < new_size ? used : new_size;
3244 for (u32 i=0; i<=amount; ++i)
3245 array[i] = old_array[i];
3247 if (allocated <= used)
3248 used = allocated - 1;
3252 allocator.deallocate(old_array); // delete [] old_array;
3255 //--- member variables
3258 unicode::EUTF_ENCODE encoding;
3262 //irrAllocator<uchar16_t> allocator;
3265 typedef ustring16<irrAllocator<uchar16_t> > ustring;
3268 //! Appends two ustring16s.
3269 template <typename TAlloc>
3270 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const ustring16<TAlloc>& right)
3272 ustring16<TAlloc> ret(left);
3278 //! Appends a ustring16 and a null-terminated unicode string.
3279 template <typename TAlloc, class B>
3280 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const B* const right)
3282 ustring16<TAlloc> ret(left);
3288 //! Appends a ustring16 and a null-terminated unicode string.
3289 template <class B, typename TAlloc>
3290 inline ustring16<TAlloc> operator+(const B* const left, const ustring16<TAlloc>& right)
3292 ustring16<TAlloc> ret(left);
3298 //! Appends a ustring16 and an Irrlicht string.
3299 template <typename TAlloc, typename B, typename BAlloc>
3300 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const string<B, BAlloc>& right)
3302 ustring16<TAlloc> ret(left);
3308 //! Appends a ustring16 and an Irrlicht string.
3309 template <typename TAlloc, typename B, typename BAlloc>
3310 inline ustring16<TAlloc> operator+(const string<B, BAlloc>& left, const ustring16<TAlloc>& right)
3312 ustring16<TAlloc> ret(left);
3318 //! Appends a ustring16 and a std::basic_string.
3319 template <typename TAlloc, typename B, typename A, typename BAlloc>
3320 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const std::basic_string<B, A, BAlloc>& right)
3322 ustring16<TAlloc> ret(left);
3328 //! Appends a ustring16 and a std::basic_string.
3329 template <typename TAlloc, typename B, typename A, typename BAlloc>
3330 inline ustring16<TAlloc> operator+(const std::basic_string<B, A, BAlloc>& left, const ustring16<TAlloc>& right)
3332 ustring16<TAlloc> ret(left);
3338 //! Appends a ustring16 and a char.
3339 template <typename TAlloc>
3340 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const char right)
3342 ustring16<TAlloc> ret(left);
3348 //! Appends a ustring16 and a char.
3349 template <typename TAlloc>
3350 inline ustring16<TAlloc> operator+(const char left, const ustring16<TAlloc>& right)
3352 ustring16<TAlloc> ret(left);
3358 #ifdef USTRING_CPP0X_NEWLITERALS
3359 //! Appends a ustring16 and a uchar32_t.
3360 template <typename TAlloc>
3361 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const uchar32_t right)
3363 ustring16<TAlloc> ret(left);
3369 //! Appends a ustring16 and a uchar32_t.
3370 template <typename TAlloc>
3371 inline ustring16<TAlloc> operator+(const uchar32_t left, const ustring16<TAlloc>& right)
3373 ustring16<TAlloc> ret(left);
3380 //! Appends a ustring16 and a short.
3381 template <typename TAlloc>
3382 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const short right)
3384 ustring16<TAlloc> ret(left);
3385 ret += core::stringc(right);
3390 //! Appends a ustring16 and a short.
3391 template <typename TAlloc>
3392 inline ustring16<TAlloc> operator+(const short left, const ustring16<TAlloc>& right)
3394 ustring16<TAlloc> ret(core::stringc(left));
3400 //! Appends a ustring16 and an unsigned short.
3401 template <typename TAlloc>
3402 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const unsigned short right)
3404 ustring16<TAlloc> ret(left);
3405 ret += core::stringc(right);
3410 //! Appends a ustring16 and an unsigned short.
3411 template <typename TAlloc>
3412 inline ustring16<TAlloc> operator+(const unsigned short left, const ustring16<TAlloc>& right)
3414 ustring16<TAlloc> ret(core::stringc(left));
3420 //! Appends a ustring16 and an int.
3421 template <typename TAlloc>
3422 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const int right)
3424 ustring16<TAlloc> ret(left);
3425 ret += core::stringc(right);
3430 //! Appends a ustring16 and an int.
3431 template <typename TAlloc>
3432 inline ustring16<TAlloc> operator+(const int left, const ustring16<TAlloc>& right)
3434 ustring16<TAlloc> ret(core::stringc(left));
3440 //! Appends a ustring16 and an unsigned int.
3441 template <typename TAlloc>
3442 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const unsigned int right)
3444 ustring16<TAlloc> ret(left);
3445 ret += core::stringc(right);
3450 //! Appends a ustring16 and an unsigned int.
3451 template <typename TAlloc>
3452 inline ustring16<TAlloc> operator+(const unsigned int left, const ustring16<TAlloc>& right)
3454 ustring16<TAlloc> ret(core::stringc(left));
3460 //! Appends a ustring16 and a long.
3461 template <typename TAlloc>
3462 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const long right)
3464 ustring16<TAlloc> ret(left);
3465 ret += core::stringc(right);
3470 //! Appends a ustring16 and a long.
3471 template <typename TAlloc>
3472 inline ustring16<TAlloc> operator+(const long left, const ustring16<TAlloc>& right)
3474 ustring16<TAlloc> ret(core::stringc(left));
3480 //! Appends a ustring16 and an unsigned long.
3481 template <typename TAlloc>
3482 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const unsigned long right)
3484 ustring16<TAlloc> ret(left);
3485 ret += core::stringc(right);
3490 //! Appends a ustring16 and an unsigned long.
3491 template <typename TAlloc>
3492 inline ustring16<TAlloc> operator+(const unsigned long left, const ustring16<TAlloc>& right)
3494 ustring16<TAlloc> ret(core::stringc(left));
3500 //! Appends a ustring16 and a float.
3501 template <typename TAlloc>
3502 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const float right)
3504 ustring16<TAlloc> ret(left);
3505 ret += core::stringc(right);
3510 //! Appends a ustring16 and a float.
3511 template <typename TAlloc>
3512 inline ustring16<TAlloc> operator+(const float left, const ustring16<TAlloc>& right)
3514 ustring16<TAlloc> ret(core::stringc(left));
3520 //! Appends a ustring16 and a double.
3521 template <typename TAlloc>
3522 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const double right)
3524 ustring16<TAlloc> ret(left);
3525 ret += core::stringc(right);
3530 //! Appends a ustring16 and a double.
3531 template <typename TAlloc>
3532 inline ustring16<TAlloc> operator+(const double left, const ustring16<TAlloc>& right)
3534 ustring16<TAlloc> ret(core::stringc(left));
3540 #ifdef USTRING_CPP0X
3541 //! Appends two ustring16s.
3542 template <typename TAlloc>
3543 inline ustring16<TAlloc>&& operator+(const ustring16<TAlloc>& left, ustring16<TAlloc>&& right)
3545 //std::cout << "MOVE operator+(&, &&)" << std::endl;
3546 right.insert(left, 0);
3547 return std::move(right);
3551 //! Appends two ustring16s.
3552 template <typename TAlloc>
3553 inline ustring16<TAlloc>&& operator+(ustring16<TAlloc>&& left, const ustring16<TAlloc>& right)
3555 //std::cout << "MOVE operator+(&&, &)" << std::endl;
3557 return std::move(left);
3561 //! Appends two ustring16s.
3562 template <typename TAlloc>
3563 inline ustring16<TAlloc>&& operator+(ustring16<TAlloc>&& left, ustring16<TAlloc>&& right)
3565 //std::cout << "MOVE operator+(&&, &&)" << std::endl;
3566 if ((right.size_raw() <= left.capacity() - left.size_raw()) ||
3567 (right.capacity() - right.size_raw() < left.size_raw()))
3570 return std::move(left);
3574 right.insert(left, 0);
3575 return std::move(right);
3580 //! Appends a ustring16 and a null-terminated unicode string.
3581 template <typename TAlloc, class B>
3582 inline ustring16<TAlloc>&& operator+(ustring16<TAlloc>&& left, const B* const right)
3584 //std::cout << "MOVE operator+(&&, B*)" << std::endl;
3586 return std::move(left);
3590 //! Appends a ustring16 and a null-terminated unicode string.
3591 template <class B, typename TAlloc>
3592 inline ustring16<TAlloc>&& operator+(const B* const left, ustring16<TAlloc>&& right)
3594 //std::cout << "MOVE operator+(B*, &&)" << std::endl;
3595 right.insert(left, 0);
3596 return std::move(right);
3600 //! Appends a ustring16 and an Irrlicht string.
3601 template <typename TAlloc, typename B, typename BAlloc>
3602 inline ustring16<TAlloc>&& operator+(const string<B, BAlloc>& left, ustring16<TAlloc>&& right)
3604 //std::cout << "MOVE operator+(&, &&)" << std::endl;
3605 right.insert(left, 0);
3606 return std::move(right);
3610 //! Appends a ustring16 and an Irrlicht string.
3611 template <typename TAlloc, typename B, typename BAlloc>
3612 inline ustring16<TAlloc>&& operator+(ustring16<TAlloc>&& left, const string<B, BAlloc>& right)
3614 //std::cout << "MOVE operator+(&&, &)" << std::endl;
3616 return std::move(left);
3620 //! Appends a ustring16 and a std::basic_string.
3621 template <typename TAlloc, typename B, typename A, typename BAlloc>
3622 inline ustring16<TAlloc>&& operator+(const std::basic_string<B, A, BAlloc>& left, ustring16<TAlloc>&& right)
3624 //std::cout << "MOVE operator+(&, &&)" << std::endl;
3625 right.insert(core::ustring16<TAlloc>(left), 0);
3626 return std::move(right);
3630 //! Appends a ustring16 and a std::basic_string.
3631 template <typename TAlloc, typename B, typename A, typename BAlloc>
3632 inline ustring16<TAlloc>&& operator+(ustring16<TAlloc>&& left, const std::basic_string<B, A, BAlloc>& right)
3634 //std::cout << "MOVE operator+(&&, &)" << std::endl;
3636 return std::move(left);
3640 //! Appends a ustring16 and a char.
3641 template <typename TAlloc>
3642 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const char right)
3644 left.append((uchar32_t)right);
3645 return std::move(left);
3649 //! Appends a ustring16 and a char.
3650 template <typename TAlloc>
3651 inline ustring16<TAlloc> operator+(const char left, ustring16<TAlloc>&& right)
3653 right.insert((uchar32_t)left, 0);
3654 return std::move(right);
3658 #ifdef USTRING_CPP0X_NEWLITERALS
3659 //! Appends a ustring16 and a uchar32_t.
3660 template <typename TAlloc>
3661 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const uchar32_t right)
3664 return std::move(left);
3668 //! Appends a ustring16 and a uchar32_t.
3669 template <typename TAlloc>
3670 inline ustring16<TAlloc> operator+(const uchar32_t left, ustring16<TAlloc>&& right)
3672 right.insert(left, 0);
3673 return std::move(right);
3678 //! Appends a ustring16 and a short.
3679 template <typename TAlloc>
3680 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const short right)
3682 left.append(core::stringc(right));
3683 return std::move(left);
3687 //! Appends a ustring16 and a short.
3688 template <typename TAlloc>
3689 inline ustring16<TAlloc> operator+(const short left, ustring16<TAlloc>&& right)
3691 right.insert(core::stringc(left), 0);
3692 return std::move(right);
3696 //! Appends a ustring16 and an unsigned short.
3697 template <typename TAlloc>
3698 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const unsigned short right)
3700 left.append(core::stringc(right));
3701 return std::move(left);
3705 //! Appends a ustring16 and an unsigned short.
3706 template <typename TAlloc>
3707 inline ustring16<TAlloc> operator+(const unsigned short left, ustring16<TAlloc>&& right)
3709 right.insert(core::stringc(left), 0);
3710 return std::move(right);
3714 //! Appends a ustring16 and an int.
3715 template <typename TAlloc>
3716 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const int right)
3718 left.append(core::stringc(right));
3719 return std::move(left);
3723 //! Appends a ustring16 and an int.
3724 template <typename TAlloc>
3725 inline ustring16<TAlloc> operator+(const int left, ustring16<TAlloc>&& right)
3727 right.insert(core::stringc(left), 0);
3728 return std::move(right);
3732 //! Appends a ustring16 and an unsigned int.
3733 template <typename TAlloc>
3734 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const unsigned int right)
3736 left.append(core::stringc(right));
3737 return std::move(left);
3741 //! Appends a ustring16 and an unsigned int.
3742 template <typename TAlloc>
3743 inline ustring16<TAlloc> operator+(const unsigned int left, ustring16<TAlloc>&& right)
3745 right.insert(core::stringc(left), 0);
3746 return std::move(right);
3750 //! Appends a ustring16 and a long.
3751 template <typename TAlloc>
3752 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const long right)
3754 left.append(core::stringc(right));
3755 return std::move(left);
3759 //! Appends a ustring16 and a long.
3760 template <typename TAlloc>
3761 inline ustring16<TAlloc> operator+(const long left, ustring16<TAlloc>&& right)
3763 right.insert(core::stringc(left), 0);
3764 return std::move(right);
3768 //! Appends a ustring16 and an unsigned long.
3769 template <typename TAlloc>
3770 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const unsigned long right)
3772 left.append(core::stringc(right));
3773 return std::move(left);
3777 //! Appends a ustring16 and an unsigned long.
3778 template <typename TAlloc>
3779 inline ustring16<TAlloc> operator+(const unsigned long left, ustring16<TAlloc>&& right)
3781 right.insert(core::stringc(left), 0);
3782 return std::move(right);
3786 //! Appends a ustring16 and a float.
3787 template <typename TAlloc>
3788 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const float right)
3790 left.append(core::stringc(right));
3791 return std::move(left);
3795 //! Appends a ustring16 and a float.
3796 template <typename TAlloc>
3797 inline ustring16<TAlloc> operator+(const float left, ustring16<TAlloc>&& right)
3799 right.insert(core::stringc(left), 0);
3800 return std::move(right);
3804 //! Appends a ustring16 and a double.
3805 template <typename TAlloc>
3806 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const double right)
3808 left.append(core::stringc(right));
3809 return std::move(left);
3813 //! Appends a ustring16 and a double.
3814 template <typename TAlloc>
3815 inline ustring16<TAlloc> operator+(const double left, ustring16<TAlloc>&& right)
3817 right.insert(core::stringc(left), 0);
3818 return std::move(right);
3823 #ifndef USTRING_NO_STL
3824 //! Writes a ustring16 to an ostream.
3825 template <typename TAlloc>
3826 inline std::ostream& operator<<(std::ostream& out, const ustring16<TAlloc>& in)
3828 out << in.toUTF8_s().c_str();
3832 //! Writes a ustring16 to a wostream.
3833 template <typename TAlloc>
3834 inline std::wostream& operator<<(std::wostream& out, const ustring16<TAlloc>& in)
3836 out << in.toWCHAR_s().c_str();
3842 #ifndef USTRING_NO_STL
3847 //! Hashing algorithm for hashing a ustring. Used for things like unordered_maps.
3848 //! Algorithm taken from std::hash<std::string>.
3849 class hash : public std::unary_function<core::ustring, size_t>
3852 size_t operator()(const core::ustring& s) const
3854 size_t ret = 2166136261U;
3856 size_t stride = 1 + s.size_raw() / 10;
3858 core::ustring::const_iterator i = s.begin();
3859 while (i != s.end())
3861 // TODO: Don't force u32 on an x64 OS. Make it agnostic.
3862 ret = 16777619U * ret ^ (size_t)s[(u32)index];
3870 } // end namespace unicode
3874 } // end namespace core
3875 } // end namespace irr