From 43dab2ffc899133fbd8bb4bf4d209a3bbc70b901 Mon Sep 17 00:00:00 2001 From: est31 Date: Wed, 17 Jun 2015 22:10:22 +0200 Subject: [PATCH] Make wrap_rows not wrap inside utf-8 multibyte sequences Also count multibyte sequences as "one" character. Adds unittest for the bug reporter's case. Fixes #2796. --- src/unittest/test_utilities.cpp | 8 ++++++++ src/util/string.h | 13 +++++++++++-- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/src/unittest/test_utilities.cpp b/src/unittest/test_utilities.cpp index 6a731c662..9678a81eb 100644 --- a/src/unittest/test_utilities.cpp +++ b/src/unittest/test_utilities.cpp @@ -242,6 +242,14 @@ void TestUtilities::testUTF8() void TestUtilities::testWrapRows() { UASSERT(wrap_rows("12345678",4) == "1234\n5678"); + // test that wrap_rows doesn't wrap inside multibyte sequences + const unsigned char s[] = { + 0x2f, 0x68, 0x6f, 0x6d, 0x65, 0x2f, 0x72, 0x61, 0x70, 0x74, 0x6f, + 0x72, 0x2f, 0xd1, 0x82, 0xd0, 0xb5, 0xd1, 0x81, 0xd1, 0x82, 0x2f, + 0x6d, 0x69, 0x6e, 0x65, 0x74, 0x65, 0x73, 0x74, 0x2f, 0x62, 0x69, + 0x6e, 0x2f, 0x2e, 0x2e, 0}; + std::string str((char *)s); + UASSERT(utf8_to_wide(wrap_rows(str, 20)) != L""); } diff --git a/src/util/string.h b/src/util/string.h index 5bf2b5b7c..72d3c6075 100644 --- a/src/util/string.h +++ b/src/util/string.h @@ -32,6 +32,9 @@ with this program; if not, write to the Free Software Foundation, Inc., #define STRINGIFY(x) #x #define TOSTRING(x) STRINGIFY(x) +// Checks whether a byte is an inner byte for an utf-8 multibyte sequence +#define IS_UTF8_MULTB_INNER(x) (((unsigned char)x >= 0x80) && ((unsigned char)x <= 0xc0)) + typedef std::map StringMap; struct FlagDesc { @@ -411,7 +414,10 @@ inline bool string_allowed_blacklist(const std::string &str, * every \p row_len characters whether it breaks a word or not. It is * intended to be used for, for example, showing paths in the GUI. * - * @param from The string to be wrapped into rows. + * @note This function doesn't wrap inside utf-8 multibyte sequences and also + * counts multibyte sequences correcly as single characters. + * + * @param from The (utf-8) string to be wrapped into rows. * @param row_len The row length (in characters). * @return A new string with the wrapping applied. */ @@ -420,9 +426,12 @@ inline std::string wrap_rows(const std::string &from, { std::string to; + size_t character_idx = 0; for (size_t i = 0; i < from.size(); i++) { - if (i != 0 && i % row_len == 0) + if (character_idx > 0 && character_idx % row_len == 0) to += '\n'; + if (!IS_UTF8_MULTB_INNER(from[i])) + character_idx++; to += from[i]; } -- 2.25.1