From d8c28232c34d4be9984eda52d69d0920678d937d Mon Sep 17 00:00:00 2001 From: Heinrich Schuchardt Date: Fri, 31 Aug 2018 21:31:27 +0200 Subject: [PATCH] lib: charset: utility functions for Unicode utf8_get() - get next UTF-8 code point from buffer utf8_put() - write UTF-8 code point to buffer utf8_utf16_strnlen() - length of a utf-8 string after conversion to utf-16 utf8_utf16_strncpy() - copy a utf-8 string to utf-16 utf16_get() - get next UTF-16 code point from buffer utf16_put() - write UTF-16 code point to buffer utf16_strnlen() - number of codes points in a utf-16 string utf16_utf8_strnlen() - length of a utf-16 string after conversion to utf-8 utf16_utf8_strncpy() - copy a utf-16 string to utf-8 Signed-off-by: Heinrich Schuchardt Signed-off-by: Alexander Graf --- include/charset.h | 130 +++++++++++++++++++++++++ lib/charset.c | 236 +++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 363 insertions(+), 3 deletions(-) diff --git a/include/charset.h b/include/charset.h index 2c6deb8034..cf41eb5e5f 100644 --- a/include/charset.h +++ b/include/charset.h @@ -8,10 +8,140 @@ #ifndef __CHARSET_H_ #define __CHARSET_H_ +#include #include #define MAX_UTF8_PER_UTF16 3 +/** + * utf8_get() - get next UTF-8 code point from buffer + * + * @src: pointer to current byte, updated to point to next byte + * Return: code point, or 0 for end of string, or -1 if no legal + * code point is found. In case of an error src points to + * the incorrect byte. + */ +s32 utf8_get(const char **src); + +/** + * utf8_put() - write UTF-8 code point to buffer + * + * @code: code point + * @dst: pointer to destination buffer, updated to next position + * Return: -1 if the input parameters are invalid + */ +int utf8_put(s32 code, char **dst); + +/** + * utf8_utf16_strnlen() - length of a truncated utf-8 string after conversion + * to utf-16 + * + * @src: utf-8 string + * @count: maximum number of code points to convert + * Return: length in bytes after conversion to utf-16 without the + * trailing \0. If an invalid UTF-8 sequence is hit one + * word will be reserved for a replacement character. + */ +size_t utf8_utf16_strnlen(const char *src, size_t count); + +/** + * utf8_utf16_strlen() - length of a utf-8 string after conversion to utf-16 + * + * @src: utf-8 string + * Return: length in bytes after conversion to utf-16 without the + * trailing \0. -1 if the utf-8 string is not valid. + */ +#define utf8_utf16_strlen(a) utf8_utf16_strnlen((a), SIZE_MAX) + +/** + * utf8_utf16_strncpy() - copy utf-8 string to utf-16 string + * + * @dst: destination buffer + * @src: source buffer + * @count: maximum number of code points to copy + * Return: -1 if the input parameters are invalid + */ +int utf8_utf16_strncpy(u16 **dst, const char *src, size_t count); + +/** + * utf8_utf16_strcpy() - copy utf-8 string to utf-16 string + * + * @dst: destination buffer + * @src: source buffer + * Return: -1 if the input parameters are invalid + */ +#define utf8_utf16_strcpy(d, s) utf8_utf16_strncpy((d), (s), SIZE_MAX) + +/** + * utf16_get() - get next UTF-16 code point from buffer + * + * @src: pointer to current word, updated to point to next word + * Return: code point, or 0 for end of string, or -1 if no legal + * code point is found. In case of an error src points to + * the incorrect word. + */ +s32 utf16_get(const u16 **src); + +/** + * utf16_put() - write UTF-16 code point to buffer + * + * @code: code point + * @dst: pointer to destination buffer, updated to next position + * Return: -1 if the input parameters are invalid + */ +int utf16_put(s32 code, u16 **dst); + +/** + * utf16_strnlen() - length of a truncated utf-16 string + * + * @src: utf-16 string + * @count: maximum number of code points to convert + * Return: length in code points. If an invalid UTF-16 sequence is + * hit one position will be reserved for a replacement + * character. + */ +size_t utf16_strnlen(const u16 *src, size_t count); + +/** + * utf16_utf8_strnlen() - length of a truncated utf-16 string after conversion + * to utf-8 + * + * @src: utf-16 string + * @count: maximum number of code points to convert + * Return: length in bytes after conversion to utf-8 without the + * trailing \0. If an invalid UTF-16 sequence is hit one + * byte will be reserved for a replacement character. + */ +size_t utf16_utf8_strnlen(const u16 *src, size_t count); + +/** + * utf16_utf8_strlen() - length of a utf-16 string after conversion to utf-8 + * + * @src: utf-16 string + * Return: length in bytes after conversion to utf-8 without the + * trailing \0. -1 if the utf-16 string is not valid. + */ +#define utf16_utf8_strlen(a) utf16_utf8_strnlen((a), SIZE_MAX) + +/** + * utf16_utf8_strncpy() - copy utf-16 string to utf-8 string + * + * @dst: destination buffer + * @src: source buffer + * @count: maximum number of code points to copy + * Return: -1 if the input parameters are invalid + */ +int utf16_utf8_strncpy(char **dst, const u16 *src, size_t count); + +/** + * utf16_utf8_strcpy() - copy utf-16 string to utf-8 string + * + * @dst: destination buffer + * @src: source buffer + * Return: -1 if the input parameters are invalid + */ +#define utf16_utf8_strcpy(d, s) utf16_utf8_strncpy((d), (s), SIZE_MAX) + /** * u16_strlen - count non-zero words * diff --git a/lib/charset.c b/lib/charset.c index 8ff8d59957..e82622a7f8 100644 --- a/lib/charset.c +++ b/lib/charset.c @@ -8,9 +8,239 @@ #include #include -/* - * utf8/utf16 conversion mostly lifted from grub - */ +s32 utf8_get(const char **src) +{ + s32 code = 0; + unsigned char c; + + if (!src || !*src) + return -1; + if (!**src) + return 0; + c = **src; + if (c >= 0x80) { + ++*src; + if (!**src) + return -1; + /* + * We do not expect a continuation byte (0x80 - 0xbf). + * 0x80 is coded as 0xc2 0x80, so we cannot have less then 0xc2 + * here. + * The highest code point is 0x10ffff which is coded as + * 0xf4 0x8f 0xbf 0xbf. So we cannot have a byte above 0xf4. + */ + if (c < 0xc2 || code > 0xf4) + return -1; + if (c >= 0xe0) { + if (c >= 0xf0) { + /* 0xf0 - 0xf4 */ + c &= 0x07; + code = c << 18; + c = **src; + ++*src; + if (!**src) + return -1; + if (c < 0x80 || c > 0xbf) + return -1; + c &= 0x3f; + } else { + /* 0xe0 - 0xef */ + c &= 0x0f; + } + code += c << 12; + if ((code >= 0xD800 && code <= 0xDFFF) || + code >= 0x110000) + return -1; + c = **src; + ++*src; + if (!**src) + return -1; + if (c < 0x80 || c > 0xbf) + return -1; + } + /* 0xc0 - 0xdf or continuation byte (0x80 - 0xbf) */ + c &= 0x3f; + code += c << 6; + c = **src; + if (c < 0x80 || c > 0xbf) + return -1; + c &= 0x3f; + } + code += c; + ++*src; + return code; +} + +int utf8_put(s32 code, char **dst) +{ + if (!dst || !*dst) + return -1; + if ((code >= 0xD800 && code <= 0xDFFF) || code >= 0x110000) + return -1; + if (code <= 0x007F) { + **dst = code; + } else { + if (code <= 0x07FF) { + **dst = code >> 6 | 0xC0; + } else { + if (code < 0x10000) { + **dst = code >> 12 | 0xE0; + } else { + **dst = code >> 18 | 0xF0; + ++*dst; + **dst = (code >> 12 & 0x3F) | 0x80; + } + ++*dst; + **dst = (code >> 6 & 0x3F) | 0x80; + } + ++*dst; + **dst = (code & 0x3F) | 0x80; + } + ++*dst; + return 0; +} + +size_t utf8_utf16_strnlen(const char *src, size_t count) +{ + size_t len = 0; + + for (; *src && count; --count) { + s32 code = utf8_get(&src); + + if (!code) + break; + if (code < 0) { + /* Reserve space for a replacement character */ + len += 1; + } else if (code < 0x10000) { + len += 1; + } else { + len += 2; + } + } + return len; +} + +int utf8_utf16_strncpy(u16 **dst, const char *src, size_t count) +{ + if (!src || !dst || !*dst) + return -1; + + for (; count && *src; --count) { + s32 code = utf8_get(&src); + + if (code < 0) + code = '?'; + utf16_put(code, dst); + } + **dst = 0; + return 0; +} + +s32 utf16_get(const u16 **src) +{ + s32 code, code2; + + if (!src || !*src) + return -1; + if (!**src) + return 0; + code = **src; + ++*src; + if (code >= 0xDC00 && code <= 0xDFFF) + return -1; + if (code >= 0xD800 && code <= 0xDBFF) { + if (!**src) + return -1; + code &= 0x3ff; + code <<= 10; + code += 0x10000; + code2 = **src; + ++*src; + if (code2 <= 0xDC00 || code2 >= 0xDFFF) + return -1; + code2 &= 0x3ff; + code += code2; + } + return code; +} + +int utf16_put(s32 code, u16 **dst) +{ + if (!dst || !*dst) + return -1; + if ((code >= 0xD800 && code <= 0xDFFF) || code >= 0x110000) + return -1; + if (code < 0x10000) { + **dst = code; + } else { + code -= 0x10000; + **dst = code >> 10 | 0xD800; + ++*dst; + **dst = (code & 0x3ff) | 0xDC00; + } + ++*dst; + return 0; +} + +size_t utf16_strnlen(const u16 *src, size_t count) +{ + size_t len = 0; + + for (; *src && count; --count) { + s32 code = utf16_get(&src); + + if (!code) + break; + /* + * In case of an illegal sequence still reserve space for a + * replacement character. + */ + ++len; + } + return len; +} + +size_t utf16_utf8_strnlen(const u16 *src, size_t count) +{ + size_t len = 0; + + for (; *src && count; --count) { + s32 code = utf16_get(&src); + + if (!code) + break; + if (code < 0) + /* Reserve space for a replacement character */ + len += 1; + else if (code < 0x80) + len += 1; + else if (code < 0x800) + len += 2; + else if (code < 0x10000) + len += 3; + else + len += 4; + } + return len; +} + +int utf16_utf8_strncpy(char **dst, const u16 *src, size_t count) +{ + if (!src || !dst || !*dst) + return -1; + + for (; count && *src; --count) { + s32 code = utf16_get(&src); + + if (code < 0) + code = '?'; + utf8_put(code, dst); + } + **dst = 0; + return 0; +} + size_t u16_strlen(const u16 *in) { -- 2.25.1