X-Git-Url: https://git.librecmc.org/?a=blobdiff_plain;f=libbb%2Funicode.c;h=99dc1dfa6afee28e7834b0f98fdd32a0ab79b6ac;hb=b2320370be14811459718b9fe418efed75ea3615;hp=d1c6167c78fa5506f11a20c13ac77dc180d3a385;hpb=a659b81dfa435aa19130a8c7dd1bfe8fa9a22131;p=oweals%2Fbusybox.git diff --git a/libbb/unicode.c b/libbb/unicode.c index d1c6167c7..99dc1dfa6 100644 --- a/libbb/unicode.c +++ b/libbb/unicode.c @@ -4,7 +4,7 @@ * * Copyright (C) 2009 Denys Vlasenko * - * Licensed under GPL version 2, see file LICENSE in this tarball for details. + * Licensed under GPLv2, see file LICENSE in this source tree. */ #include "libbb.h" #include "unicode.h" @@ -23,15 +23,24 @@ uint8_t unicode_status; /* Unicode support using libc locale support. */ -void FAST_FUNC init_unicode(void) +void FAST_FUNC reinit_unicode(const char *LANG) { - /* In unicode, this is a one character string */ static const char unicode_0x394[] = { 0xce, 0x94, 0 }; + size_t width; - if (unicode_status != UNICODE_UNKNOWN) - return; +//TODO: avoid repeated calls by caching last string? + setlocale(LC_ALL, (LANG && LANG[0]) ? LANG : "C"); + + /* In unicode, this is a one character string */ +// can use unicode_strlen(string) too, but otherwise unicode_strlen() is unused + width = mbstowcs(NULL, unicode_0x394, INT_MAX); + unicode_status = (width == 1 ? UNICODE_ON : UNICODE_OFF); +} - unicode_status = unicode_strlen(unicode_0x394) == 1 ? UNICODE_ON : UNICODE_OFF; +void FAST_FUNC init_unicode(void) +{ + if (unicode_status == UNICODE_UNKNOWN) + reinit_unicode(getenv("LANG")); } #else @@ -39,19 +48,19 @@ void FAST_FUNC init_unicode(void) /* Homegrown Unicode support. It knows only C and Unicode locales. */ # if ENABLE_FEATURE_CHECK_UNICODE_IN_ENV -void FAST_FUNC init_unicode(void) +void FAST_FUNC reinit_unicode(const char *LANG) { - char *lang; - - if (unicode_status != UNICODE_UNKNOWN) - return; - unicode_status = UNICODE_OFF; - lang = getenv("LANG"); - if (!lang || !(strstr(lang, ".utf") || strstr(lang, ".UTF"))) + if (!LANG || !(strstr(LANG, ".utf") || strstr(LANG, ".UTF"))) return; unicode_status = UNICODE_ON; } + +void FAST_FUNC init_unicode(void) +{ + if (unicode_status == UNICODE_UNKNOWN) + reinit_unicode(getenv("LANG")); +} # endif static size_t wcrtomb_internal(char *s, wchar_t wc) @@ -129,7 +138,7 @@ size_t FAST_FUNC wcstombs(char *dest, const wchar_t *src, size_t n) size_t len = wcrtomb_internal(tbuf, wc); if (len > n) - len = n; + break; memcpy(dest, tbuf, len); if (wc == L'\0') return org_n - n; @@ -240,7 +249,7 @@ int FAST_FUNC iswpunct(wint_t wc) } -# if LAST_SUPPORTED_WCHAR >= 0x300 +# if CONFIG_LAST_SUPPORTED_WCHAR >= 0x300 struct interval { uint16_t first; uint16_t last; @@ -418,9 +427,9 @@ static int in_uint16_table(unsigned ucs, const uint16_t *table, unsigned max) * This implementation assumes that wchar_t characters are encoded * in ISO 10646. */ -static int wcwidth(unsigned ucs) +int FAST_FUNC wcwidth(unsigned ucs) { -# if LAST_SUPPORTED_WCHAR >= 0x300 +# if CONFIG_LAST_SUPPORTED_WCHAR >= 0x300 /* sorted list of non-overlapping intervals of non-spacing characters */ /* generated by "uniset +cat=Me +cat=Mn +cat=Cf -00AD +1160-11FF +200B c" */ # define BIG_(a,b) { a, b }, @@ -579,14 +588,14 @@ static int wcwidth(unsigned ucs) if ((ucs & ~0x80) < 0x20 || ucs == 0x7f) return -1; /* Quick abort if it is an obviously invalid char */ - if (ucs > LAST_SUPPORTED_WCHAR) + if (ucs > CONFIG_LAST_SUPPORTED_WCHAR) return -1; /* Optimization: no combining chars below 0x300 */ - if (LAST_SUPPORTED_WCHAR < 0x300 || ucs < 0x300) + if (CONFIG_LAST_SUPPORTED_WCHAR < 0x300 || ucs < 0x300) return 1; -# if LAST_SUPPORTED_WCHAR >= 0x300 +# if CONFIG_LAST_SUPPORTED_WCHAR >= 0x300 /* Binary search in table of non-spacing characters */ if (in_interval_table(ucs, combining, ARRAY_SIZE(combining) - 1)) return 0; @@ -594,25 +603,25 @@ static int wcwidth(unsigned ucs) return 0; /* Optimization: all chars below 0x1100 are not double-width */ - if (LAST_SUPPORTED_WCHAR < 0x1100 || ucs < 0x1100) + if (CONFIG_LAST_SUPPORTED_WCHAR < 0x1100 || ucs < 0x1100) return 1; -# if LAST_SUPPORTED_WCHAR >= 0x1100 +# if CONFIG_LAST_SUPPORTED_WCHAR >= 0x1100 /* Invalid code points: */ /* High (d800..dbff) and low (dc00..dfff) surrogates (valid only in UTF16) */ /* Private Use Area (e000..f8ff) */ /* Noncharacters fdd0..fdef */ - if ((LAST_SUPPORTED_WCHAR >= 0xd800 && ucs >= 0xd800 && ucs <= 0xf8ff) - || (LAST_SUPPORTED_WCHAR >= 0xfdd0 && ucs >= 0xfdd0 && ucs <= 0xfdef) + if ((CONFIG_LAST_SUPPORTED_WCHAR >= 0xd800 && ucs >= 0xd800 && ucs <= 0xf8ff) + || (CONFIG_LAST_SUPPORTED_WCHAR >= 0xfdd0 && ucs >= 0xfdd0 && ucs <= 0xfdef) ) { return -1; } /* 0xfffe and 0xffff in every plane are invalid */ - if (LAST_SUPPORTED_WCHAR >= 0xfffe && ((ucs & 0xfffe) == 0xfffe)) { + if (CONFIG_LAST_SUPPORTED_WCHAR >= 0xfffe && ((ucs & 0xfffe) == 0xfffe)) { return -1; } -# if LAST_SUPPORTED_WCHAR >= 0x10000 +# if CONFIG_LAST_SUPPORTED_WCHAR >= 0x10000 if (ucs >= 0x10000) { /* Combining chars in Supplementary Multilingual Plane 0x1xxxx */ static const struct interval combining0x10000[] = { @@ -625,7 +634,7 @@ static int wcwidth(unsigned ucs) if (in_interval_table(ucs ^ 0x10000, combining0x10000, ARRAY_SIZE(combining0x10000) - 1)) return 0; /* Check a few non-spacing chars in Supplementary Special-purpose Plane 0xExxxx */ - if (LAST_SUPPORTED_WCHAR >= 0xE0001 + if (CONFIG_LAST_SUPPORTED_WCHAR >= 0xE0001 && ( ucs == 0xE0001 || (ucs >= 0xE0020 && ucs <= 0xE007F) || (ucs >= 0xE0100 && ucs <= 0xE01EF) @@ -644,7 +653,7 @@ static int wcwidth(unsigned ucs) || ucs == 0x2329 /* left-pointing angle bracket; also CJK punct. char */ || ucs == 0x232a /* right-pointing angle bracket; also CJK punct. char */ || (ucs >= 0x2e80 && ucs <= 0xa4cf && ucs != 0x303f) /* CJK ... Yi */ -# if LAST_SUPPORTED_WCHAR >= 0xac00 +# if CONFIG_LAST_SUPPORTED_WCHAR >= 0xac00 || (ucs >= 0xac00 && ucs <= 0xd7a3) /* Hangul Syllables */ || (ucs >= 0xf900 && ucs <= 0xfaff) /* CJK Compatibility Ideographs */ || (ucs >= 0xfe10 && ucs <= 0xfe19) /* Vertical forms */ @@ -954,6 +963,7 @@ int FAST_FUNC unicode_bidi_is_neutral_wchar(wint_t wc) /* The rest is mostly same for libc and for "homegrown" support */ +#if 0 // UNUSED size_t FAST_FUNC unicode_strlen(const char *string) { size_t width = mbstowcs(NULL, string, INT_MAX); @@ -961,6 +971,14 @@ size_t FAST_FUNC unicode_strlen(const char *string) return strlen(string); return width; } +#endif + +size_t FAST_FUNC unicode_strwidth(const char *string) +{ + uni_stat_t uni_stat; + printable_string(&uni_stat, string); + return uni_stat.unicode_width; +} static char* FAST_FUNC unicode_conv_to_printable2(uni_stat_t *stats, const char *src, unsigned width, int flags) { @@ -994,8 +1012,11 @@ static char* FAST_FUNC unicode_conv_to_printable2(uni_stat_t *stats, const char d++; } } - if (stats) - stats->byte_count = stats->unicode_count = (d - dst); + if (stats) { + stats->byte_count = (d - dst); + stats->unicode_count = (d - dst); + stats->unicode_width = (d - dst); + } return dst; } @@ -1093,16 +1114,17 @@ char* FAST_FUNC unicode_conv_to_printable(uni_stat_t *stats, const char *src) { return unicode_conv_to_printable2(stats, src, INT_MAX, 0); } -char* FAST_FUNC unicode_conv_to_printable_maxwidth(uni_stat_t *stats, const char *src, unsigned maxwidth) +char* FAST_FUNC unicode_conv_to_printable_fixedwidth(/*uni_stat_t *stats,*/ const char *src, unsigned width) { - return unicode_conv_to_printable2(stats, src, maxwidth, 0); + return unicode_conv_to_printable2(/*stats:*/ NULL, src, width, UNI_FLAG_PAD); } -char* FAST_FUNC unicode_conv_to_printable_fixedwidth(uni_stat_t *stats, const char *src, unsigned width) + +#ifdef UNUSED +char* FAST_FUNC unicode_conv_to_printable_maxwidth(uni_stat_t *stats, const char *src, unsigned maxwidth) { - return unicode_conv_to_printable2(stats, src, width, UNI_FLAG_PAD); + return unicode_conv_to_printable2(stats, src, maxwidth, 0); } -#ifdef UNUSED unsigned FAST_FUNC unicode_padding_to_width(unsigned width, const char *src) { if (unicode_status != UNICODE_ON) {