*
* Copyright (C) 2009 Denys Vlasenko
*
- * Licensed under GPL version 2, see file LICENSE in this tarball for details.
+ * Licensed under GPLv2, see file LICENSE in this source tree.
*/
#include "libbb.h"
#include "unicode.h"
/* Unicode support using libc locale support. */
-void FAST_FUNC init_unicode(void)
+void FAST_FUNC reinit_unicode(const char *LANG)
{
- /* In unicode, this is a one character string */
static const char unicode_0x394[] = { 0xce, 0x94, 0 };
+ size_t width;
+
+ /* We pass "" instead of "C" because some libc's have
+ * non-ASCII default locale for setlocale("") call
+ * (this allows users of such libc to have Unicoded
+ * system without having to mess with env).
+ *
+ * We set LC_CTYPE because (a) we may be called with $LC_CTYPE
+ * value in LANG, not with $LC_ALL, (b) internationalized
+ * LC_NUMERIC and LC_TIME are more PITA than benefit
+ * (for one, some utilities have hard time with comma
+ * used as a fractional separator).
+ */
+//TODO: avoid repeated calls by caching last string?
+ setlocale(LC_CTYPE, LANG ? LANG : "");
- if (unicode_status != UNICODE_UNKNOWN)
- return;
+ /* In unicode, this is a one character string */
+ width = unicode_strlen(unicode_0x394);
+ unicode_status = (width == 1 ? UNICODE_ON : UNICODE_OFF);
+}
- unicode_status = unicode_strlen(unicode_0x394) == 1 ? UNICODE_ON : UNICODE_OFF;
+void FAST_FUNC init_unicode(void)
+{
+ /* Some people set only $LC_CTYPE, not $LC_ALL, because they want
+ * only Unicode to be activated on their system, not the whole
+ * shebang of wrong decimal points, strange date formats and so on.
+ */
+ if (unicode_status == UNICODE_UNKNOWN) {
+ char *s = getenv("LC_ALL");
+ if (!s) s = getenv("LC_CTYPE");
+ if (!s) s = getenv("LANG");
+ reinit_unicode(s);
+ }
}
#else
/* Homegrown Unicode support. It knows only C and Unicode locales. */
# if ENABLE_FEATURE_CHECK_UNICODE_IN_ENV
-void FAST_FUNC init_unicode(void)
+void FAST_FUNC reinit_unicode(const char *LANG)
{
- char *lang;
-
- if (unicode_status != UNICODE_UNKNOWN)
- return;
-
unicode_status = UNICODE_OFF;
- lang = getenv("LANG");
- if (!lang || !(strstr(lang, ".utf") || strstr(lang, ".UTF")))
+ if (!LANG || !(strstr(LANG, ".utf") || strstr(LANG, ".UTF")))
return;
unicode_status = UNICODE_ON;
}
+
+void FAST_FUNC init_unicode(void)
+{
+ if (unicode_status == UNICODE_UNKNOWN) {
+ char *s = getenv("LC_ALL");
+ if (!s) s = getenv("LC_CTYPE");
+ if (!s) s = getenv("LANG");
+ reinit_unicode(s);
+ }
+}
# endif
static size_t wcrtomb_internal(char *s, wchar_t wc)
size_t len = wcrtomb_internal(tbuf, wc);
if (len > n)
- len = n;
+ break;
memcpy(dest, tbuf, len);
if (wc == L'\0')
return org_n - n;
}
-# if LAST_SUPPORTED_WCHAR >= 0x300
+# if CONFIG_LAST_SUPPORTED_WCHAR >= 0x300
struct interval {
uint16_t first;
uint16_t last;
* This implementation assumes that wchar_t characters are encoded
* in ISO 10646.
*/
-static int wcwidth(unsigned ucs)
+int FAST_FUNC wcwidth(unsigned ucs)
{
-# if LAST_SUPPORTED_WCHAR >= 0x300
+# if CONFIG_LAST_SUPPORTED_WCHAR >= 0x300
/* sorted list of non-overlapping intervals of non-spacing characters */
/* generated by "uniset +cat=Me +cat=Mn +cat=Cf -00AD +1160-11FF +200B c" */
# define BIG_(a,b) { a, b },
if ((ucs & ~0x80) < 0x20 || ucs == 0x7f)
return -1;
/* Quick abort if it is an obviously invalid char */
- if (ucs > LAST_SUPPORTED_WCHAR)
+ if (ucs > CONFIG_LAST_SUPPORTED_WCHAR)
return -1;
/* Optimization: no combining chars below 0x300 */
- if (LAST_SUPPORTED_WCHAR < 0x300 || ucs < 0x300)
+ if (CONFIG_LAST_SUPPORTED_WCHAR < 0x300 || ucs < 0x300)
return 1;
-# if LAST_SUPPORTED_WCHAR >= 0x300
+# if CONFIG_LAST_SUPPORTED_WCHAR >= 0x300
/* Binary search in table of non-spacing characters */
if (in_interval_table(ucs, combining, ARRAY_SIZE(combining) - 1))
return 0;
return 0;
/* Optimization: all chars below 0x1100 are not double-width */
- if (LAST_SUPPORTED_WCHAR < 0x1100 || ucs < 0x1100)
+ if (CONFIG_LAST_SUPPORTED_WCHAR < 0x1100 || ucs < 0x1100)
return 1;
-# if LAST_SUPPORTED_WCHAR >= 0x1100
+# if CONFIG_LAST_SUPPORTED_WCHAR >= 0x1100
/* Invalid code points: */
/* High (d800..dbff) and low (dc00..dfff) surrogates (valid only in UTF16) */
/* Private Use Area (e000..f8ff) */
/* Noncharacters fdd0..fdef */
- if ((LAST_SUPPORTED_WCHAR >= 0xd800 && ucs >= 0xd800 && ucs <= 0xf8ff)
- || (LAST_SUPPORTED_WCHAR >= 0xfdd0 && ucs >= 0xfdd0 && ucs <= 0xfdef)
+ if ((CONFIG_LAST_SUPPORTED_WCHAR >= 0xd800 && ucs >= 0xd800 && ucs <= 0xf8ff)
+ || (CONFIG_LAST_SUPPORTED_WCHAR >= 0xfdd0 && ucs >= 0xfdd0 && ucs <= 0xfdef)
) {
return -1;
}
/* 0xfffe and 0xffff in every plane are invalid */
- if (LAST_SUPPORTED_WCHAR >= 0xfffe && ((ucs & 0xfffe) == 0xfffe)) {
+ if (CONFIG_LAST_SUPPORTED_WCHAR >= 0xfffe && ((ucs & 0xfffe) == 0xfffe)) {
return -1;
}
-# if LAST_SUPPORTED_WCHAR >= 0x10000
+# if CONFIG_LAST_SUPPORTED_WCHAR >= 0x10000
if (ucs >= 0x10000) {
/* Combining chars in Supplementary Multilingual Plane 0x1xxxx */
static const struct interval combining0x10000[] = {
if (in_interval_table(ucs ^ 0x10000, combining0x10000, ARRAY_SIZE(combining0x10000) - 1))
return 0;
/* Check a few non-spacing chars in Supplementary Special-purpose Plane 0xExxxx */
- if (LAST_SUPPORTED_WCHAR >= 0xE0001
+ if (CONFIG_LAST_SUPPORTED_WCHAR >= 0xE0001
&& ( ucs == 0xE0001
|| (ucs >= 0xE0020 && ucs <= 0xE007F)
|| (ucs >= 0xE0100 && ucs <= 0xE01EF)
|| ucs == 0x2329 /* left-pointing angle bracket; also CJK punct. char */
|| ucs == 0x232a /* right-pointing angle bracket; also CJK punct. char */
|| (ucs >= 0x2e80 && ucs <= 0xa4cf && ucs != 0x303f) /* CJK ... Yi */
-# if LAST_SUPPORTED_WCHAR >= 0xac00
+# if CONFIG_LAST_SUPPORTED_WCHAR >= 0xac00
|| (ucs >= 0xac00 && ucs <= 0xd7a3) /* Hangul Syllables */
|| (ucs >= 0xf900 && ucs <= 0xfaff) /* CJK Compatibility Ideographs */
|| (ucs >= 0xfe10 && ucs <= 0xfe19) /* Vertical forms */
return width;
}
+size_t FAST_FUNC unicode_strwidth(const char *string)
+{
+ uni_stat_t uni_stat;
+ printable_string(&uni_stat, string);
+ return uni_stat.unicode_width;
+}
+
static char* FAST_FUNC unicode_conv_to_printable2(uni_stat_t *stats, const char *src, unsigned width, int flags)
{
char *dst;
d++;
}
}
- if (stats)
- stats->byte_count = stats->unicode_count = (d - dst);
+ if (stats) {
+ stats->byte_count = (d - dst);
+ stats->unicode_count = (d - dst);
+ stats->unicode_width = (d - dst);
+ }
return dst;
}
{
return unicode_conv_to_printable2(stats, src, INT_MAX, 0);
}
-char* FAST_FUNC unicode_conv_to_printable_maxwidth(uni_stat_t *stats, const char *src, unsigned maxwidth)
+char* FAST_FUNC unicode_conv_to_printable_fixedwidth(/*uni_stat_t *stats,*/ const char *src, unsigned width)
{
- return unicode_conv_to_printable2(stats, src, maxwidth, 0);
+ return unicode_conv_to_printable2(/*stats:*/ NULL, src, width, UNI_FLAG_PAD);
}
-char* FAST_FUNC unicode_conv_to_printable_fixedwidth(uni_stat_t *stats, const char *src, unsigned width)
+
+#ifdef UNUSED
+char* FAST_FUNC unicode_conv_to_printable_maxwidth(uni_stat_t *stats, const char *src, unsigned maxwidth)
{
- return unicode_conv_to_printable2(stats, src, width, UNI_FLAG_PAD);
+ return unicode_conv_to_printable2(stats, src, maxwidth, 0);
}
-#ifdef UNUSED
unsigned FAST_FUNC unicode_padding_to_width(unsigned width, const char *src)
{
if (unicode_status != UNICODE_ON) {