libarchive: open_zipped() does not need to check extensions for e.g. gzip

[oweals/busybox.git] / libbb / unicode.c
diff --git a/libbb/unicode.c b/libbb/unicode.c

index d1c6167c78fa5506f11a20c13ac77dc180d3a385..9c4da50d3b20639b8f489bb69dc41b60583dc1b2 100644 (file)
--- a/libbb/unicode.c
+++ b/libbb/unicode.c
@@ -4,7 +4,7 @@
   *
   * Copyright (C) 2009 Denys Vlasenko
   *
- * Licensed under GPL version 2, see file LICENSE in this tarball for details.
+ * Licensed under GPLv2, see file LICENSE in this source tree.
   */
  #include "libbb.h"
  #include "unicode.h"
@@ -23,15 +23,42 @@ uint8_t unicode_status;
  
  /* Unicode support using libc locale support. */
  
-void FAST_FUNC init_unicode(void)
+void FAST_FUNC reinit_unicode(const char *LANG)
  {
-       /* In unicode, this is a one character string */
         static const char unicode_0x394[] = { 0xce, 0x94, 0 };
+       size_t width;
+
+       /* We pass "" instead of "C" because some libc's have
+        * non-ASCII default locale for setlocale("") call
+        * (this allows users of such libc to have Unicoded
+        * system without having to mess with env).
+        *
+        * We set LC_CTYPE because (a) we may be called with $LC_CTYPE
+        * value in LANG, not with $LC_ALL, (b) internationalized
+        * LC_NUMERIC and LC_TIME are more PITA than benefit
+        * (for one, some utilities have hard time with comma
+        * used as a fractional separator).
+        */
+//TODO: avoid repeated calls by caching last string?
+       setlocale(LC_CTYPE, LANG ? LANG : "");
  
-       if (unicode_status != UNICODE_UNKNOWN)
-               return;
+       /* In unicode, this is a one character string */
+       width = unicode_strlen(unicode_0x394);
+       unicode_status = (width == 1 ? UNICODE_ON : UNICODE_OFF);
+}
  
-       unicode_status = unicode_strlen(unicode_0x394) == 1 ? UNICODE_ON : UNICODE_OFF;
+void FAST_FUNC init_unicode(void)
+{
+       /* Some people set only $LC_CTYPE, not $LC_ALL, because they want
+        * only Unicode to be activated on their system, not the whole
+        * shebang of wrong decimal points, strange date formats and so on.
+        */
+       if (unicode_status == UNICODE_UNKNOWN) {
+               char *s = getenv("LC_ALL");
+               if (!s) s = getenv("LC_CTYPE");
+               if (!s) s = getenv("LANG");
+               reinit_unicode(s);
+       }
  }
  
  #else
@@ -39,19 +66,23 @@ void FAST_FUNC init_unicode(void)
  /* Homegrown Unicode support. It knows only C and Unicode locales. */
  
  # if ENABLE_FEATURE_CHECK_UNICODE_IN_ENV
-void FAST_FUNC init_unicode(void)
+void FAST_FUNC reinit_unicode(const char *LANG)
  {
-       char *lang;
-
-       if (unicode_status != UNICODE_UNKNOWN)
-               return;
-
         unicode_status = UNICODE_OFF;
-       lang = getenv("LANG");
-       if (!lang || !(strstr(lang, ".utf") || strstr(lang, ".UTF")))
+       if (!LANG || !(strstr(LANG, ".utf") || strstr(LANG, ".UTF")))
                 return;
         unicode_status = UNICODE_ON;
  }
+
+void FAST_FUNC init_unicode(void)
+{
+       if (unicode_status == UNICODE_UNKNOWN) {
+               char *s = getenv("LC_ALL");
+               if (!s) s = getenv("LC_CTYPE");
+               if (!s) s = getenv("LANG");
+               reinit_unicode(s);
+       }
+}
  # endif
  
  static size_t wcrtomb_internal(char *s, wchar_t wc)
@@ -129,7 +160,7 @@ size_t FAST_FUNC wcstombs(char *dest, const wchar_t *src, size_t n)
                 size_t len = wcrtomb_internal(tbuf, wc);
  
                 if (len > n)
-                       len = n;
+                       break;
                 memcpy(dest, tbuf, len);
                 if (wc == L'\0')
                         return org_n - n;
@@ -240,7 +271,7 @@ int FAST_FUNC iswpunct(wint_t wc)
  }
  
  
-# if LAST_SUPPORTED_WCHAR >= 0x300
+# if CONFIG_LAST_SUPPORTED_WCHAR >= 0x300
  struct interval {
         uint16_t first;
         uint16_t last;
@@ -418,9 +449,9 @@ static int in_uint16_table(unsigned ucs, const uint16_t *table, unsigned max)
   * This implementation assumes that wchar_t characters are encoded
   * in ISO 10646.
   */
-static int wcwidth(unsigned ucs)
+int FAST_FUNC wcwidth(unsigned ucs)
  {
-# if LAST_SUPPORTED_WCHAR >= 0x300
+# if CONFIG_LAST_SUPPORTED_WCHAR >= 0x300
         /* sorted list of non-overlapping intervals of non-spacing characters */
         /* generated by "uniset +cat=Me +cat=Mn +cat=Cf -00AD +1160-11FF +200B c" */
  #  define BIG_(a,b) { a, b },
@@ -579,14 +610,14 @@ static int wcwidth(unsigned ucs)
         if ((ucs & ~0x80) < 0x20 || ucs == 0x7f)
                 return -1;
         /* Quick abort if it is an obviously invalid char */
-       if (ucs > LAST_SUPPORTED_WCHAR)
+       if (ucs > CONFIG_LAST_SUPPORTED_WCHAR)
                 return -1;
  
         /* Optimization: no combining chars below 0x300 */
-       if (LAST_SUPPORTED_WCHAR < 0x300 || ucs < 0x300)
+       if (CONFIG_LAST_SUPPORTED_WCHAR < 0x300 || ucs < 0x300)
                 return 1;
  
-# if LAST_SUPPORTED_WCHAR >= 0x300
+# if CONFIG_LAST_SUPPORTED_WCHAR >= 0x300
         /* Binary search in table of non-spacing characters */
         if (in_interval_table(ucs, combining, ARRAY_SIZE(combining) - 1))
                 return 0;
@@ -594,25 +625,25 @@ static int wcwidth(unsigned ucs)
                 return 0;
  
         /* Optimization: all chars below 0x1100 are not double-width */
-       if (LAST_SUPPORTED_WCHAR < 0x1100 || ucs < 0x1100)
+       if (CONFIG_LAST_SUPPORTED_WCHAR < 0x1100 || ucs < 0x1100)
                 return 1;
  
-#  if LAST_SUPPORTED_WCHAR >= 0x1100
+#  if CONFIG_LAST_SUPPORTED_WCHAR >= 0x1100
         /* Invalid code points: */
         /* High (d800..dbff) and low (dc00..dfff) surrogates (valid only in UTF16) */
         /* Private Use Area (e000..f8ff) */
         /* Noncharacters fdd0..fdef */
-       if ((LAST_SUPPORTED_WCHAR >= 0xd800 && ucs >= 0xd800 && ucs <= 0xf8ff)
-        || (LAST_SUPPORTED_WCHAR >= 0xfdd0 && ucs >= 0xfdd0 && ucs <= 0xfdef)
+       if ((CONFIG_LAST_SUPPORTED_WCHAR >= 0xd800 && ucs >= 0xd800 && ucs <= 0xf8ff)
+        || (CONFIG_LAST_SUPPORTED_WCHAR >= 0xfdd0 && ucs >= 0xfdd0 && ucs <= 0xfdef)
         ) {
                 return -1;
         }
         /* 0xfffe and 0xffff in every plane are invalid */
-       if (LAST_SUPPORTED_WCHAR >= 0xfffe && ((ucs & 0xfffe) == 0xfffe)) {
+       if (CONFIG_LAST_SUPPORTED_WCHAR >= 0xfffe && ((ucs & 0xfffe) == 0xfffe)) {
                 return -1;
         }
  
-#   if LAST_SUPPORTED_WCHAR >= 0x10000
+#   if CONFIG_LAST_SUPPORTED_WCHAR >= 0x10000
         if (ucs >= 0x10000) {
                 /* Combining chars in Supplementary Multilingual Plane 0x1xxxx */
                 static const struct interval combining0x10000[] = {
@@ -625,7 +656,7 @@ static int wcwidth(unsigned ucs)
                 if (in_interval_table(ucs ^ 0x10000, combining0x10000, ARRAY_SIZE(combining0x10000) - 1))
                         return 0;
                 /* Check a few non-spacing chars in Supplementary Special-purpose Plane 0xExxxx */
-               if (LAST_SUPPORTED_WCHAR >= 0xE0001
+               if (CONFIG_LAST_SUPPORTED_WCHAR >= 0xE0001
                  && (  ucs == 0xE0001
                     || (ucs >= 0xE0020 && ucs <= 0xE007F)
                     || (ucs >= 0xE0100 && ucs <= 0xE01EF)
@@ -644,7 +675,7 @@ static int wcwidth(unsigned ucs)
                 || ucs == 0x2329 /* left-pointing angle bracket; also CJK punct. char */
                 || ucs == 0x232a /* right-pointing angle bracket; also CJK punct. char */
                 || (ucs >= 0x2e80 && ucs <= 0xa4cf && ucs != 0x303f) /* CJK ... Yi */
-#   if LAST_SUPPORTED_WCHAR >= 0xac00
+#   if CONFIG_LAST_SUPPORTED_WCHAR >= 0xac00
                 || (ucs >= 0xac00 && ucs <= 0xd7a3) /* Hangul Syllables */
                 || (ucs >= 0xf900 && ucs <= 0xfaff) /* CJK Compatibility Ideographs */
                 || (ucs >= 0xfe10 && ucs <= 0xfe19) /* Vertical forms */
@@ -962,6 +993,13 @@ size_t FAST_FUNC unicode_strlen(const char *string)
         return width;
  }
  
+size_t FAST_FUNC unicode_strwidth(const char *string)
+{
+       uni_stat_t uni_stat;
+       printable_string(&uni_stat, string);
+       return uni_stat.unicode_width;
+}
+
  static char* FAST_FUNC unicode_conv_to_printable2(uni_stat_t *stats, const char *src, unsigned width, int flags)
  {
         char *dst;
@@ -994,8 +1032,11 @@ static char* FAST_FUNC unicode_conv_to_printable2(uni_stat_t *stats, const char
                                 d++;
                         }
                 }
-               if (stats)
-                       stats->byte_count = stats->unicode_count = (d - dst);
+               if (stats) {
+                       stats->byte_count = (d - dst);
+                       stats->unicode_count = (d - dst);
+                       stats->unicode_width = (d - dst);
+               }
                 return dst;
         }
  
@@ -1093,16 +1134,17 @@ char* FAST_FUNC unicode_conv_to_printable(uni_stat_t *stats, const char *src)
  {
         return unicode_conv_to_printable2(stats, src, INT_MAX, 0);
  }
-char* FAST_FUNC unicode_conv_to_printable_maxwidth(uni_stat_t *stats, const char *src, unsigned maxwidth)
+char* FAST_FUNC unicode_conv_to_printable_fixedwidth(/*uni_stat_t *stats,*/ const char *src, unsigned width)
  {
-       return unicode_conv_to_printable2(stats, src, maxwidth, 0);
+       return unicode_conv_to_printable2(/*stats:*/ NULL, src, width, UNI_FLAG_PAD);
  }
-char* FAST_FUNC unicode_conv_to_printable_fixedwidth(uni_stat_t *stats, const char *src, unsigned width)
+
+#ifdef UNUSED
+char* FAST_FUNC unicode_conv_to_printable_maxwidth(uni_stat_t *stats, const char *src, unsigned maxwidth)
  {
-       return unicode_conv_to_printable2(stats, src, width, UNI_FLAG_PAD);
+       return unicode_conv_to_printable2(stats, src, maxwidth, 0);
  }
  
-#ifdef UNUSED
  unsigned FAST_FUNC unicode_padding_to_width(unsigned width, const char *src)
  {
         if (unicode_status != UNICODE_ON) {