unicode: exclude FDD0..FDEF range too
authorDenys Vlasenko <vda.linux@googlemail.com>
Sun, 31 Jan 2010 15:34:37 +0000 (16:34 +0100)
committerDenys Vlasenko <vda.linux@googlemail.com>
Sun, 31 Jan 2010 15:34:37 +0000 (16:34 +0100)
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
libbb/unicode_wcwidth.c
testsuite/ls.tests

index 410c741ac929f5d5e3ace9a8eeccd014e4057e7d..c7cc524a6757b2be3234485dc1b042bfa7701c55 100644 (file)
  * until Unicode committee assigns something there.
  */
 
-#if CONFIG_LAST_SUPPORTED_WCHAR < 126 || CONFIG_LAST_SUPPORTED_WCHAR > 0x30000
-# define LAST_SUPPORTED_WCHAR 0x30000
+#if CONFIG_LAST_SUPPORTED_WCHAR < 126 || CONFIG_LAST_SUPPORTED_WCHAR >= 0x30000
+# define LAST_SUPPORTED_WCHAR 0x2ffff
 #else
 # define LAST_SUPPORTED_WCHAR CONFIG_LAST_SUPPORTED_WCHAR
 #endif
 
-#if LAST_SUPPORTED_WCHAR >= 0x0300
+#if LAST_SUPPORTED_WCHAR >= 0x300
 struct interval {
        uint16_t first;
        uint16_t last;
@@ -185,7 +185,7 @@ static int in_uint16_table(unsigned ucs, const uint16_t *table, unsigned max)
  */
 static int wcwidth(unsigned ucs)
 {
-#if LAST_SUPPORTED_WCHAR >= 0x0300
+#if LAST_SUPPORTED_WCHAR >= 0x300
        /* sorted list of non-overlapping intervals of non-spacing characters */
        /* generated by "uniset +cat=Me +cat=Mn +cat=Cf -00AD +1160-11FF +200B c" */
        static const struct interval combining[] = {
@@ -460,75 +460,75 @@ static int wcwidth(unsigned ucs)
 #undef BIG_
 #undef PAIR
        };
-# if LAST_SUPPORTED_WCHAR >= 0x10000
-       /* Combining chars in Supplementary Multilingual Plane 0x1xxxx */
-       static const struct interval combining0x10000[] = {
-               { 0x0A01, 0x0A03 }, { 0x0A05, 0x0A06 }, { 0x0A0C, 0x0A0F },
-               { 0x0A38, 0x0A3A }, { 0x0A3F, 0x0A3F }, { 0xD167, 0xD169 },
-               { 0xD173, 0xD182 }, { 0xD185, 0xD18B }, { 0xD1AA, 0xD1AD },
-               { 0xD242, 0xD244 }
-       };
-# endif
 #endif
 
        if (ucs == 0)
                return 0;
-       /* test for 8-bit control characters (00-1f, 80-9f, 7f) */
+
+       /* Test for 8-bit control characters (00-1f, 80-9f, 7f) */
        if ((ucs & ~0x80) < 0x20 || ucs == 0x7f)
                return -1;
-       if (ucs < 0x0300) /* optimization */
+       /* Quick abort if it is an obviously invalid char */
+       if (ucs > LAST_SUPPORTED_WCHAR)
+               return -1;
+
+       /* Optimization: no combining chars below 0x300 */
+       if (LAST_SUPPORTED_WCHAR < 0x300 || ucs < 0x300)
                return 1;
 
-#if LAST_SUPPORTED_WCHAR < 0x0300
-       return -1;
-#else
-       /* binary search in table of non-spacing characters */
+#if LAST_SUPPORTED_WCHAR >= 0x300
+       /* Binary search in table of non-spacing characters */
        if (in_interval_table(ucs, combining, ARRAY_SIZE(combining) - 1))
                return 0;
        if (in_uint16_table(ucs, combining1, ARRAY_SIZE(combining1) - 1))
                return 0;
 
-       if (ucs < 0x1100) /* optimization */
+       /* Optimization: all chars below 0x1100 are not double-width */
+       if (LAST_SUPPORTED_WCHAR < 0x1100 || ucs < 0x1100)
                return 1;
 
-# if LAST_SUPPORTED_WCHAR < 0x1100
-       return -1;
-# else
-       if (ucs >= LAST_SUPPORTED_WCHAR)
-               return -1;
-
-       /* High (d800..dbff) and low (dc00..dfff) surrogates are invalid (used only by UTF16) */
-       /* We also exclude Private Use Area (e000..f8ff) */
-       if (LAST_SUPPORTED_WCHAR >= 0xd800
-        && (ucs >= 0xd800 || ucs <= 0xf8ff)
+# if LAST_SUPPORTED_WCHAR >= 0x1100
+       /* Invalid code points: */
+       /* High (d800..dbff) and low (dc00..dfff) surrogates (valid only in UTF16) */
+       /* Private Use Area (e000..f8ff) */
+       /* Noncharacters fdd0..fdef */
+       if ((LAST_SUPPORTED_WCHAR >= 0xd800 && ucs >= 0xd800 && ucs <= 0xf8ff)
+        || (LAST_SUPPORTED_WCHAR >= 0xfdd0 && ucs >= 0xfdd0 && ucs <= 0xfdef)
        ) {
                return -1;
        }
-
        /* 0xfffe and 0xffff in every plane are invalid */
-       if (LAST_SUPPORTED_WCHAR >= 0xfffe
-        && (ucs & 0xfffe) == 0xfffe
-       ) {
+       if (LAST_SUPPORTED_WCHAR >= 0xfffe && ((ucs & 0xfffe) == 0xfffe)) {
                return -1;
        }
 
 #  if LAST_SUPPORTED_WCHAR >= 0x10000
-       /* binary search in table of non-spacing characters in Supplementary Multilingual Plane */
-       if (in_interval_table(ucs ^ 0x10000, combining0x10000, ARRAY_SIZE(combining0x10000) - 1))
-               return 0;
-#  endif
-       /* Check a few non-spacing chars in Supplementary Special-purpose Plane 0xExxxx */
-       if (LAST_SUPPORTED_WCHAR >= 0xE0001
-        && (  ucs == 0xE0001
-           || (ucs >= 0xE0020 && ucs <= 0xE007F)
-           || (ucs >= 0xE0100 && ucs <= 0xE01EF)
-           )
-       ) {
-               return 0;
+       if (ucs >= 0x10000) {
+               /* Combining chars in Supplementary Multilingual Plane 0x1xxxx */
+               static const struct interval combining0x10000[] = {
+                       { 0x0A01, 0x0A03 }, { 0x0A05, 0x0A06 }, { 0x0A0C, 0x0A0F },
+                       { 0x0A38, 0x0A3A }, { 0x0A3F, 0x0A3F }, { 0xD167, 0xD169 },
+                       { 0xD173, 0xD182 }, { 0xD185, 0xD18B }, { 0xD1AA, 0xD1AD },
+                       { 0xD242, 0xD244 }
+               };
+               /* Binary search in table of non-spacing characters in Supplementary Multilingual Plane */
+               if (in_interval_table(ucs ^ 0x10000, combining0x10000, ARRAY_SIZE(combining0x10000) - 1))
+                       return 0;
+               /* Check a few non-spacing chars in Supplementary Special-purpose Plane 0xExxxx */
+               if (LAST_SUPPORTED_WCHAR >= 0xE0001
+                && (  ucs == 0xE0001
+                   || (ucs >= 0xE0020 && ucs <= 0xE007F)
+                   || (ucs >= 0xE0100 && ucs <= 0xE01EF)
+                   )
+               ) {
+                       return 0;
+               }
        }
+#  endif
 
-       /* if we arrive here, ucs is not a combining or C0/C1 control character */
-
+       /* If we arrive here, ucs is not a combining or C0/C1 control character.
+        * Check whether it's 1 char or 2-shar wide.
+        */
        return 1 +
                (  (/*ucs >= 0x1100 &&*/ ucs <= 0x115f) /* Hangul Jamo init. consonants */
                || ucs == 0x2329 /* left-pointing angle bracket; also CJK punct. char */
index e08249ea601488f3662a322a2415916fbb65ced3..169313a631d129a925815ed1a97a8c10abf24de4 100755 (executable)
@@ -13,7 +13,7 @@ mkdir ls.testdir || exit 1
 
 # With Unicode provided by libc locale, I'm not sure this test can pass.
 # I suspect we might fail to skip exactly correct number of bytes
-# over broken unicode sequences.
+# over broked unicode sequences.
 test x"$CONFIG_FEATURE_ASSUME_UNICODE" = x"y" \
 && test x"$CONFIG_LOCALE_SUPPORT" != x"y" \
 && test x"$CONFIG_SUBST_WCHAR" = x"63" \
@@ -144,7 +144,7 @@ test x"$CONFIG_FEATURE_ASSUME_UNICODE" = x"y" \
 0003_2.1__First_possible_sequence_of_a_certain_length_____________________|
 0004_2.1.2__2_bytes__U-00000080_:________"?"______________________________|
 0005_2.1.3__3_bytes__U-00000800_:________"ࠀ"______________________________|
-0006_2.1.4__4_bytes__U-00010000_:________"?"______________________________|
+0006_2.1.4__4_bytes__U-00010000_:________"𐀀"______________________________|
 0007_2.1.5__5_bytes__U-00200000_:________"?"______________________________|
 0008_2.1.6__6_bytes__U-04000000_:________"?"______________________________|
 0009_2.2__Last_possible_sequence_of_a_certain_length______________________|
@@ -155,9 +155,9 @@ test x"$CONFIG_FEATURE_ASSUME_UNICODE" = x"y" \
 0014_2.2.5__5_bytes__U-03FFFFFF_:________"?"______________________________|
 0015_2.2.6__6_bytes__U-7FFFFFFF_:________"?"______________________________|
 0016_2.3__Other_boundary_conditions_______________________________________|
-0017_2.3.1__U-0000D7FF_=_ed_9f_bf_=_"?"___________________________________|
+0017_2.3.1__U-0000D7FF_=_ed_9f_bf_=_""___________________________________|
 0018_2.3.2__U-0000E000_=_ee_80_80_=_"?"___________________________________|
-0019_2.3.3__U-0000FFFD_=_ef_bf_bd_=_"?"___________________________________|
+0019_2.3.3__U-0000FFFD_=_ef_bf_bd_=_""___________________________________|
 0020_2.3.4__U-0010FFFF_=_f4_8f_bf_bf_=_"?"________________________________|
 0021_2.3.5__U-00110000_=_f4_90_80_80_=_"?"________________________________|
 0022_3__Malformed_sequences_______________________________________________|