lineedit: invalid unicode characters are replaced with CONFIG_SUBST_WCHAR
authorTomas Heinrich <heinrich.tomas@gmail.com>
Tue, 9 Mar 2010 13:09:24 +0000 (14:09 +0100)
committerDenys Vlasenko <vda.linux@googlemail.com>
Tue, 9 Mar 2010 13:09:24 +0000 (14:09 +0100)
function                                             old     new   delta
read_key_ungets                                        -      50     +50
lineedit_read_key                                    223     252     +29

Signed-off-by: Tomas Heinrich <heinrich.tomas@gmail.com>
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
include/libbb.h
libbb/lineedit.c
libbb/read_key.c
testsuite/ash.tests [new file with mode: 0755]

index ead1020dde2fdc6091686474b8c790db5f21941e..fccc816cb4dd6fb570e498f7cb700b335ff71661 100644 (file)
@@ -1277,6 +1277,7 @@ enum {
  * on first call.
  */
 int64_t read_key(int fd, char *buffer) FAST_FUNC;
+void read_key_ungets(char *buffer, const char *str, unsigned len) FAST_FUNC;
 
 
 #if ENABLE_FEATURE_EDITING
index c50b31d67eba88686ebf1973b1e630d17c9cc60a..8e339da5304fc00f87214c0edf4cc2307e371ff8 100644 (file)
@@ -1700,18 +1700,34 @@ static int lineedit_read_key(char *read_key_buffer)
 #endif
 
 #if ENABLE_FEATURE_ASSUME_UNICODE
-               {
+               if (unicode_status == UNICODE_ON) {
                        wchar_t wc;
 
                        if ((int32_t)ic < 0) /* KEYCODE_xxx */
                                return ic;
+                       // TODO: imagine sequence like: 0xff, <left-arrow>: we are currently losing 0xff...
+
                        unicode_buf[unicode_idx++] = ic;
                        unicode_buf[unicode_idx] = '\0';
-                       if (mbstowcs(&wc, unicode_buf, 1) != 1 && unicode_idx < MB_CUR_MAX) {
-                               delay = 50;
-                               goto poll_again;
+                       if (mbstowcs(&wc, unicode_buf, 1) != 1) {
+                               /* Not (yet?) a valid unicode char */
+                               if (unicode_idx < MB_CUR_MAX) {
+                                       delay = 50;
+                                       goto poll_again;
+                               }
+                               /* Invalid sequence. Save all "bad bytes" except first */
+                               read_key_ungets(read_key_buffer, unicode_buf + 1, MB_CUR_MAX - 1);
+                               /*
+                                * ic = unicode_buf[0] sounds even better, but currently
+                                * this does not work: wchar_t[] -> char[] conversion
+                                * when lineedit finishes mangles such "raw bytes"
+                                * (by misinterpreting them as unicode chars):
+                                */
+                               ic = CONFIG_SUBST_WCHAR;
+                       } else {
+                               /* Valid unicode char, return its code */
+                               ic = wc;
                        }
-                       ic = wc;
                }
 #endif
        } while (errno == EAGAIN);
index a2253ce3e8f3c6493dd0a939d857ea796fd7cd93..98b3131de7b9a779e0ebb92cc6e2ca763651980f 100644 (file)
@@ -246,3 +246,12 @@ int64_t FAST_FUNC read_key(int fd, char *buffer)
        buffer[-1] = 0;
        goto start_over;
 }
+
+void FAST_FUNC read_key_ungets(char *buffer, const char *str, unsigned len)
+{
+       unsigned cur_len = (unsigned char)buffer[0];
+       if (len > KEYCODE_BUFFER_SIZE-1 - cur_len)
+               len = KEYCODE_BUFFER_SIZE-1 - cur_len;
+       memcpy(buffer + 1 + cur_len, str, len);
+       buffer[0] += cur_len + len;
+}
diff --git a/testsuite/ash.tests b/testsuite/ash.tests
new file mode 100755 (executable)
index 0000000..4b6efe4
--- /dev/null
@@ -0,0 +1,42 @@
+#!/bin/sh
+#
+# These are not ash tests, we use ash as a way to test lineedit!
+#
+# Copyright 2010 by Denys Vlasenko
+# Licensed under GPL v2, see file LICENSE for details.
+
+. ./testing.sh
+
+# testing "test name" "options" "expected result" "file input" "stdin"
+
+testing "One byte which is not valid unicode char followed by valid input" \
+       "script -q -c 'ash' /dev/null >/dev/null; cat output; rm output" \
+       "\
+00000000  3f 2d 0a                                          |?-.|
+00000003
+" \
+       "" \
+       "echo \xff- | hexdump -C >output; exit; exit; exit; exit\n" \
+
+testing "30 bytes which are not valid unicode chars followed by valid input" \
+       "script -q -c 'ash' /dev/null >/dev/null; cat output; rm output" \
+       "\
+00000000  3f 3f 3f 3f 3f 3f 3f 3f  3f 3f 3f 3f 3f 3f 3f 3f  |????????????????|
+00000010  3f 3f 3f 3f 3f 3f 3f 3f  3f 3f 3f 3f 3f 3f 2d 0a  |??????????????-.|
+00000020
+" \
+       "" \
+       "echo \xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff- | hexdump -C >output; exit; exit; exit; exit\n" \
+
+# Not sure this behavior is perfect: we lose all invalid input which precedes
+# arrow keys and such. In this example, \xff\xff are lost
+testing "2 bytes which are not valid unicode chars followed by left arrow key" \
+       "script -q -c 'ash' /dev/null >/dev/null; cat output; rm output" \
+       "\
+00000000  3d 2d 0a                                          |=-.|
+00000003
+" \
+       "" \
+       "echo =+\xff\xff\x1b\x5b\x44- | hexdump -C >output; exit; exit; exit; exit\n" \
+
+exit $FAILCOUNT