With this option on, more extensive (and bigger) table
of neutral chars will be used.
+config UNICODE_PRESERVE_BROKEN
+ bool "Make it possible to enter sequences of chars which are not Unicode"
+ default n
+ depends on UNICODE_SUPPORT
+ help
+ With this option on, invalid UTF-8 bytes are not substituted
+ with the selected substitution character.
+ For example, this means that entering 'l', 's', ' ', 0xff, [Enter]
+ at shell prompt will list file named 0xff (single char name
+ with char value 255), not file named '?'.
+
config LONG_OPTS
bool "Support for --long-options"
default y
#undef CHAR_T
#if ENABLE_UNICODE_SUPPORT
-# define BB_NUL L'\0'
+# define BB_NUL ((wchar_t)0)
# define CHAR_T wchar_t
static bool BB_isspace(CHAR_T c) { return ((unsigned)c < 256 && isspace(c)); }
# if ENABLE_FEATURE_EDITING_VI
#endif
+# if ENABLE_UNICODE_PRESERVE_BROKEN
+# define unicode_mark_inv_wchar(wc) ((wc) | 0x20000000)
+# define unicode_is_inv_wchar(wc) ((wc) & 0x20000000)
+# else
+# define unicode_is_inv_wchar(wc) 0
+# endif
+
+
enum {
/* We use int16_t for positions, need to limit line len */
MAX_LINELEN = CONFIG_FEATURE_EDITING_MAX_LEN < 0x7ff0
ssize_t len = mbstowcs(command_ps, src, maxsize - 1);
if (len < 0)
len = 0;
- command_ps[len] = L'\0';
+ command_ps[len] = 0;
return len;
}
-static size_t save_string(char *dst, int maxsize)
+static unsigned save_string(char *dst, unsigned maxsize)
{
+#if !ENABLE_UNICODE_PRESERVE_BROKEN
ssize_t len = wcstombs(dst, command_ps, maxsize - 1);
if (len < 0)
len = 0;
dst[len] = '\0';
return len;
+#else
+ unsigned dstpos = 0;
+ unsigned srcpos = 0;
+
+ maxsize--;
+ while (dstpos < maxsize) {
+ wchar_t wc;
+ int n = srcpos;
+ while ((wc = command_ps[srcpos]) != 0
+ && !unicode_is_inv_wchar(wc)
+ ) {
+ srcpos++;
+ }
+ command_ps[srcpos] = 0;
+ n = wcstombs(dst + dstpos, command_ps + n, maxsize - dstpos);
+ if (n < 0) /* should not happen */
+ break;
+ dstpos += n;
+ if (wc == 0) /* usually is */
+ break;
+ /* We do have invalid byte here! */
+ command_ps[srcpos] = wc; /* restore it */
+ srcpos++;
+ if (dstpos == maxsize)
+ break;
+ dst[dstpos++] = (char) wc;
+ }
+ dst[dstpos] = '\0';
+ return dstpos;
+#endif
}
/* I thought just fputwc(c, stdout) would work. But no... */
static void BB_PUTCHAR(wchar_t c)
{
char buf[MB_CUR_MAX + 1];
mbstate_t mbst = { 0 };
- ssize_t len = wcrtomb(buf, c, &mbst);
+ ssize_t len;
+ if (unicode_is_inv_wchar(c))
+ c = CONFIG_SUBST_WCHAR;
+ len = wcrtomb(buf, c, &mbst);
if (len > 0) {
buf[len] = '\0';
fputs(buf, stdout);
return strlen(command_ps);
}
# if ENABLE_FEATURE_TAB_COMPLETION
-static void save_string(char *dst, int maxsize)
+static void save_string(char *dst, unsigned maxsize)
{
safe_strncpy(dst, command_ps, maxsize);
}
pushback:
/* Invalid sequence. Save all "bad bytes" except first */
read_key_ungets(read_key_buffer, unicode_buf + 1, unicode_idx - 1);
- /*
- * ic = unicode_buf[0] sounds even better, but currently
- * this does not work: wchar_t[] -> char[] conversion
- * when lineedit finishes mangles such "raw bytes"
- * (by misinterpreting them as unicode chars):
- */
+# if !ENABLE_UNICODE_PRESERVE_BROKEN
ic = CONFIG_SUBST_WCHAR;
+# else
+ ic = unicode_mark_inv_wchar(unicode_buf[0]);
+# endif
} else {
/* Valid unicode char, return its code */
ic = wc;
# if LAST_SUPPORTED_WCHAR >= 0x300
/* sorted list of non-overlapping intervals of non-spacing characters */
/* generated by "uniset +cat=Me +cat=Mn +cat=Cf -00AD +1160-11FF +200B c" */
- static const struct interval combining[] = {
# define BIG_(a,b) { a, b },
# define PAIR(a,b)
# define ARRAY /* PAIR if < 0x4000 and no more than 4 chars big */ \
BIG_(0xFE20, 0xFE23) \
BIG_(0xFEFF, 0xFEFF) \
BIG_(0xFFF9, 0xFFFB)
- ARRAY
+ static const struct interval combining[] = { ARRAY };
# undef BIG_
# undef PAIR
- };
# define BIG_(a,b)
# define PAIR(a,b) (a << 2) | (b-a),
static const uint16_t combining1[] = { ARRAY };
* http://www.unicode.org/Public/5.2.0/ucd/extracted/DerivedBidiClass.txt
* Bidi_Class=Left_To_Right | Bidi_Class=Arabic_Letter
*/
- static const struct interval rtl_b[] = {
# define BIG_(a,b) { a, b },
# define PAIR(a,b)
# define ARRAY \
{0x10E7F, 0x10FFF},
{0x1E800, 0x1EFFF}
*/
- ARRAY
+ static const struct interval rtl_b[] = { ARRAY };
# undef BIG_
# undef PAIR
- };
# define BIG_(a,b)
# define PAIR(a,b) (a << 2) | (b-a),
static const uint16_t rtl_p[] = { ARRAY };
* White_Space, Other_Neutral, European_Number, European_Separator,
* European_Terminator, Arabic_Number, Common_Separator
*/
- static const struct interval neutral_b[] = {
# define BIG_(a,b) { a, b },
# define PAIR(a,b)
# define ARRAY \
{0x1F030, 0x1F093},
{0x1F100, 0x1F10A}
*/
- ARRAY
+ static const struct interval neutral_b[] = { ARRAY };
# undef BIG_
# undef PAIR
- };
# define BIG_(a,b)
# define PAIR(a,b) (a << 2) | (b-a),
static const uint16_t neutral_p[] = { ARRAY };
. ./testing.sh
+test -f "$bindir/.config" && . "$bindir/.config"
+
# testing "test name" "options" "expected result" "file input" "stdin"
+if test x"$CONFIG_UNICODE_PRESERVE_BROKEN" = x"y"; then
+testing "One byte which is not valid unicode char followed by valid input" \
+ "script -q -c 'ash' /dev/null >/dev/null; cat ash.output" \
+ "\
+00000000 ff 2d 0a |.-.|
+00000003
+" \
+ "" \
+ "echo \xff- | hexdump -C >ash.output; exit; exit; exit; exit\n"
+
+testing "30 bytes which are not valid unicode chars followed by valid input" \
+ "script -q -c 'ash' /dev/null >/dev/null; cat ash.output" \
+ "\
+00000000 ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff |................|
+00000010 ff ff ff ff ff ff ff ff ff ff ff ff ff ff 2d 0a |..............-.|
+00000020
+" \
+ "" \
+ "echo \xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff- | hexdump -C >ash.output; exit; exit; exit; exit\n"
+else
testing "One byte which is not valid unicode char followed by valid input" \
"script -q -c 'ash' /dev/null >/dev/null; cat ash.output" \
"\
" \
"" \
"echo \xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff- | hexdump -C >ash.output; exit; exit; exit; exit\n"
+fi
+
# Not sure this behavior is perfect: we lose all invalid input which precedes
# arrow keys and such. In this example, \xff\xff are lost