brctl: fixing stp parameters incompatibility
[oweals/busybox.git] / libbb / unicode.c
1 /* vi: set sw=4 ts=4: */
2 /*
3  * Unicode support routines.
4  *
5  * Copyright (C) 2009 Denys Vlasenko
6  *
7  * Licensed under GPL version 2, see file LICENSE in this tarball for details.
8  */
9 #include "libbb.h"
10 #include "unicode.h"
11
12 /* If it's not #defined as a constant in unicode.h... */
13 #ifndef unicode_status
14 uint8_t unicode_status;
15 #endif
16
17 /* This file is compiled only if FEATURE_ASSUME_UNICODE is on.
18  * We check other options and decide whether to use libc support
19  * via locale, or use our own logic:
20  */
21
22 #if ENABLE_LOCALE_SUPPORT
23
24 /* Unicode support using libc locale support. */
25
26 void FAST_FUNC init_unicode(void)
27 {
28         /* In unicode, this is a one character string */
29         static const char unicode_0x394[] = { 0xce, 0x94, 0 };
30
31         if (unicode_status != UNICODE_UNKNOWN)
32                 return;
33
34         unicode_status = unicode_strlen(unicode_0x394) == 1 ? UNICODE_ON : UNICODE_OFF;
35 }
36
37 #else
38
39 /* Homegrown Unicode support. It knows only C and Unicode locales. */
40
41 # if ENABLE_FEATURE_CHECK_UNICODE_IN_ENV
42 void FAST_FUNC init_unicode(void)
43 {
44         char *lang;
45
46         if (unicode_status != UNICODE_UNKNOWN)
47                 return;
48
49         unicode_status = UNICODE_OFF;
50         lang = getenv("LANG");
51         if (!lang || !(strstr(lang, ".utf") || strstr(lang, ".UTF")))
52                 return;
53         unicode_status = UNICODE_ON;
54 }
55 # endif
56
57 static size_t wcrtomb_internal(char *s, wchar_t wc)
58 {
59         int n, i;
60         uint32_t v = wc;
61
62         if (v <= 0x7f) {
63                 *s = v;
64                 return 1;
65         }
66
67         /* RFC 3629 says that Unicode ends at 10FFFF,
68          * but we cover entire 32 bits */
69
70         /* 4000000-FFFFFFFF -> 111111tt 10tttttt 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx */
71         /* 200000-3FFFFFF -> 111110tt 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx */
72         /* 10000-1FFFFF -> 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx */
73         /* 800-FFFF -> 1110yyyy 10yyyyxx 10xxxxxx */
74         /* 80-7FF -> 110yyyxx 10xxxxxx */
75
76         /* How many bytes do we need? */
77         n = 2;
78         /* (0x80000000+ would result in n = 7, limiting n to 6) */
79         while (v >= 0x800 && n < 6) {
80                 v >>= 5;
81                 n++;
82         }
83         /* Fill bytes n-1..1 */
84         i = n;
85         while (--i) {
86                 s[i] = (wc & 0x3f) | 0x80;
87                 wc >>= 6;
88         }
89         /* Fill byte 0 */
90         s[0] = wc | (uint8_t)(0x3f00 >> n);
91         return n;
92 }
93 size_t FAST_FUNC wcrtomb(char *s, wchar_t wc, mbstate_t *ps UNUSED_PARAM)
94 {
95         if (unicode_status != UNICODE_ON) {
96                 *s = wc;
97                 return 1;
98         }
99
100         return wcrtomb_internal(s, wc);
101 }
102 size_t FAST_FUNC wcstombs(char *dest, const wchar_t *src, size_t n)
103 {
104         size_t org_n = n;
105
106         if (unicode_status != UNICODE_ON) {
107                 while (n) {
108                         wchar_t c = *src++;
109                         *dest++ = c;
110                         if (c == 0)
111                                 break;
112                         n--;
113                 }
114                 return org_n - n;
115         }
116
117         while (n >= MB_CUR_MAX) {
118                 wchar_t wc = *src++;
119                 size_t len = wcrtomb_internal(dest, wc);
120
121                 if (wc == L'\0')
122                         return org_n - n;
123                 dest += len;
124                 n -= len;
125         }
126         while (n) {
127                 char tbuf[MB_CUR_MAX];
128                 wchar_t wc = *src++;
129                 size_t len = wcrtomb_internal(tbuf, wc);
130
131                 if (len > n)
132                         len = n;
133                 memcpy(dest, tbuf, len);
134                 if (wc == L'\0')
135                         return org_n - n;
136                 dest += len;
137                 n -= len;
138         }
139         return org_n - n;
140 }
141
142 #define ERROR_WCHAR (~(wchar_t)0)
143
144 static const char *mbstowc_internal(wchar_t *res, const char *src)
145 {
146         int bytes;
147         unsigned c = (unsigned char) *src++;
148
149         if (c <= 0x7f) {
150                 *res = c;
151                 return src;
152         }
153
154         /* 80-7FF -> 110yyyxx 10xxxxxx */
155         /* 800-FFFF -> 1110yyyy 10yyyyxx 10xxxxxx */
156         /* 10000-1FFFFF -> 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx */
157         /* 200000-3FFFFFF -> 111110tt 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx */
158         /* 4000000-FFFFFFFF -> 111111tt 10tttttt 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx */
159         bytes = 0;
160         do {
161                 c <<= 1;
162                 bytes++;
163         } while ((c & 0x80) && bytes < 6);
164         if (bytes == 1) {
165                 /* A bare "continuation" byte. Say, 80 */
166                 *res = ERROR_WCHAR;
167                 return src;
168         }
169         c = (uint8_t)(c) >> bytes;
170
171         while (--bytes) {
172                 unsigned ch = (unsigned char) *src;
173                 if ((ch & 0xc0) != 0x80) {
174                         /* Missing "continuation" byte. Example: e0 80 */
175                         *res = ERROR_WCHAR;
176                         return src;
177                 }
178                 c = (c << 6) + (ch & 0x3f);
179                 src++;
180         }
181
182         /* TODO */
183         /* Need to check that c isn't produced by overlong encoding */
184         /* Example: 11000000 10000000 converts to NUL */
185         /* 11110000 10000000 10000100 10000000 converts to 0x100 */
186         /* correct encoding: 11000100 10000000 */
187         if (c <= 0x7f) { /* crude check */
188                 *res = ERROR_WCHAR;
189                 return src;
190         }
191
192         *res = c;
193         return src;
194 }
195 size_t FAST_FUNC mbstowcs(wchar_t *dest, const char *src, size_t n)
196 {
197         size_t org_n = n;
198
199         if (unicode_status != UNICODE_ON) {
200                 while (n) {
201                         unsigned char c = *src++;
202
203                         if (dest)
204                                 *dest++ = c;
205                         if (c == 0)
206                                 break;
207                         n--;
208                 }
209                 return org_n - n;
210         }
211
212         while (n) {
213                 wchar_t wc;
214                 src = mbstowc_internal(&wc, src);
215                 if (wc == ERROR_WCHAR) /* error */
216                         return (size_t) -1L;
217                 if (dest)
218                         *dest++ = wc;
219                 if (wc == 0) /* end-of-string */
220                         break;
221                 n--;
222         }
223
224         return org_n - n;
225 }
226
227 int FAST_FUNC iswspace(wint_t wc)
228 {
229         return (unsigned)wc <= 0x7f && isspace(wc);
230 }
231
232 int FAST_FUNC iswalnum(wint_t wc)
233 {
234         return (unsigned)wc <= 0x7f && isalnum(wc);
235 }
236
237 int FAST_FUNC iswpunct(wint_t wc)
238 {
239         return (unsigned)wc <= 0x7f && ispunct(wc);
240 }
241
242 #include "unicode_wcwidth.c"
243
244 # if ENABLE_UNICODE_BIDI_SUPPORT
245 int FAST_FUNC unicode_isrtl(wint_t wc)
246 {
247         /* ranges taken from
248          * http://www.unicode.org/Public/5.2.0/ucd/extracted/DerivedBidiClass.txt
249          * Bidi_Class=Left_To_Right | Bidi_Class=Arabic_Letter
250          */
251         static const struct interval rtl_b[] = {
252 #  define BIG_(a,b) { a, b },
253 #  define PAIR(a,b)
254                 PAIR(0x0590, 0x0590)
255                 PAIR(0x05BE, 0x05BE)
256                 PAIR(0x05C0, 0x05C0)
257                 PAIR(0x05C3, 0x05C3)
258                 PAIR(0x05C6, 0x05C6)
259                 BIG_(0x05C8, 0x05FF)
260                 PAIR(0x0604, 0x0605)
261                 PAIR(0x0608, 0x0608)
262                 PAIR(0x060B, 0x060B)
263                 PAIR(0x060D, 0x060D)
264                 BIG_(0x061B, 0x064A)
265                 PAIR(0x065F, 0x065F)
266                 PAIR(0x066D, 0x066F)
267                 BIG_(0x0671, 0x06D5)
268                 PAIR(0x06E5, 0x06E6)
269                 PAIR(0x06EE, 0x06EF)
270                 BIG_(0x06FA, 0x070E)
271                 PAIR(0x0710, 0x0710)
272                 BIG_(0x0712, 0x072F)
273                 BIG_(0x074B, 0x07A5)
274                 BIG_(0x07B1, 0x07EA)
275                 PAIR(0x07F4, 0x07F5)
276                 BIG_(0x07FA, 0x0815)
277                 PAIR(0x081A, 0x081A)
278                 PAIR(0x0824, 0x0824)
279                 PAIR(0x0828, 0x0828)
280                 BIG_(0x082E, 0x08FF)
281                 PAIR(0x200F, 0x200F)
282                 PAIR(0x202B, 0x202B)
283                 PAIR(0x202E, 0x202E)
284                 BIG_(0xFB1D, 0xFB1D)
285                 BIG_(0xFB1F, 0xFB28)
286                 BIG_(0xFB2A, 0xFD3D)
287                 BIG_(0xFD40, 0xFDCF)
288                 BIG_(0xFDC8, 0xFDCF)
289                 BIG_(0xFDF0, 0xFDFC)
290                 BIG_(0xFDFE, 0xFDFF)
291                 BIG_(0xFE70, 0xFEFE)
292                 /* Probably not necessary
293                 {0x10800, 0x1091E},
294                 {0x10920, 0x10A00},
295                 {0x10A04, 0x10A04},
296                 {0x10A07, 0x10A0B},
297                 {0x10A10, 0x10A37},
298                 {0x10A3B, 0x10A3E},
299                 {0x10A40, 0x10A7F},
300                 {0x10B36, 0x10B38},
301                 {0x10B40, 0x10E5F},
302                 {0x10E7F, 0x10FFF},
303                 {0x1E800, 0x1EFFF}
304                 */
305 #  undef BIG_
306 #  undef PAIR
307         };
308
309         static const uint16_t rtl_p[] = {
310 #  define BIG_(a,b)
311 #  define PAIR(a,b) (a << 2) | (b-a),
312                 /* Exact copy-n-paste of the above: */
313                 PAIR(0x0590, 0x0590)
314                 PAIR(0x05BE, 0x05BE)
315                 PAIR(0x05C0, 0x05C0)
316                 PAIR(0x05C3, 0x05C3)
317                 PAIR(0x05C6, 0x05C6)
318                 BIG_(0x05C8, 0x05FF)
319                 PAIR(0x0604, 0x0605)
320                 PAIR(0x0608, 0x0608)
321                 PAIR(0x060B, 0x060B)
322                 PAIR(0x060D, 0x060D)
323                 BIG_(0x061B, 0x064A)
324                 PAIR(0x065F, 0x065F)
325                 PAIR(0x066D, 0x066F)
326                 BIG_(0x0671, 0x06D5)
327                 PAIR(0x06E5, 0x06E6)
328                 PAIR(0x06EE, 0x06EF)
329                 BIG_(0x06FA, 0x070E)
330                 PAIR(0x0710, 0x0710)
331                 BIG_(0x0712, 0x072F)
332                 BIG_(0x074B, 0x07A5)
333                 BIG_(0x07B1, 0x07EA)
334                 PAIR(0x07F4, 0x07F5)
335                 BIG_(0x07FA, 0x0815)
336                 PAIR(0x081A, 0x081A)
337                 PAIR(0x0824, 0x0824)
338                 PAIR(0x0828, 0x0828)
339                 BIG_(0x082E, 0x08FF)
340                 PAIR(0x200F, 0x200F)
341                 PAIR(0x202B, 0x202B)
342                 PAIR(0x202E, 0x202E)
343                 BIG_(0xFB1D, 0xFB1D)
344                 BIG_(0xFB1F, 0xFB28)
345                 BIG_(0xFB2A, 0xFD3D)
346                 BIG_(0xFD40, 0xFDCF)
347                 BIG_(0xFDC8, 0xFDCF)
348                 BIG_(0xFDF0, 0xFDFC)
349                 BIG_(0xFDFE, 0xFDFF)
350                 BIG_(0xFE70, 0xFEFE)
351                 /* Probably not necessary
352                 {0x10800, 0x1091E},
353                 {0x10920, 0x10A00},
354                 {0x10A04, 0x10A04},
355                 {0x10A07, 0x10A0B},
356                 {0x10A10, 0x10A37},
357                 {0x10A3B, 0x10A3E},
358                 {0x10A40, 0x10A7F},
359                 {0x10B36, 0x10B38},
360                 {0x10B40, 0x10E5F},
361                 {0x10E7F, 0x10FFF},
362                 {0x1E800, 0x1EFFF}
363                 */
364 #  undef BIG_
365 #  undef PAIR
366         };
367
368         if (in_interval_table(wc, rtl_b, ARRAY_SIZE(rtl_b) - 1))
369                 return 1;
370         if (in_uint16_table(wc, rtl_p, ARRAY_SIZE(rtl_p) - 1))
371                 return 1;
372         return 0;
373 }
374 # endif /* UNICODE_BIDI_SUPPORT */
375
376 #endif /* Homegrown Unicode support */
377
378
379 /* The rest is mostly same for libc and for "homegrown" support */
380
381 size_t FAST_FUNC unicode_strlen(const char *string)
382 {
383         size_t width = mbstowcs(NULL, string, INT_MAX);
384         if (width == (size_t)-1L)
385                 return strlen(string);
386         return width;
387 }
388
389 static char* FAST_FUNC unicode_conv_to_printable2(uni_stat_t *stats, const char *src, unsigned width, int flags)
390 {
391         char *dst;
392         unsigned dst_len;
393         unsigned uni_count;
394         unsigned uni_width;
395
396         if (unicode_status != UNICODE_ON) {
397                 char *d;
398                 if (flags & UNI_FLAG_PAD) {
399                         d = dst = xmalloc(width + 1);
400                         while ((int)--width >= 0) {
401                                 unsigned char c = *src;
402                                 if (c == '\0') {
403                                         do
404                                                 *d++ = ' ';
405                                         while ((int)--width >= 0);
406                                         break;
407                                 }
408                                 *d++ = (c >= ' ' && c < 0x7f) ? c : '?';
409                                 src++;
410                         }
411                         *d = '\0';
412                 } else {
413                         d = dst = xstrndup(src, width);
414                         while (*d) {
415                                 unsigned char c = *d;
416                                 if (c < ' ' || c >= 0x7f)
417                                         *d = '?';
418                                 d++;
419                         }
420                 }
421                 if (stats)
422                         stats->byte_count = stats->unicode_count = (d - dst);
423                 return dst;
424         }
425
426         dst = NULL;
427         uni_count = uni_width = 0;
428         dst_len = 0;
429         while (1) {
430                 int w;
431                 wchar_t wc;
432
433 #if ENABLE_LOCALE_SUPPORT
434                 {
435                         mbstate_t mbst = { 0 };
436                         ssize_t rc = mbsrtowcs(&wc, &src, 1, &mbst);
437                         /* If invalid sequence is seen: -1 is returned,
438                          * src points to the invalid sequence, errno = EILSEQ.
439                          * Else number of wchars (excluding terminating L'\0')
440                          * written to dest is returned.
441                          * If len (here: 1) non-L'\0' wchars stored at dest,
442                          * src points to the next char to be converted.
443                          * If string is completely converted: src = NULL.
444                          */
445                         if (rc == 0) /* end-of-string */
446                                 break;
447                         if (rc < 0) { /* error */
448                                 src++;
449                                 goto subst;
450                         }
451                         if (!iswprint(wc))
452                                 goto subst;
453                 }
454 #else
455                 src = mbstowc_internal(&wc, src);
456                 /* src is advanced to next mb char
457                  * wc == ERROR_WCHAR: invalid sequence is seen
458                  * else: wc is set
459                  */
460                 if (wc == ERROR_WCHAR) /* error */
461                         goto subst;
462                 if (wc == 0) /* end-of-string */
463                         break;
464 #endif
465                 if (CONFIG_LAST_SUPPORTED_WCHAR && wc > CONFIG_LAST_SUPPORTED_WCHAR)
466                         goto subst;
467                 w = wcwidth(wc);
468                 if ((ENABLE_UNICODE_COMBINING_WCHARS && w < 0) /* non-printable wchar */
469                  || (!ENABLE_UNICODE_COMBINING_WCHARS && w <= 0)
470                  || (!ENABLE_UNICODE_WIDE_WCHARS && w > 1)
471                 ) {
472  subst:
473                         wc = CONFIG_SUBST_WCHAR;
474                         w = 1;
475                 }
476                 width -= w;
477                 /* Note: if width == 0, we still may add more chars,
478                  * they may be zero-width or combining ones */
479                 if ((int)width < 0) {
480                         /* can't add this wc, string would become longer than width */
481                         width += w;
482                         break;
483                 }
484
485                 uni_count++;
486                 uni_width += w;
487                 dst = xrealloc(dst, dst_len + MB_CUR_MAX);
488 #if ENABLE_LOCALE_SUPPORT
489                 {
490                         mbstate_t mbst = { 0 };
491                         dst_len += wcrtomb(&dst[dst_len], wc, &mbst);
492                 }
493 #else
494                 dst_len += wcrtomb_internal(&dst[dst_len], wc);
495 #endif
496         }
497
498         /* Pad to remaining width */
499         if (flags & UNI_FLAG_PAD) {
500                 dst = xrealloc(dst, dst_len + width + 1);
501                 uni_count += width;
502                 uni_width += width;
503                 while ((int)--width >= 0) {
504                         dst[dst_len++] = ' ';
505                 }
506         }
507         dst[dst_len] = '\0';
508         if (stats) {
509                 stats->byte_count = dst_len;
510                 stats->unicode_count = uni_count;
511                 stats->unicode_width = uni_width;
512         }
513
514         return dst;
515 }
516 char* FAST_FUNC unicode_conv_to_printable(uni_stat_t *stats, const char *src)
517 {
518         return unicode_conv_to_printable2(stats, src, INT_MAX, 0);
519 }
520 char* FAST_FUNC unicode_conv_to_printable_maxwidth(uni_stat_t *stats, const char *src, unsigned maxwidth)
521 {
522         return unicode_conv_to_printable2(stats, src, maxwidth, 0);
523 }
524 char* FAST_FUNC unicode_conv_to_printable_fixedwidth(uni_stat_t *stats, const char *src, unsigned width)
525 {
526         return unicode_conv_to_printable2(stats, src, width, UNI_FLAG_PAD);
527 }
528
529 #ifdef UNUSED
530 unsigned FAST_FUNC unicode_padding_to_width(unsigned width, const char *src)
531 {
532         if (unicode_status != UNICODE_ON) {
533                 return width - strnlen(src, width);
534         }
535
536         while (1) {
537                 int w;
538                 wchar_t wc;
539
540 #if ENABLE_LOCALE_SUPPORT
541                 {
542                         mbstate_t mbst = { 0 };
543                         ssize_t rc = mbsrtowcs(&wc, &src, 1, &mbst);
544                         if (rc <= 0) /* error, or end-of-string */
545                                 return width;
546                 }
547 #else
548                 src = mbstowc_internal(&wc, src);
549                 if (wc == ERROR_WCHAR || wc == 0) /* error, or end-of-string */
550                         return width;
551 #endif
552                 w = wcwidth(wc);
553                 if (w < 0) /* non-printable wchar */
554                         return width;
555                 width -= w;
556                 if ((int)width <= 0) /* string is longer than width */
557                         return 0;
558         }
559 }
560 #endif