Merge tag 'fsl-qoriq-for-v2018.11-rc1' of git://git.denx.de/u-boot-fsl-qoriq
[oweals/u-boot.git] / lib / charset.c
1 // SPDX-License-Identifier: GPL-2.0+
2 /*
3  *  charset conversion utils
4  *
5  *  Copyright (c) 2017 Rob Clark
6  */
7
8 #include <common.h>
9 #include <charset.h>
10 #include <capitalization.h>
11 #include <malloc.h>
12
13 static struct capitalization_table capitalization_table[] =
14 #ifdef CONFIG_EFI_UNICODE_CAPITALIZATION
15         UNICODE_CAPITALIZATION_TABLE;
16 #elif CONFIG_FAT_DEFAULT_CODEPAGE == 1250
17         CP1250_CAPITALIZATION_TABLE;
18 #else
19         CP437_CAPITALIZATION_TABLE;
20 #endif
21
22 /**
23  * get_code() - read Unicode code point from UTF-8 stream
24  *
25  * @read_u8:    - stream reader
26  * @src:        - string buffer passed to stream reader, optional
27  * Return:      - Unicode code point
28  */
29 static int get_code(u8 (*read_u8)(void *data), void *data)
30 {
31         s32 ch = 0;
32
33         ch = read_u8(data);
34         if (!ch)
35                 return 0;
36         if (ch >= 0xc2 && ch <= 0xf4) {
37                 int code = 0;
38
39                 if (ch >= 0xe0) {
40                         if (ch >= 0xf0) {
41                                 /* 0xf0 - 0xf4 */
42                                 ch &= 0x07;
43                                 code = ch << 18;
44                                 ch = read_u8(data);
45                                 if (ch < 0x80 || ch > 0xbf)
46                                         goto error;
47                                 ch &= 0x3f;
48                         } else {
49                                 /* 0xe0 - 0xef */
50                                 ch &= 0x0f;
51                         }
52                         code += ch << 12;
53                         if ((code >= 0xD800 && code <= 0xDFFF) ||
54                             code >= 0x110000)
55                                 goto error;
56                         ch = read_u8(data);
57                         if (ch < 0x80 || ch > 0xbf)
58                                 goto error;
59                 }
60                 /* 0xc0 - 0xdf or continuation byte (0x80 - 0xbf) */
61                 ch &= 0x3f;
62                 code += ch << 6;
63                 ch = read_u8(data);
64                 if (ch < 0x80 || ch > 0xbf)
65                         goto error;
66                 ch &= 0x3f;
67                 ch += code;
68         } else if (ch >= 0x80) {
69                 goto error;
70         }
71         return ch;
72 error:
73         return '?';
74 }
75
76 /**
77  * read_string() - read byte from character string
78  *
79  * @data:       - pointer to string
80  * Return:      - byte read
81  *
82  * The string pointer is incremented if it does not point to '\0'.
83  */
84 static u8 read_string(void *data)
85
86 {
87         const char **src = (const char **)data;
88         u8 c;
89
90         if (!src || !*src || !**src)
91                 return 0;
92         c = **src;
93         ++*src;
94         return c;
95 }
96
97 /**
98  * read_console() - read byte from console
99  *
100  * @src         - not used, needed to match interface
101  * Return:      - byte read
102  */
103 static u8 read_console(void *data)
104 {
105         return getc();
106 }
107
108 int console_read_unicode(s32 *code)
109 {
110         if (!tstc()) {
111                 /* No input available */
112                 return 1;
113         }
114
115         /* Read Unicode code */
116         *code = get_code(read_console, NULL);
117         return 0;
118 }
119
120 s32 utf8_get(const char **src)
121 {
122         return get_code(read_string, src);
123 }
124
125 int utf8_put(s32 code, char **dst)
126 {
127         if (!dst || !*dst)
128                 return -1;
129         if ((code >= 0xD800 && code <= 0xDFFF) || code >= 0x110000)
130                 return -1;
131         if (code <= 0x007F) {
132                 **dst = code;
133         } else {
134                 if (code <= 0x07FF) {
135                         **dst = code >> 6 | 0xC0;
136                 } else {
137                         if (code < 0x10000) {
138                                 **dst = code >> 12 | 0xE0;
139                         } else {
140                                 **dst = code >> 18 | 0xF0;
141                                 ++*dst;
142                                 **dst = (code >> 12 & 0x3F) | 0x80;
143                         }
144                         ++*dst;
145                         **dst = (code >> 6 & 0x3F) | 0x80;
146                 }
147                 ++*dst;
148                 **dst = (code & 0x3F) | 0x80;
149         }
150         ++*dst;
151         return 0;
152 }
153
154 size_t utf8_utf16_strnlen(const char *src, size_t count)
155 {
156         size_t len = 0;
157
158         for (; *src && count; --count)  {
159                 s32 code = utf8_get(&src);
160
161                 if (!code)
162                         break;
163                 if (code < 0) {
164                         /* Reserve space for a replacement character */
165                         len += 1;
166                 } else if (code < 0x10000) {
167                         len += 1;
168                 } else {
169                         len += 2;
170                 }
171         }
172         return len;
173 }
174
175 int utf8_utf16_strncpy(u16 **dst, const char *src, size_t count)
176 {
177         if (!src || !dst || !*dst)
178                 return -1;
179
180         for (; count && *src; --count) {
181                 s32 code = utf8_get(&src);
182
183                 if (code < 0)
184                         code = '?';
185                 utf16_put(code, dst);
186         }
187         **dst = 0;
188         return 0;
189 }
190
191 s32 utf16_get(const u16 **src)
192 {
193         s32 code, code2;
194
195         if (!src || !*src)
196                 return -1;
197         if (!**src)
198                 return 0;
199         code = **src;
200         ++*src;
201         if (code >= 0xDC00 && code <= 0xDFFF)
202                 return -1;
203         if (code >= 0xD800 && code <= 0xDBFF) {
204                 if (!**src)
205                         return -1;
206                 code &= 0x3ff;
207                 code <<= 10;
208                 code += 0x10000;
209                 code2 = **src;
210                 ++*src;
211                 if (code2 <= 0xDC00 || code2 >= 0xDFFF)
212                         return -1;
213                 code2 &= 0x3ff;
214                 code += code2;
215         }
216         return code;
217 }
218
219 int utf16_put(s32 code, u16 **dst)
220 {
221         if (!dst || !*dst)
222                 return -1;
223         if ((code >= 0xD800 && code <= 0xDFFF) || code >= 0x110000)
224                 return -1;
225         if (code < 0x10000) {
226                 **dst = code;
227         } else {
228                 code -= 0x10000;
229                 **dst = code >> 10 | 0xD800;
230                 ++*dst;
231                 **dst = (code & 0x3ff) | 0xDC00;
232         }
233         ++*dst;
234         return 0;
235 }
236
237 size_t utf16_strnlen(const u16 *src, size_t count)
238 {
239         size_t len = 0;
240
241         for (; *src && count; --count)  {
242                 s32 code = utf16_get(&src);
243
244                 if (!code)
245                         break;
246                 /*
247                  * In case of an illegal sequence still reserve space for a
248                  * replacement character.
249                  */
250                 ++len;
251         }
252         return len;
253 }
254
255 size_t utf16_utf8_strnlen(const u16 *src, size_t count)
256 {
257         size_t len = 0;
258
259         for (; *src && count; --count)  {
260                 s32 code = utf16_get(&src);
261
262                 if (!code)
263                         break;
264                 if (code < 0)
265                         /* Reserve space for a replacement character */
266                         len += 1;
267                 else if (code < 0x80)
268                         len += 1;
269                 else if (code < 0x800)
270                         len += 2;
271                 else if (code < 0x10000)
272                         len += 3;
273                 else
274                         len += 4;
275         }
276         return len;
277 }
278
279 int utf16_utf8_strncpy(char **dst, const u16 *src, size_t count)
280 {
281         if (!src || !dst || !*dst)
282                 return -1;
283
284         for (; count && *src; --count) {
285                 s32 code = utf16_get(&src);
286
287                 if (code < 0)
288                         code = '?';
289                 utf8_put(code, dst);
290         }
291         **dst = 0;
292         return 0;
293 }
294
295 s32 utf_to_lower(const s32 code)
296 {
297         struct capitalization_table *pos = capitalization_table;
298         s32 ret = code;
299
300         if (code <= 0x7f) {
301                 if (code >= 'A' && code <= 'Z')
302                         ret += 0x20;
303                 return ret;
304         }
305         for (; pos->upper; ++pos) {
306                 if (pos->upper == code) {
307                         ret = pos->lower;
308                         break;
309                 }
310         }
311         return ret;
312 }
313
314 s32 utf_to_upper(const s32 code)
315 {
316         struct capitalization_table *pos = capitalization_table;
317         s32 ret = code;
318
319         if (code <= 0x7f) {
320                 if (code >= 'a' && code <= 'z')
321                         ret -= 0x20;
322                 return ret;
323         }
324         for (; pos->lower; ++pos) {
325                 if (pos->lower == code) {
326                         ret = pos->upper;
327                         break;
328                 }
329         }
330         return ret;
331 }
332
333 size_t u16_strlen(const u16 *in)
334 {
335         size_t i;
336         for (i = 0; in[i]; i++);
337         return i;
338 }
339
340 size_t u16_strnlen(const u16 *in, size_t count)
341 {
342         size_t i;
343         for (i = 0; count-- && in[i]; i++);
344         return i;
345 }
346
347 /* Convert UTF-16 to UTF-8.  */
348 uint8_t *utf16_to_utf8(uint8_t *dest, const uint16_t *src, size_t size)
349 {
350         uint32_t code_high = 0;
351
352         while (size--) {
353                 uint32_t code = *src++;
354
355                 if (code_high) {
356                         if (code >= 0xDC00 && code <= 0xDFFF) {
357                                 /* Surrogate pair.  */
358                                 code = ((code_high - 0xD800) << 10) + (code - 0xDC00) + 0x10000;
359
360                                 *dest++ = (code >> 18) | 0xF0;
361                                 *dest++ = ((code >> 12) & 0x3F) | 0x80;
362                                 *dest++ = ((code >> 6) & 0x3F) | 0x80;
363                                 *dest++ = (code & 0x3F) | 0x80;
364                         } else {
365                                 /* Error...  */
366                                 *dest++ = '?';
367                                 /* *src may be valid. Don't eat it.  */
368                                 src--;
369                         }
370
371                         code_high = 0;
372                 } else {
373                         if (code <= 0x007F) {
374                                 *dest++ = code;
375                         } else if (code <= 0x07FF) {
376                                 *dest++ = (code >> 6) | 0xC0;
377                                 *dest++ = (code & 0x3F) | 0x80;
378                         } else if (code >= 0xD800 && code <= 0xDBFF) {
379                                 code_high = code;
380                                 continue;
381                         } else if (code >= 0xDC00 && code <= 0xDFFF) {
382                                 /* Error... */
383                                 *dest++ = '?';
384                         } else if (code < 0x10000) {
385                                 *dest++ = (code >> 12) | 0xE0;
386                                 *dest++ = ((code >> 6) & 0x3F) | 0x80;
387                                 *dest++ = (code & 0x3F) | 0x80;
388                         } else {
389                                 *dest++ = (code >> 18) | 0xF0;
390                                 *dest++ = ((code >> 12) & 0x3F) | 0x80;
391                                 *dest++ = ((code >> 6) & 0x3F) | 0x80;
392                                 *dest++ = (code & 0x3F) | 0x80;
393                         }
394                 }
395         }
396
397         return dest;
398 }