19 #define SHIFT_JIS 0321
24 /* FIXME: these are not implemented yet
26 * GBK: 81-FE 40-7E,80-FE
27 * Big5: A1-FE 40-7E,A1-FE
30 /* Definitions of charmaps. Each charmap consists of:
31 * 1. Empty-string-terminated list of null-terminated aliases.
32 * 2. Special type code or number of elided entries.
33 * 3. Character table (size determined by field 2). */
35 static const unsigned char charmaps[] =
38 "ucs2\0ucs2be\0\0\304"
40 "utf16\0utf16be\0\0\302"
42 "ucs4\0ucs4be\0utf32\0utf32be\0\0\300"
43 "ucs4le\0utf32le\0\0\303"
44 "ascii\0usascii\0iso646\0iso646us\0\0\307"
46 "shiftjis\0sjis\0\0\321"
50 #include "codepages.h"
53 static const unsigned short legacy_chars[] = {
54 #include "legacychars.h"
57 static const unsigned short jis0208[84][94] = {
61 static const unsigned short gb18030[126][190] = {
65 static int fuzzycmp(const unsigned char *a, const unsigned char *b)
67 for (; *a && *b; a++, b++) {
68 while (*a && (*a|32U)-'a'>26 && *a-'0'>10U) a++;
69 if ((*a|32U) != *b) return 1;
74 static size_t find_charmap(const void *name)
76 const unsigned char *s;
77 for (s=charmaps; *s; ) {
78 if (!fuzzycmp(name, s)) {
79 for (; *s; s+=strlen((void *)s)+1);
82 s += strlen((void *)s)+1;
84 if (s[1] > 0200) s+=2;
85 else s+=2+(128U-s[1])/4*5;
91 iconv_t iconv_open(const char *to, const char *from)
95 if ((t = find_charmap(to))==-1
96 || (f = find_charmap(from))==-1
97 || (charmaps[t] >= 0320)) {
102 return (void *)(f<<16 | t);
105 int iconv_close(iconv_t cd)
110 static unsigned get_16(const unsigned char *s, int e)
113 return s[e]<<8 | s[1-e];
116 static void put_16(unsigned char *s, unsigned c, int e)
123 static unsigned get_32(const unsigned char *s, int e)
126 return s[e]+0U<<24 | s[e^1]<<16 | s[e^2]<<8 | s[e^3];
129 static void put_32(unsigned char *s, unsigned c, int e)
138 /* Adapt as needed */
139 #define mbrtowc_utf8 mbrtowc
140 #define wctomb_utf8 wctomb
142 size_t iconv(iconv_t cd0, char **restrict in, size_t *restrict inb, char **restrict out, size_t *restrict outb)
145 unsigned long cd = (unsigned long)cd0;
146 unsigned to = cd & 0xffff;
147 unsigned from = cd >> 16;
148 const unsigned char *map = charmaps+from+1;
149 const unsigned char *tomap = charmaps+to+1;
155 unsigned char type = map[-1];
156 unsigned char totype = tomap[-1];
158 if (!in || !*in || !*inb) return 0;
160 for (; *inb; *in+=l, *inb-=l) {
161 c = *(unsigned char *)*in;
164 if (c >= 128 || type-UTF_32BE < 7U) switch (type) {
166 l = mbrtowc_utf8(&wc, *in, *inb, &st);
168 else if (l == (size_t)-1) goto ilseq;
169 else if (l == (size_t)-2) goto starved;
176 if (*inb < l) goto starved;
182 if (*inb < 4) goto starved;
183 c = get_32((void *)*in, type);
185 if (c-0xd800u < 0x800u || c >= 0x110000u) goto ilseq;
192 if (*inb < 2) goto starved;
193 c = get_16((void *)*in, type);
194 if ((unsigned)(c-0xdc00) < 0x400) goto ilseq;
195 if ((unsigned)(c-0xd800) < 0x400) {
196 if (type-UCS2BE < 2U) goto ilseq;
198 if (*inb < 4) goto starved;
199 d = get_16((void *)(*in + 2), type);
200 if ((unsigned)(d-0xdc00) >= 0x400) goto ilseq;
201 c = ((c-0xd7c0)<<10) + (d-0xdc00);
205 if (c-0xa1 <= 0xdf-0xa1) {
210 if (*inb < 2) goto starved;
211 d = *((unsigned char *)*in + 1);
212 if (c-129 <= 159-129) c -= 129;
213 else if (c-224 <= 239-224) c -= 193;
216 if (d-64 <= 158-64) {
217 if (d==127) goto ilseq;
220 } else if (d-159 <= 252-159) {
229 if (*inb < 2) goto starved;
230 d = *((unsigned char *)*in + 1);
233 if (c-0xa1 > 0xdf-0xa1) goto ilseq;
239 if (c >= 84 || d >= 94) goto ilseq;
244 if (c < 0xa1) goto ilseq;
248 if (c >= 126) goto ilseq;
250 if (*inb < 2) goto starved;
251 d = *((unsigned char *)*in + 1);
252 if (d < 0xa1 && type == GB2312) goto ilseq;
253 if (d-0x40>=191 || d==127) {
254 if (d-'0'>9 || type != GB18030)
257 if (*inb < 4) goto starved;
258 c = (10*c + d-'0') * 1260;
259 d = *((unsigned char *)*in + 2);
260 if (d-0x81>126) goto ilseq;
262 d = *((unsigned char *)*in + 3);
263 if (d-'0'>9) goto ilseq;
268 for (int i=0; i<126; i++)
269 for (int j=0; j<190; j++)
270 if (gb18030[i][j]-d <= c-d)
282 if (c < 128+type) break;
284 c = legacy_chars[ map[c*5/4]>>2*c%8 |
285 map[c*5/4+1]<<8-2*c%8 & 1023 ];
286 if (!c) c = *(unsigned char *)*in;
287 if (c==1) goto ilseq;
292 if (*outb < sizeof(wchar_t)) goto toobig;
293 *(wchar_t *)*out = c;
294 *out += sizeof(wchar_t);
295 *outb -= sizeof(wchar_t);
300 k = wctomb_utf8(tmp, c);
301 if (*outb < k) goto toobig;
302 memcpy(*out, tmp, k);
303 } else k = wctomb_utf8(*out, c);
308 if (c > 0x7f) subst: x++, c='*';
310 if (*outb < 1) goto toobig;
311 if (c < 128+totype) {
318 for (c=0; c<128-totype; c++) {
319 if (d == legacy_chars[ map[c*5/4]>>2*c%8 |
320 map[c*5/4+1]<<8-2*c%8 & 1023 ]) {
330 if (c < 0x10000 || type-UCS2BE < 2U) {
331 if (c >= 0x10000) c = 0xFFFD;
332 if (*outb < 2) goto toobig;
333 put_16((void *)*out, c, totype);
338 if (*outb < 4) goto toobig;
340 put_16((void *)*out, (c>>10)|0xd800, totype);
341 put_16((void *)(*out + 2), (c&0x3ff)|0xdc00, totype);
347 if (*outb < 4) goto toobig;
348 put_32((void *)*out, c, totype);