From 1a63a9fc30e7a1f1239e3cedcb5041e5ec1c5351 Mon Sep 17 00:00:00 2001 From: Rich Felker Date: Mon, 23 Apr 2012 19:19:26 -0400 Subject: [PATCH] sync case mappings with unicode 6.1 MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit also special-case ß (U+00DF) as lowercase even though it does not have a mapping to uppercase. unicode added an uppercase version of this character but does not map it, presumably because the uppercase version is not actually used except for some obscure purpose... --- src/ctype/iswlower.c | 2 +- src/ctype/towctrans.c | 36 +++++++++++++++++++++++++++++------- 2 files changed, 30 insertions(+), 8 deletions(-) diff --git a/src/ctype/iswlower.c b/src/ctype/iswlower.c index 0a568e77..438fe26a 100644 --- a/src/ctype/iswlower.c +++ b/src/ctype/iswlower.c @@ -2,5 +2,5 @@ int iswlower(wint_t wc) { - return towupper(wc) != wc; + return towupper(wc) != wc || wc == 0xdf; } diff --git a/src/ctype/towctrans.c b/src/ctype/towctrans.c index 0b1eed04..2842d690 100644 --- a/src/ctype/towctrans.c +++ b/src/ctype/towctrans.c @@ -30,6 +30,7 @@ static const struct { CASELACE(0x4c1,0x4cd), CASELACE(0x4d0,0x50e), + CASELACE(0x514,0x526), CASEMAP(0x531,0x556,0x561), CASELACE(0x01a0,0x01a4), @@ -69,12 +70,19 @@ static const struct { CASEMAP(0x2c00,0x2c2e,0x2c30), CASELACE(0x2c67,0x2c6b), CASELACE(0x2c80,0x2ce2), + CASELACE(0x2ceb,0x2ced), + + CASELACE(0xa640,0xa66c), + CASELACE(0xa680,0xa696), CASELACE(0xa722,0xa72e), CASELACE(0xa732,0xa76e), CASELACE(0xa779,0xa77b), CASELACE(0xa77e,0xa786), + CASELACE(0xa790,0xa792), + CASELACE(0xa7a0,0xa7a8), + CASEMAP(0xff21,0xff3a,0xff41), { 0,0,0 } }; @@ -144,6 +152,8 @@ static const unsigned short pairs[][2] = { { 0x03f7, 0x03f8 }, { 0x03fa, 0x03fb }, { 0x1e60, 0x1e9b }, + { 0xdf, 0xdf }, + { 0x1e9e, 0xdf }, { 0x1f59, 0x1f51 }, { 0x1f5b, 0x1f53 }, @@ -181,10 +191,20 @@ static const unsigned short pairs[][2] = { { 0x2c6d, 0x251 }, { 0x2c6e, 0x271 }, { 0x2c6f, 0x250 }, + { 0x2c70, 0x252 }, { 0x2c72, 0x2c73 }, { 0x2c75, 0x2c76 }, + { 0x2c7e, 0x23f }, + { 0x2c7f, 0x240 }, + { 0x2cf2, 0x2cf3 }, { 0xa77d, 0x1d79 }, + { 0xa78b, 0xa78c }, + { 0xa78d, 0x265 }, + { 0xa7aa, 0x266 }, + + { 0x10c7, 0x2d27 }, + { 0x10cd, 0x2d2d }, /* bogus greek 'symbol' letters */ { 0x376, 0x377 }, @@ -207,17 +227,19 @@ static wchar_t __towcase(wchar_t wc, int lower) int i; int lmul = 2*lower-1; int lmask = lower-1; - if ((unsigned)wc - 0x10400 < 0x50) - return wc + lmul*0x28; /* no letters with case in these large ranges */ if (!iswalpha(wc) || (unsigned)wc - 0x0600 <= 0x0fff-0x0600 - || (unsigned)wc - 0x2e00 <= 0xa6ff-0x2e00 + || (unsigned)wc - 0x2e00 <= 0xa63f-0x2e00 || (unsigned)wc - 0xa800 <= 0xfeff-0xa800) return wc; /* special case because the diff between upper/lower is too big */ - if ((unsigned)wc - 0x10a0 < 0x26 || (unsigned)wc - 0x2d00 < 0x26) - return wc + lmul*(0x2d00-0x10a0); + if (lower && (unsigned)wc - 0x10a0 < 0x2e) + if (wc>0x10c5 && wc != 0x10c7 && wc != 0x10cd) return wc; + else return wc + 0x2d00 - 0x10a0; + if (!lower && (unsigned)wc - 0x2d00 < 0x26) + if (wc>0x2d25 && wc != 0x2d27 && wc != 0x2d2d) return wc; + else return wc + 0x10a0 - 0x2d00; for (i=0; casemaps[i].len; i++) { int base = casemaps[i].upper + (lmask & casemaps[i].lower); if ((unsigned)wc-base < casemaps[i].len) { @@ -230,8 +252,8 @@ static wchar_t __towcase(wchar_t wc, int lower) if (pairs[i][1-lower] == wc) return pairs[i][lower]; } - if ((unsigned)wc - 0x10428 + (lower<<5) + (lower<<3) < 0x28) - return wc - 0x28 + (lower<<10) + (lower<<6); + if ((unsigned)wc - (0x10428 - 0x28*lower) < 0x28) + return wc - 0x28 + 0x50*lower; return wc; } -- 2.25.1