cde/programs/nsgmls/UTF8CodingSystem.C

   1 /*
   2  * CDE - Common Desktop Environment
   3  *
   4  * Copyright (c) 1993-2012, The Open Group. All rights reserved.
   5  *
   6  * These libraries and programs are free software; you can
   7  * redistribute them and/or modify them under the terms of the GNU
   8  * Lesser General Public License as published by the Free Software
   9  * Foundation; either version 2 of the License, or (at your option)
  10  * any later version.
  11  *
  12  * These libraries and programs are distributed in the hope that
  13  * they will be useful, but WITHOUT ANY WARRANTY; without even the
  14  * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE. See the GNU Lesser General Public License for more
  16  * details.
  17  *
  18  * You should have received a copy of the GNU Lesser General Public
  19  * License along with these librararies and programs; if not, write
  20  * to the Free Software Foundation, Inc., 51 Franklin Street, Fifth
  21  * Floor, Boston, MA 02110-1301 USA
  22  */
  23 /* $XConsortium: UTF8CodingSystem.C /main/1 1996/07/29 17:07:15 cde-hp $ */
  24 // Copyright (c) 1994 James Clark
  25 // See the file COPYING for copying permission.
  26
  27 #include "splib.h"
  28
  29 #ifdef SP_MULTI_BYTE
  30
  31 #include "UTF8CodingSystem.h"
  32 #include "constant.h"
  33 #include <iostream.h>
  34
  35 #ifdef SP_NAMESPACE
  36 namespace SP_NAMESPACE {
  37 #endif
  38
  39 enum {
  40   // cmaskN is mask for first byte to test for N byte sequence
  41   cmask1 = 0x80,
  42   cmask2 = 0xe0,
  43   cmask3 = 0xf0,
  44   cmask4 = 0xf8,
  45   cmask5 = 0xfc,
  46   cmask6 = 0xfe,
  47   // cvalN is value of masked first byte of N byte sequence
  48   cval1 = 0x00,
  49   cval2 = 0xc0,
  50   cval3 = 0xe0,
  51   cval4 = 0xf0,
  52   cval5 = 0xf8,
  53   cval6 = 0xfc,
  54   // vmaskN is mask to get value from first byte in N byte sequence
  55   vmask2 = 0x1f,
  56   vmask3 = 0xf,
  57   vmask4 = 0x7,
  58   vmask5 = 0x3,
  59   vmask6 = 0x1,
  60   // minN is minimum legal resulting value for N byte sequence
  61   min2 = 0x80,
  62   min3 = 0x800,
  63   min4 = 0x10000,
  64   min5 = 0x200000,
  65   min6 = 0x4000000,
  66   max6 = 0x7fffffff
  67 };
  68
  69 class UTF8Decoder : public Decoder {
  70 public:
  71   UTF8Decoder();
  72   size_t decode(Char *, const char *, size_t, const char **);
  73 private:
  74   // value for encoding error
  75   enum { invalid = 0xfffd };
  76   Boolean recovering_;
  77 };
  78
  79 class UTF8Encoder : public Encoder {
  80 public:
  81   UTF8Encoder();
  82   void output(const Char *, size_t, streambuf *);
  83 };
  84
  85 Decoder *UTF8CodingSystem::makeDecoder() const
  86 {
  87   return new UTF8Decoder;
  88 }
  89
  90 Encoder *UTF8CodingSystem::makeEncoder() const
  91 {
  92   return new UTF8Encoder;
  93 }
  94
  95
  96 UTF8Decoder::UTF8Decoder()
  97 : recovering_(0)
  98 {
  99 }
 100
 101 size_t UTF8Decoder::decode(Char *to, const char *s,
 102                           size_t slen, const char **result)
 103 {
 104   Char *start = to;
 105   const unsigned char *us = (const unsigned char *)s;
 106   if (recovering_) {
 107     recovering_ = 0;
 108     goto recover;
 109   }
 110   while (slen > 0) {
 111     unsigned c0;
 112     c0 = us[0];
 113     if ((c0 & cmask1) == cval1) {
 114       *to++ = c0;
 115       us++;
 116       slen--;
 117     }
 118     else if ((c0 & cmask2) == cval2) {
 119       if (slen < 2)
 120         goto done;
 121       unsigned c1 = us[1] ^ 0x80;
 122       if (c1 & 0xc0)
 123         goto error;
 124       unsigned c = ((c0 & vmask2) << 6) | c1;
 125       if (c < min2)
 126         c = invalid;
 127       *to++ = c;
 128       slen -= 2;
 129       us += 2;
 130     }
 131     else if ((c0 & cmask3) == cval3) {
 132       if (slen < 3)
 133         goto done;
 134       unsigned c1 = us[1] ^ 0x80;
 135       unsigned c2 = us[2] ^ 0x80;
 136       if ((c1 | c2) & 0xc0)
 137         goto error;
 138       unsigned c = ((((c0 & vmask3) << 6) | c1) << 6) | c2;
 139       if (c < min3)
 140         c = invalid;
 141       *to++ = c;
 142       slen -= 3;
 143       us += 3;
 144     }
 145     else if ((c0 & cmask4) == cval4) {
 146       if (slen < 4)
 147         goto done;
 148       if (charMax < min5 - 1)
 149         *to++ = invalid;
 150       unsigned c1 = us[1] ^ 0x80;
 151       unsigned c2 = us[2] ^ 0x80;
 152       unsigned c3 = us[3] ^ 0x80;
 153       if ((c1 | c2 | c3) & 0xc0)
 154         goto error;
 155       else {
 156         unsigned long c = ((((c0 & vmask4) << 6) | c1) << 6) | c2;
 157         c = (c << 6) | c3;
 158         if (c < min4)
 159           c = invalid;
 160         *to++ = c;
 161       }
 162       slen -= 4;
 163       us += 4;
 164     }
 165     else if ((c0 & cmask5) == cval5) {
 166       if (slen < 5)
 167         goto done;
 168       unsigned c1 = us[1] ^ 0x80;
 169       unsigned c2 = us[2] ^ 0x80;
 170       unsigned c3 = us[3] ^ 0x80;
 171       unsigned c4 = us[4] ^ 0x80;
 172       if ((c1 | c2 | c3 | c4) & 0xc0)
 173         goto error;
 174       if (charMax < min6 - 1)
 175         *to++ = invalid;
 176       else {
 177         unsigned long c = ((((c0 & vmask5) << 6) | c1) << 6) | c2;
 178         c = (((c << 6) | c3) << 6) | c4;
 179         if (c < min5)
 180           c = invalid;
 181         *to++ = c;
 182       }
 183       slen -= 5;
 184       us += 5;
 185     }
 186     else if ((c0 & cmask6) == cval6) {
 187       if (slen < 6)
 188         goto done;
 189       unsigned c1 = us[1] ^ 0x80;
 190       unsigned c2 = us[2] ^ 0x80;
 191       unsigned c3 = us[3] ^ 0x80;
 192       unsigned c4 = us[4] ^ 0x80;
 193       unsigned c5 = us[5] ^ 0x80;
 194       if ((c1 | c2 | c3 | c4 | c5) & 0xc0)
 195         goto error;
 196       if (charMax < max6)
 197         *to++ = invalid;
 198       else {
 199         unsigned long c = ((((c0 & vmask6) << 6) | c1) << 6) | c2;
 200         c = (((((c << 6) | c3) << 6) | c4) << 6) | c5;
 201         if (c < min6)
 202           c = invalid;
 203         *to++ = c;
 204       }
 205       slen -= 6;
 206       us += 6;
 207     }
 208     else {
 209     error:
 210       us++;
 211       slen--;
 212       *to++ = invalid;
 213     recover:
 214       for (;;) {
 215         if (slen == 0) {
 216           recovering_ = 1;
 217           goto done;
 218         }
 219         if ((*us & 0xc0) != 0x80)
 220           break;
 221         us++;
 222         slen--;
 223       }
 224     }
 225   }
 226  done:
 227   *result = (char *)us;
 228   return to - start;
 229 }
 230
 231 UTF8Encoder::UTF8Encoder()
 232 {
 233 }
 234
 235 // FIXME handle errors from streambuf::sputc
 236
 237 void UTF8Encoder::output(const Char *s, size_t n, streambuf *sb)
 238 {
 239   for (; n > 0; s++, n--) {
 240     Char c = *s;
 241     if (c < min2)
 242       sb->sputc(char(c));
 243     else if (c < min3) {
 244       sb->sputc((c >> 6) | cval2);
 245       sb->sputc((c & 0x3f) | 0x80);
 246     }
 247     else if (c < min4) {
 248       sb->sputc((c >> 12) | cval3);
 249       sb->sputc(((c >> 6) & 0x3f) | 0x80);
 250       sb->sputc((c & 0x3f) | 0x80);
 251     }
 252     else if (c < min5) {
 253       sb->sputc((c >> 18) | cval4);
 254       sb->sputc(((c >> 12) & 0x3f) | 0x80);
 255       sb->sputc(((c >> 6) & 0x3f) | 0x80);
 256       sb->sputc((c & 0x3f) | 0x80);
 257     }
 258     else if (c < min6) {
 259       sb->sputc((c >> 24) | cval5);
 260       sb->sputc(((c >> 18) & 0x3f) | 0x80);
 261       sb->sputc(((c >> 12) & 0x3f) | 0x80);
 262       sb->sputc(((c >> 6) & 0x3f) | 0x80);
 263       sb->sputc((c & 0x3f) | 0x80);
 264     }
 265     else if (c <= max6) {
 266       sb->sputc((c >> 30) | cval6);
 267       sb->sputc(((c >> 24) & 0x3f) | 0x80);
 268       sb->sputc(((c >> 18) & 0x3f) | 0x80);
 269       sb->sputc(((c >> 12) & 0x3f) | 0x80);
 270       sb->sputc(((c >> 6) & 0x3f) | 0x80);
 271       sb->sputc((c & 0x3f) | 0x80);
 272     }
 273   }
 274 }
 275 #ifdef SP_NAMESPACE
 276 }
 277 #endif
 278
 279 #else /* not SP_MULTI_BYTE */
 280
 281 #ifndef __GNUG__
 282 static char non_empty_translation_unit; // sigh
 283 #endif
 284
 285 #endif /* not SP_MULTI_BYTE */