cde/programs/nsgmls/UTF8CodingSystem.C

   1 /*
   2  * CDE - Common Desktop Environment
   3  *
   4  * Copyright (c) 1993-2012, The Open Group. All rights reserved.
   5  *
   6  * These libraries and programs are free software; you can
   7  * redistribute them and/or modify them under the terms of the GNU
   8  * Lesser General Public License as published by the Free Software
   9  * Foundation; either version 2 of the License, or (at your option)
  10  * any later version.
  11  *
  12  * These libraries and programs are distributed in the hope that
  13  * they will be useful, but WITHOUT ANY WARRANTY; without even the
  14  * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE. See the GNU Lesser General Public License for more
  16  * details.
  17  *
  18  * You should have received a copy of the GNU Lesser General Public
  19  * License along with these librararies and programs; if not, write
  20  * to the Free Software Foundation, Inc., 51 Franklin Street, Fifth
  21  * Floor, Boston, MA 02110-1301 USA
  22  */
  23 /* $XConsortium: UTF8CodingSystem.C /main/1 1996/07/29 17:07:15 cde-hp $ */
  24 // Copyright (c) 1994 James Clark
  25 // See the file COPYING for copying permission.
  26
  27 #include "splib.h"
  28
  29 #ifdef SP_MULTI_BYTE
  30
  31 #include "UTF8CodingSystem.h"
  32 #include "constant.h"
  33 #if defined(linux)
  34 #include <iostream>
  35 #else
  36 #include <iostream.h>
  37 #endif
  38
  39 #ifdef SP_NAMESPACE
  40 namespace SP_NAMESPACE {
  41 #endif
  42
  43 enum {
  44   // cmaskN is mask for first byte to test for N byte sequence
  45   cmask1 = 0x80,
  46   cmask2 = 0xe0,
  47   cmask3 = 0xf0,
  48   cmask4 = 0xf8,
  49   cmask5 = 0xfc,
  50   cmask6 = 0xfe,
  51   // cvalN is value of masked first byte of N byte sequence
  52   cval1 = 0x00,
  53   cval2 = 0xc0,
  54   cval3 = 0xe0,
  55   cval4 = 0xf0,
  56   cval5 = 0xf8,
  57   cval6 = 0xfc,
  58   // vmaskN is mask to get value from first byte in N byte sequence
  59   vmask2 = 0x1f,
  60   vmask3 = 0xf,
  61   vmask4 = 0x7,
  62   vmask5 = 0x3,
  63   vmask6 = 0x1,
  64   // minN is minimum legal resulting value for N byte sequence
  65   min2 = 0x80,
  66   min3 = 0x800,
  67   min4 = 0x10000,
  68   min5 = 0x200000,
  69   min6 = 0x4000000,
  70   max6 = 0x7fffffff
  71 };
  72
  73 class UTF8Decoder : public Decoder {
  74 public:
  75   UTF8Decoder();
  76   size_t decode(Char *, const char *, size_t, const char **);
  77 private:
  78   // value for encoding error
  79   enum { invalid = 0xfffd };
  80   Boolean recovering_;
  81 };
  82
  83 class UTF8Encoder : public Encoder {
  84 public:
  85   UTF8Encoder();
  86   void output(const Char *, size_t, streambuf *);
  87 };
  88
  89 Decoder *UTF8CodingSystem::makeDecoder() const
  90 {
  91   return new UTF8Decoder;
  92 }
  93
  94 Encoder *UTF8CodingSystem::makeEncoder() const
  95 {
  96   return new UTF8Encoder;
  97 }
  98
  99
 100 UTF8Decoder::UTF8Decoder()
 101 : recovering_(0)
 102 {
 103 }
 104
 105 size_t UTF8Decoder::decode(Char *to, const char *s,
 106                           size_t slen, const char **result)
 107 {
 108   Char *start = to;
 109   const unsigned char *us = (const unsigned char *)s;
 110   if (recovering_) {
 111     recovering_ = 0;
 112     goto recover;
 113   }
 114   while (slen > 0) {
 115     unsigned c0;
 116     c0 = us[0];
 117     if ((c0 & cmask1) == cval1) {
 118       *to++ = c0;
 119       us++;
 120       slen--;
 121     }
 122     else if ((c0 & cmask2) == cval2) {
 123       if (slen < 2)
 124         goto done;
 125       unsigned c1 = us[1] ^ 0x80;
 126       if (c1 & 0xc0)
 127         goto error;
 128       unsigned c = ((c0 & vmask2) << 6) | c1;
 129       if (c < min2)
 130         c = invalid;
 131       *to++ = c;
 132       slen -= 2;
 133       us += 2;
 134     }
 135     else if ((c0 & cmask3) == cval3) {
 136       if (slen < 3)
 137         goto done;
 138       unsigned c1 = us[1] ^ 0x80;
 139       unsigned c2 = us[2] ^ 0x80;
 140       if ((c1 | c2) & 0xc0)
 141         goto error;
 142       unsigned c = ((((c0 & vmask3) << 6) | c1) << 6) | c2;
 143       if (c < min3)
 144         c = invalid;
 145       *to++ = c;
 146       slen -= 3;
 147       us += 3;
 148     }
 149     else if ((c0 & cmask4) == cval4) {
 150       if (slen < 4)
 151         goto done;
 152       if (charMax < min5 - 1)
 153         *to++ = invalid;
 154       unsigned c1 = us[1] ^ 0x80;
 155       unsigned c2 = us[2] ^ 0x80;
 156       unsigned c3 = us[3] ^ 0x80;
 157       if ((c1 | c2 | c3) & 0xc0)
 158         goto error;
 159       else {
 160         unsigned long c = ((((c0 & vmask4) << 6) | c1) << 6) | c2;
 161         c = (c << 6) | c3;
 162         if (c < min4)
 163           c = invalid;
 164         *to++ = c;
 165       }
 166       slen -= 4;
 167       us += 4;
 168     }
 169     else if ((c0 & cmask5) == cval5) {
 170       if (slen < 5)
 171         goto done;
 172       unsigned c1 = us[1] ^ 0x80;
 173       unsigned c2 = us[2] ^ 0x80;
 174       unsigned c3 = us[3] ^ 0x80;
 175       unsigned c4 = us[4] ^ 0x80;
 176       if ((c1 | c2 | c3 | c4) & 0xc0)
 177         goto error;
 178       if (charMax < min6 - 1)
 179         *to++ = invalid;
 180       else {
 181         unsigned long c = ((((c0 & vmask5) << 6) | c1) << 6) | c2;
 182         c = (((c << 6) | c3) << 6) | c4;
 183         if (c < min5)
 184           c = invalid;
 185         *to++ = c;
 186       }
 187       slen -= 5;
 188       us += 5;
 189     }
 190     else if ((c0 & cmask6) == cval6) {
 191       if (slen < 6)
 192         goto done;
 193       unsigned c1 = us[1] ^ 0x80;
 194       unsigned c2 = us[2] ^ 0x80;
 195       unsigned c3 = us[3] ^ 0x80;
 196       unsigned c4 = us[4] ^ 0x80;
 197       unsigned c5 = us[5] ^ 0x80;
 198       if ((c1 | c2 | c3 | c4 | c5) & 0xc0)
 199         goto error;
 200       if (charMax < max6)
 201         *to++ = invalid;
 202       else {
 203         unsigned long c = ((((c0 & vmask6) << 6) | c1) << 6) | c2;
 204         c = (((((c << 6) | c3) << 6) | c4) << 6) | c5;
 205         if (c < min6)
 206           c = invalid;
 207         *to++ = c;
 208       }
 209       slen -= 6;
 210       us += 6;
 211     }
 212     else {
 213     error:
 214       us++;
 215       slen--;
 216       *to++ = invalid;
 217     recover:
 218       for (;;) {
 219         if (slen == 0) {
 220           recovering_ = 1;
 221           goto done;
 222         }
 223         if ((*us & 0xc0) != 0x80)
 224           break;
 225         us++;
 226         slen--;
 227       }
 228     }
 229   }
 230  done:
 231   *result = (char *)us;
 232   return to - start;
 233 }
 234
 235 UTF8Encoder::UTF8Encoder()
 236 {
 237 }
 238
 239 // FIXME handle errors from streambuf::sputc
 240
 241 void UTF8Encoder::output(const Char *s, size_t n, streambuf *sb)
 242 {
 243   for (; n > 0; s++, n--) {
 244     Char c = *s;
 245     if (c < min2)
 246       sb->sputc(char(c));
 247     else if (c < min3) {
 248       sb->sputc((c >> 6) | cval2);
 249       sb->sputc((c & 0x3f) | 0x80);
 250     }
 251     else if (c < min4) {
 252       sb->sputc((c >> 12) | cval3);
 253       sb->sputc(((c >> 6) & 0x3f) | 0x80);
 254       sb->sputc((c & 0x3f) | 0x80);
 255     }
 256     else if (c < min5) {
 257       sb->sputc((c >> 18) | cval4);
 258       sb->sputc(((c >> 12) & 0x3f) | 0x80);
 259       sb->sputc(((c >> 6) & 0x3f) | 0x80);
 260       sb->sputc((c & 0x3f) | 0x80);
 261     }
 262     else if (c < min6) {
 263       sb->sputc((c >> 24) | cval5);
 264       sb->sputc(((c >> 18) & 0x3f) | 0x80);
 265       sb->sputc(((c >> 12) & 0x3f) | 0x80);
 266       sb->sputc(((c >> 6) & 0x3f) | 0x80);
 267       sb->sputc((c & 0x3f) | 0x80);
 268     }
 269     else if (c <= max6) {
 270       sb->sputc((c >> 30) | cval6);
 271       sb->sputc(((c >> 24) & 0x3f) | 0x80);
 272       sb->sputc(((c >> 18) & 0x3f) | 0x80);
 273       sb->sputc(((c >> 12) & 0x3f) | 0x80);
 274       sb->sputc(((c >> 6) & 0x3f) | 0x80);
 275       sb->sputc((c & 0x3f) | 0x80);
 276     }
 277   }
 278 }
 279 #ifdef SP_NAMESPACE
 280 }
 281 #endif
 282
 283 #else /* not SP_MULTI_BYTE */
 284
 285 #ifndef __GNUG__
 286 static char non_empty_translation_unit; // sigh
 287 #endif
 288
 289 #endif /* not SP_MULTI_BYTE */