From 8ac568acb018702b2d8234b74773877e470e4ae5 Mon Sep 17 00:00:00 2001 From: Jo-Philipp Wich Date: Fri, 12 Nov 2010 22:48:17 +0000 Subject: [PATCH] libs/web: fix sanitize_utf8(), passes all testcases now --- libs/web/src/template_utils.c | 126 ++++++++++++++++++---------------- 1 file changed, 68 insertions(+), 58 deletions(-) diff --git a/libs/web/src/template_utils.c b/libs/web/src/template_utils.c index c560b7a02..f17d3b3e9 100644 --- a/libs/web/src/template_utils.c +++ b/libs/web/src/template_utils.c @@ -181,70 +181,67 @@ static int _validate_utf8(unsigned char **s, int l, struct template_buffer *buf) unsigned char *ptr = *s; unsigned int o = 0, v, n; - //for (o = 0; o < l; o++) + /* ascii byte without null */ + if ((*(ptr+0) >= 0x01) && (*(ptr+0) <= 0x7F)) { - /* ascii byte without null */ - if ((*(ptr+0) >= 0x01) && (*(ptr+0) <= 0x7F)) - { - if (!buf_putchar(buf, *ptr++)) - return 0; + if (!buf_putchar(buf, *ptr++)) + return 0; - o = 1; - } + o = 1; + } - /* multi byte sequence */ - else if ((n = mb_num_chars(*ptr)) > 1) + /* multi byte sequence */ + else if ((n = mb_num_chars(*ptr)) > 1) + { + /* count valid chars */ + for (v = 1; (v <= n) && ((o+v) < l) && mb_is_cont(*(ptr+v)); v++); + + switch (n) { - /* count valid chars */ - for (v = 1; (v <= n) && ((o+v) < l) && mb_is_cont(*(ptr+v)); v++); - - switch (n) - { - case 6: - case 5: - /* five and six byte sequences are always invalid */ + case 6: + case 5: + /* five and six byte sequences are always invalid */ + if (!buf_putchar(buf, '?')) + return 0; + + break; + + default: + /* if the number of valid continuation bytes matches the + * expected number and if the sequence is legal, copy + * the bytes to the destination buffer */ + if ((v == n) && mb_is_shortest(ptr, n) && + !mb_is_surrogate(ptr, n) && !mb_is_illegal(ptr, n)) + { + /* copy sequence */ + if (!buf_append(buf, ptr, n)) + return 0; + } + + /* the found sequence is illegal, skip it */ + else + { + /* invalid sequence */ if (!buf_putchar(buf, '?')) return 0; + } - break; - - default: - /* if the number of valid continuation bytes matches the - * expected number and if the sequence is legal, copy - * the bytes to the destination buffer */ - if ((v == n) && mb_is_shortest(ptr, n) && - !mb_is_surrogate(ptr, n) && !mb_is_illegal(ptr, n)) - { - /* copy sequence */ - if (!buf_append(buf, ptr, n)) - return 0; - } - - /* the found sequence is illegal, skip it */ - else - { - /* invalid sequence */ - if (!buf_putchar(buf, '?')) - return 0; - } - - break; - } - - /* advance beyound the last found valid continuation char */ - o = v; - ptr += v; + break; } - /* invalid byte (0x00) */ - else - { - if (!buf_putchar(buf, '?')) /* or 0xEF, 0xBF, 0xBD */ - return 0; + /* advance beyound the last found valid continuation char */ + o = v; + ptr += v; + } - o = 1; - ptr++; - } + /* invalid byte (0x00) */ + else + { + if (!buf_putchar(buf, '?')) /* or 0xEF, 0xBF, 0xBD */ + return 0; + + o = 1; + ptr++; } *s = ptr; @@ -256,15 +253,28 @@ char * sanitize_utf8(const char *s, unsigned int l) { struct template_buffer *buf = buf_init(); unsigned char *ptr = (unsigned char *)s; + unsigned int v, o; if (!buf) return NULL; - if (!_validate_utf8(&ptr, l, buf)) + for (o = 0; o < l; o++) { - free(buf->data); - free(buf); - return NULL; + /* ascii char */ + if ((*ptr >= 0x01) && (*ptr <= 0x7F)) + { + if (!buf_putchar(buf, *ptr++)) + break; + } + + /* invalid byte or multi byte sequence */ + else + { + if (!(v = _validate_utf8(&ptr, l - o, buf))) + break; + + o += (v - 1); + } } return buf_destroy(buf); -- 2.25.1