implement POSIX regexp support
[oweals/jsonpath.git] / lexer.c
1 /*
2  * Copyright (C) 2013-2014 Jo-Philipp Wich <jo@mein.io>
3  *
4  * Permission to use, copy, modify, and/or distribute this software for any
5  * purpose with or without fee is hereby granted, provided that the above
6  * copyright notice and this permission notice appear in all copies.
7  *
8  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
9  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
10  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
11  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
12  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
13  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
14  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
15  */
16
17 #include <stdbool.h>
18 #include <stdlib.h>
19 #include <string.h>
20 #include <ctype.h>
21 #include <regex.h>
22
23 #include "ast.h"
24 #include "lexer.h"
25 #include "parser.h"
26
27
28 struct token {
29         int type;
30         const char *pat;
31         int plen;
32         int (*parse)(const char *buf, struct jp_opcode *op, struct jp_state *s);
33 };
34
35 #define dec(o) \
36         ((o) - '0')
37
38 #define hex(x) \
39         (((x) >= 'a') ? (10 + (x) - 'a') : \
40                 (((x) >= 'A') ? (10 + (x) - 'A') : dec(x)))
41
42 /*
43  * Stores the given codepoint as a utf8 multibyte sequence into the given
44  * output buffer and substracts the required amount of bytes from  the given
45  * length pointer.
46  *
47  * Returns false if the multibyte sequence would not fit into the buffer,
48  * otherwise true.
49  */
50
51 static bool
52 utf8enc(char **out, int *rem, int code)
53 {
54         if (code > 0 && code <= 0x7F)
55         {
56                 if (*rem < 1)
57                         return false;
58
59                 *(*out)++ = code; (*rem)--;
60                 return true;
61         }
62         else if (code > 0 && code <= 0x7FF)
63         {
64                 if (*rem < 2)
65                         return false;
66
67                 *(*out)++ = ((code >>  6) & 0x1F) | 0xC0; (*rem)--;
68                 *(*out)++ = ( code        & 0x3F) | 0x80; (*rem)--;
69                 return true;
70         }
71         else if (code > 0 && code <= 0xFFFF)
72         {
73                 if (*rem < 3)
74                         return false;
75
76                 *(*out)++ = ((code >> 12) & 0x0F) | 0xE0; (*rem)--;
77                 *(*out)++ = ((code >>  6) & 0x3F) | 0x80; (*rem)--;
78                 *(*out)++ = ( code        & 0x3F) | 0x80; (*rem)--;
79                 return true;
80         }
81         else if (code > 0 && code <= 0x10FFFF)
82         {
83                 if (*rem < 4)
84                         return false;
85
86                 *(*out)++ = ((code >> 18) & 0x07) | 0xF0; (*rem)--;
87                 *(*out)++ = ((code >> 12) & 0x3F) | 0x80; (*rem)--;
88                 *(*out)++ = ((code >>  6) & 0x3F) | 0x80; (*rem)--;
89                 *(*out)++ = ( code        & 0x3F) | 0x80; (*rem)--;
90                 return true;
91         }
92
93         return true;
94 }
95
96
97 /*
98  * Parses a string literal from the given buffer.
99  *
100  * Returns a negative value on error, otherwise the amount of consumed
101  * characters from the given buffer.
102  *
103  * Error values:
104  *  -1  Unterminated string
105  *  -2  Invalid escape sequence
106  *  -3  String literal too long
107  */
108
109 static int
110 parse_string(const char *buf, struct jp_opcode *op, struct jp_state *s)
111 {
112         char q = *(buf++);
113         char str[128] = { 0 };
114         char *out = str;
115         const char *in = buf;
116         bool esc = false;
117         int rem = sizeof(str) - 1;
118         int code;
119
120         while (*in)
121         {
122                 /* continuation of escape sequence */
123                 if (esc)
124                 {
125                         /* \uFFFF */
126                         if (in[0] == 'u')
127                         {
128                                 if (isxdigit(in[1]) && isxdigit(in[2]) &&
129                                     isxdigit(in[3]) && isxdigit(in[4]))
130                                 {
131                                         if (!utf8enc(&out, &rem,
132                                                      hex(in[1]) * 16 * 16 * 16 +
133                                                      hex(in[2]) * 16 * 16 +
134                                                      hex(in[3]) * 16 +
135                                                      hex(in[4])))
136                                         {
137                                                 s->error_pos = s->off + (in - buf);
138                                                 return -3;
139                                         }
140
141                                         in += 5;
142                                 }
143                                 else
144                                 {
145                                         s->error_pos = s->off + (in - buf);
146                                         return -2;
147                                 }
148                         }
149
150                         /* \xFF */
151                         else if (in[0] == 'x')
152                         {
153                                 if (isxdigit(in[1]) && isxdigit(in[2]))
154                                 {
155                                         if (!utf8enc(&out, &rem, hex(in[1]) * 16 + hex(in[2])))
156                                         {
157                                                 s->error_pos = s->off + (in - buf);
158                                                 return -3;
159                                         }
160
161                                         in += 3;
162                                 }
163                                 else
164                                 {
165                                         s->error_pos = s->off + (in - buf);
166                                         return -2;
167                                 }
168                         }
169
170                         /* \377, \77 or \7 */
171                         else if (in[0] >= '0' && in[0] <= '7')
172                         {
173                                 /* \377 */
174                                 if (in[1] >= '0' && in[1] <= '7' &&
175                                     in[2] >= '0' && in[2] <= '7')
176                                 {
177                                         code = dec(in[0]) * 8 * 8 +
178                                                dec(in[1]) * 8 +
179                                                dec(in[2]);
180
181                                         if (code > 255)
182                                         {
183                                                 s->error_pos = s->off + (in - buf);
184                                                 return -2;
185                                         }
186
187                                         if (!utf8enc(&out, &rem, code))
188                                         {
189                                                 s->error_pos = s->off + (in - buf);
190                                                 return -3;
191                                         }
192
193                                         in += 3;
194                                 }
195
196                                 /* \77 */
197                                 else if (in[1] >= '0' && in[1] <= '7')
198                                 {
199                                         if (!utf8enc(&out, &rem, dec(in[0]) * 8 + dec(in[1])))
200                                         {
201                                                 s->error_pos = s->off + (in - buf);
202                                                 return -3;
203                                         }
204
205                                         in += 2;
206                                 }
207
208                                 /* \7 */
209                                 else
210                                 {
211                                         if (!utf8enc(&out, &rem, dec(in[0])))
212                                         {
213                                                 s->error_pos = s->off + (in - buf);
214                                                 return -3;
215                                         }
216
217                                         in += 1;
218                                 }
219                         }
220
221                         /* single character escape */
222                         else
223                         {
224                                 if (rem-- < 1)
225                                 {
226                                         s->error_pos = s->off + (in - buf);
227                                         return -3;
228                                 }
229
230                                 switch (in[0])
231                                 {
232                                 case 'a': *out = '\a'; break;
233                                 case 'b': *out = '\b'; break;
234                                 case 'e': *out = '\e'; break;
235                                 case 'f': *out = '\f'; break;
236                                 case 'n': *out = '\n'; break;
237                                 case 'r': *out = '\r'; break;
238                                 case 't': *out = '\t'; break;
239                                 case 'v': *out = '\v'; break;
240                                 default:
241                                         /* in regexp mode, retain backslash */
242                                         if (q == '/')
243                                         {
244                                                 if (rem-- < 1)
245                                                 {
246                                                         s->error_pos = s->off + (in - buf);
247                                                         return -3;
248                                                 }
249
250                                                 *out++ = '\\';
251                                         }
252
253                                         *out = *in;
254                                         break;
255                                 }
256
257                                 in++;
258                                 out++;
259                         }
260
261                         esc = false;
262                 }
263
264                 /* begin of escape sequence */
265                 else if (*in == '\\')
266                 {
267                         in++;
268                         esc = true;
269                 }
270
271                 /* terminating quote */
272                 else if (*in == q)
273                 {
274                         op->str = strdup(str);
275                         return (in - buf) + 2;
276                 }
277
278                 /* ordinary char */
279                 else
280                 {
281                         if (rem-- < 1)
282                         {
283                                 s->error_pos = s->off + (in - buf);
284                                 return -3;
285                         }
286
287                         *out++ = *in++;
288                 }
289         }
290
291         return -1;
292 }
293
294
295 /*
296  * Parses a regexp literal from the given buffer.
297  *
298  * Returns a negative value on error, otherwise the amount of consumed
299  * characters from the given buffer.
300  *
301  * Error values:
302  *  -1  Unterminated regexp
303  *  -2  Invalid escape sequence
304  *  -3  Regexp literal too long
305  */
306
307 static int
308 parse_regexp(const char *buf, struct jp_opcode *op, struct jp_state *s)
309 {
310         int len = parse_string(buf, op, s);
311         const char *p;
312
313         if (len >= 2)
314         {
315                 op->num = REG_NOSUB | REG_NEWLINE;
316
317                 for (p = buf + len; p; p++)
318                 {
319                         switch (*p)
320                         {
321                         case 'e':
322                                 op->num |= REG_EXTENDED;
323                                 len++;
324                                 break;
325
326                         case 'i':
327                                 op->num |= REG_ICASE;
328                                 len++;
329                                 break;
330
331                         case 's':
332                                 op->num &= ~REG_NEWLINE;
333                                 len++;
334                                 break;
335
336                         default:
337                                 return len;
338                         }
339                 }
340
341         }
342
343         return len;
344 }
345
346
347 /*
348  * Parses a label from the given buffer.
349  *
350  * Returns a negative value on error, otherwise the amount of consumed
351  * characters from the given buffer.
352  *
353  * Error values:
354  *  -3  Label too long
355  */
356
357 static int
358 parse_label(const char *buf, struct jp_opcode *op, struct jp_state *s)
359 {
360         char str[128] = { 0 };
361         char *out = str;
362         const char *in = buf;
363         int rem = sizeof(str) - 1;
364
365         while (*in == '_' || isalnum(*in))
366         {
367                 if (rem-- < 1)
368                 {
369                         s->error_pos = s->off + (in - buf);
370                         return -3;
371                 }
372
373                 *out++ = *in++;
374         }
375
376         if (!strcmp(str, "true") || !strcmp(str, "false"))
377         {
378                 op->num = (str[0] == 't');
379                 op->type = T_BOOL;
380         }
381         else
382         {
383                 op->str = strdup(str);
384         }
385
386         return (in - buf);
387 }
388
389
390 /*
391  * Parses a number literal from the given buffer.
392  *
393  * Returns a negative value on error, otherwise the amount of consumed
394  * characters from the given buffer.
395  *
396  * Error values:
397  *  -2  Invalid number character
398  */
399
400 static int
401 parse_number(const char *buf, struct jp_opcode *op, struct jp_state *s)
402 {
403         char *e;
404         int n = strtol(buf, &e, 10);
405
406         if (e == buf)
407         {
408                 s->error_pos = s->off;
409                 return -2;
410         }
411
412         op->num = n;
413
414         return (e - buf);
415 }
416
417 static const struct token tokens[] = {
418         { 0,                    " ",     1 },
419         { 0,                    "\t",    1 },
420         { 0,                    "\n",    1 },
421         { T_LE,                 "<=",    2 },
422         { T_GE,                 ">=",    2 },
423         { T_NE,                 "!=",    2 },
424         { T_AND,                "&&",    2 },
425         { T_OR,                 "||",    2 },
426         { T_DOT,                ".",     1 },
427         { T_BROPEN,             "[",     1 },
428         { T_BRCLOSE,    "]",     1 },
429         { T_POPEN,              "(",     1 },
430         { T_PCLOSE,             ")",     1 },
431         { T_UNION,              ",",     1 },
432         { T_ROOT,               "$",     1 },
433         { T_THIS,               "@",     1 },
434         { T_LT,                 "<",     1 },
435         { T_GT,                 ">",     1 },
436         { T_EQ,                 "=",     1 },
437         { T_MATCH,              "~",     1 },
438         { T_NOT,                "!",     1 },
439         { T_WILDCARD,   "*",     1 },
440         { T_REGEXP,             "/",     1, parse_regexp },
441         { T_STRING,             "'",     1, parse_string },
442         { T_STRING,             "\"",    1, parse_string },
443         { T_LABEL,              "_",     1, parse_label  },
444         { T_LABEL,              "az",    0, parse_label  },
445         { T_LABEL,              "AZ",    0, parse_label  },
446         { T_NUMBER,             "-",     1, parse_number },
447         { T_NUMBER,             "09",    0, parse_number },
448 };
449
450 const char *tokennames[25] = {
451         [0]                             = "End of file",
452         [T_AND]                 = "'&&'",
453         [T_OR]                  = "'||'",
454         [T_UNION]               = "','",
455         [T_EQ]                  = "'='",
456         [T_NE]                  = "'!='",
457         [T_GT]                  = "'>'",
458         [T_GE]                  = "'>='",
459         [T_LT]                  = "'<'",
460         [T_LE]                  = "'<='",
461         [T_MATCH]       = "'~'",
462         [T_NOT]                 = "'!'",
463         [T_LABEL]               = "Label",
464         [T_ROOT]                = "'$'",
465         [T_THIS]                = "'@'",
466         [T_DOT]                 = "'.'",
467         [T_WILDCARD]    = "'*'",
468         [T_REGEXP]      = "/.../",
469         [T_BROPEN]              = "'['",
470         [T_BRCLOSE]             = "']'",
471         [T_BOOL]                = "Bool",
472         [T_NUMBER]              = "Number",
473         [T_STRING]              = "String",
474         [T_POPEN]               = "'('",
475         [T_PCLOSE]              = "')'",
476 };
477
478
479 static int
480 match_token(const char *ptr, struct jp_opcode *op, struct jp_state *s)
481 {
482         int i;
483         const struct token *tok;
484
485         for (i = 0, tok = &tokens[0];
486              i < sizeof(tokens) / sizeof(tokens[0]);
487                  i++, tok = &tokens[i])
488         {
489                 if ((tok->plen > 0 && !strncmp(ptr, tok->pat, tok->plen)) ||
490                     (tok->plen == 0 && *ptr >= tok->pat[0] && *ptr <= tok->pat[1]))
491                 {
492                         op->type = tok->type;
493
494                         if (tok->parse)
495                                 return tok->parse(ptr, op, s);
496
497                         return tok->plen;
498                 }
499         }
500
501         s->error_pos = s->off;
502         return -4;
503 }
504
505 struct jp_opcode *
506 jp_get_token(struct jp_state *s, const char *input, int *mlen)
507 {
508         struct jp_opcode op = { 0 };
509
510         *mlen = match_token(input, &op, s);
511
512         if (*mlen < 0)
513         {
514                 s->error_code = *mlen;
515                 return NULL;
516         }
517         else if (op.type == 0)
518         {
519                 return NULL;
520         }
521
522         return jp_alloc_op(s, op.type, op.num, op.str, NULL);
523 }