ca5880e15fedcb6ad46731562800facb67db2aa7
[oweals/jsonpath.git] / lexer.c
1 /*
2  * Copyright (C) 2013-2014 Jo-Philipp Wich <jo@mein.io>
3  *
4  * Permission to use, copy, modify, and/or distribute this software for any
5  * purpose with or without fee is hereby granted, provided that the above
6  * copyright notice and this permission notice appear in all copies.
7  *
8  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
9  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
10  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
11  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
12  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
13  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
14  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
15  */
16
17 #include <stdbool.h>
18 #include <stdlib.h>
19 #include <string.h>
20 #include <ctype.h>
21
22 #include "ast.h"
23 #include "lexer.h"
24 #include "parser.h"
25
26
27 struct token {
28         int type;
29         const char *pat;
30         int plen;
31         int (*parse)(const char *buf, struct jp_opcode *op, struct jp_state *s);
32 };
33
34 #define dec(o) \
35         ((o) - '0')
36
37 #define hex(x) \
38         (((x) >= 'a') ? (10 + (x) - 'a') : \
39                 (((x) >= 'A') ? (10 + (x) - 'A') : dec(x)))
40
41 /*
42  * Stores the given codepoint as a utf8 multibyte sequence into the given
43  * output buffer and substracts the required amount of bytes from  the given
44  * length pointer.
45  *
46  * Returns false if the multibyte sequence would not fit into the buffer,
47  * otherwise true.
48  */
49
50 static bool
51 utf8enc(char **out, int *rem, int code)
52 {
53         if (code > 0 && code <= 0x7F)
54         {
55                 if (*rem < 1)
56                         return false;
57
58                 *(*out)++ = code; (*rem)--;
59                 return true;
60         }
61         else if (code > 0 && code <= 0x7FF)
62         {
63                 if (*rem < 2)
64                         return false;
65
66                 *(*out)++ = ((code >>  6) & 0x1F) | 0xC0; (*rem)--;
67                 *(*out)++ = ( code        & 0x3F) | 0x80; (*rem)--;
68                 return true;
69         }
70         else if (code > 0 && code <= 0xFFFF)
71         {
72                 if (*rem < 3)
73                         return false;
74
75                 *(*out)++ = ((code >> 12) & 0x0F) | 0xE0; (*rem)--;
76                 *(*out)++ = ((code >>  6) & 0x3F) | 0x80; (*rem)--;
77                 *(*out)++ = ( code        & 0x3F) | 0x80; (*rem)--;
78                 return true;
79         }
80         else if (code > 0 && code <= 0x10FFFF)
81         {
82                 if (*rem < 4)
83                         return false;
84
85                 *(*out)++ = ((code >> 18) & 0x07) | 0xF0; (*rem)--;
86                 *(*out)++ = ((code >> 12) & 0x3F) | 0x80; (*rem)--;
87                 *(*out)++ = ((code >>  6) & 0x3F) | 0x80; (*rem)--;
88                 *(*out)++ = ( code        & 0x3F) | 0x80; (*rem)--;
89                 return true;
90         }
91
92         return true;
93 }
94
95
96 /*
97  * Parses a string literal from the given buffer.
98  *
99  * Returns a negative value on error, otherwise the amount of consumed
100  * characters from the given buffer.
101  *
102  * Error values:
103  *  -1  Unterminated string
104  *  -2  Invalid escape sequence
105  *  -3  String literal too long
106  */
107
108 static int
109 parse_string(const char *buf, struct jp_opcode *op, struct jp_state *s)
110 {
111         char q = *(buf++);
112         char str[128] = { 0 };
113         char *out = str;
114         const char *in = buf;
115         bool esc = false;
116         int rem = sizeof(str) - 1;
117         int code;
118
119         while (*in)
120         {
121                 /* continuation of escape sequence */
122                 if (esc)
123                 {
124                         /* \uFFFF */
125                         if (in[0] == 'u')
126                         {
127                                 if (isxdigit(in[1]) && isxdigit(in[2]) &&
128                                     isxdigit(in[3]) && isxdigit(in[4]))
129                                 {
130                                         if (!utf8enc(&out, &rem,
131                                                      hex(in[1]) * 16 * 16 * 16 +
132                                                      hex(in[2]) * 16 * 16 +
133                                                      hex(in[3]) * 16 +
134                                                      hex(in[4])))
135                                         {
136                                                 s->error_pos = s->off + (in - buf);
137                                                 return -3;
138                                         }
139
140                                         in += 5;
141                                 }
142                                 else
143                                 {
144                                         s->error_pos = s->off + (in - buf);
145                                         return -2;
146                                 }
147                         }
148
149                         /* \xFF */
150                         else if (in[0] == 'x')
151                         {
152                                 if (isxdigit(in[1]) && isxdigit(in[2]))
153                                 {
154                                         if (!utf8enc(&out, &rem, hex(in[1]) * 16 + hex(in[2])))
155                                         {
156                                                 s->error_pos = s->off + (in - buf);
157                                                 return -3;
158                                         }
159
160                                         in += 3;
161                                 }
162                                 else
163                                 {
164                                         s->error_pos = s->off + (in - buf);
165                                         return -2;
166                                 }
167                         }
168
169                         /* \377, \77 or \7 */
170                         else if (in[0] >= '0' && in[0] <= '7')
171                         {
172                                 /* \377 */
173                                 if (in[1] >= '0' && in[1] <= '7' &&
174                                     in[2] >= '0' && in[2] <= '7')
175                                 {
176                                         code = dec(in[0]) * 8 * 8 +
177                                                dec(in[1]) * 8 +
178                                                dec(in[2]);
179
180                                         if (code > 255)
181                                         {
182                                                 s->error_pos = s->off + (in - buf);
183                                                 return -2;
184                                         }
185
186                                         if (!utf8enc(&out, &rem, code))
187                                         {
188                                                 s->error_pos = s->off + (in - buf);
189                                                 return -3;
190                                         }
191
192                                         in += 3;
193                                 }
194
195                                 /* \77 */
196                                 else if (in[1] >= '0' && in[1] <= '7')
197                                 {
198                                         if (!utf8enc(&out, &rem, dec(in[0]) * 8 + dec(in[1])))
199                                         {
200                                                 s->error_pos = s->off + (in - buf);
201                                                 return -3;
202                                         }
203
204                                         in += 2;
205                                 }
206
207                                 /* \7 */
208                                 else
209                                 {
210                                         if (!utf8enc(&out, &rem, dec(in[0])))
211                                         {
212                                                 s->error_pos = s->off + (in - buf);
213                                                 return -3;
214                                         }
215
216                                         in += 1;
217                                 }
218                         }
219
220                         /* single character escape */
221                         else
222                         {
223                                 if (rem-- < 1)
224                                 {
225                                         s->error_pos = s->off + (in - buf);
226                                         return -3;
227                                 }
228
229                                 switch (in[0])
230                                 {
231                                 case 'a': *out = '\a'; break;
232                                 case 'b': *out = '\b'; break;
233                                 case 'e': *out = '\e'; break;
234                                 case 'f': *out = '\f'; break;
235                                 case 'n': *out = '\n'; break;
236                                 case 'r': *out = '\r'; break;
237                                 case 't': *out = '\t'; break;
238                                 case 'v': *out = '\v'; break;
239                                 default:  *out = *in; break;
240                                 }
241
242                                 in++;
243                                 out++;
244                         }
245
246                         esc = false;
247                 }
248
249                 /* begin of escape sequence */
250                 else if (*in == '\\')
251                 {
252                         in++;
253                         esc = true;
254                 }
255
256                 /* terminating quote */
257                 else if (*in == q)
258                 {
259                         op->str = strdup(str);
260                         return (in - buf) + 2;
261                 }
262
263                 /* ordinary char */
264                 else
265                 {
266                         if (rem-- < 1)
267                         {
268                                 s->error_pos = s->off + (in - buf);
269                                 return -3;
270                         }
271
272                         *out++ = *in++;
273                 }
274         }
275
276         return -1;
277 }
278
279
280 /*
281  * Parses a label from the given buffer.
282  *
283  * Returns a negative value on error, otherwise the amount of consumed
284  * characters from the given buffer.
285  *
286  * Error values:
287  *  -3  Label too long
288  */
289
290 static int
291 parse_label(const char *buf, struct jp_opcode *op, struct jp_state *s)
292 {
293         char str[128] = { 0 };
294         char *out = str;
295         const char *in = buf;
296         int rem = sizeof(str) - 1;
297
298         while (*in == '_' || isalnum(*in))
299         {
300                 if (rem-- < 1)
301                 {
302                         s->error_pos = s->off + (in - buf);
303                         return -3;
304                 }
305
306                 *out++ = *in++;
307         }
308
309         if (!strcmp(str, "true") || !strcmp(str, "false"))
310         {
311                 op->num = (str[0] == 't');
312                 op->type = T_BOOL;
313         }
314         else
315         {
316                 op->str = strdup(str);
317         }
318
319         return (in - buf);
320 }
321
322
323 /*
324  * Parses a number literal from the given buffer.
325  *
326  * Returns a negative value on error, otherwise the amount of consumed
327  * characters from the given buffer.
328  *
329  * Error values:
330  *  -2  Invalid number character
331  */
332
333 static int
334 parse_number(const char *buf, struct jp_opcode *op, struct jp_state *s)
335 {
336         char *e;
337         int n = strtol(buf, &e, 10);
338
339         if (e == buf)
340         {
341                 s->error_pos = s->off;
342                 return -2;
343         }
344
345         op->num = n;
346
347         return (e - buf);
348 }
349
350 static const struct token tokens[] = {
351         { 0,                    " ",     1 },
352         { 0,                    "\t",    1 },
353         { 0,                    "\n",    1 },
354         { T_LE,                 "<=",    2 },
355         { T_GE,                 ">=",    2 },
356         { T_NE,                 "!=",    2 },
357         { T_AND,                "&&",    2 },
358         { T_OR,                 "||",    2 },
359         { T_DOT,                ".",     1 },
360         { T_BROPEN,             "[",     1 },
361         { T_BRCLOSE,    "]",     1 },
362         { T_POPEN,              "(",     1 },
363         { T_PCLOSE,             ")",     1 },
364         { T_UNION,              ",",     1 },
365         { T_ROOT,               "$",     1 },
366         { T_THIS,               "@",     1 },
367         { T_LT,                 "<",     1 },
368         { T_GT,                 ">",     1 },
369         { T_EQ,                 "=",     1 },
370         { T_NOT,                "!",     1 },
371         { T_WILDCARD,   "*",     1 },
372         { T_STRING,             "'",     1, parse_string },
373         { T_STRING,             "\"",    1, parse_string },
374         { T_LABEL,              "_",     1, parse_label  },
375         { T_LABEL,              "az",    0, parse_label  },
376         { T_LABEL,              "AZ",    0, parse_label  },
377         { T_NUMBER,             "-",     1, parse_number },
378         { T_NUMBER,             "09",    0, parse_number },
379 };
380
381 const char *tokennames[23] = {
382         [0]                             = "End of file",
383         [T_AND]                 = "'&&'",
384         [T_OR]                  = "'||'",
385         [T_UNION]               = "','",
386         [T_EQ]                  = "'='",
387         [T_NE]                  = "'!='",
388         [T_GT]                  = "'>'",
389         [T_GE]                  = "'>='",
390         [T_LT]                  = "'<'",
391         [T_LE]                  = "'<='",
392         [T_NOT]                 = "'!'",
393         [T_LABEL]               = "Label",
394         [T_ROOT]                = "'$'",
395         [T_THIS]                = "'@'",
396         [T_DOT]                 = "'.'",
397         [T_WILDCARD]    = "'*'",
398         [T_BROPEN]              = "'['",
399         [T_BRCLOSE]             = "']'",
400         [T_BOOL]                = "Bool",
401         [T_NUMBER]              = "Number",
402         [T_STRING]              = "String",
403         [T_POPEN]               = "'('",
404         [T_PCLOSE]              = "')'",
405 };
406
407
408 static int
409 match_token(const char *ptr, struct jp_opcode *op, struct jp_state *s)
410 {
411         int i;
412         const struct token *tok;
413
414         for (i = 0, tok = &tokens[0];
415              i < sizeof(tokens) / sizeof(tokens[0]);
416                  i++, tok = &tokens[i])
417         {
418                 if ((tok->plen > 0 && !strncmp(ptr, tok->pat, tok->plen)) ||
419                     (tok->plen == 0 && *ptr >= tok->pat[0] && *ptr <= tok->pat[1]))
420                 {
421                         op->type = tok->type;
422
423                         if (tok->parse)
424                                 return tok->parse(ptr, op, s);
425
426                         return tok->plen;
427                 }
428         }
429
430         s->error_pos = s->off;
431         return -4;
432 }
433
434 struct jp_opcode *
435 jp_get_token(struct jp_state *s, const char *input, int *mlen)
436 {
437         struct jp_opcode op = { 0 };
438
439         *mlen = match_token(input, &op, s);
440
441         if (*mlen < 0)
442         {
443                 s->error_code = *mlen;
444                 return NULL;
445         }
446         else if (op.type == 0)
447         {
448                 return NULL;
449         }
450
451         return jp_alloc_op(s, op.type, op.num, op.str, NULL);
452 }