3703d56d7bc4500ec07213a2d140442c21e5ff52
[oweals/jsonpath.git] / lexer.c
1 /*
2  * Copyright (C) 2013-2014 Jo-Philipp Wich <jow@openwrt.org>
3  *
4  * Permission to use, copy, modify, and/or distribute this software for any
5  * purpose with or without fee is hereby granted, provided that the above
6  * copyright notice and this permission notice appear in all copies.
7  *
8  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
9  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
10  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
11  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
12  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
13  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
14  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
15  */
16
17 #include <stdbool.h>
18 #include <stdlib.h>
19 #include <string.h>
20 #include <ctype.h>
21
22 #include "ast.h"
23 #include "lexer.h"
24 #include "parser.h"
25
26
27 struct token {
28         int type;
29         const char *pat;
30         int plen;
31         int (*parse)(const char *buf, struct jp_opcode *op);
32 };
33
34 #define dec(o) \
35         ((o) - '0')
36
37 #define hex(x) \
38         (((x) >= 'a') ? (10 + (x) - 'a') : \
39                 (((x) >= 'A') ? (10 + (x) - 'A') : dec(x)))
40
41 /*
42  * Stores the given codepoint as a utf8 multibyte sequence into the given
43  * output buffer and substracts the required amount of bytes from  the given
44  * length pointer.
45  *
46  * Returns false if the multibyte sequence would not fit into the buffer,
47  * otherwise true.
48  */
49
50 static bool
51 utf8enc(char **out, int *rem, int code)
52 {
53         if (code > 0 && code <= 0x7F)
54         {
55                 if (*rem < 1)
56                         return false;
57
58                 *(*out++) = code; (*rem)--;
59                 return true;
60         }
61         else if (code > 0 && code <= 0x7FF)
62         {
63                 if (*rem < 2)
64                         return false;
65
66                 *(*out)++ = ((code >>  6) & 0x1F) | 0xC0; (*rem)--;
67                 *(*out)++ = ( code        & 0x3F) | 0x80; (*rem)--;
68                 return true;
69         }
70         else if (code > 0 && code <= 0xFFFF)
71         {
72                 if (*rem < 3)
73                         return false;
74
75                 *(*out)++ = ((code >> 12) & 0x0F) | 0xE0; (*rem)--;
76                 *(*out)++ = ((code >>  6) & 0x3F) | 0x80; (*rem)--;
77                 *(*out)++ = ( code        & 0x3F) | 0x80; (*rem)--;
78                 return true;
79         }
80         else if (code > 0 && code <= 0x10FFFF)
81         {
82                 if (*rem < 4)
83                         return false;
84
85                 *(*out)++ = ((code >> 18) & 0x07) | 0xF0; (*rem)--;
86                 *(*out)++ = ((code >> 12) & 0x3F) | 0x80; (*rem)--;
87                 *(*out)++ = ((code >>  6) & 0x3F) | 0x80; (*rem)--;
88                 *(*out)++ = ( code        & 0x3F) | 0x80; (*rem)--;
89                 return true;
90         }
91
92         return true;
93 }
94
95
96 /*
97  * Parses a string literal from the given buffer.
98  *
99  * Returns a negative value on error, otherwise the amount of consumed
100  * characters from the given buffer.
101  *
102  * Error values:
103  *  -1  Unterminated string
104  *  -2  Invalid escape sequence
105  *  -3  String literal too long
106  */
107
108 static int
109 parse_string(const char *buf, struct jp_opcode *op)
110 {
111         char q = *(buf++);
112         char str[128] = { 0 };
113         char *out = str;
114         const char *in = buf;
115         bool esc = false;
116         int rem = sizeof(str) - 1;
117         int code;
118
119         while (*in)
120         {
121                 /* continuation of escape sequence */
122                 if (esc)
123                 {
124                         /* \uFFFF */
125                         if (in[0] == 'u')
126                         {
127                                 if (isxdigit(in[1]) && isxdigit(in[2]) &&
128                                     isxdigit(in[3]) && isxdigit(in[4]))
129                                 {
130                                         if (!utf8enc(&out, &rem,
131                                                      hex(in[1]) * 16 * 16 * 16 +
132                                                      hex(in[2]) * 16 * 16 +
133                                                      hex(in[3]) * 16 +
134                                                      hex(in[4])))
135                                                 return -3;
136
137                                         in += 5;
138                                 }
139                                 else
140                                 {
141                                         return -2;
142                                 }
143                         }
144
145                         /* \xFF */
146                         else if (in[0] == 'x')
147                         {
148                                 if (isxdigit(in[1]) && isxdigit(in[2]))
149                                 {
150                                         if (!utf8enc(&out, &rem, hex(in[1]) * 16 + hex(in[2])))
151                                                 return -3;
152
153                                         in += 3;
154                                 }
155                                 else
156                                 {
157                                         return -2;
158                                 }
159                         }
160
161                         /* \377, \77 or \7 */
162                         else if (in[0] >= '0' && in[0] <= '7')
163                         {
164                                 /* \377 */
165                                 if (in[1] >= '0' && in[1] <= '7' &&
166                                     in[2] >= '0' && in[2] <= '7')
167                                 {
168                                         code = dec(in[0]) * 8 * 8 +
169                                                dec(in[1]) * 8 +
170                                                dec(in[2]);
171
172                                         if (code > 255)
173                                                 return -2;
174
175                                         if (!utf8enc(&out, &rem, code))
176                                                 return -3;
177
178                                         in += 3;
179                                 }
180
181                                 /* \77 */
182                                 else if (in[1] >= '0' && in[1] <= '7')
183                                 {
184                                         if (!utf8enc(&out, &rem, dec(in[0]) * 8 + dec(in[1])))
185                                                 return -3;
186
187                                         in += 2;
188                                 }
189
190                                 /* \7 */
191                                 else
192                                 {
193                                         if (!utf8enc(&out, &rem, dec(in[0])))
194                                                 return -3;
195
196                                         in += 1;
197                                 }
198                         }
199
200                         /* single character escape */
201                         else
202                         {
203                                 if (rem-- < 1)
204                                         return -3;
205
206                                 switch (in[0])
207                                 {
208                                 case 'a': *out = '\a'; break;
209                                 case 'b': *out = '\b'; break;
210                                 case 'e': *out = '\e'; break;
211                                 case 'f': *out = '\f'; break;
212                                 case 'n': *out = '\n'; break;
213                                 case 'r': *out = '\r'; break;
214                                 case 't': *out = '\t'; break;
215                                 case 'v': *out = '\v'; break;
216                                 default:  *out = *in; break;
217                                 }
218
219                                 in++;
220                                 out++;
221                         }
222
223                         esc = false;
224                 }
225
226                 /* begin of escape sequence */
227                 else if (*in == '\\')
228                 {
229                         in++;
230                         esc = true;
231                 }
232
233                 /* terminating quote */
234                 else if (*in == q)
235                 {
236                         op->str = strdup(str);
237                         return (in - buf) + 2;
238                 }
239
240                 /* ordinary char */
241                 else
242                 {
243                         if (rem-- < 1)
244                                 return -3;
245
246                         *out++ = *in++;
247                 }
248         }
249
250         return -1;
251 }
252
253
254 /*
255  * Parses a label from the given buffer.
256  *
257  * Returns a negative value on error, otherwise the amount of consumed
258  * characters from the given buffer.
259  *
260  * Error values:
261  *  -3  Label too long
262  */
263
264 static int
265 parse_label(const char *buf, struct jp_opcode *op)
266 {
267         char str[128] = { 0 };
268         char *out = str;
269         const char *in = buf;
270         int rem = sizeof(str) - 1;
271
272         while (*in == '_' || isalnum(*in))
273         {
274                 if (rem-- < 1)
275                         return -3;
276
277                 *out++ = *in++;
278         }
279
280         if (!strcmp(str, "true") || !strcmp(str, "false"))
281         {
282                 op->num = (str[0] == 't');
283                 op->type = T_BOOL;
284         }
285         else
286         {
287                 op->str = strdup(str);
288         }
289
290         return (in - buf);
291 }
292
293
294 /*
295  * Parses a number literal from the given buffer.
296  *
297  * Returns a negative value on error, otherwise the amount of consumed
298  * characters from the given buffer.
299  *
300  * Error values:
301  *  -2  Invalid number character
302  */
303
304 static int
305 parse_number(const char *buf, struct jp_opcode *op)
306 {
307         char *e;
308         int n = strtol(buf, &e, 10);
309
310         if (e == buf)
311                 return -2;
312
313         op->num = n;
314
315         return (e - buf);
316 }
317
318 static const struct token tokens[] = {
319         { 0,                    " ",     1 },
320         { 0,                    "\t",    1 },
321         { 0,                    "\n",    1 },
322         { T_LE,                 "<=",    2 },
323         { T_GE,                 ">=",    2 },
324         { T_NE,                 "!=",    2 },
325         { T_AND,                "&&",    2 },
326         { T_OR,                 "||",    2 },
327         { T_DOT,                ".",     1 },
328         { T_BROPEN,             "[",     1 },
329         { T_BRCLOSE,    "]",     1 },
330         { T_POPEN,              "(",     1 },
331         { T_PCLOSE,             ")",     1 },
332         { T_UNION,              ",",     1 },
333         { T_ROOT,               "$",     1 },
334         { T_THIS,               "@",     1 },
335         { T_LT,                 "<",     1 },
336         { T_GT,                 ">",     1 },
337         { T_EQ,                 "=",     1 },
338         { T_NOT,                "!",     1 },
339         { T_WILDCARD,   "*",     1 },
340         { T_STRING,             "'",     1, parse_string },
341         { T_STRING,             "\"",    1, parse_string },
342         { T_LABEL,              "_",     1, parse_label  },
343         { T_LABEL,              "az",    0, parse_label  },
344         { T_LABEL,              "AZ",    0, parse_label  },
345         { T_NUMBER,             "-",     1, parse_number },
346         { T_NUMBER,             "09",    0, parse_number },
347 };
348
349 const char *tokennames[23] = {
350         [0]                             = "End of file",
351         [T_AND]                 = "'&&'",
352         [T_OR]                  = "'||'",
353         [T_UNION]               = "','",
354         [T_EQ]                  = "'='",
355         [T_NE]                  = "'!='",
356         [T_GT]                  = "'>'",
357         [T_GE]                  = "'>='",
358         [T_LT]                  = "'<'",
359         [T_LE]                  = "'<='",
360         [T_NOT]                 = "'!'",
361         [T_LABEL]               = "Label",
362         [T_ROOT]                = "'$'",
363         [T_THIS]                = "'@'",
364         [T_DOT]                 = "'.'",
365         [T_WILDCARD]    = "'*'",
366         [T_BROPEN]              = "'['",
367         [T_BRCLOSE]             = "']'",
368         [T_BOOL]                = "Bool",
369         [T_NUMBER]              = "Number",
370         [T_STRING]              = "String",
371         [T_POPEN]               = "'('",
372         [T_PCLOSE]              = "')'",
373 };
374
375
376 static int
377 match_token(const char *ptr, struct jp_opcode *op)
378 {
379         int i;
380         const struct token *tok;
381
382         for (i = 0, tok = &tokens[0];
383              i < sizeof(tokens) / sizeof(tokens[0]);
384                  i++, tok = &tokens[i])
385         {
386                 if ((tok->plen > 0 && !strncmp(ptr, tok->pat, tok->plen)) ||
387                     (tok->plen == 0 && *ptr >= tok->pat[0] && *ptr <= tok->pat[1]))
388                 {
389                         op->type = tok->type;
390
391                         if (tok->parse)
392                                 return tok->parse(ptr, op);
393
394                         return tok->plen;
395                 }
396         }
397
398         return -1;
399 }
400
401 struct jp_opcode *
402 jp_get_token(struct jp_state *s, const char *input, int *mlen)
403 {
404         struct jp_opcode op = { 0 };
405
406         *mlen = match_token(input, &op);
407
408         if (*mlen < 0 || op.type == 0)
409                 return NULL;
410
411         return jp_alloc_op(s, op.type, op.num, op.str, NULL);
412 }