From c7e938d6582a436dddc938539e72dd1320625c54 Mon Sep 17 00:00:00 2001 From: Jo-Philipp Wich Date: Sun, 4 Feb 2018 18:36:07 +0100 Subject: [PATCH] implement POSIX regexp support Introduce a new operator `~` and new `/.../eis` regular expression syntax. This allows filtering by regular expression, e.g. jsonfilter -s '[ "foo", "bar", "baz" ]' -e '$[@ ~ /^b/]' ... would yield the values `bar` and `baz`. Possible regular expression modifiers are: - `e` ... enable extended POSIX regular expressions - `i` ... perform case insensitive matches - `s` ... let ranges and `.` match the newline character A regular expression literal may occur on the left or the right side of the `~` operator, but not on both. In case neither side of the `~` operator is a regular expression, the right side will be treated as regular expression pattern. Non-string values are converted to their string representation before performing matching. Signed-off-by: Jo-Philipp Wich --- lexer.c | 75 ++++++++++++++++++++++++++++++++++++++++-- lexer.h | 2 +- matcher.c | 97 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ matcher.h | 2 ++ parser.y | 4 ++- 5 files changed, 176 insertions(+), 4 deletions(-) diff --git a/lexer.c b/lexer.c index ca5880e..c016d41 100644 --- a/lexer.c +++ b/lexer.c @@ -18,6 +18,7 @@ #include #include #include +#include #include "ast.h" #include "lexer.h" @@ -236,7 +237,21 @@ parse_string(const char *buf, struct jp_opcode *op, struct jp_state *s) case 'r': *out = '\r'; break; case 't': *out = '\t'; break; case 'v': *out = '\v'; break; - default: *out = *in; break; + default: + /* in regexp mode, retain backslash */ + if (q == '/') + { + if (rem-- < 1) + { + s->error_pos = s->off + (in - buf); + return -3; + } + + *out++ = '\\'; + } + + *out = *in; + break; } in++; @@ -277,6 +292,58 @@ parse_string(const char *buf, struct jp_opcode *op, struct jp_state *s) } +/* + * Parses a regexp literal from the given buffer. + * + * Returns a negative value on error, otherwise the amount of consumed + * characters from the given buffer. + * + * Error values: + * -1 Unterminated regexp + * -2 Invalid escape sequence + * -3 Regexp literal too long + */ + +static int +parse_regexp(const char *buf, struct jp_opcode *op, struct jp_state *s) +{ + int len = parse_string(buf, op, s); + const char *p; + + if (len >= 2) + { + op->num = REG_NOSUB | REG_NEWLINE; + + for (p = buf + len; p; p++) + { + switch (*p) + { + case 'e': + op->num |= REG_EXTENDED; + len++; + break; + + case 'i': + op->num |= REG_ICASE; + len++; + break; + + case 's': + op->num &= ~REG_NEWLINE; + len++; + break; + + default: + return len; + } + } + + } + + return len; +} + + /* * Parses a label from the given buffer. * @@ -367,8 +434,10 @@ static const struct token tokens[] = { { T_LT, "<", 1 }, { T_GT, ">", 1 }, { T_EQ, "=", 1 }, + { T_MATCH, "~", 1 }, { T_NOT, "!", 1 }, { T_WILDCARD, "*", 1 }, + { T_REGEXP, "/", 1, parse_regexp }, { T_STRING, "'", 1, parse_string }, { T_STRING, "\"", 1, parse_string }, { T_LABEL, "_", 1, parse_label }, @@ -378,7 +447,7 @@ static const struct token tokens[] = { { T_NUMBER, "09", 0, parse_number }, }; -const char *tokennames[23] = { +const char *tokennames[25] = { [0] = "End of file", [T_AND] = "'&&'", [T_OR] = "'||'", @@ -389,12 +458,14 @@ const char *tokennames[23] = { [T_GE] = "'>='", [T_LT] = "'<'", [T_LE] = "'<='", + [T_MATCH] = "'~'", [T_NOT] = "'!'", [T_LABEL] = "Label", [T_ROOT] = "'$'", [T_THIS] = "'@'", [T_DOT] = "'.'", [T_WILDCARD] = "'*'", + [T_REGEXP] = "/.../", [T_BROPEN] = "'['", [T_BRCLOSE] = "']'", [T_BOOL] = "Bool", diff --git a/lexer.h b/lexer.h index 0906f76..a47c154 100644 --- a/lexer.h +++ b/lexer.h @@ -19,7 +19,7 @@ #include "ast.h" -extern const char *tokennames[23]; +extern const char *tokennames[25]; struct jp_opcode * jp_get_token(struct jp_state *s, const char *input, int *mlen); diff --git a/matcher.c b/matcher.c index 85bd1c5..d2a8767 100644 --- a/matcher.c +++ b/matcher.c @@ -17,6 +17,7 @@ #include "parser.h" #include "matcher.h" + static struct json_object * jp_match_next(struct jp_opcode *ptr, struct json_object *root, struct json_object *cur, @@ -130,6 +131,99 @@ jp_cmp(struct jp_opcode *op, struct json_object *root, struct json_object *cur) } } +static bool +jp_regmatch(struct jp_opcode *op, struct json_object *root, struct json_object *cur) +{ + struct jp_opcode left, right; + char lbuf[22], rbuf[22], *lval, *rval; + int err, rflags = REG_NOSUB | REG_NEWLINE; + regex_t preg; + + + if (!jp_resolve(root, cur, op->down, &left) || + !jp_resolve(root, cur, op->down->sibling, &right)) + return false; + + if (left.type == T_REGEXP) + { + switch (right.type) + { + case T_BOOL: + lval = right.num ? "true" : "false"; + break; + + case T_NUMBER: + snprintf(lbuf, sizeof(lbuf), "%d", right.num); + lval = lbuf; + break; + + case T_STRING: + lval = right.str; + break; + + default: + return false; + } + + rval = left.str; + rflags = left.num; + } + else + { + switch (left.type) + { + case T_BOOL: + lval = left.num ? "true" : "false"; + break; + + case T_NUMBER: + snprintf(lbuf, sizeof(lbuf), "%d", left.num); + lval = lbuf; + break; + + case T_STRING: + lval = left.str; + break; + + default: + return false; + } + + switch (right.type) + { + case T_BOOL: + rval = right.num ? "true" : "false"; + break; + + case T_NUMBER: + snprintf(rbuf, sizeof(rbuf), "%d", right.num); + rval = rbuf; + break; + + case T_STRING: + rval = right.str; + break; + + case T_REGEXP: + rval = right.str; + rflags = right.num; + break; + + default: + return false; + } + } + + if (regcomp(&preg, rval, rflags)) + return false; + + err = regexec(&preg, lval, 0, NULL, 0); + + regfree(&preg); + + return err ? false : true; +} + static bool jp_expr(struct jp_opcode *op, struct json_object *root, struct json_object *cur, int idx, const char *key, jp_match_cb_t cb, void *priv) @@ -149,6 +243,9 @@ jp_expr(struct jp_opcode *op, struct json_object *root, struct json_object *cur, case T_GE: return jp_cmp(op, root, cur); + case T_MATCH: + return jp_regmatch(op, root, cur); + case T_ROOT: return !!jp_match(op, root, NULL, NULL); diff --git a/matcher.h b/matcher.h index 468ddf2..aac21b9 100644 --- a/matcher.h +++ b/matcher.h @@ -19,6 +19,8 @@ #include #include +#include +#include #ifdef JSONC #include diff --git a/parser.y b/parser.y index 29b43ba..4d3581e 100644 --- a/parser.y +++ b/parser.y @@ -20,7 +20,7 @@ %left T_AND. %left T_OR. %left T_UNION. -%nonassoc T_EQ T_NE T_GT T_GE T_LT T_LE. +%nonassoc T_EQ T_NE T_GT T_GE T_LT T_LE T_MATCH. %right T_NOT. %include { @@ -87,11 +87,13 @@ cmp_exp(A) ::= unary_exp(B) T_GT unary_exp(C). { A = alloc_op(T_GT, 0, NULL, B, cmp_exp(A) ::= unary_exp(B) T_GE unary_exp(C). { A = alloc_op(T_GE, 0, NULL, B, C); } cmp_exp(A) ::= unary_exp(B) T_EQ unary_exp(C). { A = alloc_op(T_EQ, 0, NULL, B, C); } cmp_exp(A) ::= unary_exp(B) T_NE unary_exp(C). { A = alloc_op(T_NE, 0, NULL, B, C); } +cmp_exp(A) ::= unary_exp(B) T_MATCH unary_exp(C). { A = alloc_op(T_MATCH, 0, NULL, B, C); } cmp_exp(A) ::= unary_exp(B). { A = B; } unary_exp(A) ::= T_BOOL(B). { A = B; } unary_exp(A) ::= T_NUMBER(B). { A = B; } unary_exp(A) ::= T_STRING(B). { A = B; } +unary_exp(A) ::= T_REGEXP(B). { A = B; } unary_exp(A) ::= T_WILDCARD(B). { A = B; } unary_exp(A) ::= T_POPEN or_exps(B) T_PCLOSE. { A = B; } unary_exp(A) ::= T_NOT unary_exp(B). { A = alloc_op(T_NOT, 0, NULL, B); } -- 2.25.1