X-Git-Url: https://git.librecmc.org/?a=blobdiff_plain;f=src%2Fregex%2Fregex_test_lib.c;h=d25799ea42819f35f12d0a14221a6f3e31be31fc;hb=ec50a665dc884f7997419d0351ae8ade9c1aeabe;hp=0fc54e83fcadd6c27a7b611d35658f42fba43c97;hpb=0486f072af2d21307db2e34b8a5ecc22927865d1;p=oweals%2Fgnunet.git diff --git a/src/regex/regex_test_lib.c b/src/regex/regex_test_lib.c index 0fc54e83f..d25799ea4 100644 --- a/src/regex/regex_test_lib.c +++ b/src/regex/regex_test_lib.c @@ -1,44 +1,185 @@ /* * This file is part of GNUnet - * (C) 2012 Christian Grothoff (and other contributing authors) - * - * GNUnet is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published - * by the Free Software Foundation; either version 3, or (at your - * option) any later version. - * + * Copyright (C) 2012-2017 GNUnet e.V. + * + * GNUnet is free software: you can redistribute it and/or modify it + * under the terms of the GNU Affero General Public License as published + * by the Free Software Foundation, either version 3 of the License, + * or (at your option) any later version. + * * GNUnet is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. + * Affero General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with GNUnet; see the file COPYING. If not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 02111-1307, USA. + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . */ /** * @file src/regex/regex_test_lib.c * @brief library to read regexes representing IP networks from a file. * and simplyfinying the into one big regex, in order to run - * tests (regex performance, mesh profiler). + * tests (regex performance, cadet profiler). * @author Bartlomiej Polot */ #include "platform.h" #include "gnunet_util_lib.h" -struct RegexCombineCtx { - struct RegexCombineCtx *next; - struct RegexCombineCtx *prev; - struct RegexCombineCtx *head; - struct RegexCombineCtx *tail; +/** + * Struct to hold the tree formed by prefix-combining the regexes. + */ +struct RegexCombineCtx +{ + /** + * Child nodes with same prefix and token. + */ + struct RegexCombineCtx **children; + + /** + * Alphabet size (how many @a children there are) + */ + unsigned int size; + /** + * Token. + */ char *s; }; +/** + * Char 2 int + * + * Convert a character into its int value depending on the base used + * + * @param c Char + * @param size base (2, 8 or 16(hex)) + * + * @return Int in range [0, (base-1)] + */ +static int +c2i (char c, int size) +{ + switch (size) + { + case 2: + case 8: + return c - '0'; + break; + case 16: + if (c >= '0' && c <= '9') + return c - '0'; + else if (c >= 'A' && c <= 'F') + return c - 'A' + 10; + else if (c >= 'a' && c <= 'f') + return c - 'a' + 10; + else + { + GNUNET_log (GNUNET_ERROR_TYPE_ERROR, + "Cannot convert char %c in base %u\n", + c, size); + GNUNET_assert (0); + } + break; + default: + GNUNET_assert (0); + } +} + + +/** + * Printf spaces to indent the regex tree + * + * @param n Indentation level + */ +static void +space (int n) +{ + for (int i = 0; i < n; i++) + fprintf (stderr, "| "); +} + + +/** + * Printf the combined regex ctx. + * + * @param ctx The ctx to printf + * @param level Indentation level to start with + */ +static void +debugctx (struct RegexCombineCtx *ctx, int level) +{ +#if DEBUG_REGEX + if (NULL != ctx->s) + { + space (level - 1); + fprintf (stderr, "%u:'%s'\n", c2i(ctx->s[0], ctx->size), ctx->s); + } + else + fprintf (stderr, "ROOT (base %u)\n", ctx->size); + for (unsigned int i = 0; i < ctx->size; i++) + { + if (NULL != ctx->children[i]) + { + space (level); + debugctx (ctx->children[i], level + 1); + } + } + fflush(stderr); +#endif +} + + +/** + * Add a single regex to a context, combining with exisiting regex by-prefix. + * + * @param ctx Context with 0 or more regexes. + * @param regex Regex to add. + */ +static void +regex_add (struct RegexCombineCtx *ctx, + const char *regex); + + +/** + * Create and initialize a new RegexCombineCtx. + * + * @param alphabet_size Size of the alphabet (and the Trie array) + */ +static struct RegexCombineCtx * +new_regex_ctx (unsigned int alphabet_size) +{ + struct RegexCombineCtx *ctx; + size_t array_size; + + array_size = sizeof(struct RegexCombineCtx *) * alphabet_size; + ctx = GNUNET_new (struct RegexCombineCtx); + ctx->children = GNUNET_malloc (array_size); + ctx->size = alphabet_size; + + return ctx; +} + + +static void +move_children (struct RegexCombineCtx *dst, + const struct RegexCombineCtx *src) +{ + size_t array_size; + + array_size = sizeof(struct RegexCombineCtx *) * src->size; + GNUNET_memcpy (dst->children, + src->children, + array_size); + for (unsigned int i = 0; i < src->size; i++) + { + src->children[i] = NULL; + } +} + + /** * Extract a string from all prefix-combined regexes. * @@ -50,43 +191,235 @@ static char * regex_combine (struct RegexCombineCtx *ctx) { struct RegexCombineCtx *p; + unsigned int i; size_t len; char *regex; char *tmp; char *s; + int opt; - if (NULL != ctx->s) - GNUNET_asprintf (®ex, "%s(", ctx->s); - else - regex = GNUNET_strdup ("("); - GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, "prefix: %s\n", regex); - - for (p = ctx->head; NULL != p; p = p->next) + GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, "new combine %s\n", ctx->s); + regex = GNUNET_strdup (""); + opt = GNUNET_NO; + for (i = 0; i < ctx->size; i++) { + p = ctx->children[i]; + if (NULL == p) + continue; + GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, + "adding '%s' to innner %s\n", + p->s, ctx->s); s = regex_combine (p); - GNUNET_asprintf (&tmp, "%s%s|", regex, s); + GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, " total '%s'\n", s); + if (strlen(s) == 0) + { + opt = GNUNET_YES; + } + else + { + GNUNET_asprintf (&tmp, "%s%s|", regex, s); + GNUNET_free_non_null (regex); + regex = tmp; + } GNUNET_free_non_null (s); - GNUNET_free_non_null (regex); - regex = tmp; + GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, " so far '%s' for inner %s\n", regex, ctx->s); } + + GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, "opt: %d, innner: '%s'\n", opt, regex); len = strlen (regex); - if (1 == len) + if (0 == len) { + GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, "empty, returning ''\n"); GNUNET_free (regex); - return GNUNET_strdup (""); + return NULL == ctx->s ? NULL : GNUNET_strdup (ctx->s); } - GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, "pre-partial: %s\n", regex); if ('|' == regex[len - 1]) - regex[len - 1] = ')'; - if ('(' == regex[len - 1]) regex[len - 1] = '\0'; + if (NULL != ctx->s) + { + if (opt) + GNUNET_asprintf (&s, "%s(%s)?", ctx->s, regex); + else + GNUNET_asprintf (&s, "%s(%s)", ctx->s, regex); + GNUNET_free (regex); + regex = s; + } + GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, "partial: %s\n", regex); return regex; } +/** + * Get the number of matching characters on the prefix of both strings. + * + * @param s1 String 1. + * @param s2 String 2. + * + * @return Number of characters of matching prefix. + */ +static unsigned int +get_prefix_length (const char *s1, const char *s2) +{ + unsigned int l1; + unsigned int l2; + unsigned int limit; + unsigned int i; + + l1 = strlen (s1); + l2 = strlen (s2); + limit = l1 > l2 ? l2 : l1; + + for (i = 0; i < limit; i++) + { + if (s1[i] != s2[i]) + return i; + } + return limit; +} + + +/** + * Return the child context with the longest prefix match with the regex. + * Usually only one child will match, search all just in case. + * + * @param ctx Context whose children to search. + * @param regex String to match. + * + * @return Child with the longest prefix, NULL if no child matches. + */ +static struct RegexCombineCtx * +get_longest_prefix (struct RegexCombineCtx *ctx, const char *regex) +{ + struct RegexCombineCtx *p; + struct RegexCombineCtx *best; + unsigned int i; + unsigned int l; + unsigned int best_l; + + best_l = 0; + best = NULL; + + for (i = 0; i < ctx->size; i++) + { + p = ctx->children[i]; + if (NULL == p) + continue; + + l = get_prefix_length (p->s, regex); + if (l > best_l) + { + GNUNET_break (0 == best_l); + best = p; + best_l = l; + } + } + return best; +} + +static void +regex_add_multiple (struct RegexCombineCtx *ctx, + const char *regex, + struct RegexCombineCtx **children) +{ + char tmp[2]; + long unsigned int i; + size_t l; + struct RegexCombineCtx *newctx; + unsigned int count; + + if ('(' != regex[0]) + { + GNUNET_assert (0); + } + + /* Does the regex cover *all* possible children? Then don't add any, + * as it will be covered by the post-regex "(a-z)*" + */ + l = strlen (regex); + count = 0; + for (i = 1UL; i < l; i++) + { + if (regex[i] != '|' && regex[i] != ')') + { + count++; + } + } + if (count == ctx->size) + { + return; + } + + /* Add every component as a child node */ + tmp[1] = '\0'; + for (i = 1UL; i < l; i++) + { + if (regex[i] != '|' && regex[i] != ')') + { + tmp[0] = regex[i]; + newctx = new_regex_ctx(ctx->size); + newctx->s = GNUNET_strdup (tmp); + if (children != NULL) + GNUNET_memcpy (newctx->children, + children, + sizeof (*children) * ctx->size); + ctx->children[c2i(tmp[0], ctx->size)] = newctx; + } + } +} + +/** + * Add a single regex to a context, splitting the exisiting state. + * + * We only had a partial match, split existing state, truncate the current node + * so it only contains the prefix, add suffix(es) as children. + * + * @param ctx Context to split. + * @param len Lenght of ctx->s + * @param prefix_l Lenght of common prefix of the new regex and @a ctx->s + */ +static void +regex_split (struct RegexCombineCtx *ctx, + unsigned int len, + unsigned int prefix_l) +{ + struct RegexCombineCtx *newctx; + unsigned int idx; + char *suffix; + + suffix = GNUNET_malloc (len - prefix_l + 1); + strncpy (suffix, &ctx->s[prefix_l], len - prefix_l + 1); + + /* Suffix saved, truncate current node so it only contains the prefix, + * copy any children nodes to put as grandchildren and initialize new empty + * children array. + */ + ctx->s[prefix_l] = '\0'; + + /* If the suffix is an OR expression, add multiple children */ + if ('(' == suffix[0]) + { + struct RegexCombineCtx **tmp; + + tmp = ctx->children; + ctx->children = GNUNET_malloc (sizeof(*tmp) * ctx->size); + regex_add_multiple (ctx, suffix, tmp); + GNUNET_free (suffix); + GNUNET_free (tmp); + return; + } + + /* The suffix is a normal string, add as one node */ + newctx = new_regex_ctx (ctx->size); + newctx->s = suffix; + move_children (newctx, ctx); + idx = c2i(suffix[0], ctx->size); + ctx->children[idx] = newctx; +} + + /** * Add a single regex to a context, combining with exisiting regex by-prefix. * @@ -97,42 +430,63 @@ static void regex_add (struct RegexCombineCtx *ctx, const char *regex) { struct RegexCombineCtx *p; - const char *rest; + struct RegexCombineCtx *newctx; + long unsigned int l; + unsigned int prefix_l; + const char *rest_r; + const char *rest_s; + size_t len; + int idx; - rest = ®ex[1]; - for (p = ctx->head; NULL != p; p = p->next) + GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, + "regex_add '%s' into '%s'\n", + regex, ctx->s); + l = strlen (regex); + if (0UL == l) + return; + + /* If the regex is in the form of (a|b|c), add every character separately */ + if ('(' == regex[0]) + { + regex_add_multiple (ctx, regex, NULL); + return; + } + + p = get_longest_prefix (ctx, regex); + if (NULL != p) { - if (p->s[0] == regex[0]) + /* There is some prefix match, reduce regex and try again */ + prefix_l = get_prefix_length (p->s, regex); + rest_s = &p->s[prefix_l]; + rest_r = ®ex[prefix_l]; + GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, "chosen '%s' [%u]\n", p->s, prefix_l); + GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, "prefix r '%.*s'\n", prefix_l, p->s); + GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, "rest r '%s'\n", rest_r); + GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, "rest s '%s'\n", rest_s); + len = strlen (p->s); + if (prefix_l < len) { - if (1 == strlen(p->s)) - { - GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, "common char %s\n", p->s); - GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, "adding %s\n", rest); - regex_add (p, rest); - } - else - { - struct RegexCombineCtx *new; - new = GNUNET_malloc (sizeof (struct RegexCombineCtx)); - new->s = GNUNET_strdup (&p->s[1]); - GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, " p has now %s\n", p->s); - GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, " p will have %.1s\n", p->s); - GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, " regex is %s\n", regex); - GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, " new has now %s\n", new->s); - GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, " rest is now %s\n", rest); - p->s[1] = '\0'; /* dont realloc */ - GNUNET_CONTAINER_DLL_insert (p->head, p->tail, new); - regex_add (p, rest); - } - return; + regex_split (p, len, prefix_l); } + regex_add (p, rest_r); + return; + } + + /* There is no prefix match, add new */ + idx = c2i(regex[0], ctx->size); + if (NULL == ctx->children[idx] && NULL != ctx->s) + { + /* this was the end before, add empty string */ + newctx = new_regex_ctx (ctx->size); + newctx->s = GNUNET_strdup (""); + ctx->children[idx] = newctx; } - GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, " no match\n"); + GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, " no match\n"); GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, " new state %s\n", regex); GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, " under %s\n", ctx->s); - p = GNUNET_malloc (sizeof (struct RegexCombineCtx)); - p->s = GNUNET_strdup (regex); - GNUNET_CONTAINER_DLL_insert (ctx->head, ctx->tail, p); + newctx = new_regex_ctx(ctx->size); + newctx->s = GNUNET_strdup (regex); + ctx->children[idx] = newctx; } @@ -144,43 +498,53 @@ regex_add (struct RegexCombineCtx *ctx, const char *regex) static void regex_ctx_destroy (struct RegexCombineCtx *ctx) { - struct RegexCombineCtx *p; - struct RegexCombineCtx *next; + unsigned int i; - for (p = ctx->head; NULL != p; p = next) + if (NULL == ctx) + return; + + for (i = 0; i < ctx->size; i++) { - next = p->next; - regex_ctx_destroy (p); + regex_ctx_destroy (ctx->children[i]); } - GNUNET_free (ctx->s); + GNUNET_free_non_null (ctx->s); /* 's' on root node is null */ + GNUNET_free (ctx->children); GNUNET_free (ctx); } /** - * Return a prefix-combine regex that matches the same strings as + * Combine an array of regexes into a single prefix-shared regex. + * Returns a prefix-combine regex that matches the same strings as * any of the original regexes. * * WARNING: only useful for reading specific regexes for specific applications, * namely the gnunet-regex-profiler / gnunet-regex-daemon. * This function DOES NOT support arbitrary regex combining. + * + * @param regexes A NULL-terminated array of regexes. + * @param alphabet_size Size of the alphabet the regex uses. + * + * @return A string with a single regex that matches any of the original regexes */ char * -GNUNET_REGEX_combine (char * const regexes[]) +REGEX_TEST_combine (char * const regexes[], unsigned int alphabet_size) { unsigned int i; char *combined; const char *current; struct RegexCombineCtx *ctx; - ctx = GNUNET_malloc (sizeof (struct RegexCombineCtx)); + ctx = new_regex_ctx (alphabet_size); for (i = 0; regexes[i]; i++) { current = regexes[i]; GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, "Regex %u: %s\n", i, current); regex_add (ctx, current); + debugctx (ctx, 0); } GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, "\nCombining...\n"); + debugctx (ctx, 0); combined = regex_combine (ctx); @@ -192,15 +556,15 @@ GNUNET_REGEX_combine (char * const regexes[]) /** * Read a set of regexes from a file, one per line and return them in an array - * suitable for GNUNET_REGEX_combine. - * The array must be free'd using GNUNET_REGEX_free_from_file. + * suitable for REGEX_TEST_combine. + * The array must be free'd using REGEX_TEST_free_from_file. * * @param filename Name of the file containing the regexes. * * @return A newly allocated, NULL terminated array of regexes. */ char ** -GNUNET_REGEX_read_from_file (const char *filename) +REGEX_TEST_read_from_file (const char *filename) { struct GNUNET_DISK_FileHandle *f; unsigned int nr; @@ -224,6 +588,7 @@ GNUNET_REGEX_read_from_file (const char *filename) { GNUNET_log (GNUNET_ERROR_TYPE_ERROR, "Can't get size of file %s\n", filename); + GNUNET_DISK_file_close (f); return NULL; } GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, @@ -248,22 +613,15 @@ GNUNET_REGEX_read_from_file (const char *filename) offset += len + 1; if (len < 1) continue; - if (len < 6 || strncmp (®ex[len - 6], "(0|1)*", 6) != 0) - { - GNUNET_log (GNUNET_ERROR_TYPE_WARNING, - "%s (line %u) does not end in \"(0|1)*\"\n", - buffer, nr); - } - else - { - len -= 6; - buffer[len] = '\0'; - } + regex[len] = '\0'; regex = GNUNET_realloc (regex, len + 1); GNUNET_array_grow (regexes, nr, nr + 1); + GNUNET_assert (NULL == regexes[nr - 2]); regexes[nr - 2] = regex; regexes[nr - 1] = NULL; + regex = NULL; } while (offset < size); + GNUNET_free_non_null (regex); GNUNET_free (buffer); return regexes; @@ -276,7 +634,7 @@ GNUNET_REGEX_read_from_file (const char *filename) * @param regexes NULL-terminated array of regexes. */ void -GNUNET_REGEX_free_from_file (char **regexes) +REGEX_TEST_free_from_file (char **regexes) { unsigned int i; @@ -285,4 +643,4 @@ GNUNET_REGEX_free_from_file (char **regexes) GNUNET_free (regexes); } -/* end of regex_test_lib.c */ \ No newline at end of file +/* end of regex_test_lib.c */