2 * This file is part of GNUnet
3 * Copyright (C) 2012-2017 GNUnet e.V.
5 * GNUnet is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published
7 * by the Free Software Foundation; either version 3, or (at your
8 * option) any later version.
10 * GNUnet is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with GNUnet; see the file COPYING. If not, write to the
17 * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
18 * Boston, MA 02110-1301, USA.
21 * @file src/regex/regex_test_lib.c
22 * @brief library to read regexes representing IP networks from a file.
23 * and simplyfinying the into one big regex, in order to run
24 * tests (regex performance, cadet profiler).
25 * @author Bartlomiej Polot
29 #include "gnunet_util_lib.h"
33 * Struct to hold the tree formed by prefix-combining the regexes.
35 struct RegexCombineCtx
38 * Child nodes with same prefix and token.
40 struct RegexCombineCtx **children;
43 * Alphabet size (how many @a children there are)
57 * Convert a character into its int value depending on the base used
60 * @param size base (2, 8 or 16(hex))
62 * @return Int in range [0, (base-1)]
65 c2i (char c, int size)
74 if (c >= '0' && c <= '9')
76 else if (c >= 'A' && c <= 'F')
78 else if (c >= 'a' && c <= 'f')
82 GNUNET_log (GNUNET_ERROR_TYPE_ERROR,
83 "Cannot convert char %c in base %u\n",
95 * Printf spaces to indent the regex tree
97 * @param n Indentation level
103 for (i = 0; i < n; i++)
104 fprintf (stderr, "| ");
109 * Printf the combined regex ctx.
111 * @param ctx The ctx to printf
112 * @param level Indentation level to start with
115 debugctx (struct RegexCombineCtx *ctx, int level)
122 fprintf (stderr, "%u:'%s'\n", c2i(ctx->s[0], ctx->size), ctx->s);
125 fprintf (stderr, "ROOT (base %u)\n", ctx->size);
126 for (i = 0; i < ctx->size; i++)
128 if (NULL != ctx->children[i])
131 debugctx (ctx->children[i], level + 1);
139 * Add a single regex to a context, combining with exisiting regex by-prefix.
141 * @param ctx Context with 0 or more regexes.
142 * @param regex Regex to add.
145 regex_add (struct RegexCombineCtx *ctx, const char *regex);
149 * Create and initialize a new RegexCombineCtx.
151 * @param alphabet_size Size of the alphabet (and the Trie array)
153 static struct RegexCombineCtx *
154 new_regex_ctx (unsigned int alphabet_size)
156 struct RegexCombineCtx *ctx;
159 array_size = sizeof(struct RegexCombineCtx *) * alphabet_size;
160 ctx = GNUNET_new (struct RegexCombineCtx);
161 ctx->children = GNUNET_malloc (array_size);
162 ctx->size = alphabet_size;
168 move_children (struct RegexCombineCtx *dst, const struct RegexCombineCtx *src)
172 array_size = sizeof(struct RegexCombineCtx *) * src->size;
173 memcpy (dst->children, src->children, array_size);
174 for (int i = 0; i < src->size; i++)
176 src->children[i] = NULL;
182 * Extract a string from all prefix-combined regexes.
184 * @param ctx Context with 0 or more regexes.
186 * @return Regex that matches any of the added regexes.
189 regex_combine (struct RegexCombineCtx *ctx)
191 struct RegexCombineCtx *p;
199 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, "new combine %s\n", ctx->s);
200 regex = GNUNET_strdup ("");
202 for (i = 0; i < ctx->size; i++)
204 p = ctx->children[i];
207 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG,
208 "adding '%s' to innner %s\n",
210 s = regex_combine (p);
211 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, " total '%s'\n", s);
218 GNUNET_asprintf (&tmp, "%s%s|", regex, s);
219 GNUNET_free_non_null (regex);
222 GNUNET_free_non_null (s);
223 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, " so far '%s' for inner %s\n", regex, ctx->s);
226 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, "opt: %d, innner: '%s'\n", opt, regex);
227 len = strlen (regex);
230 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, "empty, returning ''\n");
232 return NULL == ctx->s ? NULL : GNUNET_strdup (ctx->s);
235 if ('|' == regex[len - 1])
236 regex[len - 1] = '\0';
241 GNUNET_asprintf (&s, "%s(%s)?", ctx->s, regex);
243 GNUNET_asprintf (&s, "%s(%s)", ctx->s, regex);
248 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, "partial: %s\n", regex);
254 * Get the number of matching characters on the prefix of both strings.
256 * @param s1 String 1.
257 * @param s2 String 2.
259 * @return Number of characters of matching prefix.
262 get_prefix_length (const char *s1, const char *s2)
271 limit = l1 > l2 ? l2 : l1;
273 for (i = 0; i < limit; i++)
283 * Return the child context with the longest prefix match with the regex.
284 * Usually only one child will match, search all just in case.
286 * @param ctx Context whose children to search.
287 * @param regex String to match.
289 * @return Child with the longest prefix, NULL if no child matches.
291 static struct RegexCombineCtx *
292 get_longest_prefix (struct RegexCombineCtx *ctx, const char *regex)
294 struct RegexCombineCtx *p;
295 struct RegexCombineCtx *best;
303 for (i = 0; i < ctx->size; i++)
305 p = ctx->children[i];
309 l = get_prefix_length (p->s, regex);
312 GNUNET_break (0 == best_l);
321 regex_add_multiple (struct RegexCombineCtx *ctx,
323 struct RegexCombineCtx **children)
328 struct RegexCombineCtx *newctx;
336 /* Does the regex cover *all* possible children? Then don't add any,
337 * as it will be covered by the post-regex "(a-z)*"
341 for (i = 1UL; i < l; i++)
343 if (regex[i] != '|' && regex[i] != ')')
348 if (count == ctx->size)
353 /* Add every component as a child node */
355 for (i = 1UL; i < l; i++)
357 if (regex[i] != '|' && regex[i] != ')')
360 newctx = new_regex_ctx(ctx->size);
361 newctx->s = GNUNET_strdup (tmp);
362 if (children != NULL)
363 memcpy (newctx->children, children, sizeof (*children) * ctx->size);
364 ctx->children[c2i(tmp[0], ctx->size)] = newctx;
370 * Add a single regex to a context, splitting the exisiting state.
372 * We only had a partial match, split existing state, truncate the current node
373 * so it only contains the prefix, add suffix(es) as children.
375 * @param ctx Context to split.
376 * @param len Lenght of ctx->s
377 * @param prefix_l Lenght of common prefix of the new regex and @a ctx->s
380 regex_split (struct RegexCombineCtx *ctx,
382 unsigned int prefix_l)
384 struct RegexCombineCtx *newctx;
388 suffix = GNUNET_malloc (len - prefix_l + 1);
389 strncpy (suffix, &ctx->s[prefix_l], len - prefix_l + 1);
391 /* Suffix saved, truncate current node so it only contains the prefix,
392 * copy any children nodes to put as grandchildren and initialize new empty
395 ctx->s[prefix_l] = '\0';
397 /* If the suffix is an OR expression, add multiple children */
398 if ('(' == suffix[0])
400 struct RegexCombineCtx **tmp;
403 ctx->children = GNUNET_malloc (sizeof(*tmp) * ctx->size);
404 regex_add_multiple (ctx, suffix, tmp);
409 /* The suffix is a normal string, add as one node */
410 newctx = new_regex_ctx (ctx->size);
412 move_children (newctx, ctx);
413 idx = c2i(suffix[0], ctx->size);
414 ctx->children[idx] = newctx;
419 * Add a single regex to a context, combining with exisiting regex by-prefix.
421 * @param ctx Context with 0 or more regexes.
422 * @param regex Regex to add.
425 regex_add (struct RegexCombineCtx *ctx, const char *regex)
427 struct RegexCombineCtx *p;
428 struct RegexCombineCtx *newctx;
430 unsigned int prefix_l;
436 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG,
437 "regex_add '%s' into '%s'\n",
443 /* If the regex is in the form of (a|b|c), add every character separately */
446 regex_add_multiple (ctx, regex, NULL);
450 p = get_longest_prefix (ctx, regex);
453 /* There is some prefix match, reduce regex and try again */
454 prefix_l = get_prefix_length (p->s, regex);
455 rest_s = &p->s[prefix_l];
456 rest_r = ®ex[prefix_l];
457 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, "chosen '%s' [%u]\n", p->s, prefix_l);
458 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, "prefix r '%.*s'\n", prefix_l, p->s);
459 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, "rest r '%s'\n", rest_r);
460 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, "rest s '%s'\n", rest_s);
464 regex_split (p, len, prefix_l);
466 regex_add (p, rest_r);
470 /* There is no prefix match, add new */
471 idx = c2i(regex[0], ctx->size);
472 if (NULL == ctx->children[idx] && NULL != ctx->s)
474 /* this was the end before, add empty string */
475 newctx = new_regex_ctx (ctx->size);
476 newctx->s = GNUNET_strdup ("");
477 ctx->children[idx] = newctx;
479 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, " no match\n");
480 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, " new state %s\n", regex);
481 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, " under %s\n", ctx->s);
482 newctx = new_regex_ctx(ctx->size);
483 newctx->s = GNUNET_strdup (regex);
484 ctx->children[idx] = newctx;
489 * Free all resources used by the context node and all its children.
491 * @param ctx Context to free.
494 regex_ctx_destroy (struct RegexCombineCtx *ctx)
501 for (i = 0; i < ctx->size; i++)
503 regex_ctx_destroy (ctx->children[i]);
505 GNUNET_free_non_null (ctx->s); /* 's' on root node is null */
506 GNUNET_free (ctx->children);
512 * Combine an array of regexes into a single prefix-shared regex.
513 * Returns a prefix-combine regex that matches the same strings as
514 * any of the original regexes.
516 * WARNING: only useful for reading specific regexes for specific applications,
517 * namely the gnunet-regex-profiler / gnunet-regex-daemon.
518 * This function DOES NOT support arbitrary regex combining.
520 * @param regexes A NULL-terminated array of regexes.
521 * @param alphabet_size Size of the alphabet the regex uses.
523 * @return A string with a single regex that matches any of the original regexes
526 REGEX_TEST_combine (char * const regexes[], unsigned int alphabet_size)
531 struct RegexCombineCtx *ctx;
533 ctx = new_regex_ctx (alphabet_size);
534 for (i = 0; regexes[i]; i++)
536 current = regexes[i];
537 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, "Regex %u: %s\n", i, current);
538 regex_add (ctx, current);
541 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, "\nCombining...\n");
544 combined = regex_combine (ctx);
546 regex_ctx_destroy (ctx);
553 * Read a set of regexes from a file, one per line and return them in an array
554 * suitable for REGEX_TEST_combine.
555 * The array must be free'd using REGEX_TEST_free_from_file.
557 * @param filename Name of the file containing the regexes.
559 * @return A newly allocated, NULL terminated array of regexes.
562 REGEX_TEST_read_from_file (const char *filename)
564 struct GNUNET_DISK_FileHandle *f;
573 f = GNUNET_DISK_file_open (filename,
574 GNUNET_DISK_OPEN_READ,
575 GNUNET_DISK_PERM_NONE);
578 GNUNET_log (GNUNET_ERROR_TYPE_ERROR,
579 "Can't open file %s for reading\n", filename);
582 if (GNUNET_OK != GNUNET_DISK_file_handle_size (f, &size))
584 GNUNET_log (GNUNET_ERROR_TYPE_ERROR,
585 "Can't get size of file %s\n", filename);
586 GNUNET_DISK_file_close (f);
589 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG,
590 "using file %s, size %llu\n",
591 filename, (unsigned long long) size);
593 buffer = GNUNET_malloc (size + 1);
594 GNUNET_DISK_file_read (f, buffer, size);
595 GNUNET_DISK_file_close (f);
596 regexes = GNUNET_malloc (sizeof (char *));
603 regex = GNUNET_malloc (size + 1);
604 len = (size_t) sscanf (&buffer[offset], "%s", regex);
607 len = strlen (regex);
612 regex = GNUNET_realloc (regex, len + 1);
613 GNUNET_array_grow (regexes, nr, nr + 1);
614 GNUNET_assert (NULL == regexes[nr - 2]);
615 regexes[nr - 2] = regex;
616 regexes[nr - 1] = NULL;
618 } while (offset < size);
619 GNUNET_free_non_null (regex);
620 GNUNET_free (buffer);
627 * Free all memory reserved for a set of regexes created by read_from_file.
629 * @param regexes NULL-terminated array of regexes.
632 REGEX_TEST_free_from_file (char **regexes)
636 for (i = 0; regexes[i]; i++)
637 GNUNET_free (regexes[i]);
638 GNUNET_free (regexes);
641 /* end of regex_test_lib.c */