2 * This file is part of GNUnet
3 * Copyright (C) 2012-2017 GNUnet e.V.
5 * GNUnet is free software: you can redistribute it and/or modify it
6 * under the terms of the GNU Affero General Public License as published
7 * by the Free Software Foundation, either version 3 of the License,
8 * or (at your option) any later version.
10 * GNUnet is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Affero General Public License for more details.
15 * You should have received a copy of the GNU Affero General Public License
16 * along with this program. If not, see <http://www.gnu.org/licenses/>.
18 SPDX-License-Identifier: AGPL3.0-or-later
21 * @file src/regex/regex_test_lib.c
22 * @brief library to read regexes representing IP networks from a file.
23 * and simplyfinying the into one big regex, in order to run
24 * tests (regex performance, cadet profiler).
25 * @author Bartlomiej Polot
29 #include "gnunet_util_lib.h"
33 * Struct to hold the tree formed by prefix-combining the regexes.
35 struct RegexCombineCtx
38 * Child nodes with same prefix and token.
40 struct RegexCombineCtx **children;
43 * Alphabet size (how many @a children there are)
57 * Convert a character into its int value depending on the base used
60 * @param size base (2, 8 or 16(hex))
62 * @return Int in range [0, (base-1)]
65 c2i (char c, int size)
74 if (c >= '0' && c <= '9')
76 else if (c >= 'A' && c <= 'F')
78 else if (c >= 'a' && c <= 'f')
82 GNUNET_log (GNUNET_ERROR_TYPE_ERROR,
83 "Cannot convert char %c in base %u\n",
95 * Printf spaces to indent the regex tree
97 * @param n Indentation level
102 for (int i = 0; i < n; i++)
103 fprintf (stderr, "| ");
108 * Printf the combined regex ctx.
110 * @param ctx The ctx to printf
111 * @param level Indentation level to start with
114 debugctx (struct RegexCombineCtx *ctx, int level)
120 fprintf (stderr, "%u:'%s'\n", c2i(ctx->s[0], ctx->size), ctx->s);
123 fprintf (stderr, "ROOT (base %u)\n", ctx->size);
124 for (unsigned int i = 0; i < ctx->size; i++)
126 if (NULL != ctx->children[i])
129 debugctx (ctx->children[i], level + 1);
138 * Add a single regex to a context, combining with exisiting regex by-prefix.
140 * @param ctx Context with 0 or more regexes.
141 * @param regex Regex to add.
144 regex_add (struct RegexCombineCtx *ctx,
149 * Create and initialize a new RegexCombineCtx.
151 * @param alphabet_size Size of the alphabet (and the Trie array)
153 static struct RegexCombineCtx *
154 new_regex_ctx (unsigned int alphabet_size)
156 struct RegexCombineCtx *ctx;
159 array_size = sizeof(struct RegexCombineCtx *) * alphabet_size;
160 ctx = GNUNET_new (struct RegexCombineCtx);
161 ctx->children = GNUNET_malloc (array_size);
162 ctx->size = alphabet_size;
169 move_children (struct RegexCombineCtx *dst,
170 const struct RegexCombineCtx *src)
174 array_size = sizeof(struct RegexCombineCtx *) * src->size;
175 GNUNET_memcpy (dst->children,
178 for (unsigned int i = 0; i < src->size; i++)
180 src->children[i] = NULL;
186 * Extract a string from all prefix-combined regexes.
188 * @param ctx Context with 0 or more regexes.
190 * @return Regex that matches any of the added regexes.
193 regex_combine (struct RegexCombineCtx *ctx)
195 struct RegexCombineCtx *p;
203 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, "new combine %s\n", ctx->s);
204 regex = GNUNET_strdup ("");
206 for (i = 0; i < ctx->size; i++)
208 p = ctx->children[i];
211 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG,
212 "adding '%s' to innner %s\n",
214 s = regex_combine (p);
215 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, " total '%s'\n", s);
222 GNUNET_asprintf (&tmp, "%s%s|", regex, s);
223 GNUNET_free_non_null (regex);
226 GNUNET_free_non_null (s);
227 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, " so far '%s' for inner %s\n", regex, ctx->s);
230 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, "opt: %d, innner: '%s'\n", opt, regex);
231 len = strlen (regex);
234 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, "empty, returning ''\n");
236 return NULL == ctx->s ? NULL : GNUNET_strdup (ctx->s);
239 if ('|' == regex[len - 1])
240 regex[len - 1] = '\0';
245 GNUNET_asprintf (&s, "%s(%s)?", ctx->s, regex);
247 GNUNET_asprintf (&s, "%s(%s)", ctx->s, regex);
252 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, "partial: %s\n", regex);
258 * Get the number of matching characters on the prefix of both strings.
260 * @param s1 String 1.
261 * @param s2 String 2.
263 * @return Number of characters of matching prefix.
266 get_prefix_length (const char *s1, const char *s2)
275 limit = l1 > l2 ? l2 : l1;
277 for (i = 0; i < limit; i++)
287 * Return the child context with the longest prefix match with the regex.
288 * Usually only one child will match, search all just in case.
290 * @param ctx Context whose children to search.
291 * @param regex String to match.
293 * @return Child with the longest prefix, NULL if no child matches.
295 static struct RegexCombineCtx *
296 get_longest_prefix (struct RegexCombineCtx *ctx, const char *regex)
298 struct RegexCombineCtx *p;
299 struct RegexCombineCtx *best;
307 for (i = 0; i < ctx->size; i++)
309 p = ctx->children[i];
313 l = get_prefix_length (p->s, regex);
316 GNUNET_break (0 == best_l);
325 regex_add_multiple (struct RegexCombineCtx *ctx,
327 struct RegexCombineCtx **children)
332 struct RegexCombineCtx *newctx;
340 /* Does the regex cover *all* possible children? Then don't add any,
341 * as it will be covered by the post-regex "(a-z)*"
345 for (i = 1UL; i < l; i++)
347 if (regex[i] != '|' && regex[i] != ')')
352 if (count == ctx->size)
357 /* Add every component as a child node */
359 for (i = 1UL; i < l; i++)
361 if (regex[i] != '|' && regex[i] != ')')
364 newctx = new_regex_ctx(ctx->size);
365 newctx->s = GNUNET_strdup (tmp);
366 if (children != NULL)
367 GNUNET_memcpy (newctx->children,
369 sizeof (*children) * ctx->size);
370 ctx->children[c2i(tmp[0], ctx->size)] = newctx;
376 * Add a single regex to a context, splitting the exisiting state.
378 * We only had a partial match, split existing state, truncate the current node
379 * so it only contains the prefix, add suffix(es) as children.
381 * @param ctx Context to split.
382 * @param len Lenght of ctx->s
383 * @param prefix_l Lenght of common prefix of the new regex and @a ctx->s
386 regex_split (struct RegexCombineCtx *ctx,
388 unsigned int prefix_l)
390 struct RegexCombineCtx *newctx;
394 suffix = GNUNET_malloc (len - prefix_l + 1);
395 strncpy (suffix, &ctx->s[prefix_l], len - prefix_l + 1);
397 /* Suffix saved, truncate current node so it only contains the prefix,
398 * copy any children nodes to put as grandchildren and initialize new empty
401 ctx->s[prefix_l] = '\0';
403 /* If the suffix is an OR expression, add multiple children */
404 if ('(' == suffix[0])
406 struct RegexCombineCtx **tmp;
409 ctx->children = GNUNET_malloc (sizeof(*tmp) * ctx->size);
410 regex_add_multiple (ctx, suffix, tmp);
411 GNUNET_free (suffix);
416 /* The suffix is a normal string, add as one node */
417 newctx = new_regex_ctx (ctx->size);
419 move_children (newctx, ctx);
420 idx = c2i(suffix[0], ctx->size);
421 ctx->children[idx] = newctx;
426 * Add a single regex to a context, combining with exisiting regex by-prefix.
428 * @param ctx Context with 0 or more regexes.
429 * @param regex Regex to add.
432 regex_add (struct RegexCombineCtx *ctx, const char *regex)
434 struct RegexCombineCtx *p;
435 struct RegexCombineCtx *newctx;
437 unsigned int prefix_l;
443 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG,
444 "regex_add '%s' into '%s'\n",
450 /* If the regex is in the form of (a|b|c), add every character separately */
453 regex_add_multiple (ctx, regex, NULL);
457 p = get_longest_prefix (ctx, regex);
460 /* There is some prefix match, reduce regex and try again */
461 prefix_l = get_prefix_length (p->s, regex);
462 rest_s = &p->s[prefix_l];
463 rest_r = ®ex[prefix_l];
464 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, "chosen '%s' [%u]\n", p->s, prefix_l);
465 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, "prefix r '%.*s'\n", prefix_l, p->s);
466 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, "rest r '%s'\n", rest_r);
467 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, "rest s '%s'\n", rest_s);
471 regex_split (p, len, prefix_l);
473 regex_add (p, rest_r);
477 /* There is no prefix match, add new */
478 idx = c2i(regex[0], ctx->size);
479 if (NULL == ctx->children[idx] && NULL != ctx->s)
481 /* this was the end before, add empty string */
482 newctx = new_regex_ctx (ctx->size);
483 newctx->s = GNUNET_strdup ("");
484 ctx->children[idx] = newctx;
486 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, " no match\n");
487 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, " new state %s\n", regex);
488 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, " under %s\n", ctx->s);
489 newctx = new_regex_ctx(ctx->size);
490 newctx->s = GNUNET_strdup (regex);
491 ctx->children[idx] = newctx;
496 * Free all resources used by the context node and all its children.
498 * @param ctx Context to free.
501 regex_ctx_destroy (struct RegexCombineCtx *ctx)
508 for (i = 0; i < ctx->size; i++)
510 regex_ctx_destroy (ctx->children[i]);
512 GNUNET_free_non_null (ctx->s); /* 's' on root node is null */
513 GNUNET_free (ctx->children);
519 * Combine an array of regexes into a single prefix-shared regex.
520 * Returns a prefix-combine regex that matches the same strings as
521 * any of the original regexes.
523 * WARNING: only useful for reading specific regexes for specific applications,
524 * namely the gnunet-regex-profiler / gnunet-regex-daemon.
525 * This function DOES NOT support arbitrary regex combining.
527 * @param regexes A NULL-terminated array of regexes.
528 * @param alphabet_size Size of the alphabet the regex uses.
530 * @return A string with a single regex that matches any of the original regexes
533 REGEX_TEST_combine (char * const regexes[], unsigned int alphabet_size)
538 struct RegexCombineCtx *ctx;
540 ctx = new_regex_ctx (alphabet_size);
541 for (i = 0; regexes[i]; i++)
543 current = regexes[i];
544 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, "Regex %u: %s\n", i, current);
545 regex_add (ctx, current);
548 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, "\nCombining...\n");
551 combined = regex_combine (ctx);
553 regex_ctx_destroy (ctx);
560 * Read a set of regexes from a file, one per line and return them in an array
561 * suitable for REGEX_TEST_combine.
562 * The array must be free'd using REGEX_TEST_free_from_file.
564 * @param filename Name of the file containing the regexes.
566 * @return A newly allocated, NULL terminated array of regexes.
569 REGEX_TEST_read_from_file (const char *filename)
571 struct GNUNET_DISK_FileHandle *f;
580 f = GNUNET_DISK_file_open (filename,
581 GNUNET_DISK_OPEN_READ,
582 GNUNET_DISK_PERM_NONE);
585 GNUNET_log (GNUNET_ERROR_TYPE_ERROR,
586 "Can't open file %s for reading\n", filename);
589 if (GNUNET_OK != GNUNET_DISK_file_handle_size (f, &size))
591 GNUNET_log (GNUNET_ERROR_TYPE_ERROR,
592 "Can't get size of file %s\n", filename);
593 GNUNET_DISK_file_close (f);
596 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG,
597 "using file %s, size %llu\n",
598 filename, (unsigned long long) size);
600 buffer = GNUNET_malloc (size + 1);
601 GNUNET_DISK_file_read (f, buffer, size);
602 GNUNET_DISK_file_close (f);
603 regexes = GNUNET_malloc (sizeof (char *));
610 regex = GNUNET_malloc (size + 1);
611 len = (size_t) sscanf (&buffer[offset], "%s", regex);
614 len = strlen (regex);
619 regex = GNUNET_realloc (regex, len + 1);
620 GNUNET_array_grow (regexes, nr, nr + 1);
621 GNUNET_assert (NULL == regexes[nr - 2]);
622 regexes[nr - 2] = regex;
623 regexes[nr - 1] = NULL;
625 } while (offset < size);
626 GNUNET_free_non_null (regex);
627 GNUNET_free (buffer);
634 * Free all memory reserved for a set of regexes created by read_from_file.
636 * @param regexes NULL-terminated array of regexes.
639 REGEX_TEST_free_from_file (char **regexes)
643 for (i = 0; regexes[i]; i++)
644 GNUNET_free (regexes[i]);
645 GNUNET_free (regexes);
648 /* end of regex_test_lib.c */