2 * This file is part of GNUnet
3 * Copyright (C) 2012-2017 GNUnet e.V.
5 * GNUnet is free software: you can redistribute it and/or modify it
6 * under the terms of the GNU Affero General Public License as published
7 * by the Free Software Foundation, either version 3 of the License,
8 * or (at your option) any later version.
10 * GNUnet is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Affero General Public License for more details.
15 * You should have received a copy of the GNU Affero General Public License
16 * along with this program. If not, see <http://www.gnu.org/licenses/>.
18 SPDX-License-Identifier: AGPL3.0-or-later
21 * @file src/regex/regex_test_lib.c
22 * @brief library to read regexes representing IP networks from a file.
23 * and simplyfinying the into one big regex, in order to run
24 * tests (regex performance, cadet profiler).
25 * @author Bartlomiej Polot
29 #include "gnunet_util_lib.h"
33 * Struct to hold the tree formed by prefix-combining the regexes.
35 struct RegexCombineCtx
38 * Child nodes with same prefix and token.
40 struct RegexCombineCtx **children;
43 * Alphabet size (how many @a children there are)
57 * Convert a character into its int value depending on the base used
60 * @param size base (2, 8 or 16(hex))
62 * @return Int in range [0, (base-1)]
65 c2i (char c, int size)
75 if ((c >= '0') && (c <= '9') )
77 else if ((c >= 'A') && (c <= 'F') )
79 else if ((c >= 'a') && (c <= 'f') )
83 GNUNET_log (GNUNET_ERROR_TYPE_ERROR,
84 "Cannot convert char %c in base %u\n",
97 * Printf spaces to indent the regex tree
99 * @param n Indentation level
104 for (int i = 0; i < n; i++)
105 fprintf (stderr, "| ");
110 * Printf the combined regex ctx.
112 * @param ctx The ctx to printf
113 * @param level Indentation level to start with
116 debugctx (struct RegexCombineCtx *ctx, int level)
122 fprintf (stderr, "%u:'%s'\n", c2i (ctx->s[0], ctx->size), ctx->s);
125 fprintf (stderr, "ROOT (base %u)\n", ctx->size);
126 for (unsigned int i = 0; i < ctx->size; i++)
128 if (NULL != ctx->children[i])
131 debugctx (ctx->children[i], level + 1);
140 * Add a single regex to a context, combining with exisiting regex by-prefix.
142 * @param ctx Context with 0 or more regexes.
143 * @param regex Regex to add.
146 regex_add (struct RegexCombineCtx *ctx,
151 * Create and initialize a new RegexCombineCtx.
153 * @param alphabet_size Size of the alphabet (and the Trie array)
155 static struct RegexCombineCtx *
156 new_regex_ctx (unsigned int alphabet_size)
158 struct RegexCombineCtx *ctx;
161 array_size = sizeof(struct RegexCombineCtx *) * alphabet_size;
162 ctx = GNUNET_new (struct RegexCombineCtx);
163 ctx->children = GNUNET_malloc (array_size);
164 ctx->size = alphabet_size;
171 move_children (struct RegexCombineCtx *dst,
172 const struct RegexCombineCtx *src)
176 array_size = sizeof(struct RegexCombineCtx *) * src->size;
177 GNUNET_memcpy (dst->children,
180 for (unsigned int i = 0; i < src->size; i++)
182 src->children[i] = NULL;
188 * Extract a string from all prefix-combined regexes.
190 * @param ctx Context with 0 or more regexes.
192 * @return Regex that matches any of the added regexes.
195 regex_combine (struct RegexCombineCtx *ctx)
197 struct RegexCombineCtx *p;
205 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, "new combine %s\n", ctx->s);
206 regex = GNUNET_strdup ("");
208 for (i = 0; i < ctx->size; i++)
210 p = ctx->children[i];
213 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG,
214 "adding '%s' to innner %s\n",
216 s = regex_combine (p);
217 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, " total '%s'\n", s);
224 GNUNET_asprintf (&tmp, "%s%s|", regex, s);
225 GNUNET_free_non_null (regex);
228 GNUNET_free_non_null (s);
229 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, " so far '%s' for inner %s\n", regex,
233 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, "opt: %d, innner: '%s'\n", opt, regex);
234 len = strlen (regex);
237 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, "empty, returning ''\n");
239 return NULL == ctx->s ? NULL : GNUNET_strdup (ctx->s);
242 if ('|' == regex[len - 1])
243 regex[len - 1] = '\0';
248 GNUNET_asprintf (&s, "%s(%s)?", ctx->s, regex);
250 GNUNET_asprintf (&s, "%s(%s)", ctx->s, regex);
255 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, "partial: %s\n", regex);
261 * Get the number of matching characters on the prefix of both strings.
263 * @param s1 String 1.
264 * @param s2 String 2.
266 * @return Number of characters of matching prefix.
269 get_prefix_length (const char *s1, const char *s2)
278 limit = l1 > l2 ? l2 : l1;
280 for (i = 0; i < limit; i++)
290 * Return the child context with the longest prefix match with the regex.
291 * Usually only one child will match, search all just in case.
293 * @param ctx Context whose children to search.
294 * @param regex String to match.
296 * @return Child with the longest prefix, NULL if no child matches.
298 static struct RegexCombineCtx *
299 get_longest_prefix (struct RegexCombineCtx *ctx, const char *regex)
301 struct RegexCombineCtx *p;
302 struct RegexCombineCtx *best;
310 for (i = 0; i < ctx->size; i++)
312 p = ctx->children[i];
316 l = get_prefix_length (p->s, regex);
319 GNUNET_break (0 == best_l);
329 regex_add_multiple (struct RegexCombineCtx *ctx,
331 struct RegexCombineCtx **children)
336 struct RegexCombineCtx *newctx;
344 /* Does the regex cover *all* possible children? Then don't add any,
345 * as it will be covered by the post-regex "(a-z)*"
349 for (i = 1UL; i < l; i++)
351 if ((regex[i] != '|') && (regex[i] != ')') )
356 if (count == ctx->size)
361 /* Add every component as a child node */
363 for (i = 1UL; i < l; i++)
365 if ((regex[i] != '|') && (regex[i] != ')') )
368 newctx = new_regex_ctx (ctx->size);
369 newctx->s = GNUNET_strdup (tmp);
370 if (children != NULL)
371 GNUNET_memcpy (newctx->children,
373 sizeof(*children) * ctx->size);
374 ctx->children[c2i (tmp[0], ctx->size)] = newctx;
381 * Add a single regex to a context, splitting the exisiting state.
383 * We only had a partial match, split existing state, truncate the current node
384 * so it only contains the prefix, add suffix(es) as children.
386 * @param ctx Context to split.
387 * @param len Lenght of ctx->s
388 * @param prefix_l Lenght of common prefix of the new regex and @a ctx->s
391 regex_split (struct RegexCombineCtx *ctx,
393 unsigned int prefix_l)
395 struct RegexCombineCtx *newctx;
399 suffix = GNUNET_malloc (len - prefix_l + 1);
401 * We can use GNUNET_strlcpy because ctx->s is null-terminated
403 GNUNET_strlcpy (suffix, &ctx->s[prefix_l], len - prefix_l + 1);
405 /* Suffix saved, truncate current node so it only contains the prefix,
406 * copy any children nodes to put as grandchildren and initialize new empty
409 ctx->s[prefix_l] = '\0';
411 /* If the suffix is an OR expression, add multiple children */
412 if ('(' == suffix[0])
414 struct RegexCombineCtx **tmp;
417 ctx->children = GNUNET_malloc (sizeof(*tmp) * ctx->size);
418 regex_add_multiple (ctx, suffix, tmp);
419 GNUNET_free (suffix);
424 /* The suffix is a normal string, add as one node */
425 newctx = new_regex_ctx (ctx->size);
427 move_children (newctx, ctx);
428 idx = c2i (suffix[0], ctx->size);
429 ctx->children[idx] = newctx;
434 * Add a single regex to a context, combining with exisiting regex by-prefix.
436 * @param ctx Context with 0 or more regexes.
437 * @param regex Regex to add.
440 regex_add (struct RegexCombineCtx *ctx, const char *regex)
442 struct RegexCombineCtx *p;
443 struct RegexCombineCtx *newctx;
445 unsigned int prefix_l;
451 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG,
452 "regex_add '%s' into '%s'\n",
458 /* If the regex is in the form of (a|b|c), add every character separately */
461 regex_add_multiple (ctx, regex, NULL);
465 p = get_longest_prefix (ctx, regex);
468 /* There is some prefix match, reduce regex and try again */
469 prefix_l = get_prefix_length (p->s, regex);
470 rest_s = &p->s[prefix_l];
471 rest_r = ®ex[prefix_l];
472 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, "chosen '%s' [%u]\n", p->s, prefix_l);
473 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, "prefix r '%.*s'\n", prefix_l, p->s);
474 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, "rest r '%s'\n", rest_r);
475 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, "rest s '%s'\n", rest_s);
479 regex_split (p, len, prefix_l);
481 regex_add (p, rest_r);
485 /* There is no prefix match, add new */
486 idx = c2i (regex[0], ctx->size);
487 if ((NULL == ctx->children[idx]) && (NULL != ctx->s))
489 /* this was the end before, add empty string */
490 newctx = new_regex_ctx (ctx->size);
491 newctx->s = GNUNET_strdup ("");
492 ctx->children[idx] = newctx;
494 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, " no match\n");
495 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, " new state %s\n", regex);
496 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, " under %s\n", ctx->s);
497 newctx = new_regex_ctx (ctx->size);
498 newctx->s = GNUNET_strdup (regex);
499 ctx->children[idx] = newctx;
504 * Free all resources used by the context node and all its children.
506 * @param ctx Context to free.
509 regex_ctx_destroy (struct RegexCombineCtx *ctx)
516 for (i = 0; i < ctx->size; i++)
518 regex_ctx_destroy (ctx->children[i]);
520 GNUNET_free_non_null (ctx->s); /* 's' on root node is null */
521 GNUNET_free (ctx->children);
527 * Combine an array of regexes into a single prefix-shared regex.
528 * Returns a prefix-combine regex that matches the same strings as
529 * any of the original regexes.
531 * WARNING: only useful for reading specific regexes for specific applications,
532 * namely the gnunet-regex-profiler / gnunet-regex-daemon.
533 * This function DOES NOT support arbitrary regex combining.
535 * @param regexes A NULL-terminated array of regexes.
536 * @param alphabet_size Size of the alphabet the regex uses.
538 * @return A string with a single regex that matches any of the original regexes
541 REGEX_TEST_combine (char *const regexes[], unsigned int alphabet_size)
546 struct RegexCombineCtx *ctx;
548 ctx = new_regex_ctx (alphabet_size);
549 for (i = 0; regexes[i]; i++)
551 current = regexes[i];
552 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, "Regex %u: %s\n", i, current);
553 regex_add (ctx, current);
556 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, "\nCombining...\n");
559 combined = regex_combine (ctx);
561 regex_ctx_destroy (ctx);
568 * Read a set of regexes from a file, one per line and return them in an array
569 * suitable for REGEX_TEST_combine.
570 * The array must be free'd using REGEX_TEST_free_from_file.
572 * @param filename Name of the file containing the regexes.
574 * @return A newly allocated, NULL terminated array of regexes.
577 REGEX_TEST_read_from_file (const char *filename)
579 struct GNUNET_DISK_FileHandle *f;
588 f = GNUNET_DISK_file_open (filename,
589 GNUNET_DISK_OPEN_READ,
590 GNUNET_DISK_PERM_NONE);
593 GNUNET_log (GNUNET_ERROR_TYPE_ERROR,
594 "Can't open file %s for reading\n", filename);
597 if (GNUNET_OK != GNUNET_DISK_file_handle_size (f, &size))
599 GNUNET_log (GNUNET_ERROR_TYPE_ERROR,
600 "Can't get size of file %s\n", filename);
601 GNUNET_DISK_file_close (f);
604 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG,
605 "using file %s, size %llu\n",
606 filename, (unsigned long long) size);
608 buffer = GNUNET_malloc (size + 1);
609 GNUNET_DISK_file_read (f, buffer, size);
610 GNUNET_DISK_file_close (f);
611 regexes = GNUNET_malloc (sizeof(char *));
618 regex = GNUNET_malloc (size + 1);
619 len = (size_t) sscanf (&buffer[offset], "%s", regex);
622 len = strlen (regex);
627 regex = GNUNET_realloc (regex, len + 1);
628 GNUNET_array_grow (regexes, nr, nr + 1);
629 GNUNET_assert (NULL == regexes[nr - 2]);
630 regexes[nr - 2] = regex;
631 regexes[nr - 1] = NULL;
634 while (offset < size);
635 GNUNET_free_non_null (regex);
636 GNUNET_free (buffer);
643 * Free all memory reserved for a set of regexes created by read_from_file.
645 * @param regexes NULL-terminated array of regexes.
648 REGEX_TEST_free_from_file (char **regexes)
652 for (i = 0; regexes[i]; i++)
653 GNUNET_free (regexes[i]);
654 GNUNET_free (regexes);
658 /* end of regex_test_lib.c */