2 * This file is part of GNUnet
3 * Copyright (C) 2012-2017 GNUnet e.V.
5 * GNUnet is free software: you can redistribute it and/or modify it
6 * under the terms of the GNU Affero General Public License as published
7 * by the Free Software Foundation, either version 3 of the License,
8 * or (at your option) any later version.
10 * GNUnet is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Affero General Public License for more details.
15 * You should have received a copy of the GNU Affero General Public License
16 * along with this program. If not, see <http://www.gnu.org/licenses/>.
18 SPDX-License-Identifier: AGPL3.0-or-later
21 * @file src/regex/regex_test_lib.c
22 * @brief library to read regexes representing IP networks from a file.
23 * and simplyfinying the into one big regex, in order to run
24 * tests (regex performance, cadet profiler).
25 * @author Bartlomiej Polot
29 #include "gnunet_util_lib.h"
33 * Struct to hold the tree formed by prefix-combining the regexes.
35 struct RegexCombineCtx {
37 * Child nodes with same prefix and token.
39 struct RegexCombineCtx **children;
42 * Alphabet size (how many @a children there are)
56 * Convert a character into its int value depending on the base used
59 * @param size base (2, 8 or 16(hex))
61 * @return Int in range [0, (base-1)]
74 if (c >= '0' && c <= '9')
76 else if (c >= 'A' && c <= 'F')
78 else if (c >= 'a' && c <= 'f')
82 GNUNET_log(GNUNET_ERROR_TYPE_ERROR,
83 "Cannot convert char %c in base %u\n",
96 * Printf spaces to indent the regex tree
98 * @param n Indentation level
103 for (int i = 0; i < n; i++)
104 fprintf(stderr, "| ");
109 * Printf the combined regex ctx.
111 * @param ctx The ctx to printf
112 * @param level Indentation level to start with
115 debugctx(struct RegexCombineCtx *ctx, int level)
121 fprintf(stderr, "%u:'%s'\n", c2i(ctx->s[0], ctx->size), ctx->s);
124 fprintf(stderr, "ROOT (base %u)\n", ctx->size);
125 for (unsigned int i = 0; i < ctx->size; i++)
127 if (NULL != ctx->children[i])
130 debugctx(ctx->children[i], level + 1);
139 * Add a single regex to a context, combining with exisiting regex by-prefix.
141 * @param ctx Context with 0 or more regexes.
142 * @param regex Regex to add.
145 regex_add(struct RegexCombineCtx *ctx,
150 * Create and initialize a new RegexCombineCtx.
152 * @param alphabet_size Size of the alphabet (and the Trie array)
154 static struct RegexCombineCtx *
155 new_regex_ctx(unsigned int alphabet_size)
157 struct RegexCombineCtx *ctx;
160 array_size = sizeof(struct RegexCombineCtx *) * alphabet_size;
161 ctx = GNUNET_new(struct RegexCombineCtx);
162 ctx->children = GNUNET_malloc(array_size);
163 ctx->size = alphabet_size;
170 move_children(struct RegexCombineCtx *dst,
171 const struct RegexCombineCtx *src)
175 array_size = sizeof(struct RegexCombineCtx *) * src->size;
176 GNUNET_memcpy(dst->children,
179 for (unsigned int i = 0; i < src->size; i++)
181 src->children[i] = NULL;
187 * Extract a string from all prefix-combined regexes.
189 * @param ctx Context with 0 or more regexes.
191 * @return Regex that matches any of the added regexes.
194 regex_combine(struct RegexCombineCtx *ctx)
196 struct RegexCombineCtx *p;
204 GNUNET_log(GNUNET_ERROR_TYPE_DEBUG, "new combine %s\n", ctx->s);
205 regex = GNUNET_strdup("");
207 for (i = 0; i < ctx->size; i++)
209 p = ctx->children[i];
212 GNUNET_log(GNUNET_ERROR_TYPE_DEBUG,
213 "adding '%s' to innner %s\n",
215 s = regex_combine(p);
216 GNUNET_log(GNUNET_ERROR_TYPE_DEBUG, " total '%s'\n", s);
223 GNUNET_asprintf(&tmp, "%s%s|", regex, s);
224 GNUNET_free_non_null(regex);
227 GNUNET_free_non_null(s);
228 GNUNET_log(GNUNET_ERROR_TYPE_DEBUG, " so far '%s' for inner %s\n", regex, ctx->s);
231 GNUNET_log(GNUNET_ERROR_TYPE_DEBUG, "opt: %d, innner: '%s'\n", opt, regex);
235 GNUNET_log(GNUNET_ERROR_TYPE_DEBUG, "empty, returning ''\n");
237 return NULL == ctx->s ? NULL : GNUNET_strdup(ctx->s);
240 if ('|' == regex[len - 1])
241 regex[len - 1] = '\0';
246 GNUNET_asprintf(&s, "%s(%s)?", ctx->s, regex);
248 GNUNET_asprintf(&s, "%s(%s)", ctx->s, regex);
253 GNUNET_log(GNUNET_ERROR_TYPE_DEBUG, "partial: %s\n", regex);
259 * Get the number of matching characters on the prefix of both strings.
261 * @param s1 String 1.
262 * @param s2 String 2.
264 * @return Number of characters of matching prefix.
267 get_prefix_length(const char *s1, const char *s2)
276 limit = l1 > l2 ? l2 : l1;
278 for (i = 0; i < limit; i++)
288 * Return the child context with the longest prefix match with the regex.
289 * Usually only one child will match, search all just in case.
291 * @param ctx Context whose children to search.
292 * @param regex String to match.
294 * @return Child with the longest prefix, NULL if no child matches.
296 static struct RegexCombineCtx *
297 get_longest_prefix(struct RegexCombineCtx *ctx, const char *regex)
299 struct RegexCombineCtx *p;
300 struct RegexCombineCtx *best;
308 for (i = 0; i < ctx->size; i++)
310 p = ctx->children[i];
314 l = get_prefix_length(p->s, regex);
317 GNUNET_break(0 == best_l);
326 regex_add_multiple(struct RegexCombineCtx *ctx,
328 struct RegexCombineCtx **children)
333 struct RegexCombineCtx *newctx;
341 /* Does the regex cover *all* possible children? Then don't add any,
342 * as it will be covered by the post-regex "(a-z)*"
346 for (i = 1UL; i < l; i++)
348 if (regex[i] != '|' && regex[i] != ')')
353 if (count == ctx->size)
358 /* Add every component as a child node */
360 for (i = 1UL; i < l; i++)
362 if (regex[i] != '|' && regex[i] != ')')
365 newctx = new_regex_ctx(ctx->size);
366 newctx->s = GNUNET_strdup(tmp);
367 if (children != NULL)
368 GNUNET_memcpy(newctx->children,
370 sizeof(*children) * ctx->size);
371 ctx->children[c2i(tmp[0], ctx->size)] = newctx;
377 * Add a single regex to a context, splitting the exisiting state.
379 * We only had a partial match, split existing state, truncate the current node
380 * so it only contains the prefix, add suffix(es) as children.
382 * @param ctx Context to split.
383 * @param len Lenght of ctx->s
384 * @param prefix_l Lenght of common prefix of the new regex and @a ctx->s
387 regex_split(struct RegexCombineCtx *ctx,
389 unsigned int prefix_l)
391 struct RegexCombineCtx *newctx;
395 suffix = GNUNET_malloc(len - prefix_l + 1);
397 * We can use GNUNET_strlcpy because ctx->s is null-terminated
399 GNUNET_strlcpy(suffix, &ctx->s[prefix_l], len - prefix_l + 1);
401 /* Suffix saved, truncate current node so it only contains the prefix,
402 * copy any children nodes to put as grandchildren and initialize new empty
405 ctx->s[prefix_l] = '\0';
407 /* If the suffix is an OR expression, add multiple children */
408 if ('(' == suffix[0])
410 struct RegexCombineCtx **tmp;
413 ctx->children = GNUNET_malloc(sizeof(*tmp) * ctx->size);
414 regex_add_multiple(ctx, suffix, tmp);
420 /* The suffix is a normal string, add as one node */
421 newctx = new_regex_ctx(ctx->size);
423 move_children(newctx, ctx);
424 idx = c2i(suffix[0], ctx->size);
425 ctx->children[idx] = newctx;
430 * Add a single regex to a context, combining with exisiting regex by-prefix.
432 * @param ctx Context with 0 or more regexes.
433 * @param regex Regex to add.
436 regex_add(struct RegexCombineCtx *ctx, const char *regex)
438 struct RegexCombineCtx *p;
439 struct RegexCombineCtx *newctx;
441 unsigned int prefix_l;
447 GNUNET_log(GNUNET_ERROR_TYPE_DEBUG,
448 "regex_add '%s' into '%s'\n",
454 /* If the regex is in the form of (a|b|c), add every character separately */
457 regex_add_multiple(ctx, regex, NULL);
461 p = get_longest_prefix(ctx, regex);
464 /* There is some prefix match, reduce regex and try again */
465 prefix_l = get_prefix_length(p->s, regex);
466 rest_s = &p->s[prefix_l];
467 rest_r = ®ex[prefix_l];
468 GNUNET_log(GNUNET_ERROR_TYPE_DEBUG, "chosen '%s' [%u]\n", p->s, prefix_l);
469 GNUNET_log(GNUNET_ERROR_TYPE_DEBUG, "prefix r '%.*s'\n", prefix_l, p->s);
470 GNUNET_log(GNUNET_ERROR_TYPE_DEBUG, "rest r '%s'\n", rest_r);
471 GNUNET_log(GNUNET_ERROR_TYPE_DEBUG, "rest s '%s'\n", rest_s);
475 regex_split(p, len, prefix_l);
477 regex_add(p, rest_r);
481 /* There is no prefix match, add new */
482 idx = c2i(regex[0], ctx->size);
483 if (NULL == ctx->children[idx] && NULL != ctx->s)
485 /* this was the end before, add empty string */
486 newctx = new_regex_ctx(ctx->size);
487 newctx->s = GNUNET_strdup("");
488 ctx->children[idx] = newctx;
490 GNUNET_log(GNUNET_ERROR_TYPE_DEBUG, " no match\n");
491 GNUNET_log(GNUNET_ERROR_TYPE_DEBUG, " new state %s\n", regex);
492 GNUNET_log(GNUNET_ERROR_TYPE_DEBUG, " under %s\n", ctx->s);
493 newctx = new_regex_ctx(ctx->size);
494 newctx->s = GNUNET_strdup(regex);
495 ctx->children[idx] = newctx;
500 * Free all resources used by the context node and all its children.
502 * @param ctx Context to free.
505 regex_ctx_destroy(struct RegexCombineCtx *ctx)
512 for (i = 0; i < ctx->size; i++)
514 regex_ctx_destroy(ctx->children[i]);
516 GNUNET_free_non_null(ctx->s); /* 's' on root node is null */
517 GNUNET_free(ctx->children);
523 * Combine an array of regexes into a single prefix-shared regex.
524 * Returns a prefix-combine regex that matches the same strings as
525 * any of the original regexes.
527 * WARNING: only useful for reading specific regexes for specific applications,
528 * namely the gnunet-regex-profiler / gnunet-regex-daemon.
529 * This function DOES NOT support arbitrary regex combining.
531 * @param regexes A NULL-terminated array of regexes.
532 * @param alphabet_size Size of the alphabet the regex uses.
534 * @return A string with a single regex that matches any of the original regexes
537 REGEX_TEST_combine(char * const regexes[], unsigned int alphabet_size)
542 struct RegexCombineCtx *ctx;
544 ctx = new_regex_ctx(alphabet_size);
545 for (i = 0; regexes[i]; i++)
547 current = regexes[i];
548 GNUNET_log(GNUNET_ERROR_TYPE_DEBUG, "Regex %u: %s\n", i, current);
549 regex_add(ctx, current);
552 GNUNET_log(GNUNET_ERROR_TYPE_DEBUG, "\nCombining...\n");
555 combined = regex_combine(ctx);
557 regex_ctx_destroy(ctx);
564 * Read a set of regexes from a file, one per line and return them in an array
565 * suitable for REGEX_TEST_combine.
566 * The array must be free'd using REGEX_TEST_free_from_file.
568 * @param filename Name of the file containing the regexes.
570 * @return A newly allocated, NULL terminated array of regexes.
573 REGEX_TEST_read_from_file(const char *filename)
575 struct GNUNET_DISK_FileHandle *f;
584 f = GNUNET_DISK_file_open(filename,
585 GNUNET_DISK_OPEN_READ,
586 GNUNET_DISK_PERM_NONE);
589 GNUNET_log(GNUNET_ERROR_TYPE_ERROR,
590 "Can't open file %s for reading\n", filename);
593 if (GNUNET_OK != GNUNET_DISK_file_handle_size(f, &size))
595 GNUNET_log(GNUNET_ERROR_TYPE_ERROR,
596 "Can't get size of file %s\n", filename);
597 GNUNET_DISK_file_close(f);
600 GNUNET_log(GNUNET_ERROR_TYPE_DEBUG,
601 "using file %s, size %llu\n",
602 filename, (unsigned long long)size);
604 buffer = GNUNET_malloc(size + 1);
605 GNUNET_DISK_file_read(f, buffer, size);
606 GNUNET_DISK_file_close(f);
607 regexes = GNUNET_malloc(sizeof(char *));
614 regex = GNUNET_malloc(size + 1);
615 len = (size_t)sscanf(&buffer[offset], "%s", regex);
623 regex = GNUNET_realloc(regex, len + 1);
624 GNUNET_array_grow(regexes, nr, nr + 1);
625 GNUNET_assert(NULL == regexes[nr - 2]);
626 regexes[nr - 2] = regex;
627 regexes[nr - 1] = NULL;
630 while (offset < size);
631 GNUNET_free_non_null(regex);
639 * Free all memory reserved for a set of regexes created by read_from_file.
641 * @param regexes NULL-terminated array of regexes.
644 REGEX_TEST_free_from_file(char **regexes)
648 for (i = 0; regexes[i]; i++)
649 GNUNET_free(regexes[i]);
650 GNUNET_free(regexes);
653 /* end of regex_test_lib.c */