2 * This file is part of GNUnet
3 * Copyright (C) 2012-2017 GNUnet e.V.
5 * GNUnet is free software: you can redistribute it and/or modify it
6 * under the terms of the GNU Affero General Public License as published
7 * by the Free Software Foundation, either version 3 of the License,
8 * or (at your option) any later version.
10 * GNUnet is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Affero General Public License for more details.
15 * You should have received a copy of the GNU Affero General Public License
16 * along with this program. If not, see <http://www.gnu.org/licenses/>.
19 * @file src/regex/regex_test_lib.c
20 * @brief library to read regexes representing IP networks from a file.
21 * and simplyfinying the into one big regex, in order to run
22 * tests (regex performance, cadet profiler).
23 * @author Bartlomiej Polot
27 #include "gnunet_util_lib.h"
31 * Struct to hold the tree formed by prefix-combining the regexes.
33 struct RegexCombineCtx
36 * Child nodes with same prefix and token.
38 struct RegexCombineCtx **children;
41 * Alphabet size (how many @a children there are)
55 * Convert a character into its int value depending on the base used
58 * @param size base (2, 8 or 16(hex))
60 * @return Int in range [0, (base-1)]
63 c2i (char c, int size)
72 if (c >= '0' && c <= '9')
74 else if (c >= 'A' && c <= 'F')
76 else if (c >= 'a' && c <= 'f')
80 GNUNET_log (GNUNET_ERROR_TYPE_ERROR,
81 "Cannot convert char %c in base %u\n",
93 * Printf spaces to indent the regex tree
95 * @param n Indentation level
100 for (int i = 0; i < n; i++)
101 fprintf (stderr, "| ");
106 * Printf the combined regex ctx.
108 * @param ctx The ctx to printf
109 * @param level Indentation level to start with
112 debugctx (struct RegexCombineCtx *ctx, int level)
118 fprintf (stderr, "%u:'%s'\n", c2i(ctx->s[0], ctx->size), ctx->s);
121 fprintf (stderr, "ROOT (base %u)\n", ctx->size);
122 for (unsigned int i = 0; i < ctx->size; i++)
124 if (NULL != ctx->children[i])
127 debugctx (ctx->children[i], level + 1);
136 * Add a single regex to a context, combining with exisiting regex by-prefix.
138 * @param ctx Context with 0 or more regexes.
139 * @param regex Regex to add.
142 regex_add (struct RegexCombineCtx *ctx,
147 * Create and initialize a new RegexCombineCtx.
149 * @param alphabet_size Size of the alphabet (and the Trie array)
151 static struct RegexCombineCtx *
152 new_regex_ctx (unsigned int alphabet_size)
154 struct RegexCombineCtx *ctx;
157 array_size = sizeof(struct RegexCombineCtx *) * alphabet_size;
158 ctx = GNUNET_new (struct RegexCombineCtx);
159 ctx->children = GNUNET_malloc (array_size);
160 ctx->size = alphabet_size;
167 move_children (struct RegexCombineCtx *dst,
168 const struct RegexCombineCtx *src)
172 array_size = sizeof(struct RegexCombineCtx *) * src->size;
173 GNUNET_memcpy (dst->children,
176 for (unsigned int i = 0; i < src->size; i++)
178 src->children[i] = NULL;
184 * Extract a string from all prefix-combined regexes.
186 * @param ctx Context with 0 or more regexes.
188 * @return Regex that matches any of the added regexes.
191 regex_combine (struct RegexCombineCtx *ctx)
193 struct RegexCombineCtx *p;
201 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, "new combine %s\n", ctx->s);
202 regex = GNUNET_strdup ("");
204 for (i = 0; i < ctx->size; i++)
206 p = ctx->children[i];
209 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG,
210 "adding '%s' to innner %s\n",
212 s = regex_combine (p);
213 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, " total '%s'\n", s);
220 GNUNET_asprintf (&tmp, "%s%s|", regex, s);
221 GNUNET_free_non_null (regex);
224 GNUNET_free_non_null (s);
225 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, " so far '%s' for inner %s\n", regex, ctx->s);
228 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, "opt: %d, innner: '%s'\n", opt, regex);
229 len = strlen (regex);
232 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, "empty, returning ''\n");
234 return NULL == ctx->s ? NULL : GNUNET_strdup (ctx->s);
237 if ('|' == regex[len - 1])
238 regex[len - 1] = '\0';
243 GNUNET_asprintf (&s, "%s(%s)?", ctx->s, regex);
245 GNUNET_asprintf (&s, "%s(%s)", ctx->s, regex);
250 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, "partial: %s\n", regex);
256 * Get the number of matching characters on the prefix of both strings.
258 * @param s1 String 1.
259 * @param s2 String 2.
261 * @return Number of characters of matching prefix.
264 get_prefix_length (const char *s1, const char *s2)
273 limit = l1 > l2 ? l2 : l1;
275 for (i = 0; i < limit; i++)
285 * Return the child context with the longest prefix match with the regex.
286 * Usually only one child will match, search all just in case.
288 * @param ctx Context whose children to search.
289 * @param regex String to match.
291 * @return Child with the longest prefix, NULL if no child matches.
293 static struct RegexCombineCtx *
294 get_longest_prefix (struct RegexCombineCtx *ctx, const char *regex)
296 struct RegexCombineCtx *p;
297 struct RegexCombineCtx *best;
305 for (i = 0; i < ctx->size; i++)
307 p = ctx->children[i];
311 l = get_prefix_length (p->s, regex);
314 GNUNET_break (0 == best_l);
323 regex_add_multiple (struct RegexCombineCtx *ctx,
325 struct RegexCombineCtx **children)
330 struct RegexCombineCtx *newctx;
338 /* Does the regex cover *all* possible children? Then don't add any,
339 * as it will be covered by the post-regex "(a-z)*"
343 for (i = 1UL; i < l; i++)
345 if (regex[i] != '|' && regex[i] != ')')
350 if (count == ctx->size)
355 /* Add every component as a child node */
357 for (i = 1UL; i < l; i++)
359 if (regex[i] != '|' && regex[i] != ')')
362 newctx = new_regex_ctx(ctx->size);
363 newctx->s = GNUNET_strdup (tmp);
364 if (children != NULL)
365 GNUNET_memcpy (newctx->children,
367 sizeof (*children) * ctx->size);
368 ctx->children[c2i(tmp[0], ctx->size)] = newctx;
374 * Add a single regex to a context, splitting the exisiting state.
376 * We only had a partial match, split existing state, truncate the current node
377 * so it only contains the prefix, add suffix(es) as children.
379 * @param ctx Context to split.
380 * @param len Lenght of ctx->s
381 * @param prefix_l Lenght of common prefix of the new regex and @a ctx->s
384 regex_split (struct RegexCombineCtx *ctx,
386 unsigned int prefix_l)
388 struct RegexCombineCtx *newctx;
392 suffix = GNUNET_malloc (len - prefix_l + 1);
393 strncpy (suffix, &ctx->s[prefix_l], len - prefix_l + 1);
395 /* Suffix saved, truncate current node so it only contains the prefix,
396 * copy any children nodes to put as grandchildren and initialize new empty
399 ctx->s[prefix_l] = '\0';
401 /* If the suffix is an OR expression, add multiple children */
402 if ('(' == suffix[0])
404 struct RegexCombineCtx **tmp;
407 ctx->children = GNUNET_malloc (sizeof(*tmp) * ctx->size);
408 regex_add_multiple (ctx, suffix, tmp);
409 GNUNET_free (suffix);
414 /* The suffix is a normal string, add as one node */
415 newctx = new_regex_ctx (ctx->size);
417 move_children (newctx, ctx);
418 idx = c2i(suffix[0], ctx->size);
419 ctx->children[idx] = newctx;
424 * Add a single regex to a context, combining with exisiting regex by-prefix.
426 * @param ctx Context with 0 or more regexes.
427 * @param regex Regex to add.
430 regex_add (struct RegexCombineCtx *ctx, const char *regex)
432 struct RegexCombineCtx *p;
433 struct RegexCombineCtx *newctx;
435 unsigned int prefix_l;
441 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG,
442 "regex_add '%s' into '%s'\n",
448 /* If the regex is in the form of (a|b|c), add every character separately */
451 regex_add_multiple (ctx, regex, NULL);
455 p = get_longest_prefix (ctx, regex);
458 /* There is some prefix match, reduce regex and try again */
459 prefix_l = get_prefix_length (p->s, regex);
460 rest_s = &p->s[prefix_l];
461 rest_r = ®ex[prefix_l];
462 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, "chosen '%s' [%u]\n", p->s, prefix_l);
463 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, "prefix r '%.*s'\n", prefix_l, p->s);
464 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, "rest r '%s'\n", rest_r);
465 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, "rest s '%s'\n", rest_s);
469 regex_split (p, len, prefix_l);
471 regex_add (p, rest_r);
475 /* There is no prefix match, add new */
476 idx = c2i(regex[0], ctx->size);
477 if (NULL == ctx->children[idx] && NULL != ctx->s)
479 /* this was the end before, add empty string */
480 newctx = new_regex_ctx (ctx->size);
481 newctx->s = GNUNET_strdup ("");
482 ctx->children[idx] = newctx;
484 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, " no match\n");
485 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, " new state %s\n", regex);
486 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, " under %s\n", ctx->s);
487 newctx = new_regex_ctx(ctx->size);
488 newctx->s = GNUNET_strdup (regex);
489 ctx->children[idx] = newctx;
494 * Free all resources used by the context node and all its children.
496 * @param ctx Context to free.
499 regex_ctx_destroy (struct RegexCombineCtx *ctx)
506 for (i = 0; i < ctx->size; i++)
508 regex_ctx_destroy (ctx->children[i]);
510 GNUNET_free_non_null (ctx->s); /* 's' on root node is null */
511 GNUNET_free (ctx->children);
517 * Combine an array of regexes into a single prefix-shared regex.
518 * Returns a prefix-combine regex that matches the same strings as
519 * any of the original regexes.
521 * WARNING: only useful for reading specific regexes for specific applications,
522 * namely the gnunet-regex-profiler / gnunet-regex-daemon.
523 * This function DOES NOT support arbitrary regex combining.
525 * @param regexes A NULL-terminated array of regexes.
526 * @param alphabet_size Size of the alphabet the regex uses.
528 * @return A string with a single regex that matches any of the original regexes
531 REGEX_TEST_combine (char * const regexes[], unsigned int alphabet_size)
536 struct RegexCombineCtx *ctx;
538 ctx = new_regex_ctx (alphabet_size);
539 for (i = 0; regexes[i]; i++)
541 current = regexes[i];
542 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, "Regex %u: %s\n", i, current);
543 regex_add (ctx, current);
546 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, "\nCombining...\n");
549 combined = regex_combine (ctx);
551 regex_ctx_destroy (ctx);
558 * Read a set of regexes from a file, one per line and return them in an array
559 * suitable for REGEX_TEST_combine.
560 * The array must be free'd using REGEX_TEST_free_from_file.
562 * @param filename Name of the file containing the regexes.
564 * @return A newly allocated, NULL terminated array of regexes.
567 REGEX_TEST_read_from_file (const char *filename)
569 struct GNUNET_DISK_FileHandle *f;
578 f = GNUNET_DISK_file_open (filename,
579 GNUNET_DISK_OPEN_READ,
580 GNUNET_DISK_PERM_NONE);
583 GNUNET_log (GNUNET_ERROR_TYPE_ERROR,
584 "Can't open file %s for reading\n", filename);
587 if (GNUNET_OK != GNUNET_DISK_file_handle_size (f, &size))
589 GNUNET_log (GNUNET_ERROR_TYPE_ERROR,
590 "Can't get size of file %s\n", filename);
591 GNUNET_DISK_file_close (f);
594 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG,
595 "using file %s, size %llu\n",
596 filename, (unsigned long long) size);
598 buffer = GNUNET_malloc (size + 1);
599 GNUNET_DISK_file_read (f, buffer, size);
600 GNUNET_DISK_file_close (f);
601 regexes = GNUNET_malloc (sizeof (char *));
608 regex = GNUNET_malloc (size + 1);
609 len = (size_t) sscanf (&buffer[offset], "%s", regex);
612 len = strlen (regex);
617 regex = GNUNET_realloc (regex, len + 1);
618 GNUNET_array_grow (regexes, nr, nr + 1);
619 GNUNET_assert (NULL == regexes[nr - 2]);
620 regexes[nr - 2] = regex;
621 regexes[nr - 1] = NULL;
623 } while (offset < size);
624 GNUNET_free_non_null (regex);
625 GNUNET_free (buffer);
632 * Free all memory reserved for a set of regexes created by read_from_file.
634 * @param regexes NULL-terminated array of regexes.
637 REGEX_TEST_free_from_file (char **regexes)
641 for (i = 0; regexes[i]; i++)
642 GNUNET_free (regexes[i]);
643 GNUNET_free (regexes);
646 /* end of regex_test_lib.c */