2 This file is part of GNUnet
3 (C) 2012 Christian Grothoff (and other contributing authors)
5 GNUnet is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published
7 by the Free Software Foundation; either version 3, or (at your
8 option) any later version.
10 GNUnet is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with GNUnet; see the file COPYING. If not, write to the
17 Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18 Boston, MA 02111-1307, USA.
21 * @file src/regex/regex.c
22 * @brief library to create Deterministic Finite Automatons (DFAs) from regular
23 * expressions (regexes).
24 * @author Maximilian Szengel
27 #include "gnunet_container_lib.h"
28 #include "gnunet_crypto_lib.h"
29 #include "gnunet_regex_lib.h"
30 #include "regex_internal.h"
33 * Set this to GNUNET_YES to enable state naming. Used to debug NFA->DFA
34 * creation. Disabled by default for better performance.
36 #define REGEX_DEBUG_DFA GNUNET_NO
39 * Set of states using MDLL API.
41 struct GNUNET_REGEX_StateSet_MDLL
46 struct GNUNET_REGEX_State *head;
51 struct GNUNET_REGEX_State *tail;
61 * Append state to the given StateSet '
63 * @param set set to be modified
64 * @param state state to be appended
67 state_set_append (struct GNUNET_REGEX_StateSet *set,
68 struct GNUNET_REGEX_State *state)
70 if (set->off == set->size)
71 GNUNET_array_grow (set->states, set->size, set->size * 2 + 4);
72 set->states[set->off++] = state;
77 * Compare two strings for equality. If either is NULL they are not equal.
79 * @param str1 first string for comparison.
80 * @param str2 second string for comparison.
82 * @return 0 if the strings are the same or both NULL, 1 or -1 if not.
85 nullstrcmp (const char *str1, const char *str2)
87 if ((NULL == str1) != (NULL == str2))
89 if ((NULL == str1) && (NULL == str2))
92 return strcmp (str1, str2);
97 * Adds a transition from one state to another on 'label'. Does not add
101 * @param from_state starting state for the transition
102 * @param label transition label
103 * @param to_state state to where the transition should point to
106 state_add_transition (struct GNUNET_REGEX_Context *ctx,
107 struct GNUNET_REGEX_State *from_state, const char *label,
108 struct GNUNET_REGEX_State *to_state)
110 struct GNUNET_REGEX_Transition *t;
111 struct GNUNET_REGEX_Transition *oth;
113 if (NULL == from_state)
115 GNUNET_log (GNUNET_ERROR_TYPE_ERROR, "Could not create Transition.\n");
119 /* Do not add duplicate state transitions */
120 for (t = from_state->transitions_head; NULL != t; t = t->next)
122 if (t->to_state == to_state && 0 == nullstrcmp (t->label, label) &&
123 t->from_state == from_state)
127 /* sort transitions by label */
128 for (oth = from_state->transitions_head; NULL != oth; oth = oth->next)
130 if (0 < nullstrcmp (oth->label, label))
134 t = GNUNET_malloc (sizeof (struct GNUNET_REGEX_Transition));
136 t->id = ctx->transition_id++;
138 t->label = GNUNET_strdup (label);
141 t->to_state = to_state;
142 t->from_state = from_state;
144 /* Add outgoing transition to 'from_state' */
145 from_state->transition_count++;
146 GNUNET_CONTAINER_DLL_insert_before (from_state->transitions_head,
147 from_state->transitions_tail, oth, t);
152 * Remove a 'transition' from 'state'.
154 * @param state state from which the to-be-removed transition originates.
155 * @param transition transition that should be removed from state 'state'.
158 state_remove_transition (struct GNUNET_REGEX_State *state,
159 struct GNUNET_REGEX_Transition *transition)
161 if (NULL == state || NULL == transition)
164 if (transition->from_state != state)
167 GNUNET_free_non_null (transition->label);
169 state->transition_count--;
170 GNUNET_CONTAINER_DLL_remove (state->transitions_head, state->transitions_tail,
173 GNUNET_free (transition);
178 * Compare two states. Used for sorting.
180 * @param a first state
181 * @param b second state
183 * @return an integer less than, equal to, or greater than zero
184 * if the first argument is considered to be respectively
185 * less than, equal to, or greater than the second.
188 state_compare (const void *a, const void *b)
190 struct GNUNET_REGEX_State **s1 = (struct GNUNET_REGEX_State **) a;
191 struct GNUNET_REGEX_State **s2 = (struct GNUNET_REGEX_State **) b;
193 return (*s1)->id - (*s2)->id;
198 * Get all edges leaving state 's'.
201 * @param edges all edges leaving 's', expected to be allocated and have enough
202 * space for s->transitions_count elements.
204 * @return number of edges.
207 state_get_edges (struct GNUNET_REGEX_State *s, struct GNUNET_REGEX_Edge *edges)
209 struct GNUNET_REGEX_Transition *t;
217 for (t = s->transitions_head; NULL != t; t = t->next)
219 if (NULL != t->to_state)
221 edges[count].label = t->label;
222 edges[count].destination = t->to_state->hash;
231 * Compare to state sets by comparing the id's of the states that are contained
232 * in each set. Both sets are expected to be sorted by id!
234 * @param sset1 first state set
235 * @param sset2 second state set
236 * @return 0 if the sets are equal, otherwise non-zero
239 state_set_compare (struct GNUNET_REGEX_StateSet *sset1,
240 struct GNUNET_REGEX_StateSet *sset2)
245 if (NULL == sset1 || NULL == sset2)
248 result = sset1->off - sset2->off;
253 for (i = 0; i < sset1->off; i++)
254 if (0 != (result = state_compare (&sset1->states[i], &sset2->states[i])))
261 * Clears the given StateSet 'set'
263 * @param set set to be cleared
266 state_set_clear (struct GNUNET_REGEX_StateSet *set)
268 GNUNET_array_grow (set->states, set->size, 0);
274 * Clears an automaton fragment. Does not destroy the states inside the
277 * @param a automaton to be cleared
280 automaton_fragment_clear (struct GNUNET_REGEX_Automaton *a)
287 a->states_head = NULL;
288 a->states_tail = NULL;
295 * Frees the memory used by State 's'
297 * @param s state that should be destroyed
300 automaton_destroy_state (struct GNUNET_REGEX_State *s)
302 struct GNUNET_REGEX_Transition *t;
303 struct GNUNET_REGEX_Transition *next_t;
308 GNUNET_free_non_null (s->name);
309 GNUNET_free_non_null (s->proof);
310 state_set_clear (&s->nfa_set);
311 for (t = s->transitions_head; NULL != t; t = next_t)
314 state_remove_transition (s, t);
322 * Remove a state from the given automaton 'a'. Always use this function when
323 * altering the states of an automaton. Will also remove all transitions leading
324 * to this state, before destroying it.
327 * @param s state to remove
330 automaton_remove_state (struct GNUNET_REGEX_Automaton *a,
331 struct GNUNET_REGEX_State *s)
333 struct GNUNET_REGEX_State *s_check;
334 struct GNUNET_REGEX_Transition *t_check;
335 struct GNUNET_REGEX_Transition *t_check_next;
337 if (NULL == a || NULL == s)
340 /* remove all transitions leading to this state */
341 for (s_check = a->states_head; NULL != s_check; s_check = s_check->next)
343 for (t_check = s_check->transitions_head; NULL != t_check;
344 t_check = t_check_next)
346 t_check_next = t_check->next;
347 if (t_check->to_state == s)
348 state_remove_transition (s_check, t_check);
353 GNUNET_CONTAINER_DLL_remove (a->states_head, a->states_tail, s);
356 automaton_destroy_state (s);
361 * Merge two states into one. Will merge 's1' and 's2' into 's1' and destroy
362 * 's2'. 's1' will contain all (non-duplicate) outgoing transitions of 's2'.
366 * @param s1 first state
367 * @param s2 second state, will be destroyed
370 automaton_merge_states (struct GNUNET_REGEX_Context *ctx,
371 struct GNUNET_REGEX_Automaton *a,
372 struct GNUNET_REGEX_State *s1,
373 struct GNUNET_REGEX_State *s2)
375 struct GNUNET_REGEX_State *s_check;
376 struct GNUNET_REGEX_Transition *t_check;
377 struct GNUNET_REGEX_Transition *t;
378 struct GNUNET_REGEX_Transition *t_next;
384 /* 1. Make all transitions pointing to s2 point to s1, unless this transition
385 * does not already exists, if it already exists remove transition. */
386 for (s_check = a->states_head; NULL != s_check; s_check = s_check->next)
388 for (t_check = s_check->transitions_head; NULL != t_check; t_check = t_next)
390 t_next = t_check->next;
392 if (s2 == t_check->to_state)
395 for (t = t_check->from_state->transitions_head; NULL != t; t = t->next)
397 if (t->to_state == s1 && 0 == strcmp (t_check->label, t->label))
400 if (GNUNET_NO == is_dup)
401 t_check->to_state = s1;
403 state_remove_transition (t_check->from_state, t_check);
408 /* 2. Add all transitions from s2 to sX to s1 */
409 for (t_check = s2->transitions_head; NULL != t_check; t_check = t_check->next)
411 if (t_check->to_state != s1)
412 state_add_transition (ctx, s1, t_check->label, t_check->to_state);
415 /* 3. Rename s1 to {s1,s2} */
420 GNUNET_asprintf (&s1->name, "{%s,%s}", new_name, s2->name);
421 GNUNET_free (new_name);
425 GNUNET_CONTAINER_DLL_remove (a->states_head, a->states_tail, s2);
427 automaton_destroy_state (s2);
432 * Add a state to the automaton 'a', always use this function to alter the
433 * states DLL of the automaton.
435 * @param a automaton to add the state to
436 * @param s state that should be added
439 automaton_add_state (struct GNUNET_REGEX_Automaton *a,
440 struct GNUNET_REGEX_State *s)
442 GNUNET_CONTAINER_DLL_insert (a->states_head, a->states_tail, s);
448 * Depth-first traversal (DFS) of all states that are reachable from state
449 * 's'. Performs 'action' on each visited state.
451 * @param s start state.
452 * @param marks an array of size a->state_count to remember which state was
454 * @param count current count of the state.
455 * @param check function that is checked before advancing on each transition
457 * @param check_cls closure for check.
458 * @param action action to be performed on each state.
459 * @param action_cls closure for action.
462 automaton_state_traverse (struct GNUNET_REGEX_State *s, int *marks,
464 GNUNET_REGEX_traverse_check check, void *check_cls,
465 GNUNET_REGEX_traverse_action action, void *action_cls)
467 struct GNUNET_REGEX_Transition *t;
469 if (GNUNET_YES == marks[s->traversal_id])
472 marks[s->traversal_id] = GNUNET_YES;
475 action (action_cls, *count, s);
479 for (t = s->transitions_head; NULL != t; t = t->next)
482 (NULL != check && GNUNET_YES == check (check_cls, s, t)))
484 automaton_state_traverse (t->to_state, marks, count, check, check_cls,
492 * Traverses the given automaton using depth-first-search (DFS) from it's start
493 * state, visiting all reachable states and calling 'action' on each one of
496 * @param a automaton to be traversed.
497 * @param start start state, pass a->start or NULL to traverse the whole automaton.
498 * @param check function that is checked before advancing on each transition
500 * @param check_cls closure for check.
501 * @param action action to be performed on each state.
502 * @param action_cls closure for action
505 GNUNET_REGEX_automaton_traverse (const struct GNUNET_REGEX_Automaton *a,
506 struct GNUNET_REGEX_State *start,
507 GNUNET_REGEX_traverse_check check,
509 GNUNET_REGEX_traverse_action action,
513 struct GNUNET_REGEX_State *s;
515 if (NULL == a || 0 == a->state_count)
518 int marks[a->state_count];
520 for (count = 0, s = a->states_head; NULL != s && count < a->state_count;
521 s = s->next, count++)
523 s->traversal_id = count;
524 marks[s->traversal_id] = GNUNET_NO;
534 automaton_state_traverse (s, marks, &count, check, check_cls, action,
540 * String container for faster string operations.
545 * Buffer holding the string (may start in the middle!);
556 * Length of the string in the buffer.
561 * Number of bytes allocated for 'sbuf'
566 * Buffer currently represents "NULL" (not the empty string!)
571 * If this entry is part of the last/current generation array,
572 * this flag is GNUNET_YES if the last and current generation are
573 * identical (and thus copying is unnecessary if the value didn't
574 * change). This is used in an optimization that improves
575 * performance by about 1% --- if we use int16_t here. With just
576 * "int" for both flags, performance drops (on my system) significantly,
577 * most likely due to increased cache misses.
585 * Compare two strings for equality. If either is NULL they are not equal.
587 * @param s1 first string for comparison.
588 * @param s2 second string for comparison.
590 * @return 0 if the strings are the same or both NULL, 1 or -1 if not.
593 sb_nullstrcmp (const struct StringBuffer *s1,
594 const struct StringBuffer *s2)
596 if ( (GNUNET_YES == s1->null_flag) &&
597 (GNUNET_YES == s2->null_flag) )
599 if ( (GNUNET_YES == s1->null_flag) ||
600 (GNUNET_YES == s2->null_flag) )
602 if (s1->slen != s2->slen)
604 return memcmp (s1->sbuf, s2->sbuf, s1->slen);
609 * Compare two strings for equality.
611 * @param s1 first string for comparison.
612 * @param s2 second string for comparison.
614 * @return 0 if the strings are the same, 1 or -1 if not.
617 sb_strcmp (const struct StringBuffer *s1,
618 const struct StringBuffer *s2)
620 if (s1->slen != s2->slen)
622 return memcmp (s1->sbuf, s2->sbuf, s1->slen);
627 * Reallocate the buffer of 'ret' to fit 'nlen' characters;
628 * move the existing string to the beginning of the new buffer.
630 * @param ret current buffer, to be updated
631 * @param nlen target length for the buffer, must be at least ret->slen
634 sb_realloc (struct StringBuffer *ret,
639 GNUNET_assert (nlen >= ret->slen);
641 ret->abuf = GNUNET_malloc (nlen);
646 ret->sbuf = ret->abuf;
647 GNUNET_free_non_null (old);
654 * @param ret where to write the result
655 * @param sarg string to append
658 sb_append (struct StringBuffer *ret,
659 const struct StringBuffer *sarg)
661 if (GNUNET_YES == ret->null_flag)
663 ret->null_flag = GNUNET_NO;
664 if (ret->blen < sarg->slen + ret->slen)
665 sb_realloc (ret, ret->blen + sarg->slen + 128);
666 memcpy (&ret->sbuf[ret->slen],
669 ret->slen += sarg->slen;
676 * @param ret where to write the result
677 * @param cstr string to append
680 sb_append_cstr (struct StringBuffer *ret,
683 size_t cstr_len = strlen (cstr);
685 if (GNUNET_YES == ret->null_flag)
687 ret->null_flag = GNUNET_NO;
688 if (ret->blen < cstr_len + ret->slen)
689 sb_realloc (ret, ret->blen + cstr_len + 128);
690 memcpy (&ret->sbuf[ret->slen],
693 ret->slen += cstr_len;
698 * Wrap a string buffer, that is, set ret to the format string
699 * which contains an "%s" which is to be replaced with the original
700 * content of 'ret'. Note that optimizing this function is not
701 * really worth it, it is rarely called.
703 * @param ret where to write the result and take the input for %.*s from
704 * @param format format string, fprintf-style, with exactly one "%.*s"
705 * @param extra_chars how long will the result be, in addition to 'sarg' length
708 sb_wrap (struct StringBuffer *ret,
714 if (GNUNET_YES == ret->null_flag)
716 ret->null_flag = GNUNET_NO;
717 temp = GNUNET_malloc (ret->slen + extra_chars + 1);
718 GNUNET_snprintf (temp,
719 ret->slen + extra_chars + 1,
723 GNUNET_free_non_null (ret->abuf);
726 ret->blen = ret->slen + extra_chars + 1;
727 ret->slen = ret->slen + extra_chars;
732 * Format a string buffer. Note that optimizing this function is not
733 * really worth it, it is rarely called.
735 * @param ret where to write the result
736 * @param format format string, fprintf-style, with exactly one "%.*s"
737 * @param extra_chars how long will the result be, in addition to 'sarg' length
738 * @param sarg string to print into the format
741 sb_printf1 (struct StringBuffer *ret,
744 const struct StringBuffer *sarg)
746 if (ret->blen < sarg->slen + extra_chars + 1)
748 sarg->slen + extra_chars + 1);
749 ret->null_flag = GNUNET_NO;
750 ret->sbuf = ret->abuf;
751 ret->slen = sarg->slen + extra_chars;
752 GNUNET_snprintf (ret->sbuf,
761 * Format a string buffer.
763 * @param ret where to write the result
764 * @param format format string, fprintf-style, with exactly two "%.*s"
765 * @param extra_chars how long will the result be, in addition to 'sarg1/2' length
766 * @param sarg1 first string to print into the format
767 * @param sarg2 second string to print into the format
770 sb_printf2 (struct StringBuffer *ret,
773 const struct StringBuffer *sarg1,
774 const struct StringBuffer *sarg2)
776 if (ret->blen < sarg1->slen + sarg2->slen + extra_chars + 1)
778 sarg1->slen + sarg2->slen + extra_chars + 1);
779 ret->null_flag = GNUNET_NO;
780 ret->slen = sarg1->slen + sarg2->slen + extra_chars;
781 ret->sbuf = ret->abuf;
782 GNUNET_snprintf (ret->sbuf,
793 * Format a string buffer. Note that optimizing this function is not
794 * really worth it, it is rarely called.
796 * @param ret where to write the result
797 * @param format format string, fprintf-style, with exactly three "%.*s"
798 * @param extra_chars how long will the result be, in addition to 'sarg1/2/3' length
799 * @param sarg1 first string to print into the format
800 * @param sarg2 second string to print into the format
801 * @param sarg3 third string to print into the format
804 sb_printf3 (struct StringBuffer *ret,
807 const struct StringBuffer *sarg1,
808 const struct StringBuffer *sarg2,
809 const struct StringBuffer *sarg3)
811 if (ret->blen < sarg1->slen + sarg2->slen + sarg3->slen + extra_chars + 1)
813 sarg1->slen + sarg2->slen + sarg3->slen + extra_chars + 1);
814 ret->null_flag = GNUNET_NO;
815 ret->slen = sarg1->slen + sarg2->slen + sarg3->slen + extra_chars;
816 ret->sbuf = ret->abuf;
817 GNUNET_snprintf (ret->sbuf,
830 * Free resources of the given string buffer.
832 * @param sb buffer to free (actual pointer is not freed, as they
833 * should not be individually allocated)
836 sb_free (struct StringBuffer *sb)
838 GNUNET_array_grow (sb->abuf,
843 sb->null_flag= GNUNET_YES;
848 * Copy the given string buffer from 'in' to 'out'.
850 * @param in input string
851 * @param out output string
854 sb_strdup (struct StringBuffer *out,
855 const struct StringBuffer *in)
858 out->null_flag = in->null_flag;
859 if (GNUNET_YES == out->null_flag)
861 if (out->blen < in->slen)
863 GNUNET_array_grow (out->abuf,
867 out->sbuf = out->abuf;
868 out->slen = in->slen;
869 memcpy (out->sbuf, in->sbuf, out->slen);
874 * Copy the given string buffer from 'in' to 'out'.
876 * @param cstr input string
877 * @param out output string
880 sb_strdup_cstr (struct StringBuffer *out,
885 out->null_flag = GNUNET_YES;
888 out->null_flag = GNUNET_NO;
889 out->slen = strlen (cstr);
890 if (out->blen < out->slen)
892 GNUNET_array_grow (out->abuf,
896 out->sbuf = out->abuf;
897 memcpy (out->sbuf, cstr, out->slen);
902 * Check if the given string 'str' needs parentheses around it when
903 * using it to generate a regex.
907 * @return GNUNET_YES if parentheses are needed, GNUNET_NO otherwise
910 needs_parentheses (const struct StringBuffer *str)
919 if ((GNUNET_YES == str->null_flag) || ((slen = str->slen) < 2))
924 end = str->sbuf + slen;
929 cl = memchr (pos, ')', end - pos);
935 /* while '(' before ')', count opening parens */
936 while ( (NULL != (op = memchr (pos, '(', end - pos))) &&
946 return (*pos == '\0') ? GNUNET_NO : GNUNET_YES;
951 * Remove parentheses surrounding string 'str'.
952 * Example: "(a)" becomes "a", "(a|b)|(a|c)" stays the same.
953 * You need to GNUNET_free the returned string.
955 * @param str string, modified to contain a
956 * @return string without surrounding parentheses, string 'str' if no preceding
957 * epsilon could be found, NULL if 'str' was NULL
960 remove_parentheses (struct StringBuffer *str)
973 if ( (GNUNET_YES == str->null_flag) ||
974 (1 >= (slen = str->slen)) ||
975 ('(' != str->sbuf[0]) ||
976 (')' != str->sbuf[slen - 1]) )
980 end = &sbuf[slen - 1];
981 op = memchr (pos, '(', end - pos);
982 cp = memchr (pos, ')', end - pos);
985 while ( (NULL != op) &&
990 op = memchr (pos, '(', end - pos);
992 while ( (NULL != cp) &&
997 return; /* can't strip parens */
1000 cp = memchr (pos, ')', end - pos);
1014 * Check if the string 'str' starts with an epsilon (empty string).
1015 * Example: "(|a)" is starting with an epsilon.
1017 * @param str string to test
1019 * @return 0 if str has no epsilon, 1 if str starts with '(|' and ends with ')'
1022 has_epsilon (const struct StringBuffer *str)
1025 (GNUNET_YES != str->null_flag) &&
1027 ('(' == str->sbuf[0]) &&
1028 ('|' == str->sbuf[1]) &&
1029 (')' == str->sbuf[str->slen - 1]);
1034 * Remove an epsilon from the string str. Where epsilon is an empty string
1035 * Example: str = "(|a|b|c)", result: "a|b|c"
1036 * The returned string needs to be freed.
1038 * @param str original string
1039 * @param ret where to return string without preceding epsilon, string 'str' if no preceding
1040 * epsilon could be found, NULL if 'str' was NULL
1043 remove_epsilon (const struct StringBuffer *str,
1044 struct StringBuffer *ret)
1046 if (GNUNET_YES == str->null_flag)
1048 ret->null_flag = GNUNET_YES;
1051 if ( (str->slen > 1) &&
1052 ('(' == str->sbuf[0]) &&
1053 ('|' == str->sbuf[1]) &&
1054 (')' == str->sbuf[str->slen - 1]) )
1056 /* remove epsilon */
1057 if (ret->blen < str->slen - 3)
1059 GNUNET_array_grow (ret->abuf,
1063 ret->sbuf = ret->abuf;
1064 ret->slen = str->slen - 3;
1065 memcpy (ret->sbuf, &str->sbuf[2], ret->slen);
1068 sb_strdup (ret, str);
1073 * Compare n bytes of 'str1' and 'str2'
1075 * @param str1 first string to compare
1076 * @param str2 second string for comparison
1077 * @param n number of bytes to compare
1079 * @return -1 if any of the strings is NULL, 0 if equal, non 0 otherwise
1082 sb_strncmp (const struct StringBuffer *str1,
1083 const struct StringBuffer *str2, size_t n)
1087 if ( (str1->slen != str2->slen) &&
1088 ( (str1->slen < n) ||
1089 (str2->slen < n) ) )
1091 max = GNUNET_MAX (str1->slen, str2->slen);
1094 return memcmp (str1->sbuf, str2->sbuf, max);
1099 * Compare n bytes of 'str1' and 'str2'
1101 * @param str1 first string to compare
1102 * @param str2 second C string for comparison
1103 * @param n number of bytes to compare (and length of str2)
1105 * @return -1 if any of the strings is NULL, 0 if equal, non 0 otherwise
1108 sb_strncmp_cstr (const struct StringBuffer *str1,
1109 const char *str2, size_t n)
1113 return memcmp (str1->sbuf, str2, n);
1118 * Initialize string buffer for storing strings of up to n
1121 * @param sb buffer to initialize
1122 * @param n desired target length
1125 sb_init (struct StringBuffer *sb,
1128 sb->null_flag = GNUNET_NO;
1129 sb->abuf = sb->sbuf = (0 == n) ? NULL : GNUNET_malloc (n);
1136 * Compare 'str1', starting from position 'k', with whole 'str2'
1138 * @param str1 first string to compare, starting from position 'k'
1139 * @param str2 second string for comparison
1140 * @param k starting position in 'str1'
1142 * @return -1 if any of the strings is NULL, 0 if equal, non 0 otherwise
1145 sb_strkcmp (const struct StringBuffer *str1,
1146 const struct StringBuffer *str2, size_t k)
1148 if ( (GNUNET_YES == str1->null_flag) ||
1149 (GNUNET_YES == str2->null_flag) ||
1151 (str1->slen - k != str2->slen) )
1153 return memcmp (&str1->sbuf[k], str2->sbuf, str2->slen);
1158 * Helper function used as 'action' in 'GNUNET_REGEX_automaton_traverse'
1159 * function to create the depth-first numbering of the states.
1161 * @param cls states array.
1162 * @param count current state counter.
1163 * @param s current state.
1166 number_states (void *cls, const unsigned int count,
1167 struct GNUNET_REGEX_State *s)
1169 struct GNUNET_REGEX_State **states = cls;
1179 ((GNUNET_YES == a.null_flag) ? 6 : (int) a.slen), \
1180 ((GNUNET_YES == a.null_flag) ? "(null)" : a.sbuf)
1184 * Construct the regular expression given the inductive step,
1185 * $R^{(k)}_{ij} = R^{(k-1)}_{ij} | R^{(k-1)}_{ik} ( R^{(k-1)}_{kk} )^*
1186 * R^{(k-1)}_{kj}, and simplify the resulting expression saved in R_cur_ij.
1188 * @param R_last_ij value of $R^{(k-1)_{ij}.
1189 * @param R_last_ik value of $R^{(k-1)_{ik}.
1190 * @param R_last_kk value of $R^{(k-1)_{kk}.
1191 * @param R_last_kj value of $R^{(k-1)_{kj}.
1192 * @param R_cur_ij result for this inductive step is saved in R_cur_ij, R_cur_ij
1193 * is expected to be NULL when called!
1194 * @param R_cur_l optimization -- kept between iterations to avoid realloc
1195 * @param R_cur_r optimization -- kept between iterations to avoid realloc
1198 automaton_create_proofs_simplify (const struct StringBuffer *R_last_ij,
1199 const struct StringBuffer *R_last_ik,
1200 const struct StringBuffer *R_last_kk,
1201 const struct StringBuffer *R_last_kj,
1202 struct StringBuffer *R_cur_ij,
1203 struct StringBuffer *R_cur_l,
1204 struct StringBuffer *R_cur_r)
1206 struct StringBuffer R_temp_ij;
1207 struct StringBuffer R_temp_ik;
1208 struct StringBuffer R_temp_kj;
1209 struct StringBuffer R_temp_kk;
1215 int clean_ik_kk_cmp;
1216 int clean_kk_kj_cmp;
1222 * $R^{(k)}_{ij} = R^{(k-1)}_{ij} | R^{(k-1)}_{ik} ( R^{(k-1)}_{kk} )^* R^{(k-1)}_{kj}
1223 * R_last == R^{(k-1)}, R_cur == R^{(k)}
1224 * R_cur_ij = R_cur_l | R_cur_r
1225 * R_cur_l == R^{(k-1)}_{ij}
1226 * R_cur_r == R^{(k-1)}_{ik} ( R^{(k-1)}_{kk} )^* R^{(k-1)}_{kj}
1229 if ( (GNUNET_YES == R_last_ij->null_flag) &&
1230 ( (GNUNET_YES == R_last_ik->null_flag) ||
1231 (GNUNET_YES == R_last_kj->null_flag)))
1233 /* R^{(k)}_{ij} = N | N */
1234 R_cur_ij->null_flag = GNUNET_YES;
1235 R_cur_ij->synced = GNUNET_NO;
1239 if ( (GNUNET_YES == R_last_ik->null_flag) ||
1240 (GNUNET_YES == R_last_kj->null_flag) )
1242 /* R^{(k)}_{ij} = R^{(k-1)}_{ij} | N */
1243 if (GNUNET_YES == R_last_ij->synced)
1245 R_cur_ij->synced = GNUNET_YES;
1246 R_cur_ij->null_flag = GNUNET_NO;
1249 R_cur_ij->synced = GNUNET_YES;
1250 sb_strdup (R_cur_ij, R_last_ij);
1253 R_cur_ij->synced = GNUNET_NO;
1255 /* $R^{(k)}_{ij} = N | R^{(k-1)}_{ik} ( R^{(k-1)}_{kk} )^* R^{(k-1)}_{kj} OR
1256 * $R^{(k)}_{ij} = R^{(k-1)}_{ij} | R^{(k-1)}_{ik} ( R^{(k-1)}_{kk} )^* R^{(k-1)}_{kj} */
1258 R_cur_r->null_flag = GNUNET_YES;
1260 R_cur_l->null_flag = GNUNET_YES;
1263 /* cache results from strcmp, we might need these many times */
1264 ij_kj_cmp = sb_nullstrcmp (R_last_ij, R_last_kj);
1265 ij_ik_cmp = sb_nullstrcmp (R_last_ij, R_last_ik);
1266 ik_kk_cmp = sb_nullstrcmp (R_last_ik, R_last_kk);
1267 kk_kj_cmp = sb_nullstrcmp (R_last_kk, R_last_kj);
1269 /* Assign R_temp_(ik|kk|kj) to R_last[][] and remove epsilon as well
1270 * as parentheses, so we can better compare the contents */
1272 memset (&R_temp_ij, 0, sizeof (struct StringBuffer));
1273 memset (&R_temp_ik, 0, sizeof (struct StringBuffer));
1274 memset (&R_temp_kk, 0, sizeof (struct StringBuffer));
1275 memset (&R_temp_kj, 0, sizeof (struct StringBuffer));
1276 remove_epsilon (R_last_ik, &R_temp_ik);
1277 remove_epsilon (R_last_kk, &R_temp_kk);
1278 remove_epsilon (R_last_kj, &R_temp_kj);
1279 remove_parentheses (&R_temp_ik);
1280 remove_parentheses (&R_temp_kk);
1281 remove_parentheses (&R_temp_kj);
1282 clean_ik_kk_cmp = sb_nullstrcmp (R_last_ik, &R_temp_kk);
1283 clean_kk_kj_cmp = sb_nullstrcmp (&R_temp_kk, R_last_kj);
1285 /* construct R_cur_l (and, if necessary R_cur_r) */
1286 if (GNUNET_YES != R_last_ij->null_flag)
1288 /* Assign R_temp_ij to R_last_ij and remove epsilon as well
1289 * as parentheses, so we can better compare the contents */
1290 remove_epsilon (R_last_ij, &R_temp_ij);
1291 remove_parentheses (&R_temp_ij);
1293 if ( (0 == sb_strcmp (&R_temp_ij, &R_temp_ik)) &&
1294 (0 == sb_strcmp (&R_temp_ik, &R_temp_kk)) &&
1295 (0 == sb_strcmp (&R_temp_kk, &R_temp_kj)) )
1297 if (0 == R_temp_ij.slen)
1299 R_cur_r->null_flag = GNUNET_NO;
1301 else if ((0 == sb_strncmp_cstr (R_last_ij, "(|", 2)) ||
1302 (0 == sb_strncmp_cstr (R_last_ik, "(|", 2) &&
1303 0 == sb_strncmp_cstr (R_last_kj, "(|", 2)))
1306 * a|(e|a)a*(e|a) = a*
1307 * a|(e|a)(e|a)*(e|a) = a*
1309 * (e|a)|aa*(e|a) = a*
1310 * (e|a)|(e|a)a*a = a*
1311 * (e|a)|(e|a)a*(e|a) = a*
1312 * (e|a)|(e|a)(e|a)*(e|a) = a*
1314 if (GNUNET_YES == needs_parentheses (&R_temp_ij))
1315 sb_printf1 (R_cur_r, "(%.*s)*", 3, &R_temp_ij);
1317 sb_printf1 (R_cur_r, "%.*s*", 1, &R_temp_ij);
1325 * a|(e|a)(e|a)*a = a+
1326 * a|a(e|a)*(e|a) = a+
1328 if (GNUNET_YES == needs_parentheses (&R_temp_ij))
1329 sb_printf1 (R_cur_r, "(%.*s)+", 3, &R_temp_ij);
1331 sb_printf1 (R_cur_r, "%.*s+", 1, &R_temp_ij);
1334 else if ( (0 == ij_ik_cmp) && (0 == clean_kk_kj_cmp) && (0 != clean_ik_kk_cmp) )
1337 if (0 == R_last_kk->slen)
1338 sb_strdup (R_cur_r, R_last_ij);
1339 else if (GNUNET_YES == needs_parentheses (&R_temp_kk))
1340 sb_printf2 (R_cur_r, "%.*s(%.*s)*", 3, R_last_ij, &R_temp_kk);
1342 sb_printf2 (R_cur_r, "%.*s%.*s*", 1, R_last_ij, R_last_kk);
1343 R_cur_l->null_flag = GNUNET_YES;
1345 else if ( (0 == ij_kj_cmp) && (0 == clean_ik_kk_cmp) && (0 != clean_kk_kj_cmp))
1348 if (R_last_kk->slen < 1)
1350 sb_strdup (R_cur_r, R_last_kj);
1352 else if (GNUNET_YES == needs_parentheses (&R_temp_kk))
1353 sb_printf2 (R_cur_r, "(%.*s)*%.*s", 3, &R_temp_kk, R_last_kj);
1355 sb_printf2 (R_cur_r, "%.*s*%.*s", 1, &R_temp_kk, R_last_kj);
1357 R_cur_l->null_flag = GNUNET_YES;
1359 else if ( (0 == ij_ik_cmp) && (0 == kk_kj_cmp) && (! has_epsilon (R_last_ij)) &&
1360 has_epsilon (R_last_kk))
1362 /* a|a(e|b)*(e|b) = a|ab* = a|a|ab|abb|abbb|... = ab* */
1363 if (needs_parentheses (&R_temp_kk))
1364 sb_printf2 (R_cur_r, "%.*s(%.*s)*", 3, R_last_ij, &R_temp_kk);
1366 sb_printf2 (R_cur_r, "%.*s%.*s*", 1, R_last_ij, &R_temp_kk);
1367 R_cur_l->null_flag = GNUNET_YES;
1369 else if ( (0 == ij_kj_cmp) && (0 == ik_kk_cmp) && (! has_epsilon (R_last_ij)) &&
1370 has_epsilon (R_last_kk))
1372 /* a|(e|b)(e|b)*a = a|b*a = a|a|ba|bba|bbba|... = b*a */
1373 if (needs_parentheses (&R_temp_kk))
1374 sb_printf2 (R_cur_r, "(%.*s)*%.*s", 3, &R_temp_kk, R_last_ij);
1376 sb_printf2 (R_cur_r, "%.*s*%.*s", 1, &R_temp_kk, R_last_ij);
1377 R_cur_l->null_flag = GNUNET_YES;
1381 sb_strdup (R_cur_l, R_last_ij);
1382 remove_parentheses (R_cur_l);
1387 /* we have no left side */
1388 R_cur_l->null_flag = GNUNET_YES;
1391 /* construct R_cur_r, if not already constructed */
1392 if (GNUNET_YES == R_cur_r->null_flag)
1394 length = R_temp_kk.slen - R_last_ik->slen;
1396 /* a(ba)*bx = (ab)+x */
1397 if ( (length > 0) &&
1398 (GNUNET_YES != R_last_kk->null_flag) &&
1399 (0 < R_last_kk->slen) &&
1400 (GNUNET_YES != R_last_kj->null_flag) &&
1401 (0 < R_last_kj->slen) &&
1402 (GNUNET_YES != R_last_ik->null_flag) &&
1403 (0 < R_last_ik->slen) &&
1404 (0 == sb_strkcmp (&R_temp_kk, R_last_ik, length)) &&
1405 (0 == sb_strncmp (&R_temp_kk, R_last_kj, length)) )
1407 struct StringBuffer temp_a;
1408 struct StringBuffer temp_b;
1410 sb_init (&temp_a, length);
1411 sb_init (&temp_b, R_last_kj->slen - length);
1414 temp_a.sbuf = temp_a.abuf;
1415 memcpy (temp_a.sbuf, R_last_kj->sbuf, length_l);
1416 temp_a.slen = length_l;
1418 length_r = R_last_kj->slen - length;
1419 temp_b.sbuf = temp_b.abuf;
1420 memcpy (temp_b.sbuf, &R_last_kj->sbuf[length], length_r);
1421 temp_b.slen = length_r;
1423 /* e|(ab)+ = (ab)* */
1424 if ( (GNUNET_YES != R_cur_l->null_flag) &&
1425 (0 == R_cur_l->slen) &&
1426 (0 == temp_b.slen) )
1428 sb_printf2 (R_cur_r, "(%.*s%.*s)*", 3, R_last_ik, &temp_a);
1430 R_cur_l->null_flag = GNUNET_YES;
1434 sb_printf3 (R_cur_r, "(%.*s%.*s)+%.*s", 3, R_last_ik, &temp_a, &temp_b);
1439 else if (0 == sb_strcmp (&R_temp_ik, &R_temp_kk) &&
1440 0 == sb_strcmp (&R_temp_kk, &R_temp_kj))
1444 * (e|a)(e|a)*(e|a) = a*
1446 if (has_epsilon (R_last_ik) && has_epsilon (R_last_kj))
1448 if (needs_parentheses (&R_temp_kk))
1449 sb_printf1 (R_cur_r, "(%.*s)*", 3, &R_temp_kk);
1451 sb_printf1 (R_cur_r, "%.*s*", 1, &R_temp_kk);
1454 else if ( (0 == clean_ik_kk_cmp) &&
1455 (0 == clean_kk_kj_cmp) &&
1456 (! has_epsilon (R_last_ik)) )
1458 if (needs_parentheses (&R_temp_kk))
1459 sb_printf2 (R_cur_r, "(%.*s)+%.*s", 3, &R_temp_kk, &R_temp_kk);
1461 sb_printf2 (R_cur_r, "%.*s+%.*s", 1, &R_temp_kk, &R_temp_kk);
1472 (has_epsilon (R_last_ik) + has_epsilon (R_last_kk) +
1473 has_epsilon (R_last_kj));
1477 if (needs_parentheses (&R_temp_kk))
1478 sb_printf1 (R_cur_r, "(%.*s)+", 3, &R_temp_kk);
1480 sb_printf1 (R_cur_r, "%.*s+", 1, &R_temp_kk);
1486 * (e|a)(e|a)*b = a*b
1488 else if (0 == sb_strcmp (&R_temp_ik, &R_temp_kk))
1490 if (has_epsilon (R_last_ik))
1492 if (needs_parentheses (&R_temp_kk))
1493 sb_printf2 (R_cur_r, "(%.*s)*%.*s", 3, &R_temp_kk, R_last_kj);
1495 sb_printf2 (R_cur_r, "%.*s*%.*s", 1, &R_temp_kk, R_last_kj);
1499 if (needs_parentheses (&R_temp_kk))
1500 sb_printf2 (R_cur_r, "(%.*s)+%.*s", 3, &R_temp_kk, R_last_kj);
1502 sb_printf2 (R_cur_r, "%.*s+%.*s", 1, &R_temp_kk, R_last_kj);
1507 * b(e|a)*(e|a) = ba*
1509 else if (0 == sb_strcmp (&R_temp_kk, &R_temp_kj))
1511 if (has_epsilon (R_last_kj))
1513 if (needs_parentheses (&R_temp_kk))
1514 sb_printf2 (R_cur_r, "%.*s(%.*s)*", 3, R_last_ik, &R_temp_kk);
1516 sb_printf2 (R_cur_r, "%.*s%.*s*", 1, R_last_ik, &R_temp_kk);
1520 if (needs_parentheses (&R_temp_kk))
1521 sb_printf2 (R_cur_r, "(%.*s)+%.*s", 3, R_last_ik, &R_temp_kk);
1523 sb_printf2 (R_cur_r, "%.*s+%.*s", 1, R_last_ik, &R_temp_kk);
1528 if (0 < R_temp_kk.slen)
1530 if (needs_parentheses (&R_temp_kk))
1532 sb_printf3 (R_cur_r, "%.*s(%.*s)*%.*s", 3, R_last_ik, &R_temp_kk,
1537 sb_printf3 (R_cur_r, "%.*s%.*s*%.*s", 1, R_last_ik, &R_temp_kk,
1543 sb_printf2 (R_cur_r, "%.*s%.*s", 0, R_last_ik, R_last_kj);
1547 sb_free (&R_temp_ij);
1548 sb_free (&R_temp_ik);
1549 sb_free (&R_temp_kk);
1550 sb_free (&R_temp_kj);
1552 if ( (GNUNET_YES == R_cur_l->null_flag) &&
1553 (GNUNET_YES == R_cur_r->null_flag) )
1555 R_cur_ij->null_flag = GNUNET_YES;
1559 if ( (GNUNET_YES != R_cur_l->null_flag) &&
1560 (GNUNET_YES == R_cur_r->null_flag) )
1562 struct StringBuffer tmp;
1565 *R_cur_ij = *R_cur_l;
1570 if ( (GNUNET_YES == R_cur_l->null_flag) &&
1571 (GNUNET_YES != R_cur_r->null_flag) )
1573 struct StringBuffer tmp;
1576 *R_cur_ij = *R_cur_r;
1581 if (0 == sb_nullstrcmp (R_cur_l, R_cur_r))
1583 struct StringBuffer tmp;
1586 *R_cur_ij = *R_cur_l;
1590 sb_printf2 (R_cur_ij, "(%.*s|%.*s)", 3, R_cur_l, R_cur_r);
1595 * Create proofs for all states in the given automaton. Implementation of the
1596 * algorithm descriped in chapter 3.2.1 of "Automata Theory, Languages, and
1597 * Computation 3rd Edition" by Hopcroft, Motwani and Ullman.
1599 * Each state in the automaton gets assigned 'proof' and 'hash' (hash of the
1600 * proof) fields. The starting state will only have a valid proof/hash if it has
1601 * any incoming transitions.
1603 * @param a automaton for which to assign proofs and hashes, must not be NULL
1606 automaton_create_proofs (struct GNUNET_REGEX_Automaton *a)
1608 unsigned int n = a->state_count;
1609 struct GNUNET_REGEX_State *states[n];
1610 struct StringBuffer *R_last;
1611 struct StringBuffer *R_cur;
1612 struct StringBuffer R_cur_r;
1613 struct StringBuffer R_cur_l;
1614 struct StringBuffer *R_swap;
1615 struct GNUNET_REGEX_Transition *t;
1616 struct StringBuffer complete_regex;
1621 R_last = GNUNET_malloc_large (sizeof (struct StringBuffer) * n * n);
1622 R_cur = GNUNET_malloc_large (sizeof (struct StringBuffer) * n * n);
1623 if ( (NULL == R_last) ||
1626 GNUNET_log_strerror (GNUNET_ERROR_TYPE_ERROR, "malloc");
1627 GNUNET_free_non_null (R_cur);
1628 GNUNET_free_non_null (R_last);
1629 return GNUNET_SYSERR;
1632 /* create depth-first numbering of the states, initializes 'state' */
1633 GNUNET_REGEX_automaton_traverse (a, a->start, NULL, NULL, &number_states,
1636 for (i = 0; i < n; i++)
1637 GNUNET_assert (NULL != states[i]);
1638 for (i = 0; i < n; i++)
1639 for (j = 0; j < n; j++)
1640 R_last[i *n + j].null_flag = GNUNET_YES;
1642 /* Compute regular expressions of length "1" between each pair of states */
1643 for (i = 0; i < n; i++)
1645 for (t = states[i]->transitions_head; NULL != t; t = t->next)
1647 j = t->to_state->dfs_id;
1648 if (GNUNET_YES == R_last[i * n + j].null_flag)
1650 sb_strdup_cstr (&R_last[i * n + j], t->label);
1654 sb_append_cstr (&R_last[i * n + j], "|");
1655 sb_append_cstr (&R_last[i * n + j], t->label);
1658 /* add self-loop: i is reachable from i via epsilon-transition */
1659 if (GNUNET_YES == R_last[i * n + i].null_flag)
1661 R_last[i * n + i].slen = 0;
1662 R_last[i * n + i].null_flag = GNUNET_NO;
1666 sb_wrap (&R_last[i * n + i], "(|%.*s)", 3);
1669 for (i = 0; i < n; i++)
1670 for (j = 0; j < n; j++)
1671 if (needs_parentheses (&R_last[i * n + j]))
1672 sb_wrap (&R_last[i * n + j], "(%.*s)", 2);
1673 /* Compute regular expressions of length "k" between each pair of states per
1675 memset (&R_cur_l, 0, sizeof (struct StringBuffer));
1676 memset (&R_cur_r, 0, sizeof (struct StringBuffer));
1677 for (k = 0; k < n; k++)
1679 for (i = 0; i < n; i++)
1681 for (j = 0; j < n; j++)
1683 /* Basis for the recursion:
1684 * $R^{(k)}_{ij} = R^{(k-1)}_{ij} | R^{(k-1)}_{ik} ( R^{(k-1)}_{kk} )^* R^{(k-1)}_{kj}
1685 * R_last == R^{(k-1)}, R_cur == R^{(k)}
1688 /* Create R_cur[i][j] and simplify the expression */
1689 automaton_create_proofs_simplify (&R_last[i * n + j], &R_last[i * n + k],
1690 &R_last[k * n + k], &R_last[k * n + j],
1692 &R_cur_l, &R_cur_r);
1695 /* set R_last = R_cur */
1699 /* clear 'R_cur' for next iteration */
1700 for (i = 0; i < n; i++)
1701 for (j = 0; j < n; j++)
1702 R_cur[i * n + j].null_flag = GNUNET_YES;
1706 /* assign proofs and hashes */
1707 for (i = 0; i < n; i++)
1709 if (GNUNET_YES != R_last[a->start->dfs_id * n + i].null_flag)
1711 states[i]->proof = GNUNET_strndup (R_last[a->start->dfs_id * n + i].sbuf,
1712 R_last[a->start->dfs_id * n + i].slen);
1713 GNUNET_CRYPTO_hash (states[i]->proof, strlen (states[i]->proof),
1718 /* complete regex for whole DFA: union of all pairs (start state/accepting
1720 sb_init (&complete_regex, 16 * n);
1721 for (i = 0; i < n; i++)
1723 if (states[i]->accepting)
1725 if ( (0 == complete_regex.slen) &&
1726 (0 < R_last[a->start->dfs_id * n + i].slen) )
1728 sb_append (&complete_regex,
1729 &R_last[a->start->dfs_id * n + i]);
1731 else if ( (GNUNET_YES != R_last[a->start->dfs_id * n + i].null_flag) &&
1732 (0 < R_last[a->start->dfs_id * n + i].slen) )
1734 sb_append_cstr (&complete_regex, "|");
1735 sb_append (&complete_regex,
1736 &R_last[a->start->dfs_id * n + i]);
1740 a->canonical_regex = GNUNET_strndup (complete_regex.sbuf, complete_regex.slen);
1743 sb_free (&complete_regex);
1744 for (i = 0; i < n; i++)
1745 for (j = 0; j < n; j++)
1747 sb_free (&R_cur[i * n + j]);
1748 sb_free (&R_last[i * n + j]);
1750 GNUNET_free (R_cur);
1751 GNUNET_free (R_last);
1757 * Creates a new DFA state based on a set of NFA states. Needs to be freed using
1758 * automaton_destroy_state.
1760 * @param ctx context
1761 * @param nfa_states set of NFA states on which the DFA should be based on
1763 * @return new DFA state
1765 static struct GNUNET_REGEX_State *
1766 dfa_state_create (struct GNUNET_REGEX_Context *ctx,
1767 struct GNUNET_REGEX_StateSet *nfa_states)
1769 struct GNUNET_REGEX_State *s;
1772 struct GNUNET_REGEX_State *cstate;
1773 struct GNUNET_REGEX_Transition *ctran;
1776 s = GNUNET_malloc (sizeof (struct GNUNET_REGEX_State));
1777 s->id = ctx->state_id++;
1781 if (NULL == nfa_states)
1783 GNUNET_asprintf (&s->name, "s%i", s->id);
1787 s->nfa_set = *nfa_states;
1789 if (nfa_states->off < 1)
1792 /* Create a name based on 'nfa_states' */
1793 len = nfa_states->off * 14 + 4;
1794 s->name = GNUNET_malloc (len);
1795 strcat (s->name, "{");
1798 for (i = 0; i < nfa_states->off; i++)
1800 cstate = nfa_states->states[i];
1801 GNUNET_snprintf (pos, pos - s->name + len,
1803 pos += strlen (pos);
1805 /* Add a transition for each distinct label to NULL state */
1806 for (ctran = cstate->transitions_head; NULL != ctran; ctran = ctran->next)
1807 if (NULL != ctran->label)
1808 state_add_transition (ctx, s, ctran->label, NULL);
1810 /* If the nfa_states contain an accepting state, the new dfa state is also
1812 if (cstate->accepting)
1816 s->name = GNUNET_realloc (s->name, strlen (s->name) + 1);
1818 memset (nfa_states, 0, sizeof (struct GNUNET_REGEX_StateSet));
1824 * Move from the given state 's' to the next state on transition 'str'. Consumes
1825 * as much of the given 'str' as possible (usefull for strided DFAs). On return
1826 * 's' will point to the next state, and the length of the substring used for
1827 * this transition will be returned. If no transition possible 0 is returned and
1828 * 's' points to NULL.
1830 * @param s starting state, will point to the next state or NULL (if no
1831 * transition possible)
1832 * @param str edge label to follow (will match longest common prefix)
1834 * @return length of the substring comsumed from 'str'
1837 dfa_move (struct GNUNET_REGEX_State **s, const char *str)
1839 struct GNUNET_REGEX_Transition *t;
1840 struct GNUNET_REGEX_State *new_s;
1842 unsigned int max_len;
1849 for (t = (*s)->transitions_head; NULL != t; t = t->next)
1851 len = strlen (t->label);
1853 if (0 == strncmp (t->label, str, len))
1858 new_s = t->to_state;
1869 * Set the given state 'marked' to GNUNET_YES. Used by the
1870 * 'dfa_remove_unreachable_states' function to detect unreachable states in the
1873 * @param cls closure, not used.
1874 * @param count count, not used.
1875 * @param s state where the marked attribute will be set to GNUNET_YES.
1878 mark_states (void *cls, const unsigned int count, struct GNUNET_REGEX_State *s)
1880 s->marked = GNUNET_YES;
1885 * Remove all unreachable states from DFA 'a'. Unreachable states are those
1886 * states that are not reachable from the starting state.
1888 * @param a DFA automaton
1891 dfa_remove_unreachable_states (struct GNUNET_REGEX_Automaton *a)
1893 struct GNUNET_REGEX_State *s;
1894 struct GNUNET_REGEX_State *s_next;
1896 /* 1. unmark all states */
1897 for (s = a->states_head; NULL != s; s = s->next)
1898 s->marked = GNUNET_NO;
1900 /* 2. traverse dfa from start state and mark all visited states */
1901 GNUNET_REGEX_automaton_traverse (a, a->start, NULL, NULL, &mark_states, NULL);
1903 /* 3. delete all states that were not visited */
1904 for (s = a->states_head; NULL != s; s = s_next)
1907 if (GNUNET_NO == s->marked)
1908 automaton_remove_state (a, s);
1914 * Remove all dead states from the DFA 'a'. Dead states are those states that do
1915 * not transition to any other state but themselves.
1917 * @param a DFA automaton
1920 dfa_remove_dead_states (struct GNUNET_REGEX_Automaton *a)
1922 struct GNUNET_REGEX_State *s;
1923 struct GNUNET_REGEX_State *s_next;
1924 struct GNUNET_REGEX_Transition *t;
1927 GNUNET_assert (DFA == a->type);
1929 for (s = a->states_head; NULL != s; s = s_next)
1937 for (t = s->transitions_head; NULL != t; t = t->next)
1939 if (NULL != t->to_state && t->to_state != s)
1949 /* state s is dead, remove it */
1950 automaton_remove_state (a, s);
1956 * Merge all non distinguishable states in the DFA 'a'
1958 * @param ctx context
1959 * @param a DFA automaton
1960 * @return GNUNET_OK on success
1963 dfa_merge_nondistinguishable_states (struct GNUNET_REGEX_Context *ctx,
1964 struct GNUNET_REGEX_Automaton *a)
1967 struct GNUNET_REGEX_State *s1;
1968 struct GNUNET_REGEX_State *s2;
1969 struct GNUNET_REGEX_Transition *t1;
1970 struct GNUNET_REGEX_Transition *t2;
1971 struct GNUNET_REGEX_State *s1_next;
1972 struct GNUNET_REGEX_State *s2_next;
1974 unsigned int num_equal_edges;
1976 unsigned int state_cnt;
1977 unsigned long long idx;
1978 unsigned long long idx1;
1980 if ( (NULL == a) || (0 == a->state_count) )
1982 GNUNET_log (GNUNET_ERROR_TYPE_ERROR,
1983 "Could not merge nondistinguishable states, automaton was NULL.\n");
1984 return GNUNET_SYSERR;
1987 state_cnt = a->state_count;
1988 table = GNUNET_malloc_large ((sizeof (uint32_t) * state_cnt * state_cnt / 32) + sizeof (uint32_t));
1991 GNUNET_log_strerror (GNUNET_ERROR_TYPE_ERROR, "malloc");
1992 return GNUNET_SYSERR;
1995 for (i = 0, s1 = a->states_head; NULL != s1; s1 = s1->next)
1998 /* Mark all pairs of accepting/!accepting states */
1999 for (s1 = a->states_head; NULL != s1; s1 = s1->next)
2000 for (s2 = a->states_head; NULL != s2; s2 = s2->next)
2001 if ( (s1->accepting && !s2->accepting) ||
2002 (!s1->accepting && s2->accepting) )
2004 idx = s1->marked * state_cnt + s2->marked;
2005 table[idx / 32] |= (1 << (idx % 32));
2008 /* Find all equal states */
2013 for (s1 = a->states_head; NULL != s1; s1 = s1->next)
2015 for (s2 = a->states_head; NULL != s2 && s1 != s2; s2 = s2->next)
2017 idx = s1->marked * state_cnt + s2->marked;
2018 if (0 != (table[idx / 32] & (1 << (idx % 32))))
2020 num_equal_edges = 0;
2021 for (t1 = s1->transitions_head; NULL != t1; t1 = t1->next)
2023 for (t2 = s2->transitions_head; NULL != t2; t2 = t2->next)
2025 if (0 == strcmp (t1->label, t2->label))
2028 /* same edge, but targets definitively different, so we're different
2030 if (t1->to_state->marked > t2->to_state->marked)
2031 idx1 = t1->to_state->marked * state_cnt + t2->to_state->marked;
2033 idx1 = t2->to_state->marked * state_cnt + t1->to_state->marked;
2034 if (0 != (table[idx1 / 32] & (1 << (idx1 % 32))))
2036 table[idx / 32] |= (1 << (idx % 32));
2037 change = 1; /* changed a marker, need to run again */
2042 if ( (num_equal_edges != s1->transition_count) ||
2043 (num_equal_edges != s2->transition_count) )
2045 /* Make sure ALL edges of possible equal states are the same */
2046 table[idx / 32] |= (1 << (idx % 32));
2047 change = 1; /* changed a marker, need to run again */
2053 /* Merge states that are equal */
2054 for (s1 = a->states_head; NULL != s1; s1 = s1_next)
2057 for (s2 = a->states_head; NULL != s2 && s1 != s2; s2 = s2_next)
2060 idx = s1->marked * state_cnt + s2->marked;
2061 if (0 == (table[idx / 32] & (1 << (idx % 32))))
2062 automaton_merge_states (ctx, a, s1, s2);
2066 GNUNET_free (table);
2072 * Minimize the given DFA 'a' by removing all unreachable states, removing all
2073 * dead states and merging all non distinguishable states
2075 * @param ctx context
2076 * @param a DFA automaton
2077 * @return GNUNET_OK on success
2080 dfa_minimize (struct GNUNET_REGEX_Context *ctx,
2081 struct GNUNET_REGEX_Automaton *a)
2084 return GNUNET_SYSERR;
2086 GNUNET_assert (DFA == a->type);
2088 /* 1. remove unreachable states */
2089 dfa_remove_unreachable_states (a);
2091 /* 2. remove dead states */
2092 dfa_remove_dead_states (a);
2094 /* 3. Merge nondistinguishable states */
2095 if (GNUNET_OK != dfa_merge_nondistinguishable_states (ctx, a))
2096 return GNUNET_SYSERR;
2102 * Context for adding strided transitions to a DFA.
2104 struct GNUNET_REGEX_Strided_Context
2107 * Length of the strides.
2109 const unsigned int stride;
2112 * Strided transitions DLL. New strided transitions will be stored in this DLL
2113 * and afterwards added to the DFA.
2115 struct GNUNET_REGEX_Transition *transitions_head;
2118 * Strided transitions DLL.
2120 struct GNUNET_REGEX_Transition *transitions_tail;
2125 * Recursive helper function to add strides to a DFA.
2127 * @param cls context, contains stride length and strided transitions DLL.
2128 * @param depth current depth of the depth-first traversal of the graph.
2129 * @param label current label, string that contains all labels on the path from
2131 * @param start start state for the depth-first traversal of the graph.
2132 * @param s current state in the depth-first traversal
2135 dfa_add_multi_strides_helper (void *cls, const unsigned int depth, char *label,
2136 struct GNUNET_REGEX_State *start,
2137 struct GNUNET_REGEX_State *s)
2139 struct GNUNET_REGEX_Strided_Context *ctx = cls;
2140 struct GNUNET_REGEX_Transition *t;
2143 if (depth == ctx->stride)
2145 t = GNUNET_malloc (sizeof (struct GNUNET_REGEX_Transition));
2146 t->label = GNUNET_strdup (label);
2148 t->from_state = start;
2149 GNUNET_CONTAINER_DLL_insert (ctx->transitions_head, ctx->transitions_tail,
2154 for (t = s->transitions_head; NULL != t; t = t->next)
2156 /* Do not consider self-loops, because it end's up in too many
2158 if (t->to_state == t->from_state)
2163 GNUNET_asprintf (&new_label, "%s%s", label, t->label);
2166 new_label = GNUNET_strdup (t->label);
2168 dfa_add_multi_strides_helper (cls, (depth + 1), new_label, start,
2172 GNUNET_free_non_null (label);
2177 * Function called for each state in the DFA. Starts a traversal of depth set in
2178 * context starting from state 's'.
2180 * @param cls context.
2181 * @param count not used.
2182 * @param s current state.
2185 dfa_add_multi_strides (void *cls, const unsigned int count,
2186 struct GNUNET_REGEX_State *s)
2188 dfa_add_multi_strides_helper (cls, 0, NULL, s, s);
2193 * Adds multi-strided transitions to the given 'dfa'.
2195 * @param regex_ctx regex context needed to add transitions to the automaton.
2196 * @param dfa DFA to which the multi strided transitions should be added.
2197 * @param stride_len length of the strides.
2200 GNUNET_REGEX_dfa_add_multi_strides (struct GNUNET_REGEX_Context *regex_ctx,
2201 struct GNUNET_REGEX_Automaton *dfa,
2202 const unsigned int stride_len)
2204 struct GNUNET_REGEX_Strided_Context ctx = { stride_len, NULL, NULL };
2205 struct GNUNET_REGEX_Transition *t;
2206 struct GNUNET_REGEX_Transition *t_next;
2208 if (1 > stride_len || GNUNET_YES == dfa->is_multistrided)
2211 /* Compute the new transitions of given stride_len */
2212 GNUNET_REGEX_automaton_traverse (dfa, dfa->start, NULL, NULL,
2213 &dfa_add_multi_strides, &ctx);
2215 /* Add all the new transitions to the automaton. */
2216 for (t = ctx.transitions_head; NULL != t; t = t_next)
2219 state_add_transition (regex_ctx, t->from_state, t->label, t->to_state);
2220 GNUNET_CONTAINER_DLL_remove (ctx.transitions_head, ctx.transitions_tail, t);
2221 GNUNET_free_non_null (t->label);
2225 /* Mark this automaton as multistrided */
2226 dfa->is_multistrided = GNUNET_YES;
2230 * Recursive Helper function for DFA path compression. Does DFS on the DFA graph
2231 * and adds new transitions to the given transitions DLL and marks states that
2232 * should be removed by setting state->contained to GNUNET_YES.
2234 * @param dfa DFA for which the paths should be compressed.
2235 * @param start starting state for linear path search.
2236 * @param cur current state in the recursive DFS.
2237 * @param label current label (string of traversed labels).
2238 * @param max_len maximal path compression length.
2239 * @param transitions_head transitions DLL.
2240 * @param transitions_tail transitions DLL.
2243 dfa_compress_paths_helper (struct GNUNET_REGEX_Automaton *dfa,
2244 struct GNUNET_REGEX_State *start,
2245 struct GNUNET_REGEX_State *cur, char *label,
2246 unsigned int max_len,
2247 struct GNUNET_REGEX_Transition **transitions_head,
2248 struct GNUNET_REGEX_Transition **transitions_tail)
2250 struct GNUNET_REGEX_Transition *t;
2254 if (NULL != label &&
2255 ((cur->incoming_transition_count > 1 || GNUNET_YES == cur->accepting ||
2256 GNUNET_YES == cur->marked) || (start != dfa->start && max_len > 0 &&
2257 max_len == strlen (label)) ||
2258 (start == dfa->start && GNUNET_REGEX_INITIAL_BYTES == strlen (label))))
2260 t = GNUNET_malloc (sizeof (struct GNUNET_REGEX_Transition));
2261 t->label = GNUNET_strdup (label);
2263 t->from_state = start;
2264 GNUNET_CONTAINER_DLL_insert (*transitions_head, *transitions_tail, t);
2266 if (GNUNET_NO == cur->marked)
2268 dfa_compress_paths_helper (dfa, cur, cur, NULL, max_len, transitions_head,
2273 else if (cur != start)
2274 cur->contained = GNUNET_YES;
2276 if (GNUNET_YES == cur->marked && cur != start)
2279 cur->marked = GNUNET_YES;
2282 for (t = cur->transitions_head; NULL != t; t = t->next)
2285 GNUNET_asprintf (&new_label, "%s%s", label, t->label);
2287 new_label = GNUNET_strdup (t->label);
2289 if (t->to_state != cur)
2291 dfa_compress_paths_helper (dfa, start, t->to_state, new_label, max_len,
2292 transitions_head, transitions_tail);
2294 GNUNET_free (new_label);
2300 * Compress paths in the given 'dfa'. Linear paths like 0->1->2->3 will be
2301 * compressed to 0->3 by combining transitions.
2303 * @param regex_ctx context for adding new transitions.
2304 * @param dfa DFA representation, will directly modify the given DFA.
2305 * @param max_len maximal length of the compressed paths.
2308 dfa_compress_paths (struct GNUNET_REGEX_Context *regex_ctx,
2309 struct GNUNET_REGEX_Automaton *dfa, unsigned int max_len)
2311 struct GNUNET_REGEX_State *s;
2312 struct GNUNET_REGEX_State *s_next;
2313 struct GNUNET_REGEX_Transition *t;
2314 struct GNUNET_REGEX_Transition *t_next;
2315 struct GNUNET_REGEX_Transition *transitions_head = NULL;
2316 struct GNUNET_REGEX_Transition *transitions_tail = NULL;
2321 /* Count the incoming transitions on each state. */
2322 for (s = dfa->states_head; NULL != s; s = s->next)
2324 for (t = s->transitions_head; NULL != t; t = t->next)
2326 if (NULL != t->to_state)
2327 t->to_state->incoming_transition_count++;
2331 /* Unmark all states. */
2332 for (s = dfa->states_head; NULL != s; s = s->next)
2334 s->marked = GNUNET_NO;
2335 s->contained = GNUNET_NO;
2338 /* Add strides and mark states that can be deleted. */
2339 dfa_compress_paths_helper (dfa, dfa->start, dfa->start, NULL, max_len,
2340 &transitions_head, &transitions_tail);
2342 /* Add all the new transitions to the automaton. */
2343 for (t = transitions_head; NULL != t; t = t_next)
2346 state_add_transition (regex_ctx, t->from_state, t->label, t->to_state);
2347 GNUNET_CONTAINER_DLL_remove (transitions_head, transitions_tail, t);
2348 GNUNET_free_non_null (t->label);
2352 /* Remove marked states (including their incoming and outgoing transitions). */
2353 for (s = dfa->states_head; NULL != s; s = s_next)
2356 if (GNUNET_YES == s->contained)
2357 automaton_remove_state (dfa, s);
2363 * Creates a new NFA fragment. Needs to be cleared using
2364 * automaton_fragment_clear.
2366 * @param start starting state
2367 * @param end end state
2369 * @return new NFA fragment
2371 static struct GNUNET_REGEX_Automaton *
2372 nfa_fragment_create (struct GNUNET_REGEX_State *start,
2373 struct GNUNET_REGEX_State *end)
2375 struct GNUNET_REGEX_Automaton *n;
2377 n = GNUNET_malloc (sizeof (struct GNUNET_REGEX_Automaton));
2384 if (NULL == start || NULL == end)
2387 automaton_add_state (n, end);
2388 automaton_add_state (n, start);
2400 * Adds a list of states to the given automaton 'n'.
2402 * @param n automaton to which the states should be added
2403 * @param states_head head of the DLL of states
2404 * @param states_tail tail of the DLL of states
2407 nfa_add_states (struct GNUNET_REGEX_Automaton *n,
2408 struct GNUNET_REGEX_State *states_head,
2409 struct GNUNET_REGEX_State *states_tail)
2411 struct GNUNET_REGEX_State *s;
2413 if (NULL == n || NULL == states_head)
2415 GNUNET_log (GNUNET_ERROR_TYPE_ERROR, "Could not add states\n");
2419 if (NULL == n->states_head)
2421 n->states_head = states_head;
2422 n->states_tail = states_tail;
2426 if (NULL != states_head)
2428 n->states_tail->next = states_head;
2429 n->states_tail = states_tail;
2432 for (s = states_head; NULL != s; s = s->next)
2438 * Creates a new NFA state. Needs to be freed using automaton_destroy_state.
2440 * @param ctx context
2441 * @param accepting is it an accepting state or not
2443 * @return new NFA state
2445 static struct GNUNET_REGEX_State *
2446 nfa_state_create (struct GNUNET_REGEX_Context *ctx, int accepting)
2448 struct GNUNET_REGEX_State *s;
2450 s = GNUNET_malloc (sizeof (struct GNUNET_REGEX_State));
2451 s->id = ctx->state_id++;
2452 s->accepting = accepting;
2453 s->marked = GNUNET_NO;
2459 GNUNET_asprintf (&s->name, "s%i", s->id);
2466 * Calculates the closure set for the given set of states.
2468 * @param ret set to sorted nfa closure on 'label' (epsilon closure if 'label' is NULL)
2469 * @param nfa the NFA containing 's'
2470 * @param states list of states on which to base the closure on
2471 * @param label transitioning label for which to base the closure on,
2472 * pass NULL for epsilon transition
2475 nfa_closure_set_create (struct GNUNET_REGEX_StateSet *ret,
2476 struct GNUNET_REGEX_Automaton *nfa,
2477 struct GNUNET_REGEX_StateSet *states, const char *label)
2479 struct GNUNET_REGEX_State *s;
2481 struct GNUNET_REGEX_StateSet_MDLL cls_stack;
2482 struct GNUNET_REGEX_State *clsstate;
2483 struct GNUNET_REGEX_State *currentstate;
2484 struct GNUNET_REGEX_Transition *ctran;
2486 memset (ret, 0, sizeof (struct GNUNET_REGEX_StateSet));
2490 for (i = 0; i < states->off; i++)
2492 s = states->states[i];
2494 /* Add start state to closure only for epsilon closure */
2496 state_set_append (ret, s);
2498 /* initialize work stack */
2499 cls_stack.head = NULL;
2500 cls_stack.tail = NULL;
2501 GNUNET_CONTAINER_MDLL_insert (ST, cls_stack.head, cls_stack.tail, s);
2504 while (NULL != (currentstate = cls_stack.tail))
2506 GNUNET_CONTAINER_MDLL_remove (ST, cls_stack.head, cls_stack.tail,
2509 for (ctran = currentstate->transitions_head; NULL != ctran;
2510 ctran = ctran->next)
2512 if (NULL == (clsstate = ctran->to_state))
2514 if (0 != clsstate->contained)
2516 if (0 != nullstrcmp (label, ctran->label))
2518 state_set_append (ret, clsstate);
2519 GNUNET_CONTAINER_MDLL_insert_tail (ST, cls_stack.head, cls_stack.tail,
2522 clsstate->contained = 1;
2526 for (i = 0; i < ret->off; i++)
2527 ret->states[i]->contained = 0;
2530 qsort (ret->states, ret->off, sizeof (struct GNUNET_REGEX_State *),
2536 * Pops two NFA fragments (a, b) from the stack and concatenates them (ab)
2538 * @param ctx context
2541 nfa_add_concatenation (struct GNUNET_REGEX_Context *ctx)
2543 struct GNUNET_REGEX_Automaton *a;
2544 struct GNUNET_REGEX_Automaton *b;
2545 struct GNUNET_REGEX_Automaton *new_nfa;
2547 b = ctx->stack_tail;
2548 GNUNET_assert (NULL != b);
2549 GNUNET_CONTAINER_DLL_remove (ctx->stack_head, ctx->stack_tail, b);
2550 a = ctx->stack_tail;
2551 GNUNET_assert (NULL != a);
2552 GNUNET_CONTAINER_DLL_remove (ctx->stack_head, ctx->stack_tail, a);
2554 state_add_transition (ctx, a->end, NULL, b->start);
2555 a->end->accepting = 0;
2556 b->end->accepting = 1;
2558 new_nfa = nfa_fragment_create (NULL, NULL);
2559 nfa_add_states (new_nfa, a->states_head, a->states_tail);
2560 nfa_add_states (new_nfa, b->states_head, b->states_tail);
2561 new_nfa->start = a->start;
2562 new_nfa->end = b->end;
2563 new_nfa->state_count += a->state_count + b->state_count;
2564 automaton_fragment_clear (a);
2565 automaton_fragment_clear (b);
2567 GNUNET_CONTAINER_DLL_insert_tail (ctx->stack_head, ctx->stack_tail, new_nfa);
2572 * Pops a NFA fragment from the stack (a) and adds a new fragment (a*)
2574 * @param ctx context
2577 nfa_add_star_op (struct GNUNET_REGEX_Context *ctx)
2579 struct GNUNET_REGEX_Automaton *a;
2580 struct GNUNET_REGEX_Automaton *new_nfa;
2581 struct GNUNET_REGEX_State *start;
2582 struct GNUNET_REGEX_State *end;
2584 a = ctx->stack_tail;
2588 GNUNET_log (GNUNET_ERROR_TYPE_ERROR,
2589 "nfa_add_star_op failed, because there was no element on the stack");
2593 GNUNET_CONTAINER_DLL_remove (ctx->stack_head, ctx->stack_tail, a);
2595 start = nfa_state_create (ctx, 0);
2596 end = nfa_state_create (ctx, 1);
2598 state_add_transition (ctx, start, NULL, a->start);
2599 state_add_transition (ctx, start, NULL, end);
2600 state_add_transition (ctx, a->end, NULL, a->start);
2601 state_add_transition (ctx, a->end, NULL, end);
2603 a->end->accepting = 0;
2606 new_nfa = nfa_fragment_create (start, end);
2607 nfa_add_states (new_nfa, a->states_head, a->states_tail);
2608 automaton_fragment_clear (a);
2610 GNUNET_CONTAINER_DLL_insert_tail (ctx->stack_head, ctx->stack_tail, new_nfa);
2615 * Pops an NFA fragment (a) from the stack and adds a new fragment (a+)
2617 * @param ctx context
2620 nfa_add_plus_op (struct GNUNET_REGEX_Context *ctx)
2622 struct GNUNET_REGEX_Automaton *a;
2624 a = ctx->stack_tail;
2628 GNUNET_log (GNUNET_ERROR_TYPE_ERROR,
2629 "nfa_add_plus_op failed, because there was no element on the stack");
2633 GNUNET_CONTAINER_DLL_remove (ctx->stack_head, ctx->stack_tail, a);
2635 state_add_transition (ctx, a->end, NULL, a->start);
2637 GNUNET_CONTAINER_DLL_insert_tail (ctx->stack_head, ctx->stack_tail, a);
2642 * Pops an NFA fragment (a) from the stack and adds a new fragment (a?)
2644 * @param ctx context
2647 nfa_add_question_op (struct GNUNET_REGEX_Context *ctx)
2649 struct GNUNET_REGEX_Automaton *a;
2650 struct GNUNET_REGEX_Automaton *new_nfa;
2651 struct GNUNET_REGEX_State *start;
2652 struct GNUNET_REGEX_State *end;
2654 a = ctx->stack_tail;
2658 GNUNET_log (GNUNET_ERROR_TYPE_ERROR,
2659 "nfa_add_question_op failed, because there was no element on the stack");
2663 GNUNET_CONTAINER_DLL_remove (ctx->stack_head, ctx->stack_tail, a);
2665 start = nfa_state_create (ctx, 0);
2666 end = nfa_state_create (ctx, 1);
2668 state_add_transition (ctx, start, NULL, a->start);
2669 state_add_transition (ctx, start, NULL, end);
2670 state_add_transition (ctx, a->end, NULL, end);
2672 a->end->accepting = 0;
2674 new_nfa = nfa_fragment_create (start, end);
2675 nfa_add_states (new_nfa, a->states_head, a->states_tail);
2676 GNUNET_CONTAINER_DLL_insert_tail (ctx->stack_head, ctx->stack_tail, new_nfa);
2677 automaton_fragment_clear (a);
2682 * Pops two NFA fragments (a, b) from the stack and adds a new NFA fragment that
2683 * alternates between a and b (a|b)
2685 * @param ctx context
2688 nfa_add_alternation (struct GNUNET_REGEX_Context *ctx)
2690 struct GNUNET_REGEX_Automaton *a;
2691 struct GNUNET_REGEX_Automaton *b;
2692 struct GNUNET_REGEX_Automaton *new_nfa;
2693 struct GNUNET_REGEX_State *start;
2694 struct GNUNET_REGEX_State *end;
2696 b = ctx->stack_tail;
2697 GNUNET_assert (NULL != b);
2698 GNUNET_CONTAINER_DLL_remove (ctx->stack_head, ctx->stack_tail, b);
2699 a = ctx->stack_tail;
2700 GNUNET_assert (NULL != a);
2701 GNUNET_CONTAINER_DLL_remove (ctx->stack_head, ctx->stack_tail, a);
2703 start = nfa_state_create (ctx, 0);
2704 end = nfa_state_create (ctx, 1);
2705 state_add_transition (ctx, start, NULL, a->start);
2706 state_add_transition (ctx, start, NULL, b->start);
2708 state_add_transition (ctx, a->end, NULL, end);
2709 state_add_transition (ctx, b->end, NULL, end);
2711 a->end->accepting = 0;
2712 b->end->accepting = 0;
2715 new_nfa = nfa_fragment_create (start, end);
2716 nfa_add_states (new_nfa, a->states_head, a->states_tail);
2717 nfa_add_states (new_nfa, b->states_head, b->states_tail);
2718 automaton_fragment_clear (a);
2719 automaton_fragment_clear (b);
2721 GNUNET_CONTAINER_DLL_insert_tail (ctx->stack_head, ctx->stack_tail, new_nfa);
2726 * Adds a new nfa fragment to the stack
2728 * @param ctx context
2729 * @param label label for nfa transition
2732 nfa_add_label (struct GNUNET_REGEX_Context *ctx, const char *label)
2734 struct GNUNET_REGEX_Automaton *n;
2735 struct GNUNET_REGEX_State *start;
2736 struct GNUNET_REGEX_State *end;
2738 GNUNET_assert (NULL != ctx);
2740 start = nfa_state_create (ctx, 0);
2741 end = nfa_state_create (ctx, 1);
2742 state_add_transition (ctx, start, label, end);
2743 n = nfa_fragment_create (start, end);
2744 GNUNET_assert (NULL != n);
2745 GNUNET_CONTAINER_DLL_insert_tail (ctx->stack_head, ctx->stack_tail, n);
2750 * Initialize a new context
2752 * @param ctx context
2755 GNUNET_REGEX_context_init (struct GNUNET_REGEX_Context *ctx)
2759 GNUNET_log (GNUNET_ERROR_TYPE_ERROR, "Context was NULL!");
2763 ctx->transition_id = 0;
2764 ctx->stack_head = NULL;
2765 ctx->stack_tail = NULL;
2770 * Construct an NFA by parsing the regex string of length 'len'.
2772 * @param regex regular expression string
2773 * @param len length of the string
2775 * @return NFA, needs to be freed using GNUNET_REGEX_destroy_automaton
2777 struct GNUNET_REGEX_Automaton *
2778 GNUNET_REGEX_construct_nfa (const char *regex, const size_t len)
2780 struct GNUNET_REGEX_Context ctx;
2781 struct GNUNET_REGEX_Automaton *nfa;
2786 unsigned int altcount;
2787 unsigned int atomcount;
2796 if (NULL == regex || 0 == strlen (regex) || 0 == len)
2798 GNUNET_log (GNUNET_ERROR_TYPE_ERROR,
2799 "Could not parse regex. Empty regex string provided.\n");
2803 GNUNET_REGEX_context_init (&ctx);
2814 for (count = 0; count < len && *regexp; count++, regexp++)
2822 nfa_add_concatenation (&ctx);
2825 GNUNET_array_grow (p, psize, psize * 2 + 4);
2826 p[poff].altcount = altcount;
2827 p[poff].atomcount = atomcount;
2835 error_msg = "Cannot append '|' to nothing";
2838 while (--atomcount > 0)
2839 nfa_add_concatenation (&ctx);
2845 error_msg = "Missing opening '('";
2850 /* Ignore this: "()" */
2852 altcount = p[poff].altcount;
2853 atomcount = p[poff].atomcount;
2856 while (--atomcount > 0)
2857 nfa_add_concatenation (&ctx);
2858 for (; altcount > 0; altcount--)
2859 nfa_add_alternation (&ctx);
2861 altcount = p[poff].altcount;
2862 atomcount = p[poff].atomcount;
2868 error_msg = "Cannot append '*' to nothing";
2871 nfa_add_star_op (&ctx);
2876 error_msg = "Cannot append '+' to nothing";
2879 nfa_add_plus_op (&ctx);
2884 error_msg = "Cannot append '?' to nothing";
2887 nfa_add_question_op (&ctx);
2893 nfa_add_concatenation (&ctx);
2895 curlabel[0] = *regexp;
2896 nfa_add_label (&ctx, curlabel);
2903 error_msg = "Unbalanced parenthesis";
2906 while (--atomcount > 0)
2907 nfa_add_concatenation (&ctx);
2908 for (; altcount > 0; altcount--)
2909 nfa_add_alternation (&ctx);
2911 GNUNET_array_grow (p, psize, 0);
2913 nfa = ctx.stack_tail;
2914 GNUNET_CONTAINER_DLL_remove (ctx.stack_head, ctx.stack_tail, nfa);
2916 if (NULL != ctx.stack_head)
2918 error_msg = "Creating the NFA failed. NFA stack was not empty!";
2922 /* Remember the regex that was used to generate this NFA */
2923 nfa->regex = GNUNET_strdup (regex);
2925 /* create depth-first numbering of the states for pretty printing */
2926 GNUNET_REGEX_automaton_traverse (nfa, NULL, NULL, NULL, &number_states, NULL);
2928 /* No multistriding added so far */
2929 nfa->is_multistrided = GNUNET_NO;
2934 GNUNET_log (GNUNET_ERROR_TYPE_ERROR, "Could not parse regex: `%s'\n", regex);
2935 if (NULL != error_msg)
2936 GNUNET_log (GNUNET_ERROR_TYPE_ERROR, "%s\n", error_msg);
2938 GNUNET_free_non_null (p);
2940 while (NULL != (nfa = ctx.stack_head))
2942 GNUNET_CONTAINER_DLL_remove (ctx.stack_head, ctx.stack_tail, nfa);
2943 GNUNET_REGEX_automaton_destroy (nfa);
2951 * Create DFA states based on given 'nfa' and starting with 'dfa_state'.
2953 * @param ctx context.
2954 * @param nfa NFA automaton.
2955 * @param dfa DFA automaton.
2956 * @param dfa_state current dfa state, pass epsilon closure of first nfa state
2960 construct_dfa_states (struct GNUNET_REGEX_Context *ctx,
2961 struct GNUNET_REGEX_Automaton *nfa,
2962 struct GNUNET_REGEX_Automaton *dfa,
2963 struct GNUNET_REGEX_State *dfa_state)
2965 struct GNUNET_REGEX_Transition *ctran;
2966 struct GNUNET_REGEX_State *new_dfa_state;
2967 struct GNUNET_REGEX_State *state_contains;
2968 struct GNUNET_REGEX_State *state_iter;
2969 struct GNUNET_REGEX_StateSet tmp;
2970 struct GNUNET_REGEX_StateSet nfa_set;
2972 for (ctran = dfa_state->transitions_head; NULL != ctran; ctran = ctran->next)
2974 if (NULL == ctran->label || NULL != ctran->to_state)
2977 nfa_closure_set_create (&tmp, nfa, &dfa_state->nfa_set, ctran->label);
2978 nfa_closure_set_create (&nfa_set, nfa, &tmp, NULL);
2979 state_set_clear (&tmp);
2981 state_contains = NULL;
2982 for (state_iter = dfa->states_head; NULL != state_iter;
2983 state_iter = state_iter->next)
2985 if (0 == state_set_compare (&state_iter->nfa_set, &nfa_set))
2987 state_contains = state_iter;
2991 if (NULL == state_contains)
2993 new_dfa_state = dfa_state_create (ctx, &nfa_set);
2994 automaton_add_state (dfa, new_dfa_state);
2995 ctran->to_state = new_dfa_state;
2996 construct_dfa_states (ctx, nfa, dfa, new_dfa_state);
3000 ctran->to_state = state_contains;
3001 state_set_clear (&nfa_set);
3008 * Construct DFA for the given 'regex' of length 'len'.
3010 * Path compression means, that for example a DFA o -> a -> b -> c -> o will be
3011 * compressed to o -> abc -> o. Note that this parameter influences the
3012 * non-determinism of states of the resulting NFA in the DHT (number of outgoing
3013 * edges with the same label). For example for an application that stores IPv4
3014 * addresses as bitstrings it could make sense to limit the path compression to
3017 * @param regex regular expression string.
3018 * @param len length of the regular expression.
3019 * @param max_path_len limit the path compression length to the
3020 * given value. If set to 1, no path compression is applied. Set to 0 for
3021 * maximal possible path compression (generally not desireable).
3022 * @return DFA, needs to be freed using GNUNET_REGEX_automaton_destroy.
3024 struct GNUNET_REGEX_Automaton *
3025 GNUNET_REGEX_construct_dfa (const char *regex, const size_t len,
3026 unsigned int max_path_len)
3028 struct GNUNET_REGEX_Context ctx;
3029 struct GNUNET_REGEX_Automaton *dfa;
3030 struct GNUNET_REGEX_Automaton *nfa;
3031 struct GNUNET_REGEX_StateSet nfa_start_eps_cls;
3032 struct GNUNET_REGEX_StateSet singleton_set;
3034 GNUNET_REGEX_context_init (&ctx);
3037 // fprintf (stderr, "N");
3038 nfa = GNUNET_REGEX_construct_nfa (regex, len);
3042 GNUNET_log (GNUNET_ERROR_TYPE_ERROR,
3043 "Could not create DFA, because NFA creation failed\n");
3047 dfa = GNUNET_malloc (sizeof (struct GNUNET_REGEX_Automaton));
3049 dfa->regex = GNUNET_strdup (regex);
3051 /* Create DFA start state from epsilon closure */
3052 memset (&singleton_set, 0, sizeof (struct GNUNET_REGEX_StateSet));
3053 state_set_append (&singleton_set, nfa->start);
3054 nfa_closure_set_create (&nfa_start_eps_cls, nfa, &singleton_set, NULL);
3055 state_set_clear (&singleton_set);
3056 dfa->start = dfa_state_create (&ctx, &nfa_start_eps_cls);
3057 automaton_add_state (dfa, dfa->start);
3059 // fprintf (stderr, "D");
3060 construct_dfa_states (&ctx, nfa, dfa, dfa->start);
3061 GNUNET_REGEX_automaton_destroy (nfa);
3064 // fprintf (stderr, "M");
3065 if (GNUNET_OK != dfa_minimize (&ctx, dfa))
3067 GNUNET_REGEX_automaton_destroy (dfa);
3071 /* Create proofs and hashes for all states */
3072 if (GNUNET_OK != automaton_create_proofs (dfa))
3074 GNUNET_REGEX_automaton_destroy (dfa);
3078 /* Compress linear DFA paths */
3079 if (1 != max_path_len)
3080 dfa_compress_paths (&ctx, dfa, max_path_len);
3087 * Free the memory allocated by constructing the GNUNET_REGEX_Automaton data
3090 * @param a automaton to be destroyed
3093 GNUNET_REGEX_automaton_destroy (struct GNUNET_REGEX_Automaton *a)
3095 struct GNUNET_REGEX_State *s;
3096 struct GNUNET_REGEX_State *next_state;
3101 GNUNET_free_non_null (a->regex);
3102 GNUNET_free_non_null (a->canonical_regex);
3104 for (s = a->states_head; NULL != s; s = next_state)
3106 next_state = s->next;
3107 GNUNET_CONTAINER_DLL_remove (a->states_head, a->states_tail, s);
3108 automaton_destroy_state (s);
3116 * Evaluates the given string using the given DFA automaton
3118 * @param a automaton, type must be DFA
3119 * @param string string that should be evaluated
3121 * @return 0 if string matches, non 0 otherwise
3124 evaluate_dfa (struct GNUNET_REGEX_Automaton *a, const char *string)
3127 struct GNUNET_REGEX_State *s;
3128 unsigned int step_len;
3132 GNUNET_log (GNUNET_ERROR_TYPE_ERROR,
3133 "Tried to evaluate DFA, but NFA automaton given");
3139 /* If the string is empty but the starting state is accepting, we accept. */
3140 if ((NULL == string || 0 == strlen (string)) && s->accepting)
3143 for (strp = string; NULL != strp && *strp; strp += step_len)
3145 step_len = dfa_move (&s, strp);
3151 if (NULL != s && s->accepting)
3159 * Evaluates the given string using the given NFA automaton
3161 * @param a automaton, type must be NFA
3162 * @param string string that should be evaluated
3164 * @return 0 if string matches, non 0 otherwise
3167 evaluate_nfa (struct GNUNET_REGEX_Automaton *a, const char *string)
3171 struct GNUNET_REGEX_State *s;
3172 struct GNUNET_REGEX_StateSet sset;
3173 struct GNUNET_REGEX_StateSet new_sset;
3174 struct GNUNET_REGEX_StateSet singleton_set;
3180 GNUNET_log (GNUNET_ERROR_TYPE_ERROR,
3181 "Tried to evaluate NFA, but DFA automaton given");
3185 /* If the string is empty but the starting state is accepting, we accept. */
3186 if ((NULL == string || 0 == strlen (string)) && a->start->accepting)
3190 memset (&singleton_set, 0, sizeof (struct GNUNET_REGEX_StateSet));
3191 state_set_append (&singleton_set, a->start);
3192 nfa_closure_set_create (&sset, a, &singleton_set, NULL);
3193 state_set_clear (&singleton_set);
3196 for (strp = string; NULL != strp && *strp; strp++)
3199 nfa_closure_set_create (&new_sset, a, &sset, str);
3200 state_set_clear (&sset);
3201 nfa_closure_set_create (&sset, a, &new_sset, 0);
3202 state_set_clear (&new_sset);
3205 for (i = 0; i < sset.off; i++)
3208 if ( (NULL != s) && (s->accepting) )
3215 state_set_clear (&sset);
3221 * Evaluates the given 'string' against the given compiled regex
3223 * @param a automaton
3224 * @param string string to check
3226 * @return 0 if string matches, non 0 otherwise
3229 GNUNET_REGEX_eval (struct GNUNET_REGEX_Automaton *a, const char *string)
3236 result = evaluate_dfa (a, string);
3239 result = evaluate_nfa (a, string);
3242 GNUNET_log (GNUNET_ERROR_TYPE_ERROR,
3243 "Evaluating regex failed, automaton has no type!\n");
3244 result = GNUNET_SYSERR;
3253 * Get the canonical regex of the given automaton.
3254 * When constructing the automaton a proof is computed for each state,
3255 * consisting of the regular expression leading to this state. A complete
3256 * regex for the automaton can be computed by combining these proofs.
3257 * As of now this function is only useful for testing.
3259 * @param a automaton for which the canonical regex should be returned.
3264 GNUNET_REGEX_get_canonical_regex (struct GNUNET_REGEX_Automaton *a)
3269 return a->canonical_regex;
3274 * Get the number of transitions that are contained in the given automaton.
3276 * @param a automaton for which the number of transitions should be returned.
3278 * @return number of transitions in the given automaton.
3281 GNUNET_REGEX_get_transition_count (struct GNUNET_REGEX_Automaton *a)
3283 unsigned int t_count;
3284 struct GNUNET_REGEX_State *s;
3290 for (s = a->states_head; NULL != s; s = s->next)
3291 t_count += s->transition_count;
3298 * Get the first key for the given 'input_string'. This hashes the first x bits
3299 * of the 'input_string'.
3301 * @param input_string string.
3302 * @param string_len length of the 'input_string'.
3303 * @param key pointer to where to write the hash code.
3305 * @return number of bits of 'input_string' that have been consumed
3306 * to construct the key
3309 GNUNET_REGEX_get_first_key (const char *input_string, size_t string_len,
3310 struct GNUNET_HashCode * key)
3316 GNUNET_REGEX_INITIAL_BYTES ? string_len : GNUNET_REGEX_INITIAL_BYTES;
3318 if (NULL == input_string)
3320 GNUNET_log (GNUNET_ERROR_TYPE_ERROR, "Given input string was NULL!\n");
3324 GNUNET_CRYPTO_hash (input_string, size, key);
3331 * Check if the given 'proof' matches the given 'key'.
3333 * @param proof partial regex of a state.
3334 * @param key hash of a state.
3336 * @return GNUNET_OK if the proof is valid for the given key.
3339 GNUNET_REGEX_check_proof (const char *proof, const struct GNUNET_HashCode *key)
3341 struct GNUNET_HashCode key_check;
3343 if (NULL == proof || NULL == key)
3345 GNUNET_log (GNUNET_ERROR_TYPE_ERROR, "Proof check failed, was NULL.\n");
3349 GNUNET_CRYPTO_hash (proof, strlen (proof), &key_check);
3351 GNUNET_CRYPTO_hash_cmp (key, &key_check)) ? GNUNET_OK : GNUNET_NO;
3356 * Recursive function that calls the iterator for each synthetic start state.
3358 * @param min_len minimum length of the path in the graph.
3359 * @param max_len maximum length of the path in the graph.
3360 * @param consumed_string string consumed by traversing the graph till this state.
3361 * @param state current state of the automaton.
3362 * @param iterator iterator function called for each edge.
3363 * @param iterator_cls closure for the iterator function.
3366 iterate_initial_edge (const unsigned int min_len, const unsigned int max_len,
3367 char *consumed_string, struct GNUNET_REGEX_State *state,
3368 GNUNET_REGEX_KeyIterator iterator, void *iterator_cls)
3372 struct GNUNET_REGEX_Transition *t;
3373 unsigned int num_edges = state->transition_count;
3374 struct GNUNET_REGEX_Edge edges[num_edges];
3375 struct GNUNET_REGEX_Edge edge[1];
3376 struct GNUNET_HashCode hash;
3377 struct GNUNET_HashCode hash_new;
3379 unsigned int cur_len;
3381 if (NULL != consumed_string)
3382 cur_len = strlen (consumed_string);
3386 if ((cur_len >= min_len || GNUNET_YES == state->accepting) && cur_len > 0 &&
3387 NULL != consumed_string)
3389 if (cur_len <= max_len)
3391 if (state->proof != NULL && 0 != strcmp (consumed_string, state->proof))
3393 for (i = 0, t = state->transitions_head; NULL != t && i < num_edges;
3396 edges[i].label = t->label;
3397 edges[i].destination = t->to_state->hash;
3399 GNUNET_CRYPTO_hash (consumed_string, strlen (consumed_string), &hash);
3400 iterator (iterator_cls, &hash, consumed_string, state->accepting,
3404 if (GNUNET_YES == state->accepting && cur_len > 1 &&
3405 state->transition_count < 1 && cur_len < max_len)
3407 /* Special case for regex consisting of just a string that is shorter than
3409 edge[0].label = &consumed_string[cur_len - 1];
3410 edge[0].destination = state->hash;
3411 temp = GNUNET_strdup (consumed_string);
3412 temp[cur_len - 1] = '\0';
3413 GNUNET_CRYPTO_hash (temp, cur_len - 1, &hash_new);
3414 iterator (iterator_cls, &hash_new, temp, GNUNET_NO, 1, edge);
3418 else if (max_len < cur_len)
3420 /* Case where the concatenated labels are longer than max_len, then split. */
3421 edge[0].label = &consumed_string[max_len];
3422 edge[0].destination = state->hash;
3423 temp = GNUNET_strdup (consumed_string);
3424 temp[max_len] = '\0';
3425 GNUNET_CRYPTO_hash (temp, max_len, &hash);
3426 iterator (iterator_cls, &hash, temp, GNUNET_NO, 1, edge);
3431 if (cur_len < max_len)
3433 for (t = state->transitions_head; NULL != t; t = t->next)
3435 if (NULL != consumed_string)
3436 GNUNET_asprintf (&temp, "%s%s", consumed_string, t->label);
3438 GNUNET_asprintf (&temp, "%s", t->label);
3440 iterate_initial_edge (min_len, max_len, temp, t->to_state, iterator,
3449 * Iterate over all edges starting from start state of automaton 'a'. Calling
3450 * iterator for each edge.
3452 * @param a automaton.
3453 * @param iterator iterator called for each edge.
3454 * @param iterator_cls closure.
3457 GNUNET_REGEX_iterate_all_edges (struct GNUNET_REGEX_Automaton *a,
3458 GNUNET_REGEX_KeyIterator iterator,
3461 struct GNUNET_REGEX_State *s;
3463 for (s = a->states_head; NULL != s; s = s->next)
3465 struct GNUNET_REGEX_Edge edges[s->transition_count];
3466 unsigned int num_edges;
3468 num_edges = state_get_edges (s, edges);
3470 if ((NULL != s->proof && 0 < strlen (s->proof)) || s->accepting)
3471 iterator (iterator_cls, &s->hash, s->proof, s->accepting, num_edges,
3474 s->marked = GNUNET_NO;
3477 iterate_initial_edge (GNUNET_REGEX_INITIAL_BYTES, GNUNET_REGEX_INITIAL_BYTES,
3478 NULL, a->start, iterator, iterator_cls);
3482 * Create a string with binary IP notation for the given 'addr' in 'str'.
3484 * @param af address family of the given 'addr'.
3485 * @param addr address that should be converted to a string.
3486 * struct in_addr * for IPv4 and struct in6_addr * for IPv6.
3487 * @param str string that will contain binary notation of 'addr'. Expected
3488 * to be at least 33 bytes long for IPv4 and 129 bytes long for IPv6.
3491 iptobinstr (const int af, const void *addr, char *str)
3499 uint32_t b = htonl (((struct in_addr *) addr)->s_addr);
3503 for (i = 31; i >= 0; i--)
3505 *str = (b & 1) + '0';
3513 struct in6_addr b = *(const struct in6_addr *) addr;
3517 for (i = 127; i >= 0; i--)
3519 *str = (b.s6_addr[i / 8] & 1) + '0';
3521 b.s6_addr[i / 8] >>= 1;
3530 * Get the ipv4 network prefix from the given 'netmask'.
3532 * @param netmask netmask for which to get the prefix len.
3534 * @return length of ipv4 prefix for 'netmask'.
3537 ipv4netmasktoprefixlen (const char *netmask)
3543 if (1 != inet_pton (AF_INET, netmask, &a))
3546 for (t = htonl (~a.s_addr); 0 != t; t >>= 1)
3553 * Create a regex in 'rxstr' from the given 'ip' and 'netmask'.
3555 * @param ip IPv4 representation.
3556 * @param netmask netmask for the ip.
3557 * @param rxstr generated regex, must be at least GNUNET_REGEX_IPV4_REGEXLEN
3561 GNUNET_REGEX_ipv4toregex (const struct in_addr *ip, const char *netmask,
3564 unsigned int pfxlen;
3566 pfxlen = ipv4netmasktoprefixlen (netmask);
3567 iptobinstr (AF_INET, ip, rxstr);
3568 rxstr[pfxlen] = '\0';
3570 strcat (rxstr, "(0|1)+");
3575 * Create a regex in 'rxstr' from the given 'ipv6' and 'prefixlen'.
3577 * @param ipv6 IPv6 representation.
3578 * @param prefixlen length of the ipv6 prefix.
3579 * @param rxstr generated regex, must be at least GNUNET_REGEX_IPV6_REGEXLEN
3583 GNUNET_REGEX_ipv6toregex (const struct in6_addr *ipv6, unsigned int prefixlen,
3586 iptobinstr (AF_INET6, ipv6, rxstr);
3587 rxstr[prefixlen] = '\0';
3588 if (prefixlen < 128)
3589 strcat (rxstr, "(0|1)+");
3593 /* end of regex.c */