From ac43047941ad44cb3cd408b557a9abdb73d34f06 Mon Sep 17 00:00:00 2001 From: Maximilian Szengel Date: Mon, 2 Apr 2012 15:43:23 +0000 Subject: [PATCH] - dfa minimization wip - static functions - optimizations wip --- src/regex/regex.c | 423 +++++++++++++++++++++++++++++----------------- 1 file changed, 270 insertions(+), 153 deletions(-) diff --git a/src/regex/regex.c b/src/regex/regex.c index 4ad063e3c..a524ace29 100644 --- a/src/regex/regex.c +++ b/src/regex/regex.c @@ -60,6 +60,7 @@ struct GNUNET_REGEX_Automaton struct State *start; struct State *end; + unsigned int state_count; struct State *states_head; struct State *states_tail; @@ -79,6 +80,7 @@ struct State int marked; char *name; + unsigned int transition_count; struct Transition *transitions_head; struct Transition *transitions_tail; @@ -116,7 +118,7 @@ struct StateSet * * @param ctx context */ -void +static void GNUNET_REGEX_context_init (struct GNUNET_REGEX_Context *ctx) { if (NULL == ctx) @@ -130,7 +132,7 @@ GNUNET_REGEX_context_init (struct GNUNET_REGEX_Context *ctx) ctx->stack_tail = NULL; } -void +static void debug_print_state (struct State *s) { GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, @@ -138,7 +140,7 @@ debug_print_state (struct State *s) s->marked, s->accepting); } -void +static void debug_print_states (struct StateSet *sset) { struct State *s; @@ -151,7 +153,7 @@ debug_print_states (struct StateSet *sset) } } -void +static void debug_print_transitions (struct State *s) { struct Transition *t; @@ -175,51 +177,43 @@ debug_print_transitions (struct State *s) } } +static int +state_compare (const void *a, const void *b) +{ + struct State **s1; + struct State **s2; + + s1 = (struct State **) a; + s2 = (struct State **) b; + + return (*s1)->id - (*s2)->id; +} + /** * Compare to state sets by comparing the id's of the states that are - * contained in each set. + * contained in each set. Both sets are expected to be sorted by id! * * @param sset1 first state set * @param sset2 second state set * * @return 0 if they are equal, non 0 otherwise */ -int +static int state_set_compare (struct StateSet *sset1, struct StateSet *sset2) { - struct State *s1; - struct State *s2; - int i1; - int i2; - int contains; - int rslt; - - if (sset1->len < 1 || sset2->len < 1) - return -1; + int i; - rslt = 0; + if (sset1->len != sset2->len) + return 1; - for (i1 = 0; i1 < sset1->len; i1++) + for (i = 0; i < sset1->len; i++) { - s1 = sset1->states[i1]; - contains = 0; - for (i2 = 0; i2 < sset2->len; i2++) + if (sset1->states[i]->id != sset2->states[i]->id) { - s2 = sset2->states[i2]; - if (s1->id == s2->id) - { - contains = 1; - break; - } - } - - if (0 == contains) - { - rslt = 1; - break; + return 1; } } - return rslt; + return 0; } /** @@ -230,7 +224,7 @@ state_set_compare (struct StateSet *sset1, struct StateSet *sset2) * * @return GNUNET_YES if 'set' contains 'elem, GNUNET_NO otherwise */ -int +static int state_set_contains (struct StateSet *set, struct State *elem) { struct State *s; @@ -250,7 +244,7 @@ state_set_contains (struct StateSet *set, struct State *elem) * * @param set set to be cleared */ -void +static void state_set_clear (struct StateSet *set) { if (NULL != set) @@ -269,7 +263,7 @@ state_set_clear (struct StateSet *set) * @param literal transition label * @param to_state state to where the transition should point to */ -void +static void add_transition (struct GNUNET_REGEX_Context *ctx, struct State *from_state, const char literal, struct State *to_state) { @@ -297,7 +291,7 @@ add_transition (struct GNUNET_REGEX_Context *ctx, struct State *from_state, * * @param a automaton to be cleared */ -void +static void automaton_fragment_clear (struct GNUNET_REGEX_Automaton *a) { a->start = NULL; @@ -312,7 +306,7 @@ automaton_fragment_clear (struct GNUNET_REGEX_Automaton *a) * * @param s state that should be destroyed */ -void +static void automaton_destroy_state (struct State *s) { struct Transition *t; @@ -343,7 +337,7 @@ automaton_destroy_state (struct State *s) * * @return new DFA state */ -struct State * +static struct State * dfa_state_create (struct GNUNET_REGEX_Context *ctx, struct StateSet *nfa_states) { struct State *s; @@ -362,7 +356,10 @@ dfa_state_create (struct GNUNET_REGEX_Context *ctx, struct StateSet *nfa_states) s->name = NULL; if (NULL == nfa_states) + { + GNUNET_asprintf (&s->name, "s%i", s->id); return s; + } s->nfa_set = nfa_states; @@ -419,7 +416,7 @@ dfa_state_create (struct GNUNET_REGEX_Context *ctx, struct StateSet *nfa_states) return s; } -struct State * +static struct State * dfa_move (struct State *s, const char literal) { struct Transition *t; @@ -442,6 +439,108 @@ dfa_move (struct State *s, const char literal) return new_s; } +static void +dfa_remove_unreachable_states (struct GNUNET_REGEX_Automaton *a) +{ + /* + * struct StateSet *unreachable; + * struct State *stack[a->size]; + * struct State *s; + * + * unreachable = GNUNET_malloc (sizeof (struct StateSet)); + * + * // 1. add all states to unreachable set + * for (s = a->states_head; NULL != s; s = s->next) + * { + * GNUNET_array_append (unreachable->states, unreachable->len; s); + * } + * + * // 2. traverse dfa from start state and remove every visited state from unreachable set + * s = a->start; + * // push a->start + * while (stack->len > 0) + * { + * s = stack->states[stack->len - 1]; + * GNUNET_array_grow (stack->states; stack->len; stack->len-1); + * GNUNET_array_ + * for (t = s->transitions_head; NULL != t; t = t->next) + * { + * + * } + * } + * // 3. delete all states that are still in the unreachable set + */ +} + +static void +dfa_remove_dead_states (struct GNUNET_REGEX_Automaton *a) +{ + struct State *s; + struct State *s_check; + struct Transition *t; + struct Transition *t_check; + int dead; + + GNUNET_assert (DFA == a->type); + + for (s = a->states_head; NULL != s; s = s->next) + { + if (s->accepting) + continue; + + dead = 1; + for (t = s->transitions_head; NULL != t; t = t->next) + { + if (NULL != t->state && t->state != s) + { + dead = 0; + break; + } + } + + if (0 == dead) + continue; + + // state s is dead, remove it + // 1. remove all transitions to this state + for (s_check = a->states_head; NULL != s_check; s_check = s_check->next) + { + for (t_check = s_check->transitions_head; NULL != t_check; + t_check = t_check->next) + { + if (t_check->state == s) + { + GNUNET_CONTAINER_DLL_remove (s_check->transitions_head, + s_check->transitions_tail, t_check); + } + } + } + // 2. remove state + GNUNET_CONTAINER_DLL_remove (a->states_head, a->states_tail, s); + } +} + +static void +dfa_merge_nondistinguishable_states (struct GNUNET_REGEX_Automaton *a) +{ + +} + +static void +dfa_minimize (struct GNUNET_REGEX_Automaton *a) +{ + GNUNET_assert (DFA == a->type); + + // 1. remove unreachable states + dfa_remove_unreachable_states (a); + + // 2. remove dead states + dfa_remove_dead_states (a); + + // 3. Merge nondistinguishable states + dfa_merge_nondistinguishable_states (a); +} + /** * Creates a new NFA fragment. Needs to be cleared using automaton_fragment_clear. * @@ -450,7 +549,7 @@ dfa_move (struct State *s, const char literal) * * @return new NFA fragment */ -struct GNUNET_REGEX_Automaton * +static struct GNUNET_REGEX_Automaton * nfa_fragment_create (struct State *start, struct State *end) { struct GNUNET_REGEX_Automaton *n; @@ -480,7 +579,7 @@ nfa_fragment_create (struct State *start, struct State *end) * @param states_head head of the DLL of states * @param states_tail tail of the DLL of states */ -void +static void nfa_add_states (struct GNUNET_REGEX_Automaton *n, struct State *states_head, struct State *states_tail) { @@ -512,7 +611,7 @@ nfa_add_states (struct GNUNET_REGEX_Automaton *n, struct State *states_head, * * @return new NFA state */ -struct State * +static struct State * nfa_state_create (struct GNUNET_REGEX_Context *ctx, int accepting) { struct State *s; @@ -527,12 +626,121 @@ nfa_state_create (struct GNUNET_REGEX_Context *ctx, int accepting) return s; } +/** + * Calculates the NFA closure set for the given state + * + * @param s starting point state + * @param literal transitioning literal on which to base the closure on, + * pass 0 for epsilon transition + * + * @return nfa closure on 'literal' (epsilon closure if 'literal' is 0) + */ +static struct StateSet * +nfa_closure_create (struct State *s, const char literal) +{ + struct StateSet *cls; + struct StateSet *cls_check; + struct State *clsstate; + struct State *currentstate; + struct Transition *ctran; + + if (NULL == s) + return NULL; + + cls = GNUNET_malloc (sizeof (struct StateSet)); + cls_check = GNUNET_malloc (sizeof (struct StateSet)); + + // Add start state to closure only for epsilon closure + if (0 == literal) + GNUNET_array_append (cls->states, cls->len, s); + + GNUNET_array_append (cls_check->states, cls_check->len, s); + while (cls_check->len > 0) + { + currentstate = cls_check->states[cls_check->len - 1]; + GNUNET_array_grow (cls_check->states, cls_check->len, cls_check->len - 1); + + for (ctran = currentstate->transitions_head; NULL != ctran; + ctran = ctran->next) + { + if (NULL != ctran->state && literal == ctran->literal) + { + clsstate = ctran->state; + + if (NULL != clsstate && + GNUNET_YES != state_set_contains (cls, clsstate)) + { + GNUNET_array_append (cls->states, cls->len, clsstate); + GNUNET_array_append (cls_check->states, cls_check->len, clsstate); + } + } + } + } + GNUNET_assert (0 == cls_check->len); + GNUNET_free (cls_check); + + if (cls->len > 1) + qsort (cls->states, cls->len, sizeof (struct State *), state_compare); + + return cls; +} + +/** + * Calculates the closure set for the given set of states. + * + * @param states list of states on which to base the closure on + * @param literal transitioning literal for which to base the closure on, + * pass 0 for epsilon transition + * + * @return nfa closure on 'literal' (epsilon closure if 'literal' is 0) + */ +static struct StateSet * +nfa_closure_set_create (struct StateSet *states, const char literal) +{ + struct State *s; + struct StateSet *sset; + struct StateSet *cls; + int i; + int j; + int k; + int contains; + + if (NULL == states) + return NULL; + + cls = GNUNET_malloc (sizeof (struct StateSet)); + + for (i = 0; i < states->len; i++) + { + s = states->states[i]; + sset = nfa_closure_create (s, literal); + + for (j = 0; j < sset->len; j++) + { + contains = 0; + for (k = 0; k < cls->len; k++) + { + if (sset->states[j]->id == cls->states[k]->id) + contains = 1; + } + if (!contains) + GNUNET_array_append (cls->states, cls->len, sset->states[j]); + } + state_set_clear (sset); + } + + if (cls->len > 1) + qsort (cls->states, cls->len, sizeof (struct State *), state_compare); + + return cls; +} + /** * Pops two NFA fragments (a, b) from the stack and concatenates them (ab) * * @param ctx context */ -void +static void nfa_add_concatenation (struct GNUNET_REGEX_Context *ctx) { struct GNUNET_REGEX_Automaton *a; @@ -564,7 +772,7 @@ nfa_add_concatenation (struct GNUNET_REGEX_Context *ctx) * * @param ctx context */ -void +static void nfa_add_star_op (struct GNUNET_REGEX_Context *ctx) { struct GNUNET_REGEX_Automaton *a; @@ -605,7 +813,7 @@ nfa_add_star_op (struct GNUNET_REGEX_Context *ctx) * * @param ctx context */ -void +static void nfa_add_plus_op (struct GNUNET_REGEX_Context *ctx) { struct GNUNET_REGEX_Automaton *a; @@ -624,7 +832,7 @@ nfa_add_plus_op (struct GNUNET_REGEX_Context *ctx) * * @param ctx context */ -void +static void nfa_add_alternation (struct GNUNET_REGEX_Context *ctx) { struct GNUNET_REGEX_Automaton *a; @@ -665,7 +873,7 @@ nfa_add_alternation (struct GNUNET_REGEX_Context *ctx) * @param ctx context * @param lit literal for nfa transition */ -void +static void nfa_add_literal (struct GNUNET_REGEX_Context *ctx, const char lit) { struct GNUNET_REGEX_Automaton *n; @@ -682,99 +890,6 @@ nfa_add_literal (struct GNUNET_REGEX_Context *ctx, const char lit) GNUNET_CONTAINER_DLL_insert_tail (ctx->stack_head, ctx->stack_tail, n); } -/** - * Calculates the NFA closure set for the given state - * - * @param s starting point state - * @param literal transitioning literal on which to base the closure on, - * pass 0 for epsilon transition - * - * @return nfa closure on 'literal' (epsilon closure if 'literal' is 0) - */ -struct StateSet * -nfa_closure_create (struct State *s, const char literal) -{ - struct StateSet *cls; - struct StateSet *cls_check; - struct State *clsstate; - struct State *currentstate; - struct Transition *ctran; - - if (NULL == s) - return NULL; - - cls = GNUNET_malloc (sizeof (struct StateSet)); - cls_check = GNUNET_malloc (sizeof (struct StateSet)); - - // Add start state to closure only for epsilon closure - if (0 == literal) - GNUNET_array_append (cls->states, cls->len, s); - - GNUNET_array_append (cls_check->states, cls_check->len, s); - while (cls_check->len > 0) - { - currentstate = cls_check->states[cls_check->len - 1]; - GNUNET_array_grow (cls_check->states, cls_check->len, cls_check->len - 1); - - for (ctran = currentstate->transitions_head; NULL != ctran; - ctran = ctran->next) - { - if (NULL != ctran->state && literal == ctran->literal) - { - clsstate = ctran->state; - - if (NULL != clsstate && - GNUNET_YES != state_set_contains (cls, clsstate)) - { - GNUNET_array_append (cls->states, cls->len, clsstate); - GNUNET_array_append (cls_check->states, cls_check->len, clsstate); - } - } - } - } - GNUNET_assert (0 == cls_check->len); - GNUNET_free (cls_check); - - return cls; -} - -/** - * Calculates the closure set for the given set of states. - * - * @param states list of states on which to base the closure on - * @param literal transitioning literal for which to base the closure on, - * pass 0 for epsilon transition - * - * @return nfa closure on 'literal' (epsilon closure if 'literal' is 0) - */ -struct StateSet * -nfa_closure_set_create (struct StateSet *states, const char literal) -{ - struct State *s; - struct StateSet *sset; - struct StateSet *cls; - int i; - int j; - - if (NULL == states) - return NULL; - - cls = GNUNET_malloc (sizeof (struct StateSet)); - - for (i = 0; i < states->len; i++) - { - s = states->states[i]; - sset = nfa_closure_create (s, literal); - - for (j = 0; j < sset->len; j++) - GNUNET_array_append (cls->states, cls->len, sset->states[j]); - - state_set_clear (sset); - } - - return cls; -} - /** * Construct an NFA by parsing the regex string of length 'len'. * @@ -1036,6 +1151,8 @@ GNUNET_REGEX_construct_dfa (const char *regex, const size_t len) GNUNET_free (dfa_stack); GNUNET_REGEX_automaton_destroy (nfa); + dfa_minimize (dfa); + GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, "Created DFA with %i States\n", ctx.state_id); @@ -1134,7 +1251,7 @@ GNUNET_REGEX_automaton_save_graph (struct GNUNET_REGEX_Automaton *a, * * @return GNUNET_YES if string matches, GNUNET_NO if not, GNUNET_SYSERR otherwise */ -int +static int evaluate_dfa (struct GNUNET_REGEX_Automaton *a, const char *string) { const char *strp; @@ -1143,7 +1260,7 @@ evaluate_dfa (struct GNUNET_REGEX_Automaton *a, const char *string) if (DFA != a->type) { GNUNET_log (GNUNET_ERROR_TYPE_ERROR, - "Tried to evaluate NFA, but DFA automaton given"); + "Tried to evaluate DFA, but NFA automaton given"); return GNUNET_SYSERR; } @@ -1170,7 +1287,7 @@ evaluate_dfa (struct GNUNET_REGEX_Automaton *a, const char *string) * * @return GNUNET_YES if string matches, GNUNET_NO if not, GNUNET_SYSERR otherwise */ -int +static int evaluate_nfa (struct GNUNET_REGEX_Automaton *a, const char *string) { const char *strp; @@ -1178,7 +1295,7 @@ evaluate_nfa (struct GNUNET_REGEX_Automaton *a, const char *string) struct StateSet *sset; struct StateSet *new_sset; int i; - int eval; + int result; if (NFA != a->type) { @@ -1187,7 +1304,7 @@ evaluate_nfa (struct GNUNET_REGEX_Automaton *a, const char *string) return GNUNET_SYSERR; } - eval = GNUNET_NO; + result = GNUNET_NO; strp = string; sset = GNUNET_malloc (sizeof (struct StateSet)); GNUNET_array_append (sset->states, sset->len, a->start); @@ -1205,13 +1322,13 @@ evaluate_nfa (struct GNUNET_REGEX_Automaton *a, const char *string) s = sset->states[i]; if (NULL != s && s->accepting) { - eval = GNUNET_YES; + result = GNUNET_YES; break; } } state_set_clear (sset); - return eval; + return result; } @@ -1226,22 +1343,22 @@ evaluate_nfa (struct GNUNET_REGEX_Automaton *a, const char *string) int GNUNET_REGEX_eval (struct GNUNET_REGEX_Automaton *a, const char *string) { - int eval; + int result; switch (a->type) { case DFA: - eval = evaluate_dfa (a, string); + result = evaluate_dfa (a, string); break; case NFA: - eval = evaluate_nfa (a, string); + result = evaluate_nfa (a, string); break; default: GNUNET_log (GNUNET_ERROR_TYPE_ERROR, "Evaluating regex failed, automaton has no type!\n"); - eval = GNUNET_SYSERR; + result = GNUNET_SYSERR; break; } - return eval; + return result; } -- 2.25.1