From: Maximilian Szengel Date: Tue, 10 Apr 2012 14:30:19 +0000 (+0000) Subject: dfa minimization wip X-Git-Tag: initial-import-from-subversion-38251~13961 X-Git-Url: https://git.librecmc.org/?a=commitdiff_plain;h=24199bfb61f162a0ce520e0b4061560c20ba84e5;p=oweals%2Fgnunet.git dfa minimization wip --- diff --git a/src/regex/regex.c b/src/regex/regex.c index 250cca47c..0ebce7a89 100644 --- a/src/regex/regex.c +++ b/src/regex/regex.c @@ -177,6 +177,16 @@ debug_print_transitions (struct State *s) } } +/** + * Compare two states. Used for sorting. + * + * @param a first state + * @param b second state + * + * @return an integer less than, equal to, or greater than zero + * if the first argument is considered to be respectively + * less than, equal to, or greater than the second. + */ static int state_compare (const void *a, const void *b) { @@ -329,16 +339,110 @@ automaton_destroy_state (struct State *s) GNUNET_free (s); } +/** + * Remove a state from the given automaton 'a'. Always use this function + * when altering the states of an automaton. Will also remove all transitions + * leading to this state, before destroying it. + * + * @param a automaton + * @param s state to remove + */ static void automaton_remove_state (struct GNUNET_REGEX_Automaton *a, struct State *s) { struct State *ss; + struct State *s_check; + struct Transition *t_check; + + // remove state ss = s; GNUNET_CONTAINER_DLL_remove (a->states_head, a->states_tail, s); a->state_count--; + + // remove all transitions leading to this state + for (s_check = a->states_head; NULL != s_check; s_check = s_check->next) + { + for (t_check = s_check->transitions_head; NULL != t_check; + t_check = t_check->next) + { + if (t_check->state == ss) + { + GNUNET_CONTAINER_DLL_remove (s_check->transitions_head, + s_check->transitions_tail, t_check); + s_check->transition_count--; + } + } + } + automaton_destroy_state (ss); } +/** + * Merge two states into one. Will merge 's1' and 's2' into 's1' and destroy 's2'. + * + * @param ctx context + * @param a automaton + * @param s1 first state + * @param s2 second state, will be destroyed + */ +static void +automaton_merge_states (struct GNUNET_REGEX_Context *ctx, + struct GNUNET_REGEX_Automaton *a, struct State *s1, + struct State *s2) +{ + struct State *s_check; + struct Transition *t_check; + struct Transition *t; + char *new_name; + + GNUNET_assert (NULL != ctx && NULL != a && NULL != s1 && NULL != s2); + + // 1. Make all transitions pointing to s2 point to s1 + for (s_check = a->states_head; NULL != s_check; s_check = s_check->next) + { + for (t_check = s_check->transitions_head; NULL != t_check; + t_check = t_check->next) + { + if (s_check != s2 && s2 == t_check->state) + t_check->state = s1; + } + } + + // 2. Add all transitions from s2 to sX to s1 + for (t_check = s2->transitions_head; NULL != t_check; t_check = t_check->next) + { + for (t = s1->transitions_head; NULL != t; t = t->next) + { + if (t_check->literal != t->literal && NULL != t_check->state && + t_check->state != t->state && t_check->state != s2) + { + add_transition (ctx, s1, t_check->literal, t_check->state); + } + } + } + + // 3. Rename s1 to {s1,s2} + new_name = GNUNET_malloc (strlen (s1->name) + strlen (s2->name) + 1); + strncat (new_name, s1->name, strlen (s1->name)); + strncat (new_name, s2->name, strlen (s2->name)); + if (NULL != s1->name) + GNUNET_free (s1->name); + s1->name = new_name; + + // remove state + s_check = s2; + GNUNET_CONTAINER_DLL_remove (a->states_head, a->states_tail, s_check); + a->state_count--; + automaton_destroy_state (s_check); +} + +/** + * Add a state to the automaton 'a', always use this function to + * alter the states DLL of the automaton. + * + * @param a automaton to add the state to + * @param s state that should be added + */ static void automaton_add_state (struct GNUNET_REGEX_Automaton *a, struct State *s) { @@ -493,9 +597,9 @@ dfa_remove_unreachable_states (struct GNUNET_REGEX_Automaton *a) stack_len++; while (stack_len > 0) { - s = stack[stack_len-1]; + s = stack[stack_len - 1]; stack_len--; - s->marked = 1; // mark s as visited + s->marked = 1; // mark s as visited for (t = s->transitions_head; NULL != t; t = t->next) { if (NULL != t->state && 0 == t->state->marked) @@ -525,9 +629,7 @@ static void dfa_remove_dead_states (struct GNUNET_REGEX_Automaton *a) { struct State *s; - struct State *s_check; struct Transition *t; - struct Transition *t_check; int dead; GNUNET_assert (DFA == a->type); @@ -551,20 +653,6 @@ dfa_remove_dead_states (struct GNUNET_REGEX_Automaton *a) continue; // state s is dead, remove it - // 1. remove all transitions to this state - for (s_check = a->states_head; NULL != s_check; s_check = s_check->next) - { - for (t_check = s_check->transitions_head; NULL != t_check; - t_check = t_check->next) - { - if (t_check->state == s) - { - GNUNET_CONTAINER_DLL_remove (s_check->transitions_head, - s_check->transitions_tail, t_check); - } - } - } - // 2. remove state automaton_remove_state (a, s); } } @@ -575,9 +663,80 @@ dfa_remove_dead_states (struct GNUNET_REGEX_Automaton *a) * @param a DFA automaton */ static void -dfa_merge_nondistinguishable_states (struct GNUNET_REGEX_Automaton *a) +dfa_merge_nondistinguishable_states (struct GNUNET_REGEX_Context *ctx, + struct GNUNET_REGEX_Automaton *a) { + int i; + int table[a->state_count][a->state_count]; + struct State *s1; + struct State *s2; + struct Transition *t1; + struct Transition *t2; + int change; + + change = 1; + for (i = 0, s1 = a->states_head; i < a->state_count && NULL != s1; + i++, s1 = s1->next) + s1->marked = i; + + // Mark all pairs of accepting/!accepting states + for (s1 = a->states_head; NULL != s1; s1 = s1->next) + { + for (s2 = a->states_head; NULL != s2 && s1 != s2; s2 = s2->next) + { + if ((s1->accepting && !s2->accepting) || + (!s1->accepting && s2->accepting)) + { + table[s1->marked][s2->marked] = 1; + } + else + table[s1->marked][s2->marked] = 0; + } + } + + while (0 != change) + { + change = 0; + for (s1 = a->states_head; NULL != s1; s1 = s1->next) + { + for (s2 = a->states_head; NULL != s2 && s1 != s2; s2 = s2->next) + { + if (0 != table[s1->marked][s2->marked]) + continue; + for (t1 = s1->transitions_head; NULL != t1; t1 = t1->next) + { + for (t2 = s2->transitions_head; NULL != t2; t2 = t2->next) + { + if (t1->literal == t2->literal && t1->state == t2->state && + (0 != table[t1->state->marked][t2->state->marked] || + 0 != table[t2->state->marked][t1->state->marked])) + { + table[s1->marked][s2->marked] = t1->literal; + change = 1; + } + else if (t1->literal != t2->literal && t1->state != t2->state) + { + table[s1->marked][s2->marked] = -1; + change = 1; + } + } + } + } + } + } + + struct State *s2_next; + + for (i = 0, s1 = a->states_head; NULL != s1; s1 = s1->next) + { + for (s2 = a->states_head; NULL != s2 && s1 != s2; s2 = s2_next) + { + s2_next = s2->next; + if (s1 != s2 && table[s1->marked][s2->marked] == 0) + automaton_merge_states (ctx, a, s1, s2); + } + } } /** @@ -587,7 +746,8 @@ dfa_merge_nondistinguishable_states (struct GNUNET_REGEX_Automaton *a) * @param a DFA automaton */ static void -dfa_minimize (struct GNUNET_REGEX_Automaton *a) +dfa_minimize (struct GNUNET_REGEX_Context *ctx, + struct GNUNET_REGEX_Automaton *a) { if (NULL == a) return; @@ -601,7 +761,7 @@ dfa_minimize (struct GNUNET_REGEX_Automaton *a) dfa_remove_dead_states (a); // 3. Merge nondistinguishable states - dfa_merge_nondistinguishable_states (a); + dfa_merge_nondistinguishable_states (ctx, a); } /** @@ -1169,7 +1329,7 @@ GNUNET_REGEX_construct_dfa (const char *regex, const size_t len) if (NULL == nfa) { - GNUNET_log (GNUNET_ERROR_TYPE_ERROR, + GNUNET_log (GNUNET_ERROR_TYPE_ERROR, "Could not create DFA, because NFA creation failed\n"); return NULL; } @@ -1225,7 +1385,7 @@ GNUNET_REGEX_construct_dfa (const char *regex, const size_t len) GNUNET_free (dfa_stack); GNUNET_REGEX_automaton_destroy (nfa); - dfa_minimize (dfa); + /*dfa_minimize (&ctx, dfa);*/ return dfa; } @@ -1401,7 +1561,6 @@ evaluate_nfa (struct GNUNET_REGEX_Automaton *a, const char *string) return result; } - /** * Evaluates the given 'string' against the given compiled regex * diff --git a/src/regex/test_regex.c b/src/regex/test_regex.c index 65c9e6c17..373e5365f 100644 --- a/src/regex/test_regex.c +++ b/src/regex/test_regex.c @@ -42,18 +42,17 @@ struct Regex_String_Pair }; static const char allowed_literals[] = - "0123456789" - "ABCDEFGHIJKLMNOPQRSTUVWXYZ" - "abcdefghijklmnopqrstuvwxyz"; + "0123456789" "ABCDEFGHIJKLMNOPQRSTUVWXYZ" "abcdefghijklmnopqrstuvwxyz"; int -test_random (unsigned int rx_length, unsigned int max_str_len, unsigned int str_count) +test_random (unsigned int rx_length, unsigned int max_str_len, + unsigned int str_count) { int i; int j; int rx_exp; - char rand_rx[rx_length+1]; - char matching_str[str_count][max_str_len+1]; + char rand_rx[rx_length + 1]; + char matching_str[str_count][max_str_len + 1]; char *rand_rxp; char *matching_strp; int char_op_switch; @@ -79,41 +78,40 @@ test_random (unsigned int rx_length, unsigned int max_str_len, unsigned int str_ last_was_op = 1; // Generate random regex and a string that matches the regex - for (i=0; istring_count; i++) + for (i = 0; i < rxstr->string_count; i++) { eval = GNUNET_REGEX_eval (a, rxstr->strings[i]); eval_check = regexec (rx, rxstr->strings[i], 1, matchptr, 0); // We only want to match the whole string, because that's what our DFA does, too. - if (eval_check == 0 && (matchptr[0].rm_so != 0 || matchptr[0].rm_eo != strlen (rxstr->strings[i]))) + if (eval_check == 0 && + (matchptr[0].rm_so != 0 || + matchptr[0].rm_eo != strlen (rxstr->strings[i]))) eval_check = 1; - if ((rxstr->expected_results[i] == match - && (0 != eval || 0 != eval_check)) - || - (rxstr->expected_results[i] == nomatch - && (0 == eval || 0 == eval_check))) + if ((rxstr->expected_results[i] == match && (0 != eval || 0 != eval_check)) + || (rxstr->expected_results[i] == nomatch && + (0 == eval || 0 == eval_check))) { - result = 1; - regerror (eval_check, rx, error, sizeof error); - GNUNET_log (GNUNET_ERROR_TYPE_ERROR, - "Unexpected result:\nregex: %s\nstring: %s\nexpected result: %i\n" - "gnunet regex: %i\nglibc regex: %i\nglibc error: %s\nrm_so: %i\nrm_eo: %i\n\n", - rxstr->regex, rxstr->strings[i], rxstr->expected_results[i], - eval, eval_check, error, matchptr[0].rm_so, matchptr[0].rm_eo); + result = 1; + regerror (eval_check, rx, error, sizeof error); + GNUNET_log (GNUNET_ERROR_TYPE_ERROR, + "Unexpected result:\nregex: %s\nstring: %s\nexpected result: %i\n" + "gnunet regex: %i\nglibc regex: %i\nglibc error: %s\nrm_so: %i\nrm_eo: %i\n\n", + rxstr->regex, rxstr->strings[i], rxstr->expected_results[i], + eval, eval_check, error, matchptr[0].rm_so, + matchptr[0].rm_eo); } } return result; @@ -239,23 +243,34 @@ main (int argc, char *argv[]) int check_nfa; int check_dfa; int check_rand; - struct Regex_String_Pair rxstr[2] = { - {"ab(c|d)+c*(a(b|c)d)+", 5, - {"abcdcdcdcdddddabd", "abcd", "abcddddddccccccccccccccccccccccccabdacdabd", "abccccca", "abcdcdcdccdabdabd"}, + + struct Regex_String_Pair rxstr[4] = { + {"ab(c|d)+c*(a(b|c)d)+", 5, + {"abcdcdcdcdddddabd", "abcd", "abcddddddccccccccccccccccccccccccabdacdabd", + "abccccca", "abcdcdcdccdabdabd"}, {match, nomatch, match, nomatch, match}}, - {"ab+c*(a(bx|c)d)+", 5, - {"abcdcdcdcdddddabd", "abcd", "abcddddddccccccccccccccccccccccccabdacdabd", "abccccca", "abcdcdcdccdabdabd"}, - {nomatch, nomatch, nomatch, nomatch, nomatch}}}; + {"ab+c*(a(bx|c)d)+", 5, + {"abcdcdcdcdddddabd", "abcd", "abcddddddccccccccccccccccccccccccabdacdabd", + "abccccca", "abcdcdcdccdabdabd"}, + {nomatch, nomatch, nomatch, nomatch, nomatch}}, + {"k|a+X*y+c|Q*e|p|R|Z*K*y*R+w|Y*6+n+h*k*w+V*F|W*B*e*g|N+V|t+L|P*j*3*9+X*h*J|J*6|b|E*i*f*R+S|Z|R|Y*Z|g*", 1, + {"kaXycQepRZKyRwY6nhkwVFWBegNVtLPj39XhJJ6bEifRSZRYZg"}, + {nomatch}}, + {"k|a+X*y+c|Q*e|p|R|Z*K*y*R+w|Y*6+n+h*k*w+V*F|W*B*e*g|N+V|t+L|P*j*3*9+X*h*J|J*6|b|E*i*f*R+S|Z|R|Y*Z|g*", 1, + {"kaXycQepRZKyRwY6nhkwVFWBegNVtLPj39XhJJ6bEifRSZRYZg"}, + {nomatch}} + }; check_nfa = 0; check_dfa = 0; check_rand = 0; - for (i=0; i<2; i++) + for (i = 0; i < 4; i++) { if (0 != regcomp (&rx, rxstr[i].regex, REG_EXTENDED)) { - GNUNET_log (GNUNET_ERROR_TYPE_ERROR, "Could not compile regex using regcomp()\n"); + GNUNET_log (GNUNET_ERROR_TYPE_ERROR, + "Could not compile regex using regcomp()\n"); return 1; } @@ -272,8 +287,8 @@ main (int argc, char *argv[]) regfree (&rx); } - srand (time(NULL)); - for (i=0; i< 100; i++) + srand (time (NULL)); + for (i = 0; i < 100; i++) check_rand += test_random (100, 150, 10); return check_nfa + check_dfa + check_rand;