From 16cf819a0feb38c36b046c59febae5bc511a3d1b Mon Sep 17 00:00:00 2001 From: Maximilian Szengel Date: Mon, 11 Jun 2012 15:10:21 +0000 Subject: [PATCH] simplified regex/proof generation --- src/regex/regex.c | 276 +++++++++++++++++++++-------- src/regex/test_regex_eval_api.c | 25 ++- src/regex/test_regex_iterate_api.c | 15 +- 3 files changed, 235 insertions(+), 81 deletions(-) diff --git a/src/regex/regex.c b/src/regex/regex.c index 20c496dd4..d697aee89 100644 --- a/src/regex/regex.c +++ b/src/regex/regex.c @@ -456,6 +456,7 @@ state_add_transition (struct GNUNET_REGEX_Context *ctx, { int is_dup; struct Transition *t; + struct Transition *oth; if (NULL == from_state) { @@ -478,6 +479,13 @@ state_add_transition (struct GNUNET_REGEX_Context *ctx, if (is_dup) return; + // sort transitions by label + for (oth = from_state->transitions_head; NULL != oth; oth = oth->next) + { + if (oth->label > label) + break; + } + t = GNUNET_malloc (sizeof (struct Transition)); t->id = ctx->transition_id++; t->label = label; @@ -486,8 +494,8 @@ state_add_transition (struct GNUNET_REGEX_Context *ctx, // Add outgoing transition to 'from_state' from_state->transition_count++; - GNUNET_CONTAINER_DLL_insert (from_state->transitions_head, - from_state->transitions_tail, t); + GNUNET_CONTAINER_DLL_insert_before (from_state->transitions_head, + from_state->transitions_tail, oth, t); } /** @@ -831,11 +839,14 @@ automaton_create_proofs (struct GNUNET_REGEX_Automaton *a) char *R_cur_r; int length_l; int length_r; + int s_cnt; + int l_cnt; char *complete_regex; k = 0; n = a->state_count; + // number the states for (i = 0, s = a->states_head; NULL != s; s = s->next, i++) { s->marked = i; @@ -891,6 +902,8 @@ automaton_create_proofs (struct GNUNET_REGEX_Automaton *a) { for (j = 0; j < n; j++) { + //construct R_cur + // 0*R = R*0 = 0 // 0+R = R+0 = R if (NULL == R_last[i][k] || NULL == R_last[k][j]) @@ -898,6 +911,15 @@ automaton_create_proofs (struct GNUNET_REGEX_Automaton *a) if (NULL != R_last[i][j]) R_cur[i][j] = GNUNET_strdup (R_last[i][j]); } + // a(ba)*b = (ab)+ + /*else if ((NULL == R_last[i][j] || 0 == strlen (R_last[i][j])) && */ + /*NULL != R_last[k][k] && 0 < strlen (R_last[k][k]) && */ + /*NULL != R_last[k][j] && 0 < strlen (R_last[k][j]) && */ + /*0 == strncmp (R_last[k][k], R_last[k][j], (strlen (R_last[k][k]) - 1)) && */ + /*R_last[i][k][0] == R_last[k][k][strlen (R_last[k][k]) - 1]) */ + /*{ */ + /*GNUNET_asprintf (&R_cur[i][j], "(%s%s)+", R_last[i][k], R_last[k][j]); */ + /*} */ else { // R(k)ij = R(k-1)ij + R(k-1)ik (R(k-1)kk)* R(k-1)kj @@ -911,27 +933,35 @@ automaton_create_proofs (struct GNUNET_REGEX_Automaton *a) if (NULL != R_last[i][j]) strcat (R_cur_l, R_last[i][j]); - if (NULL != R_last[i][k]) + if (NULL != R_last[i][k] && 0 != strcmp (R_last[i][k], R_last[k][k])) strcat (R_cur_r, R_last[i][k]); if (NULL != R_last[k][k] && 0 != strcmp (R_last[k][k], "")) { - if (R_last[k][k][0] == '(' && R_last[k][k][strlen (R_last[k][k])-1] == ')') + if (R_last[k][k][0] == '(' && + R_last[k][k][strlen (R_last[k][k]) - 1] == ')') { strcat (R_cur_r, R_last[k][k]); - strcat (R_cur_r, "*"); } else { strcat (R_cur_r, "("); strcat (R_cur_r, R_last[k][k]); - strcat (R_cur_r, ")*"); + strcat (R_cur_r, ")"); } + + if (0 == strcmp (R_last[i][k], R_last[k][k]) || + 0 == strcmp (R_last[k][k], R_last[k][j])) + strcat (R_cur_r, "+"); + else + strcat (R_cur_r, "*"); } - if (NULL != R_last[k][j]) + if (NULL != R_last[k][j] && 0 != strcmp (R_last[k][k], R_last[k][j])) strcat (R_cur_r, R_last[k][j]); + // simplifications... + // | is idempotent: a | a = a for all a in A if (0 == strcmp (R_cur_l, R_cur_r) || 0 == strcmp (R_cur_l, "") || 0 == strcmp (R_cur_r, "")) @@ -941,32 +971,102 @@ automaton_create_proofs (struct GNUNET_REGEX_Automaton *a) else GNUNET_asprintf (&R_cur[i][j], "%s", R_cur_l); } - // a | a a* a = a+ + // TODO: in theory only applicable if (e + a) | (e + a)(e + a)*(e+a) + // where e means epsilon... check if practical! + // a | a a* a = a* else if (R_last[i][j] == R_last[i][k] && R_last[i][k] == R_last[k][k] && R_last[k][k] == R_last[k][j]) { - GNUNET_asprintf (&R_cur[i][j], "%s+", R_last[i][j]); + if (1 >= strlen (R_last[k][k]) || + (R_last[k][k][0] == '(' && + R_last[k][k][strlen (R_last[k][k]) - 1] == ')')) + GNUNET_asprintf (&R_cur[i][j], "%s*", R_last[k][k]); + else + GNUNET_asprintf (&R_cur[i][j], "(%s)*", R_last[k][k]); } // a | a b* b => a | a b | a b b | ... => a b* else if (R_last[i][j] == R_last[i][k] && R_last[k][k] == R_last[k][j]) { - GNUNET_asprintf (&R_cur[i][j], "%s%s*", R_last[i][k], R_last[k][k]); + // if a is xb then a b* becomes x b b* = x b+ + + s_cnt = strlen (R_last[k][k]); + l_cnt = strlen (R_last[i][k]); + R_cur[i][j] = GNUNET_malloc (s_cnt + l_cnt + 4); + + if (l_cnt > 0 && s_cnt >= l_cnt) + for (; s_cnt > 0; s_cnt--, l_cnt--) + if (R_last[i][k][l_cnt] != R_last[k][k][s_cnt]) + break; + + if (strlen (R_last[i][k]) > 0 && 0 == s_cnt && 0 <= l_cnt) + strncat (R_cur[i][j], R_last[i][k], l_cnt); + else + strcat (R_cur[i][j], R_last[i][k]); + + if (1 >= strlen (R_last[k][k]) && + !(R_last[k][k][0] == '(' && + R_last[k][k][strlen (R_last[k][k]) - 1] == ')')) + { + strcat (R_cur[i][j], "("); + strcat (R_cur[i][j], R_last[k][k]); + strcat (R_cur[i][j], ")"); + } + else + strcat (R_cur[i][j], R_last[k][k]); + + if (0 == s_cnt && 0 <= l_cnt) + strcat (R_cur[i][j], "+"); + else + strcat (R_cur[i][j], "*"); } // a | b b* a => a | b a | b b a | ... => b* a else if (R_last[i][j] == R_last[k][j] && R_last[i][k] == R_last[k][k]) { - GNUNET_asprintf (&R_cur[i][j], "%s*%s", R_last[k][k], R_last[k][j]); + // if a is bx then b* a becomes b+ x + + temp = NULL; + s_cnt = strlen (R_last[k][k]); + l_cnt = strlen (R_last[k][j]); + R_cur[i][j] = GNUNET_malloc (s_cnt + l_cnt + 4); + + int bla; + + if (l_cnt > 0 && s_cnt >= l_cnt) + for (bla = 0; bla < s_cnt; bla++) + if (R_last[k][k][bla] != R_last[k][j][bla]) + break; + + if (1 >= strlen (R_last[k][k]) && + !(R_last[k][k][0] == '(' && + R_last[k][k][strlen (R_last[k][k]) - 1] == ')')) + { + strcat (R_cur[i][j], "("); + strcat (R_cur[i][j], R_last[k][k]); + strcat (R_cur[i][j], ")"); + } + else + strcat (R_cur[i][j], R_last[k][k]); + + if (bla == s_cnt) + strcat (R_cur[i][j], "+"); + else + strcat (R_cur[i][j], "*"); + + if (strlen (R_last[k][j]) > 0 && bla == s_cnt) + strcat (R_cur[i][j], &R_last[k][j][bla]); + else + strcat (R_cur[i][j], R_last[k][j]); } else GNUNET_asprintf (&R_cur[i][j], "(%s|%s)", R_cur_l, R_cur_r); GNUNET_free_non_null (R_cur_l); GNUNET_free_non_null (R_cur_r); - } } } + // set R_last = R_cur for (i = 0; i < n; i++) { for (j = 0; j < n; j++) @@ -991,7 +1091,7 @@ automaton_create_proofs (struct GNUNET_REGEX_Automaton *a) &states[i]->hash); } - // complete regex for whole DFA + // complete regex for whole DFA: union of all pairs (start state/accepting state(s)). complete_regex = NULL; for (i = 0; i < n; i++) { @@ -1486,6 +1586,7 @@ nfa_closure_create (struct GNUNET_REGEX_Automaton *nfa, GNUNET_assert (0 == cls_check->len); GNUNET_free (cls_check); + // sort the states if (cls->len > 1) qsort (cls->states, cls->len, sizeof (struct GNUNET_REGEX_State *), state_compare); @@ -2047,6 +2148,7 @@ GNUNET_REGEX_automaton_destroy (struct GNUNET_REGEX_Automaton *a) return; GNUNET_free (a->regex); + GNUNET_free_non_null (a->computed_regex); for (s = a->states_head; NULL != s;) { @@ -2058,6 +2160,81 @@ GNUNET_REGEX_automaton_destroy (struct GNUNET_REGEX_Automaton *a) GNUNET_free (a); } +/** + * Save a state to an open file pointer. cls is expected to be a file pointer to + * an open file. Used only in conjunction with + * GNUNET_REGEX_automaton_save_graph. + * + * @param cls file pointer + * @param s state + */ +void +GNUNET_REGEX_automaton_save_graph_step (void *cls, struct GNUNET_REGEX_State *s) +{ + FILE *p; + struct Transition *ctran; + char *s_acc = NULL; + char *s_tran = NULL; + + p = cls; + + if (s->accepting) + { + GNUNET_asprintf (&s_acc, + "\"%s\" [shape=doublecircle, color=\"0.%i 0.8 0.95\"];\n", + s->name, s->scc_id); + } + else + { + GNUNET_asprintf (&s_acc, "\"%s\" [color=\"0.%i 0.8 0.95\"];\n", s->name, + s->scc_id); + } + + if (NULL == s_acc) + { + GNUNET_log (GNUNET_ERROR_TYPE_ERROR, "Could not print state %s\n", s->name); + return; + } + fwrite (s_acc, strlen (s_acc), 1, p); + GNUNET_free (s_acc); + s_acc = NULL; + + for (ctran = s->transitions_head; NULL != ctran; ctran = ctran->next) + { + if (NULL == ctran->to_state) + { + GNUNET_log (GNUNET_ERROR_TYPE_ERROR, + "Transition from State %i has has no state for transitioning\n", + s->id); + continue; + } + + if (ctran->label == 0) + { + GNUNET_asprintf (&s_tran, + "\"%s\" -> \"%s\" [label = \"epsilon\", color=\"0.%i 0.8 0.95\"];\n", + s->name, ctran->to_state->name, s->scc_id); + } + else + { + GNUNET_asprintf (&s_tran, + "\"%s\" -> \"%s\" [label = \"%c\", color=\"0.%i 0.8 0.95\"];\n", + s->name, ctran->to_state->name, ctran->label, s->scc_id); + } + + if (NULL == s_tran) + { + GNUNET_log (GNUNET_ERROR_TYPE_ERROR, "Could not print state %s\n", + s->name); + return; + } + + fwrite (s_tran, strlen (s_tran), 1, p); + GNUNET_free (s_tran); + s_tran = NULL; + } +} + /** * Save the given automaton as a GraphViz dot file * @@ -2068,10 +2245,6 @@ void GNUNET_REGEX_automaton_save_graph (struct GNUNET_REGEX_Automaton *a, const char *filename) { - struct GNUNET_REGEX_State *s; - struct Transition *ctran; - char *s_acc = NULL; - char *s_tran = NULL; char *start; char *end; FILE *p; @@ -2100,66 +2273,7 @@ GNUNET_REGEX_automaton_save_graph (struct GNUNET_REGEX_Automaton *a, start = "digraph G {\nrankdir=LR\n"; fwrite (start, strlen (start), 1, p); - for (s = a->states_head; NULL != s; s = s->next) - { - if (s->accepting) - { - GNUNET_asprintf (&s_acc, - "\"%s\" [shape=doublecircle, color=\"0.%i 0.8 0.95\"];\n", - s->name, s->scc_id); - } - else - { - GNUNET_asprintf (&s_acc, "\"%s\" [color=\"0.%i 0.8 0.95\"];\n", s->name, - s->scc_id); - } - - if (NULL == s_acc) - { - GNUNET_log (GNUNET_ERROR_TYPE_ERROR, "Could not print state %s\n", - s->name); - return; - } - fwrite (s_acc, strlen (s_acc), 1, p); - GNUNET_free (s_acc); - s_acc = NULL; - - for (ctran = s->transitions_head; NULL != ctran; ctran = ctran->next) - { - if (NULL == ctran->to_state) - { - GNUNET_log (GNUNET_ERROR_TYPE_ERROR, - "Transition from State %i has has no state for transitioning\n", - s->id); - continue; - } - - if (ctran->label == 0) - { - GNUNET_asprintf (&s_tran, - "\"%s\" -> \"%s\" [label = \"epsilon\", color=\"0.%i 0.8 0.95\"];\n", - s->name, ctran->to_state->name, s->scc_id); - } - else - { - GNUNET_asprintf (&s_tran, - "\"%s\" -> \"%s\" [label = \"%c\", color=\"0.%i 0.8 0.95\"];\n", - s->name, ctran->to_state->name, ctran->label, - s->scc_id); - } - - if (NULL == s_tran) - { - GNUNET_log (GNUNET_ERROR_TYPE_ERROR, "Could not print state %s\n", - s->name); - return; - } - - fwrite (s_tran, strlen (s_tran), 1, p); - GNUNET_free (s_tran); - s_tran = NULL; - } - } + automaton_traverse (p, a, GNUNET_REGEX_automaton_save_graph_step); end = "\n}\n"; fwrite (end, strlen (end), 1, p); @@ -2189,6 +2303,10 @@ evaluate_dfa (struct GNUNET_REGEX_Automaton *a, const char *string) s = a->start; + // If the string is empty but the starting state is accepting, we accept. + if ((NULL == string || 0 == strlen (string)) && s->accepting) + return 0; + for (strp = string; NULL != strp && *strp; strp++) { s = dfa_move (s, *strp); @@ -2227,6 +2345,10 @@ evaluate_nfa (struct GNUNET_REGEX_Automaton *a, const char *string) return -1; } + // If the string is empty but the starting state is accepting, we accept. + if ((NULL == string || 0 == strlen (string)) && a->start->accepting) + return 0; + result = 1; strp = string; sset = nfa_closure_create (a, a->start, 0); diff --git a/src/regex/test_regex_eval_api.c b/src/regex/test_regex_eval_api.c index 392fb3b36..89a757806 100644 --- a/src/regex/test_regex_eval_api.c +++ b/src/regex/test_regex_eval_api.c @@ -263,8 +263,9 @@ main (int argc, char *argv[]) int check_nfa; int check_dfa; int check_rand; + char *check_proof; - struct Regex_String_Pair rxstr[8] = { + struct Regex_String_Pair rxstr[12] = { {"ab?(abcd)?", 5, {"ababcd", "abab", "aabcd", "a", "abb"}, {match, nomatch, match, match, nomatch}}, @@ -288,6 +289,19 @@ main (int argc, char *argv[]) {"V|M*o?x*p*d+h+b|E*m?h?Y*E*O?W*W*P+o?Z+H*M|I*q+C*a+5?5*9|b?z|G*y*k?R|p+u|8*h?B+l*H|e|L*O|1|F?v*0?5|C+", 1, {"VMoxpdhbEmhYEOWWPoZHMIqCa559bzGykRpu8hBlHeLO1Fv05C"}, {nomatch}}, + {"(bla)*", 8, + {"", "bla", "blabla", "bl", "la", "b", "l", "a"}, + {match, match, match, nomatch, nomatch, nomatch, nomatch, nomatch}}, + {"ab(c|d)+c*(a(b|c)+d)+(bla)(bla)*", 8, + {"ab", "abcabdbla", "abdcccccccccccabcbccdblablabla", "bl", "la", "b", "l", + "a"}, + {nomatch, match, match, nomatch, nomatch, nomatch, nomatch, nomatch}}, + {"a|aa*a", 6, + {"", "a", "aa", "aaa", "aaaa", "aaaaa"}, + {nomatch, match, match, match, match, match}}, + {"ab(c|d)+c*(a(b|c)+d)+(bla)+", 1, + {"abcabdblaacdbla"}, + {nomatch}}, {"ab(c|d)+c*(a(b|c)d)+", 1, {"abacd"}, {nomatch}} @@ -297,7 +311,7 @@ main (int argc, char *argv[]) check_dfa = 0; check_rand = 0; - for (i = 0; i < 8; i++) + for (i = 0; i < 12; i++) { if (0 != regcomp (&rx, rxstr[i].regex, REG_EXTENDED)) { @@ -314,7 +328,14 @@ main (int argc, char *argv[]) // DFA test a = GNUNET_REGEX_construct_dfa (rxstr[i].regex, strlen (rxstr[i].regex)); check_dfa += test_automaton (a, &rx, &rxstr[i]); + check_proof = GNUNET_strdup (GNUNET_REGEX_get_computed_regex (a)); + GNUNET_REGEX_automaton_destroy (a); + a = GNUNET_REGEX_construct_dfa (check_proof, strlen (check_proof)); + check_dfa += test_automaton (a, &rx, &rxstr[i]); GNUNET_REGEX_automaton_destroy (a); + if (0 != check_dfa) + GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, "check_proof: %s\n", check_proof); + GNUNET_free_non_null (check_proof); regfree (&rx); } diff --git a/src/regex/test_regex_iterate_api.c b/src/regex/test_regex_iterate_api.c index 6c5b0e55b..90907baee 100644 --- a/src/regex/test_regex_iterate_api.c +++ b/src/regex/test_regex_iterate_api.c @@ -60,7 +60,10 @@ main (int argc, char *argv[]) struct GNUNET_REGEX_Automaton *dfa; error = 0; - /*regex = "ab(c|d)+c*(a(b|c)+d)+(bla)+"; */ + regex = "ab(c|d)+c*(a(b|c)+d)+(bla)+"; + /*regex = "(bla)+"; */ + /*regex = "b(lab)*la"; */ + /*regex = "(bla)*"; */ /*regex = "ab(c|d)+c*(a(b|c)+d)+(bla)(bla)*"; */ /*regex = "z(abc|def)?xyz"; */ /*regex = "1*0(0|1)*"; */ @@ -68,7 +71,15 @@ main (int argc, char *argv[]) /*regex = "abcd:(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)"; */ /*regex = "abc(1|0)*def"; */ /*regex = "ab|ac"; */ - regex = "(ab)(ab)*"; + /*regex = "(ab)(ab)*"; */ + /*regex = "ab|cd|ef|gh"; */ + /*regex = "a|b|c|d|e|f|g"; */ + /*regex = "(ab)|(ac)"; */ + /*regex = "a(b|c)"; */ + /*regex = "a*a"; */ + /*regex = "ab?(abcd)?"; */ + /*regex = "(ab)+"; */ + /*regex = "(abcsdfsdf)+"; */ dfa = GNUNET_REGEX_construct_dfa (regex, strlen (regex)); GNUNET_REGEX_automaton_save_graph (dfa, "dfa.dot"); GNUNET_REGEX_iterate_all_edges (dfa, key_iterator, NULL); -- 2.25.1