From: Maximilian Szengel Date: Mon, 2 Jul 2012 12:22:37 +0000 (+0000) Subject: regex bugfixes X-Git-Tag: initial-import-from-subversion-38251~12703 X-Git-Url: https://git.librecmc.org/?a=commitdiff_plain;h=a4b40b20949b4ea21d98bed902906cdf56d41326;p=oweals%2Fgnunet.git regex bugfixes --- diff --git a/src/regex/regex.c b/src/regex/regex.c index 09cd910a0..18f6b60ea 100644 --- a/src/regex/regex.c +++ b/src/regex/regex.c @@ -503,7 +503,7 @@ state_add_transition (struct GNUNET_REGEX_Context *ctx, } } - if (is_dup) + if (GNUNET_YES == is_dup) return; // sort transitions by label @@ -734,21 +734,41 @@ automaton_merge_states (struct GNUNET_REGEX_Context *ctx, { struct GNUNET_REGEX_State *s_check; struct Transition *t_check; + struct Transition *t; + struct Transition *t_next; char *new_name; + int is_dup; GNUNET_assert (NULL != ctx && NULL != a && NULL != s1 && NULL != s2); if (s1 == s2) return; - // 1. Make all transitions pointing to s2 point to s1 + // 1. Make all transitions pointing to s2 point to s1, unless this transition + // does not already exists, if it already exists remove transition. for (s_check = a->states_head; NULL != s_check; s_check = s_check->next) { - for (t_check = s_check->transitions_head; NULL != t_check; - t_check = t_check->next) + for (t_check = s_check->transitions_head; NULL != t_check; t_check = t_next) { + t_next = t_check->next; + if (s2 == t_check->to_state) - t_check->to_state = s1; + { + is_dup = GNUNET_NO; + for (t = t_check->from_state->transitions_head; NULL != t; t = t->next) + { + if (t->to_state == s1 && t_check->label == t->label) + is_dup = GNUNET_YES; + } + if (GNUNET_NO == is_dup) + t_check->to_state = s1; + else + { + GNUNET_CONTAINER_DLL_remove (t_check->from_state->transitions_head, + t_check->from_state->transitions_tail, + t_check); + } + } } } @@ -1015,22 +1035,23 @@ number_states (void *cls, unsigned int count, struct GNUNET_REGEX_State *s) states[count] = s; } - -/** - * create proofs for all states in the given automaton. Implementation of the - * algorithm descriped in chapter 3.2.1 of "Automata Theory, Languages, and - * Computation 3rd Edition" by Hopcroft, Motwani and Ullman. - * - * @param a automaton. +/** + * Construct the regular expression given the inductive step, + * $R^{(k)}_{ij} = R^{(k-1)}_{ij} | R^{(k-1)}_{ik} ( R^{(k-1)}_{kk} )^* + * R^{(k-1)}_{kj}, and simplify the resulting expression saved in R_cur_ij. + * + * @param R_last_ij value of $R^{(k-1)_{ij}. + * @param R_last_ik value of $R^{(k-1)_{ik}. + * @param R_last_kk value of $R^{(k-1)_{kk}. + * @param R_last_kj value of $R^{(k-1)_{kj}. + * @param R_cur_ij result for this inductive step is saved in R_cur_ij, R_cur_ij + * is expected to be NULL when called! */ static void -automaton_create_proofs (struct GNUNET_REGEX_Automaton *a) +automaton_create_proofs_simplify (char *R_last_ij, char *R_last_ik, + char *R_last_kk, char *R_last_kj, + char **R_cur_ij) { - unsigned int n = a->state_count; - struct GNUNET_REGEX_State *states[n]; - char *R_last[n][n]; - char *R_cur[n][n]; - struct Transition *t; char *R_cur_l; char *R_cur_r; char *temp_a; @@ -1039,11 +1060,7 @@ automaton_create_proofs (struct GNUNET_REGEX_Automaton *a) char *R_temp_ik; char *R_temp_kj; char *R_temp_kk; - char *complete_regex; - unsigned int i; - unsigned int j; - unsigned int k; - unsigned int cnt; + int eps_check; int ij_ik_cmp; int ij_kj_cmp; @@ -1053,10 +1070,382 @@ automaton_create_proofs (struct GNUNET_REGEX_Automaton *a) int kk_kj_cmp; int clean_ik_kk_cmp; int clean_kk_kj_cmp; + unsigned int cnt; + size_t length; size_t length_l; size_t length_r; + GNUNET_assert (NULL == *R_cur_ij && NULL != R_cur_ij); + + // $R^{(k)}_{ij} = R^{(k-1)}_{ij} | R^{(k-1)}_{ik} ( R^{(k-1)}_{kk} )^* R^{(k-1)}_{kj} + // R_last == R^{(k-1)}, R_cur == R^{(k)} + // R_cur_ij = R_cur_l | R_cur_r + // R_cur_l == R^{(k-1)}_{ij} + // R_cur_r == R^{(k-1)}_{ik} ( R^{(k-1)}_{kk} )^* R^{(k-1)}_{kj} + + if ((NULL == R_last_ij) && ((NULL == R_last_ik) || (NULL == R_last_kk) || /* technically cannot happen, but looks saner */ + (NULL == R_last_kj))) + { + /* R^{(k)}_{ij} = N | N */ + *R_cur_ij = NULL; + return; + } + + if ((NULL == R_last_ik) || (NULL == R_last_kk) || /* technically cannot happen, but looks saner */ + (NULL == R_last_kj)) + { + /* R^{(k)}_{ij} = R^{(k-1)}_{ij} | N */ + *R_cur_ij = GNUNET_strdup (R_last_ij); + return; + } + + // $R^{(k)}_{ij} = N | R^{(k-1)}_{ik} ( R^{(k-1)}_{kk} )^* R^{(k-1)}_{kj} OR + // $R^{(k)}_{ij} = R^{(k-1)}_{ij} | R^{(k-1)}_{ik} ( R^{(k-1)}_{kk} )^* R^{(k-1)}_{kj} + + /* *R_cur_ij = NULL; */ + R_cur_r = NULL; + R_cur_l = NULL; + + // cache results from strcmp, we might need these many times + ij_kj_cmp = nullstrcmp (R_last_ij, R_last_kj); + ij_ik_cmp = nullstrcmp (R_last_ij, R_last_ik); + ik_kk_cmp = nullstrcmp (R_last_ik, R_last_kk); + //ik_kj_cmp = nullstrcmp (R_last_ik, R_last_kj); + kk_kj_cmp = nullstrcmp (R_last_kk, R_last_kj); + + // Assign R_temp_(ik|kk|kj) to R_last[][] and remove epsilon as well + // as parentheses, so we can better compare the contents + R_temp_ik = remove_parentheses (remove_epsilon (R_last_ik)); + R_temp_kk = remove_parentheses (remove_epsilon (R_last_kk)); + R_temp_kj = remove_parentheses (remove_epsilon (R_last_kj)); + + clean_ik_kk_cmp = nullstrcmp (R_last_ik, R_temp_kk); + clean_kk_kj_cmp = nullstrcmp (R_temp_kk, R_last_kj); + + // construct R_cur_l (and, if necessary R_cur_r) + if (NULL != R_last_ij) + { + // Assign R_temp_ij to R_last_ij and remove epsilon as well + // as parentheses, so we can better compare the contents + R_temp_ij = remove_parentheses (remove_epsilon (R_last_ij)); + + if (0 == strcmp (R_temp_ij, R_temp_ik) && 0 == strcmp (R_temp_ik, R_temp_kk) + && 0 == strcmp (R_temp_kk, R_temp_kj)) + { + if (0 == strlen (R_temp_ij)) + { + R_cur_r = GNUNET_strdup (""); + } + else if ((0 == strncmp (R_last_ij, "(|", 2)) || + (0 == strncmp (R_last_ik, "(|", 2) && + 0 == strncmp (R_last_kj, "(|", 2))) + { + // a|(e|a)a*(e|a) = a* + // a|(e|a)(e|a)*(e|a) = a* + // (e|a)|aa*a = a* + // (e|a)|aa*(e|a) = a* + // (e|a)|(e|a)a*a = a* + // (e|a)|(e|a)a*(e|a) = a* + // (e|a)|(e|a)(e|a)*(e|a) = a* + if (GNUNET_YES == needs_parentheses (R_temp_ij)) + GNUNET_asprintf (&R_cur_r, "(%s)*", R_temp_ij); + else + GNUNET_asprintf (&R_cur_r, "%s*", R_temp_ij); + } + else + { + // a|aa*a = a+ + // a|(e|a)a*a = a+ + // a|aa*(e|a) = a+ + // a|(e|a)(e|a)*a = a+ + // a|a(e|a)*(e|a) = a+ + if (GNUNET_YES == needs_parentheses (R_temp_ij)) + GNUNET_asprintf (&R_cur_r, "(%s)+", R_temp_ij); + else + GNUNET_asprintf (&R_cur_r, "%s+", R_temp_ij); + } + } + else if (0 == ij_ik_cmp && 0 == clean_kk_kj_cmp && 0 != clean_ik_kk_cmp) + { + // a|ab*b = ab* + if (strlen (R_last_kk) < 1) + R_cur_r = GNUNET_strdup (R_last_ij); + else if (GNUNET_YES == needs_parentheses (R_temp_kk)) + GNUNET_asprintf (&R_cur_r, "%s(%s)*", R_last_ij, R_temp_kk); + else + GNUNET_asprintf (&R_cur_r, "%s%s*", R_last_ij, R_last_kk); + + R_cur_l = NULL; + } + else if (0 == ij_kj_cmp && 0 == clean_ik_kk_cmp && 0 != clean_kk_kj_cmp) + { + // a|bb*a = b*a + if (strlen (R_last_kk) < 1) + R_cur_r = GNUNET_strdup (R_last_kj); + else if (GNUNET_YES == needs_parentheses (R_temp_kk)) + GNUNET_asprintf (&R_cur_r, "(%s)*%s", R_temp_kk, R_last_kj); + else + GNUNET_asprintf (&R_cur_r, "%s*%s", R_temp_kk, R_last_kj); + + R_cur_l = NULL; + } + else if (0 == ij_ik_cmp && 0 == kk_kj_cmp && !has_epsilon (R_last_ij) && + has_epsilon (R_last_kk)) + { + // a|a(e|b)*(e|b) = a|ab* = a|a|ab|abb|abbb|... = ab* + if (needs_parentheses (R_temp_kk)) + GNUNET_asprintf (&R_cur_r, "%s(%s)*", R_last_ij, R_temp_kk); + else + GNUNET_asprintf (&R_cur_r, "%s%s*", R_last_ij, R_temp_kk); + + R_cur_l = NULL; + } + else if (0 == ij_kj_cmp && 0 == ik_kk_cmp && !has_epsilon (R_last_ij) && + has_epsilon (R_last_kk)) + { + // a|(e|b)(e|b)*a = a|b*a = a|a|ba|bba|bbba|... = b*a + if (needs_parentheses (R_temp_kk)) + GNUNET_asprintf (&R_cur_r, "(%s)*%s", R_temp_kk, R_last_ij); + else + GNUNET_asprintf (&R_cur_r, "%s*%s", R_temp_kk, R_last_ij); + + R_cur_l = NULL; + } + else + { + temp_a = (NULL == R_last_ij) ? NULL : GNUNET_strdup (R_last_ij); + temp_a = remove_parentheses (temp_a); + R_cur_l = temp_a; + } + + GNUNET_free_non_null (R_temp_ij); + } + else + { + // we have no left side + R_cur_l = NULL; + } + + // construct R_cur_r, if not already constructed + if (NULL == R_cur_r) + { + length = strlen (R_temp_kk) - strlen (R_last_ik); + + // a(ba)*bx = (ab)+x + if (length > 0 && NULL != R_last_kk && 0 < strlen (R_last_kk) && + NULL != R_last_kj && 0 < strlen (R_last_kj) && NULL != R_last_ik && + 0 < strlen (R_last_ik) && 0 == strkcmp (R_temp_kk, R_last_ik, length) && + 0 == strncmp (R_temp_kk, R_last_kj, length)) + { + temp_a = GNUNET_malloc (length + 1); + temp_b = GNUNET_malloc ((strlen (R_last_kj) - length) + 1); + + length_l = 0; + length_r = 0; + + for (cnt = 0; cnt < strlen (R_last_kj); cnt++) + { + if (cnt < length) + { + temp_a[length_l] = R_last_kj[cnt]; + length_l++; + } + else + { + temp_b[length_r] = R_last_kj[cnt]; + length_r++; + } + } + temp_a[length_l] = '\0'; + temp_b[length_r] = '\0'; + + // e|(ab)+ = (ab)* + if (NULL != R_cur_l && 0 == strlen (R_cur_l) && 0 == strlen (temp_b)) + { + GNUNET_asprintf (&R_cur_r, "(%s%s)*", R_last_ik, temp_a); + GNUNET_free (R_cur_l); + R_cur_l = NULL; + } + else + { + GNUNET_asprintf (&R_cur_r, "(%s%s)+%s", R_last_ik, temp_a, temp_b); + } + GNUNET_free (temp_a); + GNUNET_free (temp_b); + } + else if (0 == strcmp (R_temp_ik, R_temp_kk) && + 0 == strcmp (R_temp_kk, R_temp_kj)) + { + // (e|a)a*(e|a) = a* + // (e|a)(e|a)*(e|a) = a* + if (has_epsilon (R_last_ik) && has_epsilon (R_last_kj)) + { + if (needs_parentheses (R_temp_kk)) + GNUNET_asprintf (&R_cur_r, "(%s)*", R_temp_kk); + else + GNUNET_asprintf (&R_cur_r, "%s*", R_temp_kk); + } + // aa*a = a+a + else if (0 == clean_ik_kk_cmp && 0 == clean_kk_kj_cmp && + !has_epsilon (R_last_ik)) + { + if (needs_parentheses (R_temp_kk)) + GNUNET_asprintf (&R_cur_r, "(%s)+%s", R_temp_kk, R_temp_kk); + else + GNUNET_asprintf (&R_cur_r, "(%s)+%s", R_temp_kk, R_temp_kk); + } + // (e|a)a*a = a+ + // aa*(e|a) = a+ + // a(e|a)*(e|a) = a+ + // (e|a)a*a = a+ + else + { + eps_check = + (has_epsilon (R_last_ik) + has_epsilon (R_last_kk) + + has_epsilon (R_last_kj)); + + if (eps_check == 1) + { + if (needs_parentheses (R_temp_kk)) + GNUNET_asprintf (&R_cur_r, "(%s)+", R_temp_kk); + else + GNUNET_asprintf (&R_cur_r, "%s+", R_temp_kk); + } + } + } + // aa*b = a+b + // (e|a)(e|a)*b = a*b + else if (0 == strcmp (R_temp_ik, R_temp_kk)) + { + if (has_epsilon (R_last_ik)) + { + if (needs_parentheses (R_temp_kk)) + GNUNET_asprintf (&R_cur_r, "(%s)*%s", R_temp_kk, R_last_kj); + else + GNUNET_asprintf (&R_cur_r, "%s*%s", R_temp_kk, R_last_kj); + } + else + { + if (needs_parentheses (R_temp_kk)) + GNUNET_asprintf (&R_cur_r, "(%s)+%s", R_temp_kk, R_last_kj); + else + GNUNET_asprintf (&R_cur_r, "%s+%s", R_temp_kk, R_last_kj); + } + } + // ba*a = ba+ + // b(e|a)*(e|a) = ba* + else if (0 == strcmp (R_temp_kk, R_temp_kj)) + { + if (has_epsilon (R_last_kj)) + { + if (needs_parentheses (R_temp_kk)) + GNUNET_asprintf (&R_cur_r, "%s(%s)*", R_last_ik, R_temp_kk); + else + GNUNET_asprintf (&R_cur_r, "%s%s*", R_last_ik, R_temp_kk); + } + else + { + if (needs_parentheses (R_temp_kk)) + GNUNET_asprintf (&R_cur_r, "(%s)+%s", R_last_ik, R_temp_kk); + else + GNUNET_asprintf (&R_cur_r, "%s+%s", R_last_ik, R_temp_kk); + } + } + else + { + if (strlen (R_temp_kk) > 0) + { + if (needs_parentheses (R_temp_kk)) + { + GNUNET_asprintf (&R_cur_r, "%s(%s)*%s", R_last_ik, R_temp_kk, + R_last_kj); + } + else + { + GNUNET_asprintf (&R_cur_r, "%s%s*%s", R_last_ik, R_temp_kk, + R_last_kj); + } + } + else + { + GNUNET_asprintf (&R_cur_r, "%s%s", R_last_ik, R_last_kj); + } + } + } + + /* GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, "R_cur_l: %s\n", R_cur_l); */ + /* GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, "R_cur_r: %s\n", R_cur_r); */ + + if (NULL == R_cur_l && NULL == R_cur_r) + { + *R_cur_ij = NULL; + return; + } + + if (NULL != R_cur_l && NULL == R_cur_r) + { + *R_cur_ij = GNUNET_strdup (R_cur_l); + return; + } + + if (NULL == R_cur_l && NULL != R_cur_r) + { + *R_cur_ij = GNUNET_strdup (R_cur_r); + return; + } + + if (R_cur_l[0] == 'C' || R_cur_r[0] == 'C') + GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, "R_cur_l: %s R_cur_r: %s\n", R_cur_l, + R_cur_r); + + + if (0 == nullstrcmp (R_cur_l, R_cur_r)) + { + *R_cur_ij = GNUNET_strdup (R_cur_l); + GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, ">>>>>>>>>>>>>> %s == %s\n", R_cur_l, + R_cur_r); + return; + } + + /* else */ + /* { */ + /* GNUNET_log (GNUNET_ERROR_TYPE_ERROR, ">>>>>>>>>>>>>> %s != %s\n", R_cur_l, R_cur_r); */ + /* } */ + + GNUNET_asprintf (R_cur_ij, "(%s|%s)", R_cur_l, R_cur_r); + + GNUNET_free_non_null (R_cur_l); + GNUNET_free_non_null (R_cur_r); + + GNUNET_free_non_null (R_temp_ik); + GNUNET_free_non_null (R_temp_kk); + GNUNET_free_non_null (R_temp_kj); + +} + +/** + * create proofs for all states in the given automaton. Implementation of the + * algorithm descriped in chapter 3.2.1 of "Automata Theory, Languages, and + * Computation 3rd Edition" by Hopcroft, Motwani and Ullman. + * + * @param a automaton. + */ +static void +automaton_create_proofs (struct GNUNET_REGEX_Automaton *a) +{ + unsigned int n = a->state_count; + struct GNUNET_REGEX_State *states[n]; + char *R_last[n][n]; + char *R_cur[n][n]; + char *temp; + struct Transition *t; + char *complete_regex; + unsigned int i; + unsigned int j; + unsigned int k; + + /* create depth-first numbering of the states, initializes 'state' */ automaton_traverse (a, &number_states, states); @@ -1075,31 +1464,29 @@ automaton_create_proofs (struct GNUNET_REGEX_Automaton *a) GNUNET_asprintf (&R_last[i][j], "%c", t->label); else { - temp_a = R_last[i][j]; + temp = R_last[i][j]; GNUNET_asprintf (&R_last[i][j], "%s|%c", R_last[i][j], t->label); - GNUNET_free (temp_a); + GNUNET_free (temp); } } if (NULL == R_last[i][i]) GNUNET_asprintf (&R_last[i][i], ""); else { - temp_a = R_last[i][i]; + temp = R_last[i][i]; GNUNET_asprintf (&R_last[i][i], "(|%s)", R_last[i][i]); - GNUNET_free (temp_a); + GNUNET_free (temp); } } for (i = 0; i < n; i++) for (j = 0; j < n; j++) if (needs_parentheses (R_last[i][j])) { - temp_a = R_last[i][j]; + temp = R_last[i][j]; GNUNET_asprintf (&R_last[i][j], "(%s)", R_last[i][j]); - GNUNET_free (temp_a); + GNUNET_free (temp); } - // TODO: clean up and fix the induction part - /* Compute regular expressions of length "k" between each pair of states per induction */ for (k = 0; k < n; k++) { @@ -1110,341 +1497,11 @@ automaton_create_proofs (struct GNUNET_REGEX_Automaton *a) // Basis for the recursion: // $R^{(k)}_{ij} = R^{(k-1)}_{ij} | R^{(k-1)}_{ik} ( R^{(k-1)}_{kk} )^* R^{(k-1)}_{kj} // R_last == R^{(k-1)}, R_cur == R^{(k)} - // With: R_cur[i][j] = R_cur_l | R_cur_r - // R_cur_l == R^{(k-1)}_{ij} - // R_cur_r == R^{(k-1)}_{ik} ( R^{(k-1)}_{kk} )^* R^{(k-1)}_{kj} - - if ((NULL == R_last[i][j]) && ((NULL == R_last[i][k]) || (NULL == R_last[k][k]) || /* technically cannot happen, but looks saner */ - (NULL == R_last[k][j]))) - { - /* R^{(k)}_{ij} = N | N */ - /* R_cur[i][j] is already NULL */ - continue; - } - if ((NULL == R_last[i][k]) || (NULL == R_last[k][k]) || /* technically cannot happen, but looks saner */ - (NULL == R_last[k][j])) - { - /* R^{(k)}_{ij} = R^{(k-1)}_{ij} | N */ - R_cur[i][j] = GNUNET_strdup (R_last[i][j]); - continue; - } - // $R^{(k)}_{ij} = N | R^{(k-1)}_{ik} ( R^{(k-1)}_{kk} )^* R^{(k-1)}_{kj} OR - // $R^{(k)}_{ij} = R^{(k-1)}_{ij} | R^{(k-1)}_{ik} ( R^{(k-1)}_{kk} )^* R^{(k-1)}_{kj} - - R_cur[i][j] = NULL; - R_cur_r = NULL; - R_cur_l = NULL; - - // cache results from strcmp, we might need these many times - ij_kj_cmp = nullstrcmp (R_last[i][j], R_last[k][j]); - ij_ik_cmp = nullstrcmp (R_last[i][j], R_last[i][k]); - ik_kk_cmp = nullstrcmp (R_last[i][k], R_last[k][k]); - //ik_kj_cmp = nullstrcmp (R_last[i][k], R_last[k][j]); - kk_kj_cmp = nullstrcmp (R_last[k][k], R_last[k][j]); - // Assign R_temp_(ik|kk|kj) to R_last[][] and remove epsilon as well - // as parentheses, so we can better compare the contents - R_temp_ik = remove_parentheses (remove_epsilon (R_last[i][k])); - R_temp_kk = remove_parentheses (remove_epsilon (R_last[k][k])); - R_temp_kj = remove_parentheses (remove_epsilon (R_last[k][j])); - - clean_ik_kk_cmp = nullstrcmp (R_last[i][k], R_temp_kk); - clean_kk_kj_cmp = nullstrcmp (R_temp_kk, R_last[k][j]); - - // construct R_cur_l (and, if necessary R_cur_r) - if (NULL != R_last[i][j]) - { - // Assign R_temp_ij to R_last[i][j] and remove epsilon as well - // as parentheses, so we can better compare the contents - R_temp_ij = remove_parentheses (remove_epsilon (R_last[i][j])); - - if (0 == strcmp (R_temp_ij, R_temp_ik) && - 0 == strcmp (R_temp_ik, R_temp_kk) && - 0 == strcmp (R_temp_kk, R_temp_kj)) - { - if (0 == strlen (R_temp_ij)) - { - R_cur_r = GNUNET_strdup (""); - } - else if ((0 == strncmp (R_last[i][j], "(|", 2)) || - (0 == strncmp (R_last[i][k], "(|", 2) && - 0 == strncmp (R_last[k][j], "(|", 2))) - { - // a|(e|a)a*(e|a) = a* - // a|(e|a)(e|a)*(e|a) = a* - // (e|a)|aa*a = a* - // (e|a)|aa*(e|a) = a* - // (e|a)|(e|a)a*a = a* - // (e|a)|(e|a)a*(e|a) = a* - // (e|a)|(e|a)(e|a)*(e|a) = a* - if (GNUNET_YES == needs_parentheses (R_temp_ij)) - GNUNET_asprintf (&R_cur_r, "(%s)*", R_temp_ij); - else - GNUNET_asprintf (&R_cur_r, "%s*", R_temp_ij); - } - else - { - // a|aa*a = a+ - // a|(e|a)a*a = a+ - // a|aa*(e|a) = a+ - // a|(e|a)(e|a)*a = a+ - // a|a(e|a)*(e|a) = a+ - if (GNUNET_YES == needs_parentheses (R_temp_ij)) - GNUNET_asprintf (&R_cur_r, "(%s)+", R_temp_ij); - else - GNUNET_asprintf (&R_cur_r, "%s+", R_temp_ij); - } - } - else if (0 == ij_ik_cmp && 0 == clean_kk_kj_cmp && - 0 != clean_ik_kk_cmp) - { - // a|ab*b = ab* - if (strlen (R_last[k][k]) < 1) - R_cur_r = GNUNET_strdup (R_last[i][j]); - else if (GNUNET_YES == needs_parentheses (R_temp_kk)) - GNUNET_asprintf (&R_cur_r, "%s(%s)*", R_last[i][j], R_temp_kk); - else - GNUNET_asprintf (&R_cur_r, "%s%s*", R_last[i][j], R_last[k][k]); - - R_cur_l = NULL; - } - else if (0 == ij_kj_cmp && 0 == clean_ik_kk_cmp && - 0 != clean_kk_kj_cmp) - { - // a|bb*a = b*a - if (strlen (R_last[k][k]) < 1) - R_cur_r = GNUNET_strdup (R_last[k][j]); - else if (GNUNET_YES == needs_parentheses (R_temp_kk)) - GNUNET_asprintf (&R_cur_r, "(%s)*%s", R_temp_kk, R_last[k][j]); - else - GNUNET_asprintf (&R_cur_r, "%s*%s", R_temp_kk, R_last[k][j]); - - R_cur_l = NULL; - } - else if (0 == ij_ik_cmp && 0 == kk_kj_cmp && - !has_epsilon (R_last[i][j]) && has_epsilon (R_last[k][k])) - { - // a|a(e|b)*(e|b) = a|ab* = a|a|ab|abb|abbb|... = ab* - if (needs_parentheses (R_temp_kk)) - GNUNET_asprintf (&R_cur_r, "%s(%s)*", R_last[i][j], R_temp_kk); - else - GNUNET_asprintf (&R_cur_r, "%s%s*", R_last[i][j], R_temp_kk); - - R_cur_l = NULL; - } - else if (0 == ij_kj_cmp && 0 == ik_kk_cmp && - !has_epsilon (R_last[i][j]) && has_epsilon (R_last[k][k])) - { - // a|(e|b)(e|b)*a = a|b*a = a|a|ba|bba|bbba|... = b*a - if (needs_parentheses (R_temp_kk)) - GNUNET_asprintf (&R_cur_r, "(%s)*%s", R_temp_kk, R_last[i][j]); - else - GNUNET_asprintf (&R_cur_r, "%s*%s", R_temp_kk, R_last[i][j]); - - R_cur_l = NULL; - } - else - { - temp_a = - (NULL == R_last[i][j]) ? NULL : GNUNET_strdup (R_last[i][j]); - temp_a = remove_parentheses (temp_a); - R_cur_l = temp_a; - } - - GNUNET_free_non_null (R_temp_ij); - } - else - { - // we have no left side - R_cur_l = NULL; - } - - // construct R_cur_r, if not already constructed - if (NULL == R_cur_r) - { - length = strlen (R_temp_kk) - strlen (R_last[i][k]); - - // a(ba)*bx = (ab)+x - if (length > 0 && NULL != R_last[k][k] && 0 < strlen (R_last[k][k]) && - NULL != R_last[k][j] && 0 < strlen (R_last[k][j]) && - NULL != R_last[i][k] && 0 < strlen (R_last[i][k]) && - 0 == strkcmp (R_temp_kk, R_last[i][k], length) && - 0 == strncmp (R_temp_kk, R_last[k][j], length)) - { - temp_a = GNUNET_malloc (length + 1); - temp_b = GNUNET_malloc ((strlen (R_last[k][j]) - length) + 1); - - length_l = 0; - length_r = 0; - - for (cnt = 0; cnt < strlen (R_last[k][j]); cnt++) - { - if (cnt < length) - { - temp_a[length_l] = R_last[k][j][cnt]; - length_l++; - } - else - { - temp_b[length_r] = R_last[k][j][cnt]; - length_r++; - } - } - temp_a[length_l] = '\0'; - temp_b[length_r] = '\0'; - - // e|(ab)+ = (ab)* - if (NULL != R_cur_l && 0 == strlen (R_cur_l) && - 0 == strlen (temp_b)) - { - GNUNET_asprintf (&R_cur_r, "(%s%s)*", R_last[i][k], temp_a); - GNUNET_free (R_cur_l); - R_cur_l = NULL; - } - else - { - GNUNET_asprintf (&R_cur_r, "(%s%s)+%s", R_last[i][k], temp_a, - temp_b); - } - GNUNET_free (temp_a); - GNUNET_free (temp_b); - } - else if (0 == strcmp (R_temp_ik, R_temp_kk) && - 0 == strcmp (R_temp_kk, R_temp_kj)) - { - // (e|a)a*(e|a) = a* - // (e|a)(e|a)*(e|a) = a* - if (has_epsilon (R_last[i][k]) && has_epsilon (R_last[k][j])) - { - if (needs_parentheses (R_temp_kk)) - GNUNET_asprintf (&R_cur_r, "(%s)*", R_temp_kk); - else - GNUNET_asprintf (&R_cur_r, "%s*", R_temp_kk); - } - // aa*a = a+a - else if (0 == clean_ik_kk_cmp && 0 == clean_kk_kj_cmp && - !has_epsilon (R_last[i][k])) - { - if (needs_parentheses (R_temp_kk)) - GNUNET_asprintf (&R_cur_r, "(%s)+%s", R_temp_kk, R_temp_kk); - else - GNUNET_asprintf (&R_cur_r, "(%s)+%s", R_temp_kk, R_temp_kk); - } - // (e|a)a*a = a+ - // aa*(e|a) = a+ - // a(e|a)*(e|a) = a+ - // (e|a)a*a = a+ - else - { - eps_check = - (has_epsilon (R_last[i][k]) + has_epsilon (R_last[k][k]) + - has_epsilon (R_last[k][j])); - - if (eps_check == 1) - { - if (needs_parentheses (R_temp_kk)) - GNUNET_asprintf (&R_cur_r, "(%s)+", R_temp_kk); - else - GNUNET_asprintf (&R_cur_r, "%s+", R_temp_kk); - } - } - } - // aa*b = a+b - // (e|a)(e|a)*b = a*b - else if (0 == strcmp (R_temp_ik, R_temp_kk)) - { - if (has_epsilon (R_last[i][k])) - { - if (needs_parentheses (R_temp_kk)) - GNUNET_asprintf (&R_cur_r, "(%s)*%s", R_temp_kk, R_last[k][j]); - else - GNUNET_asprintf (&R_cur_r, "%s*%s", R_temp_kk, R_last[k][j]); - } - else - { - if (needs_parentheses (R_temp_kk)) - GNUNET_asprintf (&R_cur_r, "(%s)+%s", R_temp_kk, R_last[k][j]); - else - GNUNET_asprintf (&R_cur_r, "%s+%s", R_temp_kk, R_last[k][j]); - } - } - // ba*a = ba+ - // b(e|a)*(e|a) = ba* - else if (0 == strcmp (R_temp_kk, R_temp_kj)) - { - if (has_epsilon (R_last[k][j])) - { - if (needs_parentheses (R_temp_kk)) - GNUNET_asprintf (&R_cur_r, "%s(%s)*", R_last[i][k], R_temp_kk); - else - GNUNET_asprintf (&R_cur_r, "%s%s*", R_last[i][k], R_temp_kk); - } - else - { - if (needs_parentheses (R_temp_kk)) - GNUNET_asprintf (&R_cur_r, "(%s)+%s", R_last[i][k], R_temp_kk); - else - GNUNET_asprintf (&R_cur_r, "%s+%s", R_last[i][k], R_temp_kk); - } - } - else - { - if (strlen (R_temp_kk) > 0) - { - if (needs_parentheses (R_temp_kk)) - { - GNUNET_asprintf (&R_cur_r, "%s(%s)*%s", R_last[i][k], R_temp_kk, - R_last[k][j]); - } - else - { - GNUNET_asprintf (&R_cur_r, "%s%s*%s", R_last[i][k], R_temp_kk, - R_last[k][j]); - } - } - else - { - GNUNET_asprintf (&R_cur_r, "%s%s", R_last[i][k], R_last[k][j]); - } - } - } - - /* GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, "R_cur_l: %s\n", R_cur_l); */ - /* GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, "R_cur_r: %s\n", R_cur_r); */ - - // putting it all together - if (NULL != R_cur_l && NULL != R_cur_r) - { - // a|a = a - if (0 == strcmp (R_cur_l, R_cur_r)) - { - R_cur[i][j] = GNUNET_strdup (R_cur_l); - } - // R_cur_l | R_cur_r - else - { - GNUNET_asprintf (&R_cur[i][j], "(%s|%s)", R_cur_l, R_cur_r); - } - } - else if (NULL != R_cur_l) - { - R_cur[i][j] = GNUNET_strdup (R_cur_l); - } - else if (NULL != R_cur_r) - { - R_cur[i][j] = GNUNET_strdup (R_cur_r); - } - else - { - R_cur[i][j] = NULL; - } - - GNUNET_free_non_null (R_cur_l); - GNUNET_free_non_null (R_cur_r); - - GNUNET_free_non_null (R_temp_ik); - GNUNET_free_non_null (R_temp_kk); - GNUNET_free_non_null (R_temp_kj); + // Create R_cur[i][j] and simplify the expression + automaton_create_proofs_simplify (R_last[i][j], R_last[i][k], + R_last[k][k], R_last[k][j], + &R_cur[i][j]); } } @@ -1478,14 +1535,16 @@ automaton_create_proofs (struct GNUNET_REGEX_Automaton *a) if (states[i]->accepting) { if (NULL == complete_regex && 0 < strlen (R_last[a->start->proof_id][i])) + { GNUNET_asprintf (&complete_regex, "%s", R_last[a->start->proof_id][i]); + } else if (NULL != R_last[a->start->proof_id][i] && 0 < strlen (R_last[a->start->proof_id][i])) { - temp_a = complete_regex; + temp = complete_regex; GNUNET_asprintf (&complete_regex, "%s|%s", complete_regex, R_last[a->start->proof_id][i]); - GNUNET_free (temp_a); + GNUNET_free (temp); } } } @@ -1524,8 +1583,6 @@ dfa_state_create (struct GNUNET_REGEX_Context *ctx, int len = 0; struct GNUNET_REGEX_State *cstate; struct Transition *ctran; - int insert = 1; - struct Transition *t; unsigned int i; s = GNUNET_malloc (sizeof (struct GNUNET_REGEX_State)); @@ -1573,21 +1630,7 @@ dfa_state_create (struct GNUNET_REGEX_Context *ctx, for (ctran = cstate->transitions_head; NULL != ctran; ctran = ctran->next) { if (0 != ctran->label) - { - insert = 1; - - for (t = s->transitions_head; NULL != t; t = t->next) - { - if (t->label == ctran->label) - { - insert = 0; - break; - } - } - - if (insert) - state_add_transition (ctx, s, ctran->label, NULL); - } + state_add_transition (ctx, s, ctran->label, NULL); } // If the nfa_states contain an accepting state, the new dfa state is also diff --git a/src/regex/test_regex_eval_api.c b/src/regex/test_regex_eval_api.c index 6d575a05c..5a6575c86 100644 --- a/src/regex/test_regex_eval_api.c +++ b/src/regex/test_regex_eval_api.c @@ -58,7 +58,7 @@ int test_random (unsigned int rx_length, unsigned int max_str_len, unsigned int str_count) { - int i; + unsigned int i; char *rand_rx; char *matching_str; int eval; @@ -114,6 +114,11 @@ test_random (unsigned int rx_length, unsigned int max_str_len, eval_check = regexec (&rx, matching_str, 1, matchptr, 0); regfree (&rx); + // We only want to match the whole string, because that's what our DFA does, too. + if (eval_check == 0 && + (matchptr[0].rm_so != 0 || matchptr[0].rm_eo != strlen (matching_str))) + eval_check = 1; + // Match canonical regex if (0 != regcomp (&rx, canonical_regex, REG_EXTENDED)) { @@ -127,11 +132,6 @@ test_random (unsigned int rx_length, unsigned int max_str_len, regfree (&rx); GNUNET_free (canonical_regex); - // We only want to match the whole string, because that's what our DFA does, too. - if (eval_check == 0 && - (matchptr[0].rm_so != 0 || matchptr[0].rm_eo != strlen (matching_str))) - eval_check = 1; - // compare result if (eval_check != eval) { @@ -198,11 +198,11 @@ test_automaton (struct GNUNET_REGEX_Automaton *a, regex_t * rx, result = 1; regerror (eval_check, rx, error, sizeof error); GNUNET_log (GNUNET_ERROR_TYPE_ERROR, - "Unexpected result:\nregex: %s\nstring: %s\nexpected result: %i\n" + "Unexpected result:\nregex: %s\ncanonical_regex: %s\nstring: %s\nexpected result: %i\n" "gnunet regex: %i\nglibc regex: %i\nglibc error: %s\nrm_so: %i\nrm_eo: %i\n\n", - rxstr->regex, rxstr->strings[i], rxstr->expected_results[i], - eval, eval_check, error, matchptr[0].rm_so, - matchptr[0].rm_eo); + rxstr->regex, GNUNET_REGEX_get_canonical_regex (a), + rxstr->strings[i], rxstr->expected_results[i], eval, + eval_check, error, matchptr[0].rm_so, matchptr[0].rm_eo); } } return result; @@ -298,6 +298,7 @@ main (int argc, char *argv[]) check_dfa += test_automaton (a, &rx, &rxstr[i]); check_proof = GNUNET_strdup (GNUNET_REGEX_get_canonical_regex (a)); GNUNET_REGEX_automaton_destroy (a); + a = GNUNET_REGEX_construct_dfa (check_proof, strlen (check_proof)); check_dfa += test_automaton (a, &rx, &rxstr[i]); GNUNET_REGEX_automaton_destroy (a); diff --git a/src/regex/test_regex_iterate_api.c b/src/regex/test_regex_iterate_api.c index e7052f5ee..9f4f2280e 100644 --- a/src/regex/test_regex_iterate_api.c +++ b/src/regex/test_regex_iterate_api.c @@ -32,7 +32,7 @@ key_iterator (void *cls, const struct GNUNET_HashCode *key, const char *proof, int accepting, unsigned int num_edges, const struct GNUNET_REGEX_Edge *edges) { - int i; + unsigned int i; GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, "Iterating... (accepting: %i)\n", accepting); diff --git a/src/regex/test_regex_proofs.c b/src/regex/test_regex_proofs.c index 85fc3079d..831f4dc48 100644 --- a/src/regex/test_regex_proofs.c +++ b/src/regex/test_regex_proofs.c @@ -106,39 +106,48 @@ test_proofs_static (void) unsigned int i; unsigned int error; - const char *regex[4] = { + const char *regex[6] = { "a|aa*a", "a+", "a*", - "a*a*" + "a*a*", + "(F*C|WfPf|y+F*C)", + "y*F*C|WfPf", + /* "2?jA?e?D*K", */ + /* "((j|2j)K|(j|2j)AK|(j|2j)(D|e|(j|2j)A(D|e))D*K)", */ + /* "((j|2j)K|(j|2j)(D|e|((j|2j)j|(j|2j)2j)A(D|e))D*K|(j|2j)AK)", */ }; - char *canonical_regex; - struct GNUNET_REGEX_Automaton *dfa; + const char *canon_rx1; + const char *canon_rx2; + struct GNUNET_REGEX_Automaton *dfa1; + struct GNUNET_REGEX_Automaton *dfa2; error = 0; - for (i = 0; i < 4; i += 2) + for (i = 0; i < 6; i += 2) { - dfa = GNUNET_REGEX_construct_dfa (regex[i], strlen (regex[i])); - canonical_regex = GNUNET_strdup (GNUNET_REGEX_get_canonical_regex (dfa)); - GNUNET_REGEX_automaton_destroy (dfa); + dfa1 = GNUNET_REGEX_construct_dfa (regex[i], strlen (regex[i])); + dfa2 = GNUNET_REGEX_construct_dfa (regex[i + 1], strlen (regex[i + 1])); + + canon_rx1 = GNUNET_REGEX_get_canonical_regex (dfa1); + canon_rx2 = GNUNET_REGEX_get_canonical_regex (dfa2); - dfa = GNUNET_REGEX_construct_dfa (regex[i + 1], strlen (regex[i + 1])); - error += - (0 == - strcmp (canonical_regex, - GNUNET_REGEX_get_canonical_regex (dfa))) ? 0 : 1; + error += (0 == strcmp (canon_rx1, canon_rx2)) ? 0 : 1; if (error > 0) { GNUNET_log (GNUNET_ERROR_TYPE_ERROR, - "Comparing canonical regex of %s with %s failed.\n", regex[i], - regex[i + 1]); + "Comparing canonical regex failed:\nrx1: %s\ncrx1: %s\nrx2: %s\ncrx2: %s\n", + regex[i], canon_rx1, regex[i + 1], canon_rx2); + + /* Save the graphs of the two conflicting DFAs */ + /* GNUNET_REGEX_automaton_save_graph (dfa1, "proofs_fail_dfa1.dot"); */ + /* GNUNET_REGEX_automaton_save_graph (dfa2, "proofs_fail_dfa2.dot"); */ } - GNUNET_free (canonical_regex); - GNUNET_REGEX_automaton_destroy (dfa); + GNUNET_REGEX_automaton_destroy (dfa1); + GNUNET_REGEX_automaton_destroy (dfa2); } return error; @@ -161,7 +170,7 @@ main (int argc, char *argv[]) error = 0; error += test_proofs_static (); -// error += test_proofs_random (100, 10); + /* error += test_proofs_random (10000, 10); */ return error; }