*/
struct GNUNET_REGEX_Automaton;
+
/**
* Edge representation.
*/
struct GNUNET_HashCode destination;
};
+
/**
* Construct an NFA by parsing the regex string of length 'len'.
*
struct GNUNET_REGEX_Automaton *
GNUNET_REGEX_construct_nfa (const char *regex, const size_t len);
+
/**
* Construct DFA for the given 'regex' of length 'len'.
*
struct GNUNET_REGEX_Automaton *
GNUNET_REGEX_construct_dfa (const char *regex, const size_t len);
+
/**
* Free the memory allocated by constructing the GNUNET_REGEX_Automaton.
* data structure.
void
GNUNET_REGEX_automaton_destroy (struct GNUNET_REGEX_Automaton *a);
+
/**
* Save the given automaton as a GraphViz dot file.
*
GNUNET_REGEX_eval (struct GNUNET_REGEX_Automaton *a,
const char *string);
-/**
- * Get the canonical regex of the given automaton.
- * When constructing the automaton a proof is computed for each state,
- * consisting of the regular expression leading to this state. A complete
- * regex for the automaton can be computed by combining these proofs.
- * As of now this function is only useful for testing.
- *
- * @param a automaton for which the canonical regex should be returned.
- *
- * @return
- */
-const char *
-GNUNET_REGEX_get_canonical_regex (struct GNUNET_REGEX_Automaton *a);
/**
* Get the first key for the given 'input_string'. This hashes
GNUNET_REGEX_get_first_key (const char *input_string, /* FIXME: size_t */ unsigned int string_len,
struct GNUNET_HashCode * key);
+
/**
* Check if the given 'proof' matches the given 'key'.
*
GNUNET_REGEX_check_proof (const char *proof,
const struct GNUNET_HashCode *key);
+
/**
* Iterator callback function.
*
unsigned int num_edges,
const struct GNUNET_REGEX_Edge *edges);
+
/**
* Iterate over all edges starting from start state of automaton 'a'. Calling
* iterator for each edge.
GNUNET_REGEX_KeyIterator iterator,
void *iterator_cls);
+
#if 0 /* keep Emacsens' auto-indent happy */
{
#endif
lib_LTLIBRARIES = libgnunetregex.la
libgnunetregex_la_SOURCES = \
- regex.c
+ regex_internal.h regex.c \
+ regex_random.c
libgnunetregex_la_LIBADD = -lm \
$(top_builddir)/src/util/libgnunetutil.la
libgnunetregex_la_LDFLAGS = \
#include "gnunet_container_lib.h"
#include "gnunet_crypto_lib.h"
#include "gnunet_regex_lib.h"
-#include "regex.h"
+#include "regex_internal.h"
/**
* Constant for how many bits the initial string regex should have.
GNUNET_asprintf (&R_last[i][j], "%s|%c", R_last[i][j], t->label);
GNUNET_free (temp_a);
}
- if (GNUNET_YES == needs_parentheses (R_last[i][j]))
- {
- temp_a = R_last[i][j];
- GNUNET_asprintf (&R_last[i][j], "(%s)", R_last[i][j]);
- GNUNET_free (temp_a);
- }
}
if (NULL == R_last[i][i])
GNUNET_asprintf (&R_last[i][i], "");
GNUNET_free (temp_a);
}
}
+ for (i = 0; i < n; i++)
+ for (j = 0; j < n; j++)
+ if (needs_parentheses (R_last[i][j]))
+ {
+ temp_a = R_last[i][j];
+ GNUNET_asprintf (&R_last[i][j], "(%s)", R_last[i][j]);
+ GNUNET_free (temp_a);
+ }
+ // TODO: clean up and fix the induction part
// INDUCTION
for (k = 0; k < n; k++)
--- /dev/null
+/*
+ This file is part of GNUnet
+ (C) 2012 Christian Grothoff (and other contributing authors)
+
+ GNUnet is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3, or (at your
+ option) any later version.
+
+ GNUnet is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with GNUnet; see the file COPYING. If not, write to the
+ Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ Boston, MA 02111-1307, USA.
+*/
+/**
+ * @file src/regex/regex_internal.h
+ * @brief common internal definitions for regex library
+ * @author Maximilian Szengel
+ */
+#ifndef REGEX_INTERNAL_H
+#define REGEX_INTERNAL_H
+
+#include "gnunet_regex_lib.h"
+
+#ifdef __cplusplus
+extern "C"
+{
+#if 0 /* keep Emacsens' auto-indent happy */
+}
+#endif
+#endif
+
+/**
+ * char array of literals that are allowed inside a regex (apart from the
+ * operators)
+ */
+#define ALLOWED_LITERALS "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
+
+
+/**
+ * Get the canonical regex of the given automaton.
+ * When constructing the automaton a proof is computed for each state,
+ * consisting of the regular expression leading to this state. A complete
+ * regex for the automaton can be computed by combining these proofs.
+ * As of now this function is only useful for testing.
+ *
+ * @param a automaton for which the canonical regex should be returned.
+ *
+ * @return
+ */
+const char *
+GNUNET_REGEX_get_canonical_regex (struct GNUNET_REGEX_Automaton *a);
+
+
+/**
+ * Generate a (pseudo) random regular expression of length 'rx_length', as well
+ * as a (optional) string that will be matched by the generated regex. The
+ * returned regex needs to be freed.
+ *
+ * @param rx_length length of the random regex.
+ * @param matching_str (optional) pointer to a string that will contain a string
+ * that will be matched by the generated regex, if
+ * 'matching_str' pointer was not NULL.
+ *
+ * @return NULL if 'rx_length' is 0, a random regex of length 'rx_length', which
+ * needs to be freed, otherwise.
+ */
+char *
+GNUNET_REGEX_generate_random_regex (size_t rx_length, char *matching_str);
+
+
+/**
+ * Generate a random string of maximum length 'max_len' that only contains literals allowed
+ * in a regular expression. The string might be 0 chars long but is garantueed
+ * to be shorter or equal to 'max_len'.
+ *
+ * @param max_len maximum length of the string that should be generated.
+ *
+ * @return random string that needs to be freed.
+ */
+char *
+GNUNET_REGEX_generate_random_string (size_t max_len);
+
+#if 0 /* keep Emacsens' auto-indent happy */
+{
+#endif
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- /dev/null
+/*
+ This file is part of GNUnet
+ (C) 2012 Christian Grothoff (and other contributing authors)
+
+ GNUnet is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3, or (at your
+ option) any later version.
+
+ GNUnet is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with GNUnet; see the file COPYING. If not, write to the
+ Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ Boston, MA 02111-1307, USA.
+*/
+/**
+ * @file src/regex/regex_random.c
+ * @brief functions for creating random regular expressions and strings
+ * @author Maximilian Szengel
+ */
+#include "platform.h"
+#include "gnunet_regex_lib.h"
+#include "gnunet_crypto_lib.h"
+#include "regex_internal.h"
+
+
+/**
+ * Get a (pseudo) random valid literal for building a regular expression.
+ *
+ * @return random valid literal
+ */
+char
+get_random_literal ()
+{
+ uint32_t ridx;
+
+ ridx =
+ GNUNET_CRYPTO_random_u32 (GNUNET_CRYPTO_QUALITY_WEAK,
+ (uint32_t) strlen (ALLOWED_LITERALS));
+
+ return ALLOWED_LITERALS[ridx];
+}
+
+
+/**
+ * Generate a (pseudo) random regular expression of length 'rx_length', as well
+ * as a (optional) string that will be matched by the generated regex. The
+ * returned regex needs to be freed.
+ *
+ * @param rx_length length of the random regex.
+ * @param matching_str (optional) pointer to a string that will contain a string
+ * that will be matched by the generated regex, if
+ * 'matching_str' pointer was not NULL. Make sure you
+ * allocated at least rx_length+1 bytes for this sting.
+ *
+ * @return NULL if 'rx_length' is 0, a random regex of length 'rx_length', which
+ * needs to be freed, otherwise.
+ */
+char *
+GNUNET_REGEX_generate_random_regex (size_t rx_length, char *matching_str)
+{
+ char *rx;
+ char *rx_p;
+ char *matching_strp;
+ unsigned int i;
+ unsigned int char_op_switch;
+ unsigned int last_was_op;
+ int rx_op;
+ char current_char;
+
+ if (0 == rx_length)
+ return NULL;
+
+ if (NULL != matching_str)
+ matching_strp = matching_str;
+ else
+ matching_strp = NULL;
+
+ rx = GNUNET_malloc (rx_length + 1);
+ rx_p = rx;
+ current_char = 0;
+ last_was_op = 1;
+
+ for (i = 0; i < rx_length; i++)
+ {
+ char_op_switch = GNUNET_CRYPTO_random_u32 (GNUNET_CRYPTO_QUALITY_WEAK, 2);
+
+ if (0 == char_op_switch && !last_was_op)
+ {
+ last_was_op = 1;
+ rx_op = GNUNET_CRYPTO_random_u32 (GNUNET_CRYPTO_QUALITY_WEAK, 4);
+
+ switch (rx_op)
+ {
+ case 0:
+ current_char = '+';
+ break;
+ case 1:
+ current_char = '*';
+ break;
+ case 2:
+ current_char = '?';
+ break;
+ case 3:
+ if (i < rx_length - 1) // '|' cannot be at the end
+ current_char = '|';
+ else
+ current_char = get_random_literal ();
+ break;
+ }
+ }
+ else
+ {
+ current_char = get_random_literal ();
+ last_was_op = 0;
+ }
+
+ if (NULL != matching_strp &&
+ (current_char != '+' && current_char != '*' && current_char != '?' &&
+ current_char != '|'))
+ {
+ *matching_strp = current_char;
+ matching_strp++;
+ }
+
+ *rx_p = current_char;
+ rx_p++;
+ }
+ *rx_p = '\0';
+ if (NULL != matching_strp)
+ *matching_strp = '\0';
+
+ return rx;
+}
+
+/**
+ * Generate a random string of maximum length 'max_len' that only contains literals allowed
+ * in a regular expression. The string might be 0 chars long but is garantueed
+ * to be shorter or equal to 'max_len'.
+ *
+ * @param max_len maximum length of the string that should be generated.
+ *
+ * @return random string that needs to be freed.
+ */
+char *
+GNUNET_REGEX_generate_random_string (size_t max_len)
+{
+ unsigned int i;
+ char *str;
+ size_t len;
+
+ if (1 > max_len)
+ return GNUNET_strdup ("");
+
+ len = (size_t) GNUNET_CRYPTO_random_u32 (GNUNET_CRYPTO_QUALITY_WEAK, max_len);
+ str = GNUNET_malloc (len + 1);
+
+ for (i = 0; i < len; i++)
+ {
+ str[i] = get_random_literal ();
+ }
+
+ str[i] = '\0';
+
+ return str;
+}
#include <time.h>
#include "platform.h"
#include "gnunet_regex_lib.h"
+#include "regex_internal.h"
enum Match_Result
{
enum Match_Result expected_results[20];
};
-static const char allowed_literals[] =
- "0123456789" "ABCDEFGHIJKLMNOPQRSTUVWXYZ" "abcdefghijklmnopqrstuvwxyz";
/**
* Random regex test. Generate a random regex as well as 'str_count' strings to
unsigned int str_count)
{
int i;
- int j;
- int rx_exp;
- char rand_rx[rx_length + 1];
- char matching_str[str_count][max_str_len + 1];
- char *rand_rxp;
- char *matching_strp;
- int char_op_switch;
- int last_was_op;
- char current_char;
+ char *rand_rx;
+ char *matching_str;
int eval;
int eval_check;
int eval_canonical;
regmatch_t matchptr[1];
char error[200];
int result;
- unsigned int str_len;
+ size_t str_len;
char *canonical_regex;
// At least one string is needed for matching
// The string should be at least as long as the regex itself
GNUNET_assert (max_str_len >= rx_length);
- rand_rxp = rand_rx;
- matching_strp = matching_str[0];
- current_char = 0;
- last_was_op = 1;
-
// Generate random regex and a string that matches the regex
- for (i = 0; i < rx_length; i++)
- {
- char_op_switch = 0 + (int) (1.0 * rand () / (RAND_MAX + 1.0));
-
- if (0 == char_op_switch && !last_was_op)
- {
- last_was_op = 1;
- rx_exp = rand () % 4;
-
- switch (rx_exp)
- {
- case 0:
- current_char = '+';
- break;
- case 1:
- current_char = '*';
- break;
- case 2:
- current_char = '?';
- break;
- case 3:
- if (i < rx_length - 1) // '|' cannot be at the end
- current_char = '|';
- else
- current_char =
- allowed_literals[rand () % (sizeof (allowed_literals) - 1)];
- break;
- }
- }
- else
- {
- current_char =
- allowed_literals[rand () % (sizeof (allowed_literals) - 1)];
- last_was_op = 0;
- }
-
- if (current_char != '+' && current_char != '*' && current_char != '?' &&
- current_char != '|')
- {
- *matching_strp = current_char;
- matching_strp++;
- }
-
- *rand_rxp = current_char;
- rand_rxp++;
- }
- *rand_rxp = '\0';
- *matching_strp = '\0';
-
- // Generate some random strings for matching...
- // Start at 1, because the first string is generated above during regex generation
- for (i = 1; i < str_count; i++)
- {
- str_len = rand () % max_str_len;
- for (j = 0; j < str_len; j++)
- matching_str[i][j] =
- allowed_literals[rand () % (sizeof (allowed_literals) - 1)];
- matching_str[i][str_len] = '\0';
- }
+ matching_str = GNUNET_malloc (rx_length + 1);
+ rand_rx = GNUNET_REGEX_generate_random_regex (rx_length, matching_str);
// Now match
result = 0;
for (i = 0; i < str_count; i++)
{
+ if (0 < i)
+ {
+ matching_str = GNUNET_REGEX_generate_random_string (max_str_len);
+ str_len = strlen (matching_str);
+ }
+
// Match string using DFA
dfa = GNUNET_REGEX_construct_dfa (rand_rx, strlen (rand_rx));
if (NULL == dfa)
return -1;
}
- eval = GNUNET_REGEX_eval (dfa, matching_str[i]);
+ eval = GNUNET_REGEX_eval (dfa, matching_str);
canonical_regex = GNUNET_strdup (GNUNET_REGEX_get_canonical_regex (dfa));
GNUNET_REGEX_automaton_destroy (dfa);
return -1;
}
- eval_check = regexec (&rx, matching_str[i], 1, matchptr, 0);
+ eval_check = regexec (&rx, matching_str, 1, matchptr, 0);
regfree (&rx);
// Match canonical regex
return -1;
}
- eval_canonical = regexec (&rx, matching_str[i], 1, matchptr, 0);
+ eval_canonical = regexec (&rx, matching_str, 1, matchptr, 0);
regfree (&rx);
GNUNET_free (canonical_regex);
// We only want to match the whole string, because that's what our DFA does, too.
if (eval_check == 0 &&
- (matchptr[0].rm_so != 0 ||
- matchptr[0].rm_eo != strlen (matching_str[i])))
+ (matchptr[0].rm_so != 0 || matchptr[0].rm_eo != strlen (matching_str)))
eval_check = 1;
// compare result
rand_rx, matching_str, eval, eval_check, error);
result += 1;
}
+
+ GNUNET_free (matching_str);
}
+
+ GNUNET_free (rand_rx);
+
return result;
}
* @brief test for regex.c
* @author Maximilian Szengel
*/
-#include <regex.h>
-#include <time.h>
#include "platform.h"
#include "gnunet_regex_lib.h"
+#include "regex_internal.h"
-int
-main (int argc, char *argv[])
+
+/**
+ * Test if the given regex's canonical regex is the same as this canonical
+ * regex's canonical regex. Confused? Ok, then: 1. construct a dfa A from the
+ * given 'regex' 2. get the canonical regex of dfa A 3. construct a dfa B from
+ * this canonical regex 3. compare the canonical regex of dfa A with the
+ * canonical regex of dfa B.
+ *
+ * @param regex regular expression used for this test (see above).
+ *
+ * @return 0 on success, 1 on failure
+ */
+unsigned int
+test_proof (const char *regex)
{
- GNUNET_log_setup ("test-regex",
-#if VERBOSE
- "DEBUG",
-#else
- "WARNING",
-#endif
- NULL);
+ unsigned int error;
+ struct GNUNET_REGEX_Automaton *dfa;
+ char *c_rx1;
+ const char *c_rx2;
- int error;
- int i;
-
- const char *regex[21] = {
- "ab(c|d)+c*(a(b|c)+d)+(bla)+",
- "(bla)*",
- "b(lab)*la",
- "(ab)*",
- "ab(c|d)+c*(a(b|c)+d)+(bla)(bla)*",
- "z(abc|def)?xyz",
- "1*0(0|1)*",
- "a+X*y+c|p|R|Z*K*y*R+w|Y*6+n+h*k*w+V*F|W*B*e*",
- "(cd|ab)*",
- "abcd:(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1):(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)(0|1)",
- "abc(1|0)*def",
- "ab|ac",
- "(ab)(ab)*",
- "ab|cd|ef|gh",
- "a|b|c|d|e|f|g",
- "(ab)|(ac)",
- "a(b|c)",
- "a*a",
- "ab?(abcd)?",
- "(ab|cs|df|sdf)*",
- "a|aa*a"
+ dfa = GNUNET_REGEX_construct_dfa (regex, strlen (regex));
+ c_rx1 = GNUNET_strdup (GNUNET_REGEX_get_canonical_regex (dfa));
+ GNUNET_REGEX_automaton_destroy (dfa);
+ dfa = GNUNET_REGEX_construct_dfa (c_rx1, strlen (c_rx1));
+ c_rx2 = GNUNET_REGEX_get_canonical_regex (dfa);
+
+ error = (0 == strcmp (c_rx1, c_rx2)) ? 0 : 1;
+
+ if (error > 0)
+ {
+ GNUNET_log (GNUNET_ERROR_TYPE_ERROR,
+ "Comparing canonical regex of\n%s\nfailed:\n%s\nvs.\n%s\n",
+ regex, c_rx1, c_rx2);
+ }
+
+ GNUNET_free (c_rx1);
+ GNUNET_REGEX_automaton_destroy (dfa);
+
+ return error;
+}
+
+/**
+ * Use 'test_proof' function to randomly test the canonical regexes of 'count'
+ * random expressions of length 'rx_length'.
+ *
+ * @param count number of random regular expressions to test.
+ * @param rx_length length of the random regular expressions.
+ *
+ * @return 0 on succes, number of failures otherwise.
+ */
+unsigned int
+test_proofs_random (unsigned int count, size_t rx_length)
+{
+ unsigned int i;
+ char *rand_rx;
+ unsigned int failures;
+
+ failures = 0;
+
+ for (i = 0; i < count; i++)
+ {
+ rand_rx = GNUNET_REGEX_generate_random_regex (rx_length, NULL);
+ failures += test_proof (rand_rx);
+ GNUNET_free (rand_rx);
+ }
+
+ return failures;
+}
+
+/**
+ * Test a number of known examples of regexes for proper canonicalization.
+ *
+ * @return 0 on success, number of failures otherwise.
+ */
+unsigned int
+test_proofs_static (void)
+{
+ unsigned int i;
+ unsigned int error;
+
+ const char *regex[4] = {
+ "a|aa*a",
+ "a+",
+ "a*",
+ "a*a*"
};
+
char *canonical_regex;
struct GNUNET_REGEX_Automaton *dfa;
error = 0;
- for (i = 0; i < 21; i++)
+ for (i = 0; i < 4; i += 2)
{
dfa = GNUNET_REGEX_construct_dfa (regex[i], strlen (regex[i]));
canonical_regex = GNUNET_strdup (GNUNET_REGEX_get_canonical_regex (dfa));
GNUNET_REGEX_automaton_destroy (dfa);
- dfa =
- GNUNET_REGEX_construct_dfa (canonical_regex, strlen (canonical_regex));
+ dfa = GNUNET_REGEX_construct_dfa (regex[i + 1], strlen (regex[i + 1]));
error +=
(0 ==
strcmp (canonical_regex,
GNUNET_REGEX_get_canonical_regex (dfa))) ? 0 : 1;
+
+ if (error > 0)
+ {
+ GNUNET_log (GNUNET_ERROR_TYPE_ERROR,
+ "Comparing canonical regex of %s with %s failed.\n", regex[i],
+ regex[i + 1]);
+ }
+
GNUNET_free (canonical_regex);
GNUNET_REGEX_automaton_destroy (dfa);
}
return error;
}
+
+
+int
+main (int argc, char *argv[])
+{
+ GNUNET_log_setup ("test-regex",
+#if VERBOSE
+ "DEBUG",
+#else
+ "WARNING",
+#endif
+ NULL);
+
+ int error;
+
+ error = 0;
+
+ error += test_proofs_static ();
+// error += test_proofs_random (100, 10);
+
+ return error;
+}