#include "platform.h"
#include "gnunet_fs_service.h"
#include "gnunet_signatures.h"
-#include "fs.h"
+#include "fs_api.h"
+#include <unitypes.h>
+#include <unicase.h>
+#include <uniconv.h>
+#include <unistr.h>
+#include <unistdio.h>
+
/**
{
if (out[rpos] == '%')
{
- if (1 != sscanf (&out[rpos + 1], "%2X", &hx))
+ if (1 != SSCANF (&out[rpos + 1], "%2X", &hx))
{
GNUNET_free (out);
*emsg = GNUNET_strdup (_("`%' must be followed by HEX number"));
}
-/**
- * Canonicalize a keyword.
- *
- * @param in input string (the keyword)
- * @return canonicalized keyword
- */
-static char *
-canonicalize_keyword (const char *in)
-{
- char *ret;
- char *wpos;
- const char *rpos;
-
- ret = GNUNET_strdup (in);
- wpos = ret;
- rpos = in;
- while ('\0' != *rpos)
- {
- switch (tolower ((unsigned char) *rpos))
- {
- case 'a':
- case 'e':
- case 'i':
- case 'o':
- case 'u':
- case ' ':
- case '\t':
- case '\n':
- case '\r':
- /* skip characters listed above */
- break;
- case 'b':
- case 'c':
- case 'd':
- case 'f':
- case 'g':
- case 'h':
- case 'j':
- case 'k':
- case 'l':
- case 'm':
- case 'n':
- case 'p':
- case 'r':
- case 's':
- case 't':
- case 'v':
- case 'w':
- case 'x':
- case 'y':
- case 'z':
- /* convert characters listed above to lower case */
- *wpos = tolower ((unsigned char) *rpos);
- wpos++;
- break;
- case '!':
- case '.':
- case '?':
- case '-':
- /* keep characters listed above without changes */
- *wpos = *rpos;
- wpos++;
- break;
- default:
- if (isspace ((unsigned char) *rpos) ||
- isdigit ((unsigned char) *rpos) )
- break;
- /* replace characters listed above with '_' */
- *wpos = '_';
- wpos++;
- break;
- }
- rpos++;
- }
- *wpos = '\0';
- return ret;
-}
-
-
-/**
- * Canonicalize keyword URI. Performs operations such
- * as decapitalization and removal of certain characters.
- * (useful for search).
- *
- * @param uri the URI to canonicalize
- * @return canonicalized version of the URI, NULL on error
- */
-struct GNUNET_FS_Uri *
-GNUNET_FS_uri_ksk_canonicalize (const struct GNUNET_FS_Uri *uri)
-{
- struct GNUNET_FS_Uri *ret;
- unsigned int kc;
- unsigned int i;
- const char *in;
- char *sb;
- char *cc;
- const char *tok;
-
- ret = GNUNET_malloc (sizeof (struct GNUNET_FS_Uri));
- ret->type = ksk;
- kc = uri->data.ksk.keywordCount;
- for (i = 0; i < kc; i++)
- {
- in = uri->data.ksk.keywords[i];
- GNUNET_FS_uri_ksk_add_keyword (ret,
- &in[1],
- (in[0] == '+') ? GNUNET_YES : GNUNET_NO);
- sb = GNUNET_strdup (&in[1]);
-#define DELIMS " \\|\"'`/&@-_,.;!?+-*^$#~=[]{}()<>"
- for (tok = strtok (sb, DELIMS); NULL != tok; tok = strtok (NULL, DELIMS))
-#undef DELIMS
- {
- if (strlen(tok) < 3)
- continue;
- GNUNET_FS_uri_ksk_add_keyword (ret,
- tok,
- GNUNET_NO);
- cc = canonicalize_keyword (tok);
- if (strlen (cc) > 2)
- GNUNET_FS_uri_ksk_add_keyword (ret,
- cc,
- GNUNET_NO);
- }
- GNUNET_free (sb);
- }
- return ret;
-}
-
-
/**
* Merge the sets of keywords from two KSK URIs.
* (useful for merging the canonicalized keywords with
{
char *ret;
char *name;
+ char *unique_name;
if (uri->type != sks)
return NULL;
- name = GNUNET_PSEUDONYM_id_to_name (cfg, &uri->data.sks.namespace);
- if (name == NULL)
- return GNUNET_FS_uri_to_string (uri);
- GNUNET_asprintf (&ret, "%s: %s", name, uri->data.sks.identifier);
+ (void) GNUNET_PSEUDONYM_get_info (cfg, &uri->data.sks.namespace,
+ NULL, NULL, &name, NULL);
+ unique_name = GNUNET_PSEUDONYM_name_uniquify (cfg, &uri->data.sks.namespace, name, NULL);
GNUNET_free (name);
+ GNUNET_asprintf (&ret, "%s: %s", unique_name, uri->data.sks.identifier);
+ GNUNET_free (unique_name);
return ret;
}
if (uri->type == ksk)
{
- for (i = uri->data.ksk.keywordCount - 1; i >= 0; i--)
+ for (i=0;i < uri->data.ksk.keywordCount; i++)
GNUNET_assert (uri->data.ksk.keywords[i] != NULL);
}
#endif
}
+/**
+ * Add a keyword as non-mandatory (with ' '-prefix) to the
+ * given keyword list at offset 'index'. The array is
+ * guaranteed to be long enough.
+ *
+ * @param s keyword to add
+ * @param array array to add the keyword to
+ * @param index offset where to add the keyword
+ */
+static void
+insert_non_mandatory_keyword (const char *s, char **array, int index)
+{
+ char *nkword;
+ GNUNET_asprintf (&nkword, " %s", /* space to mark as 'non mandatory' */ s);
+ array[index] = nkword;
+}
+
+
+/**
+ * Test if the given keyword 's' is already present in the
+ * given array, ignoring the '+'-mandatory prefix in the array.
+ *
+ * @param s keyword to test
+ * @param array keywords to test against, with ' ' or '+' prefix to ignore
+ * @param array_length length of the array
+ * @return GNUNET_YES if the keyword exists, GNUNET_NO if not
+ */
+static int
+find_duplicate (const char *s, const char **array, int array_length)
+{
+ int j;
+
+ for (j = array_length - 1; j >= 0; j--)
+ if (0 == strcmp (&array[j][1], s))
+ return GNUNET_YES;
+ return GNUNET_NO;
+}
+
+
+/**
+ * FIXME: comment
+ */
+static char *
+normalize_metadata (enum EXTRACTOR_MetaFormat format, const char *data,
+ size_t data_len)
+{
+ uint8_t *free_str = NULL;
+ uint8_t *str_to_normalize = (uint8_t *) data;
+ uint8_t *normalized;
+ size_t r_len;
+ if (str_to_normalize == NULL)
+ return NULL;
+ /* Don't trust libextractor */
+ if (format == EXTRACTOR_METAFORMAT_UTF8)
+ {
+ free_str = (uint8_t *) u8_check ((const uint8_t *) data, data_len);
+ if (free_str == NULL)
+ free_str = NULL;
+ else
+ format = EXTRACTOR_METAFORMAT_C_STRING;
+ }
+ if (format == EXTRACTOR_METAFORMAT_C_STRING)
+ {
+ free_str = u8_strconv_from_encoding (data, locale_charset (), iconveh_escape_sequence);
+ if (free_str == NULL)
+ return NULL;
+ }
+
+ normalized = u8_tolower (str_to_normalize, strlen ((char *) str_to_normalize), NULL, UNINORM_NFD, NULL, &r_len);
+ /* free_str is allocated by libunistring internally, use free() */
+ if (free_str != NULL)
+ free (free_str);
+ if (normalized != NULL)
+ {
+ /* u8_tolower allocates a non-NULL-terminated string! */
+ free_str = GNUNET_malloc (r_len + 1);
+ memcpy (free_str, normalized, r_len);
+ free_str[r_len] = '\0';
+ free (normalized);
+ normalized = free_str;
+ }
+ return (char *) normalized;
+}
+
+/**
+ * Counts the number of UTF-8 characters (not bytes) in the string,
+ * returns that count.
+ */
+static size_t
+u8_strcount (const uint8_t *s)
+{
+ size_t count;
+ ucs4_t c;
+ GNUNET_assert (s != NULL);
+ if (s[0] == 0)
+ return 0;
+ for (count = 0; s != NULL; count++)
+ s = u8_next (&c, s);
+ return count - 1;
+}
+
+
+/**
+ * Break the filename up by matching [], () and {} pairs to make
+ * keywords. In case of nesting parentheses only the inner pair counts.
+ * You can't escape parentheses to scan something like "[blah\{foo]" to
+ * make a "blah{foo" keyword, this function is only a heuristic!
+ *
+ * @param s string to break down.
+ * @param array array to fill with enclosed tokens. If NULL, then tokens
+ * are only counted.
+ * @param index index at which to start filling the array (entries prior
+ * to it are used to check for duplicates). ignored if array == NULL.
+ * @return number of tokens counted (including duplicates), or number of
+ * tokens extracted (excluding duplicates). 0 if there are no
+ * matching parens in the string (when counting), or when all tokens
+ * were duplicates (when extracting).
+ */
+static int
+get_keywords_from_parens (const char *s, char **array, int index)
+{
+ int count = 0;
+ char *open_paren;
+ char *close_paren;
+ char *ss;
+ char tmp;
+
+ if (NULL == s)
+ return 0;
+ ss = GNUNET_strdup (s);
+ open_paren = ss - 1;
+ while (NULL != (open_paren = strpbrk (open_paren + 1, "[{(")))
+ {
+ int match = 0;
+
+ close_paren = strpbrk (open_paren + 1, "]})");
+ if (NULL == close_paren)
+ continue;
+ switch (open_paren[0])
+ {
+ case '[':
+ if (']' == close_paren[0])
+ match = 1;
+ break;
+ case '{':
+ if ('}' == close_paren[0])
+ match = 1;
+ break;
+ case '(':
+ if (')' == close_paren[0])
+ match = 1;
+ break;
+ default:
+ break;
+ }
+ if (match && (close_paren - open_paren > 1))
+ {
+ tmp = close_paren[0];
+ close_paren[0] = '\0';
+ /* Keywords must be at least 3 characters long */
+ if (u8_strcount ((const uint8_t *) &open_paren[1]) <= 2)
+ {
+ close_paren[0] = tmp;
+ continue;
+ }
+ if (NULL != array)
+ {
+ char *normalized;
+ if (GNUNET_NO == find_duplicate ((const char *) &open_paren[1],
+ (const char **) array, index + count))
+ {
+ insert_non_mandatory_keyword ((const char *) &open_paren[1], array,
+ index + count);
+ count++;
+ }
+ normalized = normalize_metadata (EXTRACTOR_METAFORMAT_UTF8,
+ &open_paren[1], close_paren - &open_paren[1]);
+ if (normalized != NULL)
+ {
+ if (GNUNET_NO == find_duplicate ((const char *) normalized,
+ (const char **) array, index + count))
+ {
+ insert_non_mandatory_keyword ((const char *) normalized, array,
+ index + count);
+ count++;
+ }
+ GNUNET_free (normalized);
+ }
+ }
+ else
+ count++;
+ close_paren[0] = tmp;
+ }
+ }
+ GNUNET_free (ss);
+ return count;
+}
+
+
+/**
+ * Where to break up keywords
+ */
+#define TOKENS "_. /-!?#&+@\"\'\\;:,"
+
+/**
+ * Break the filename up by TOKENS to make
+ * keywords.
+ *
+ * @param s string to break down.
+ * @param array array to fill with tokens. If NULL, then tokens are only
+ * counted.
+ * @param index index at which to start filling the array (entries prior
+ * to it are used to check for duplicates). ignored if array == NULL.
+ * @return number of tokens (>1) counted (including duplicates), or number of
+ * tokens extracted (excluding duplicates). 0 if there are no
+ * separators in the string (when counting), or when all tokens were
+ * duplicates (when extracting).
+ */
+static int
+get_keywords_from_tokens (const char *s, char **array, int index)
+{
+ char *p;
+ char *ss;
+ int seps = 0;
+
+ ss = GNUNET_strdup (s);
+ for (p = strtok (ss, TOKENS); p != NULL; p = strtok (NULL, TOKENS))
+ {
+ /* Keywords must be at least 3 characters long */
+ if (u8_strcount ((const uint8_t *) p) <= 2)
+ continue;
+ if (NULL != array)
+ {
+ char *normalized;
+ if (GNUNET_NO == find_duplicate (p, (const char **) array, index + seps))
+ {
+ insert_non_mandatory_keyword (p, array,
+ index + seps);
+ seps++;
+ }
+ normalized = normalize_metadata (EXTRACTOR_METAFORMAT_UTF8,
+ p, strlen (p));
+ if (normalized != NULL)
+ {
+ if (GNUNET_NO == find_duplicate ((const char *) normalized,
+ (const char **) array, index + seps))
+ {
+ insert_non_mandatory_keyword ((const char *) normalized, array,
+ index + seps);
+ seps++;
+ }
+ GNUNET_free (normalized);
+ }
+ }
+ else
+ seps++;
+ }
+ GNUNET_free (ss);
+ return seps;
+}
+#undef TOKENS
+
/**
* Function called on each value in the meta data.
* Adds it to the URI.
const char *data_mime_type, const char *data, size_t data_len)
{
struct GNUNET_FS_Uri *uri = cls;
- char *nkword;
- int j;
+ char *normalized_data;
if ((format != EXTRACTOR_METAFORMAT_UTF8) &&
(format != EXTRACTOR_METAFORMAT_C_STRING))
return 0;
- for (j = uri->data.ksk.keywordCount - 1; j >= 0; j--)
- if (0 == strcmp (&uri->data.ksk.keywords[j][1], data))
- return GNUNET_OK;
- GNUNET_asprintf (&nkword, " %s", /* space to mark as 'non mandatory' */
- data);
- uri->data.ksk.keywords[uri->data.ksk.keywordCount++] = nkword;
+ /* Keywords must be at least 3 characters long
+ * If given non-utf8 string it will, most likely, find it to be invalid,
+ * and will return the length of its valid part, skipping the keyword.
+ * If it does - fix the extractor, not this check!
+ */
+ if (u8_strcount ((const uint8_t *) data) <= 2)
+ {
+ return 0;
+ }
+ normalized_data = normalize_metadata (format, data, data_len);
+ if (!find_duplicate (data, (const char **) uri->data.ksk.keywords, uri->data.ksk.keywordCount))
+ {
+ insert_non_mandatory_keyword (data,
+ uri->data.ksk.keywords, uri->data.ksk.keywordCount);
+ uri->data.ksk.keywordCount++;
+ }
+ if (normalized_data != NULL)
+ {
+ if (!find_duplicate (normalized_data, (const char **) uri->data.ksk.keywords, uri->data.ksk.keywordCount))
+ {
+ insert_non_mandatory_keyword (normalized_data,
+ uri->data.ksk.keywords, uri->data.ksk.keywordCount);
+ uri->data.ksk.keywordCount++;
+ }
+ GNUNET_free (normalized_data);
+ }
return 0;
}
*md)
{
struct GNUNET_FS_Uri *ret;
+ char *filename;
+ char *full_name = NULL;
+ char *ss;
int ent;
+ int tok_keywords = 0;
+ int paren_keywords = 0;
if (md == NULL)
return NULL;
ent = GNUNET_CONTAINER_meta_data_iterate (md, NULL, NULL);
if (ent > 0)
{
- ret->data.ksk.keywords = GNUNET_malloc (sizeof (char *) * ent);
+ full_name = GNUNET_CONTAINER_meta_data_get_first_by_types (md,
+ EXTRACTOR_METATYPE_GNUNET_ORIGINAL_FILENAME, -1);
+ if (NULL != full_name)
+ {
+ filename = full_name;
+ while (NULL != (ss = strstr (filename, DIR_SEPARATOR_STR)))
+ filename = ss + 1;
+ tok_keywords = get_keywords_from_tokens (filename, NULL, 0);
+ paren_keywords = get_keywords_from_parens (filename, NULL, 0);
+ }
+ /* x2 because there might be a normalized variant of every keyword */
+ ret->data.ksk.keywords = GNUNET_malloc (sizeof (char *) * (ent
+ + tok_keywords + paren_keywords) * 2);
GNUNET_CONTAINER_meta_data_iterate (md, &gather_uri_data, ret);
}
+ if (tok_keywords > 0)
+ ret->data.ksk.keywordCount += get_keywords_from_tokens (filename,
+ ret->data.ksk.keywords,
+ ret->data.ksk.keywordCount);
+ if (paren_keywords > 0)
+ ret->data.ksk.keywordCount += get_keywords_from_parens (filename,
+ ret->data.ksk.keywords,
+ ret->data.ksk.keywordCount);
+ if (ent > 0)
+ GNUNET_free_non_null (full_name);
return ret;
}