X-Git-Url: https://git.librecmc.org/?a=blobdiff_plain;f=src%2Ffs%2Ffs_uri.c;h=0c2d64caca85186b4c5247e039e62ac935c994e1;hb=04630c5e40fc4de16393894d0b5ff2ca9055f4e2;hp=206a8adca82f5e22de6aa196beff094969fa6c9b;hpb=16a6919a9f98ee9fa1fee9dd262906c321004a19;p=oweals%2Fgnunet.git diff --git a/src/fs/fs_uri.c b/src/fs/fs_uri.c index 206a8adca..0c2d64cac 100644 --- a/src/fs/fs_uri.c +++ b/src/fs/fs_uri.c @@ -81,7 +81,13 @@ #include "platform.h" #include "gnunet_fs_service.h" #include "gnunet_signatures.h" -#include "fs.h" +#include "fs_api.h" +#include +#include +#include +#include +#include + /** @@ -125,7 +131,7 @@ GNUNET_FS_uri_to_key (const struct GNUNET_FS_Uri *uri, GNUNET_HashCode * key) * Convert keyword URI to a human readable format * (i.e. the search query that was used in the first place) * - * @param uri ksk uri to convert to a string + * @param uri ksk uri to convert to a string * @return string with the keywords */ char * @@ -187,7 +193,7 @@ GNUNET_FS_uri_ksk_to_string_fancy (const struct GNUNET_FS_Uri *uri) * spaces), return a copy of the keyword without %-encoding and * without double-quotes (%22). Also, add a space at the beginning * if there is not a '+'. - * + * * @param in string with %-encoding * @param emsg where to store the parser error message (if any) * @return decodded string with leading space (or preserved plus) @@ -208,7 +214,7 @@ percent_decode_keyword (const char *in, char **emsg) { if (out[rpos] == '%') { - if (1 != sscanf (&out[rpos + 1], "%2X", &hx)) + if (1 != SSCANF (&out[rpos + 1], "%2X", &hx)) { GNUNET_free (out); *emsg = GNUNET_strdup (_("`%' must be followed by HEX number")); @@ -969,109 +975,6 @@ GNUNET_FS_uri_sks_create_from_nsid (GNUNET_HashCode * nsid, const char *id) } -/** - * Canonicalize a keyword. - * - * @param in input string (the keyword) - * @return canonicalized keyword - */ -static char * -canonicalize_keyword (const char *in) -{ - char *ret; - char *wpos; - const char *rpos; - - ret = GNUNET_strdup (in); - wpos = ret; - rpos = in; - while ('\0' != *rpos) - { - switch (tolower ((unsigned char) *rpos)) - { - case 'a': - case 'e': - case 'i': - case 'o': - case 'u': - case ' ': - case '\t': - case '\n': - case '\r': - /* skip characters listed above */ - break; - case 'b': - case 'c': - case 'd': - case 'f': - case 'g': - case 'h': - case 'j': - case 'k': - case 'l': - case 'm': - case 'n': - case 'p': - case 'r': - case 's': - case 't': - case 'v': - case 'w': - case 'x': - case 'y': - case 'z': - /* convert characters listed above to lower case */ - *wpos = tolower ((unsigned char) *rpos); - wpos++; - break; - case '!': - case '.': - case '?': - case '-': - /* keep characters listed above without changes */ - *wpos = *rpos; - wpos++; - break; - default: - /* replace characters listed above with '_' */ - *wpos = '_'; - wpos++; - break; - } - rpos++; - } - return ret; -} - - -/** - * Canonicalize keyword URI. Performs operations such - * as decapitalization and removal of certain characters. - * (useful for search). - * - * @param uri the URI to canonicalize - * @return canonicalized version of the URI, NULL on error - */ -struct GNUNET_FS_Uri * -GNUNET_FS_uri_ksk_canonicalize (const struct GNUNET_FS_Uri *uri) -{ - struct GNUNET_FS_Uri *ret; - unsigned int kc; - unsigned int i; - char **kl; - - kc = uri->data.ksk.keywordCount; - kl = GNUNET_malloc (kc * sizeof (char *)); - for (i = 0; i < kc; i++) - kl[i] = canonicalize_keyword (uri->data.ksk.keywords[i]); - ret = GNUNET_malloc (sizeof (struct GNUNET_FS_Uri)); - ret->type = ksk; - ret->data.ksk.keywordCount = kc; - ret->data.ksk.keywords = kl; - return ret; -} - - /** * Merge the sets of keywords from two KSK URIs. * (useful for merging the canonicalized keywords with @@ -1475,14 +1378,16 @@ GNUNET_FS_uri_sks_to_string_fancy (struct GNUNET_CONFIGURATION_Handle *cfg, { char *ret; char *name; + char *unique_name; if (uri->type != sks) return NULL; - name = GNUNET_PSEUDONYM_id_to_name (cfg, &uri->data.sks.namespace); - if (name == NULL) - return GNUNET_FS_uri_to_string (uri); - GNUNET_asprintf (&ret, "%s: %s", name, uri->data.sks.identifier); + (void) GNUNET_PSEUDONYM_get_info (cfg, &uri->data.sks.namespace, + NULL, NULL, &name, NULL); + unique_name = GNUNET_PSEUDONYM_name_uniquify (cfg, &uri->data.sks.namespace, name, NULL); GNUNET_free (name); + GNUNET_asprintf (&ret, "%s: %s", unique_name, uri->data.sks.identifier); + GNUNET_free (unique_name); return ret; } @@ -1501,7 +1406,7 @@ GNUNET_FS_uri_test_ksk (const struct GNUNET_FS_Uri *uri) if (uri->type == ksk) { - for (i = uri->data.ksk.keywordCount - 1; i >= 0; i--) + for (i=0;i < uri->data.ksk.keywordCount; i++) GNUNET_assert (uri->data.ksk.keywords[i] != NULL); } #endif @@ -1558,6 +1463,268 @@ GNUNET_FS_uri_test_loc (const struct GNUNET_FS_Uri *uri) } +/** + * Add a keyword as non-mandatory (with ' '-prefix) to the + * given keyword list at offset 'index'. The array is + * guaranteed to be long enough. + * + * @param s keyword to add + * @param array array to add the keyword to + * @param index offset where to add the keyword + */ +static void +insert_non_mandatory_keyword (const char *s, char **array, int index) +{ + char *nkword; + GNUNET_asprintf (&nkword, " %s", /* space to mark as 'non mandatory' */ s); + array[index] = nkword; +} + + +/** + * Test if the given keyword 's' is already present in the + * given array, ignoring the '+'-mandatory prefix in the array. + * + * @param s keyword to test + * @param array keywords to test against, with ' ' or '+' prefix to ignore + * @param array_length length of the array + * @return GNUNET_YES if the keyword exists, GNUNET_NO if not + */ +static int +find_duplicate (const char *s, const char **array, int array_length) +{ + int j; + + for (j = array_length - 1; j >= 0; j--) + if (0 == strcmp (&array[j][1], s)) + return GNUNET_YES; + return GNUNET_NO; +} + + +/** + * FIXME: comment + */ +static char * +normalize_metadata (enum EXTRACTOR_MetaFormat format, const char *data, + size_t data_len) +{ + uint8_t *free_str = NULL; + uint8_t *str_to_normalize = (uint8_t *) data; + uint8_t *normalized; + size_t r_len; + if (str_to_normalize == NULL) + return NULL; + /* Don't trust libextractor */ + if (format == EXTRACTOR_METAFORMAT_UTF8) + { + free_str = (uint8_t *) u8_check ((const uint8_t *) data, data_len); + if (free_str == NULL) + free_str = NULL; + else + format = EXTRACTOR_METAFORMAT_C_STRING; + } + if (format == EXTRACTOR_METAFORMAT_C_STRING) + { + free_str = u8_strconv_from_encoding (data, locale_charset (), iconveh_escape_sequence); + if (free_str == NULL) + return NULL; + } + + normalized = u8_tolower (str_to_normalize, strlen ((char *) str_to_normalize), NULL, UNINORM_NFD, NULL, &r_len); + /* free_str is allocated by libunistring internally, use free() */ + if (free_str != NULL) + free (free_str); + if (normalized != NULL) + { + /* u8_tolower allocates a non-NULL-terminated string! */ + free_str = GNUNET_malloc (r_len + 1); + memcpy (free_str, normalized, r_len); + free_str[r_len] = '\0'; + free (normalized); + normalized = free_str; + } + return (char *) normalized; +} + +/** + * Counts the number of UTF-8 characters (not bytes) in the string, + * returns that count. + */ +static size_t +u8_strcount (const uint8_t *s) +{ + size_t count; + ucs4_t c; + GNUNET_assert (s != NULL); + if (s[0] == 0) + return 0; + for (count = 0; s != NULL; count++) + s = u8_next (&c, s); + return count - 1; +} + + +/** + * Break the filename up by matching [], () and {} pairs to make + * keywords. In case of nesting parentheses only the inner pair counts. + * You can't escape parentheses to scan something like "[blah\{foo]" to + * make a "blah{foo" keyword, this function is only a heuristic! + * + * @param s string to break down. + * @param array array to fill with enclosed tokens. If NULL, then tokens + * are only counted. + * @param index index at which to start filling the array (entries prior + * to it are used to check for duplicates). ignored if array == NULL. + * @return number of tokens counted (including duplicates), or number of + * tokens extracted (excluding duplicates). 0 if there are no + * matching parens in the string (when counting), or when all tokens + * were duplicates (when extracting). + */ +static int +get_keywords_from_parens (const char *s, char **array, int index) +{ + int count = 0; + char *open_paren; + char *close_paren; + char *ss; + char tmp; + + if (NULL == s) + return 0; + ss = GNUNET_strdup (s); + open_paren = ss - 1; + while (NULL != (open_paren = strpbrk (open_paren + 1, "[{("))) + { + int match = 0; + + close_paren = strpbrk (open_paren + 1, "]})"); + if (NULL == close_paren) + continue; + switch (open_paren[0]) + { + case '[': + if (']' == close_paren[0]) + match = 1; + break; + case '{': + if ('}' == close_paren[0]) + match = 1; + break; + case '(': + if (')' == close_paren[0]) + match = 1; + break; + default: + break; + } + if (match && (close_paren - open_paren > 1)) + { + tmp = close_paren[0]; + close_paren[0] = '\0'; + /* Keywords must be at least 3 characters long */ + if (u8_strcount ((const uint8_t *) &open_paren[1]) <= 2) + { + close_paren[0] = tmp; + continue; + } + if (NULL != array) + { + char *normalized; + if (GNUNET_NO == find_duplicate ((const char *) &open_paren[1], + (const char **) array, index + count)) + { + insert_non_mandatory_keyword ((const char *) &open_paren[1], array, + index + count); + count++; + } + normalized = normalize_metadata (EXTRACTOR_METAFORMAT_UTF8, + &open_paren[1], close_paren - &open_paren[1]); + if (normalized != NULL) + { + if (GNUNET_NO == find_duplicate ((const char *) normalized, + (const char **) array, index + count)) + { + insert_non_mandatory_keyword ((const char *) normalized, array, + index + count); + count++; + } + GNUNET_free (normalized); + } + } + else + count++; + close_paren[0] = tmp; + } + } + GNUNET_free (ss); + return count; +} + + +/** + * Where to break up keywords + */ +#define TOKENS "_. /-!?#&+@\"\'\\;:," + +/** + * Break the filename up by TOKENS to make + * keywords. + * + * @param s string to break down. + * @param array array to fill with tokens. If NULL, then tokens are only + * counted. + * @param index index at which to start filling the array (entries prior + * to it are used to check for duplicates). ignored if array == NULL. + * @return number of tokens (>1) counted (including duplicates), or number of + * tokens extracted (excluding duplicates). 0 if there are no + * separators in the string (when counting), or when all tokens were + * duplicates (when extracting). + */ +static int +get_keywords_from_tokens (const char *s, char **array, int index) +{ + char *p; + char *ss; + int seps = 0; + + ss = GNUNET_strdup (s); + for (p = strtok (ss, TOKENS); p != NULL; p = strtok (NULL, TOKENS)) + { + /* Keywords must be at least 3 characters long */ + if (u8_strcount ((const uint8_t *) p) <= 2) + continue; + if (NULL != array) + { + char *normalized; + if (GNUNET_NO == find_duplicate (p, (const char **) array, index + seps)) + { + insert_non_mandatory_keyword (p, array, + index + seps); + seps++; + } + normalized = normalize_metadata (EXTRACTOR_METAFORMAT_UTF8, + p, strlen (p)); + if (normalized != NULL) + { + if (GNUNET_NO == find_duplicate ((const char *) normalized, + (const char **) array, index + seps)) + { + insert_non_mandatory_keyword ((const char *) normalized, array, + index + seps); + seps++; + } + GNUNET_free (normalized); + } + } + else + seps++; + } + GNUNET_free (ss); + return seps; +} +#undef TOKENS + /** * Function called on each value in the meta data. * Adds it to the URI. @@ -1568,7 +1735,7 @@ GNUNET_FS_uri_test_loc (const struct GNUNET_FS_Uri *uri) * used in the main libextractor library and yielding * meta data). * @param type libextractor-type describing the meta data - * @param format basic format information about data + * @param format basic format information about data * @param data_mime_type mime-type of data (not of the original file); * can be NULL (if mime-type is not known) * @param data actual meta-data found @@ -1581,18 +1748,37 @@ gather_uri_data (void *cls, const char *plugin_name, const char *data_mime_type, const char *data, size_t data_len) { struct GNUNET_FS_Uri *uri = cls; - char *nkword; - int j; + char *normalized_data; if ((format != EXTRACTOR_METAFORMAT_UTF8) && (format != EXTRACTOR_METAFORMAT_C_STRING)) return 0; - for (j = uri->data.ksk.keywordCount - 1; j >= 0; j--) - if (0 == strcmp (&uri->data.ksk.keywords[j][1], data)) - return GNUNET_OK; - GNUNET_asprintf (&nkword, " %s", /* space to mark as 'non mandatory' */ - data); - uri->data.ksk.keywords[uri->data.ksk.keywordCount++] = nkword; + /* Keywords must be at least 3 characters long + * If given non-utf8 string it will, most likely, find it to be invalid, + * and will return the length of its valid part, skipping the keyword. + * If it does - fix the extractor, not this check! + */ + if (u8_strcount ((const uint8_t *) data) <= 2) + { + return 0; + } + normalized_data = normalize_metadata (format, data, data_len); + if (!find_duplicate (data, (const char **) uri->data.ksk.keywords, uri->data.ksk.keywordCount)) + { + insert_non_mandatory_keyword (data, + uri->data.ksk.keywords, uri->data.ksk.keywordCount); + uri->data.ksk.keywordCount++; + } + if (normalized_data != NULL) + { + if (!find_duplicate (normalized_data, (const char **) uri->data.ksk.keywords, uri->data.ksk.keywordCount)) + { + insert_non_mandatory_keyword (normalized_data, + uri->data.ksk.keywords, uri->data.ksk.keywordCount); + uri->data.ksk.keywordCount++; + } + GNUNET_free (normalized_data); + } return 0; } @@ -1610,7 +1796,12 @@ GNUNET_FS_uri_ksk_create_from_meta_data (const struct GNUNET_CONTAINER_MetaData *md) { struct GNUNET_FS_Uri *ret; + char *filename; + char *full_name = NULL; + char *ss; int ent; + int tok_keywords = 0; + int paren_keywords = 0; if (md == NULL) return NULL; @@ -1619,9 +1810,31 @@ GNUNET_FS_uri_ksk_create_from_meta_data (const struct GNUNET_CONTAINER_MetaData ent = GNUNET_CONTAINER_meta_data_iterate (md, NULL, NULL); if (ent > 0) { - ret->data.ksk.keywords = GNUNET_malloc (sizeof (char *) * ent); + full_name = GNUNET_CONTAINER_meta_data_get_first_by_types (md, + EXTRACTOR_METATYPE_GNUNET_ORIGINAL_FILENAME, -1); + if (NULL != full_name) + { + filename = full_name; + while (NULL != (ss = strstr (filename, DIR_SEPARATOR_STR))) + filename = ss + 1; + tok_keywords = get_keywords_from_tokens (filename, NULL, 0); + paren_keywords = get_keywords_from_parens (filename, NULL, 0); + } + /* x2 because there might be a normalized variant of every keyword */ + ret->data.ksk.keywords = GNUNET_malloc (sizeof (char *) * (ent + + tok_keywords + paren_keywords) * 2); GNUNET_CONTAINER_meta_data_iterate (md, &gather_uri_data, ret); } + if (tok_keywords > 0) + ret->data.ksk.keywordCount += get_keywords_from_tokens (filename, + ret->data.ksk.keywords, + ret->data.ksk.keywordCount); + if (paren_keywords > 0) + ret->data.ksk.keywordCount += get_keywords_from_parens (filename, + ret->data.ksk.keywords, + ret->data.ksk.keywordCount); + if (ent > 0) + GNUNET_free_non_null (full_name); return ret; }