--- /dev/null
+/*\r
+ This file is part of GNUnet\r
+ (C) 2005-2012 Christian Grothoff (and other contributing authors)\r
+\r
+ GNUnet is free software; you can redistribute it and/or modify\r
+ it under the terms of the GNU General Public License as published\r
+ by the Free Software Foundation; either version 2, or (at your\r
+ option) any later version.\r
+\r
+ GNUnet is distributed in the hope that it will be useful, but\r
+ WITHOUT ANY WARRANTY; without even the implied warranty of\r
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU\r
+ General Public License for more details.\r
+\r
+ You should have received a copy of the GNU General Public License\r
+ along with GNUnet; see the file COPYING. If not, write to the\r
+ Free Software Foundation, Inc., 59 Temple Place - Suite 330,\r
+ Boston, MA 02111-1307, USA.\r
+*/\r
+\r
+#include "platform.h"\r
+#include "gnunet_fs_service.h"\r
+#include "gnunet_scheduler_lib.h"\r
+\r
+/**\r
+ * Entry for each unique keyword to track how often\r
+ * it occured. Contains the keyword and the counter.\r
+ */\r
+struct KeywordCounter\r
+{\r
+\r
+ /**\r
+ * Keyword that was found.\r
+ */\r
+ const char *value;\r
+\r
+ /**\r
+ * How many files have this keyword?\r
+ */\r
+ unsigned int count;\r
+\r
+ /**\r
+ * This is a doubly-linked list\r
+ */\r
+ struct KeywordCounter *prev;\r
+\r
+ /**\r
+ * This is a doubly-linked list\r
+ */\r
+ struct KeywordCounter *next;\r
+};\r
+\r
+/**\r
+ * Aggregate information we keep for meta data in each directory.\r
+ */\r
+struct MetaCounter\r
+{\r
+ /**\r
+ * The actual meta data.\r
+ */\r
+ const char *data;\r
+\r
+ /**\r
+ * Number of bytes in 'data'.\r
+ */\r
+ size_t data_size;\r
+\r
+ /**\r
+ * Name of the plugin that provided that piece of metadata\r
+ */\r
+ const char *plugin_name;\r
+\r
+ /**\r
+ * Type of the data\r
+ */\r
+ enum EXTRACTOR_MetaType type;\r
+\r
+ /**\r
+ * Format of the data\r
+ */\r
+ enum EXTRACTOR_MetaFormat format;\r
+\r
+ /**\r
+ * MIME-type of the metadata itself\r
+ */\r
+ const char *data_mime_type;\r
+\r
+ /**\r
+ * How many files have meta entries matching this value?\r
+ * (type and format do not have to match).\r
+ */\r
+ unsigned int count;\r
+\r
+ /**\r
+ * This is a doubly-linked list\r
+ */\r
+ struct MetaCounter *prev;\r
+\r
+ /**\r
+ * This is a doubly-linked list\r
+ */\r
+ struct MetaCounter *next;\r
+};\r
+\r
+/**\r
+ * Execution context for 'add_dir'\r
+ * Owned by the initiator thread.\r
+ */\r
+struct AddDirContext\r
+{\r
+ /**\r
+ * Parent directory (used to access keyword and metadata counters,\r
+ * and the like).\r
+ * After the scan is finished, it will contain a pointer to the\r
+ * top-level directory entry in the directory tree built by the\r
+ * scanner.\r
+ */\r
+ struct ShareTreeItem *parent;\r
+\r
+ /**\r
+ * Expanded filename (as given by the scan initiator).\r
+ * The scanner thread stores a copy here, and frees it when it finishes.\r
+ */\r
+ char *filename_expanded;\r
+\r
+ /**\r
+ * A synchronization privitive. Whenever its state is altered,\r
+ * it means that the initiator wants the scanner to wrap up.\r
+ * It is owned by the initiator thread.\r
+ */\r
+#if WINDOWS\r
+ HANDLE stop;\r
+#else\r
+ sem_t *stop;\r
+#endif\r
+\r
+ /**\r
+ * 1 if the scanner should stop, 0 otherwise. Set in response\r
+ * to communication errors or when the initiator wants the scanning\r
+ * process to stop.\r
+ */\r
+ char do_stop;\r
+\r
+ /**\r
+ * Handle of the pipe end into which the progress messages are written\r
+ * The pipe is owned by the initiator thread, and there's no way to\r
+ * close this end without having access to the pipe, so it won't\r
+ * be closed by the scanner thread.\r
+ * The initiator MUST keep it alive until the scanner thread is finished.\r
+ */\r
+ const struct GNUNET_DISK_FileHandle *progress_write;\r
+\r
+\r
+ /**\r
+ * List of libextractor plugins to use for extracting.\r
+ * Initialized when the scan starts, removed when it finishes.\r
+ */\r
+ struct EXTRACTOR_PluginList *plugins;\r
+};\r
+\r
+/**\r
+ * An opaque structure a pointer to which is returned to the\r
+ * caller to be used to control the scanner.\r
+ */\r
+struct GNUNET_FS_DirScanner\r
+{\r
+ /**\r
+ * A synchronization privitive that is used to signal the scanner to stop.\r
+ * Owned by the initiator thread.\r
+ */\r
+#if WINDOWS\r
+ HANDLE stop;\r
+#else\r
+ sem_t *stop;\r
+#endif\r
+\r
+ /**\r
+ * A thread object for the scanner thread.\r
+ * Owned by the initiator thread.\r
+ */\r
+#if WINDOWS\r
+ HANDLE thread;\r
+#else\r
+ pthread_t thread;\r
+#endif\r
+\r
+ /**\r
+ * A task for reading progress messages from the scanner.\r
+ */\r
+ GNUNET_SCHEDULER_TaskIdentifier progress_read_task;\r
+\r
+ /**\r
+ * The end of the pipe that is used to read progress messages.\r
+ */\r
+ const struct GNUNET_DISK_FileHandle *progress_read;\r
+\r
+ /**\r
+ * The pipe that is used to read progress messages.\r
+ * Owned (along with both of its ends) by the initiator thread.\r
+ * Only closed after the scanner thread is finished.\r
+ */\r
+ struct GNUNET_DISK_PipeHandle *progress_pipe;\r
+\r
+ /**\r
+ * The function that will be called every time there's a progress\r
+ * message.\r
+ */\r
+ GNUNET_FS_DirScannerProgressCallback progress_callback;\r
+\r
+ /**\r
+ * A closure for progress_callback.\r
+ */\r
+ void *cls;\r
+\r
+ /**\r
+ * A pointer to the context of the scanner.\r
+ * Owned by the initiator thread.\r
+ * Initiator thread shouldn't touch it until the scanner thread\r
+ * is finished.\r
+ */\r
+ struct AddDirContext *adc;\r
+};\r
+\r
+/**\r
+ * A structure that forms a singly-linked list that serves as a stack\r
+ * for metadata-processing function.\r
+ */\r
+struct ProcessMetadataStackItem\r
+{\r
+ /**\r
+ * A pointer to metadata-processing context.\r
+ * The same in every stack item.\r
+ */\r
+ struct ProcessMetadataContext *ctx;\r
+\r
+ /**\r
+ * This is a singly-linked list. A pointer to its end is kept, and\r
+ * this pointer is used to walk it backwards.\r
+ */\r
+ struct ProcessMetadataStackItem *parent;\r
+\r
+ /**\r
+ * Map from the hash over the keyword to an 'struct KeywordCounter *'\r
+ * counter that says how often this keyword was\r
+ * encountered in the current directory.\r
+ */\r
+ struct GNUNET_CONTAINER_MultiHashMap *keywordcounter;\r
+\r
+ /**\r
+ * Map from the hash over the metadata to an 'struct MetaCounter *'\r
+ * counter that says how often this metadata was\r
+ * encountered in the current directory.\r
+ */\r
+ struct GNUNET_CONTAINER_MultiHashMap *metacounter;\r
+\r
+ /**\r
+ * Number of files in the current directory.\r
+ */\r
+ unsigned int dir_entry_count;\r
+\r
+ /**\r
+ * Keywords to exclude from using for KSK since they'll be associated\r
+ * with the parent as well. NULL for nothing blocked.\r
+ */\r
+ struct GNUNET_FS_Uri *exclude_ksk;\r
+\r
+ /**\r
+ * A share tree item that is being processed.\r
+ */\r
+ struct ShareTreeItem *item;\r
+\r
+ /**\r
+ * Set to GNUNET_YES to indicate that the directory pointer by 'item'\r
+ * was processed, and we should move on to the next.\r
+ * Otherwise the directory will be recursed into.\r
+ */\r
+ int end_directory;\r
+\r
+};\r
+\r
+/**\r
+ * The structure to keep the state of metadata processing\r
+ */\r
+struct ProcessMetadataContext\r
+{\r
+ /**\r
+ * The top of the stack.\r
+ */\r
+ struct ProcessMetadataStackItem *stack;\r
+\r
+ /**\r
+ * Callback to invoke when processing is finished\r
+ */\r
+ GNUNET_SCHEDULER_Task cb;\r
+\r
+ /**\r
+ * Closure for 'cb'\r
+ */\r
+ void *cls;\r
+\r
+ /**\r
+ * Toplevel directory item of the tree to process.\r
+ */\r
+ struct ShareTreeItem *toplevel;\r
+};\r
+\r
+/**\r
+ * Called every now and then by the scanner.\r
+ * Checks the synchronization privitive.\r
+ * Returns 1 if the scanner should stop, 0 otherwise.\r
+ */\r
+static int\r
+should_stop (struct AddDirContext *adc)\r
+{\r
+#if WINDOWS\r
+ if (WaitForSingleObject (adc->stop, 0) == WAIT_TIMEOUT)\r
+ return 0;\r
+ adc->do_stop = 1;\r
+ return 1;\r
+#else\r
+ int value;\r
+ sem_getvalue(adc->stop, &value); \r
+ if (value > 0)\r
+ {\r
+ adc->do_stop = 1;\r
+ return 1;\r
+ }\r
+ return 0;\r
+#endif\r
+}\r
+\r
+/**\r
+ * Write progress message.\r
+ * Format is:\r
+ * <reason><filename length><filename><directory flag>\r
+ * If filename is NULL, filename is not written, and its length\r
+ * is written as 0, and nothing else is written. It signals the initiator\r
+ * thread that the scanner is finished, and that it can now join its thread.\r
+ *\r
+ * Also checks if the initiator thread wants the scanner to stop,\r
+ * Returns 1 to stop scanning (if the signal was received, or\r
+ * if the pipe was broken somehow), 0 otherwise.\r
+ */\r
+static int\r
+write_progress (struct AddDirContext *adc, const char *filename,\r
+ char is_directory, enum GNUNET_DirScannerProgressUpdateReason reason)\r
+{\r
+ size_t filename_len;\r
+ size_t wr;\r
+ size_t total_write;\r
+ if ((adc->do_stop || should_stop (adc)) && reason != GNUNET_DIR_SCANNER_ASKED_TO_STOP\r
+ && reason != GNUNET_DIR_SCANNER_FINISHED)\r
+ return 1;\r
+ total_write = wr = GNUNET_DISK_file_write (adc->progress_write,\r
+ &reason, sizeof (reason));\r
+ while (wr > 0 && total_write < sizeof (reason))\r
+ {\r
+ total_write = wr = GNUNET_DISK_file_write (adc->progress_write,\r
+ &((char *)&reason)[total_write], sizeof (reason) - total_write);\r
+ if (wr > 0)\r
+ total_write += wr;\r
+ }\r
+ if (sizeof (reason) != wr)\r
+ return 1;\r
+ if (filename)\r
+ filename_len = strlen (filename) + 1;\r
+ else\r
+ filename_len = 0;\r
+ total_write = wr = GNUNET_DISK_file_write (adc->progress_write,\r
+ &filename_len, sizeof (size_t));\r
+ while (wr > 0 && total_write < sizeof (size_t))\r
+ {\r
+ total_write = wr = GNUNET_DISK_file_write (adc->progress_write,\r
+ &((char *)&filename_len)[total_write], sizeof (size_t) - total_write);\r
+ if (wr > 0)\r
+ total_write += wr;\r
+ }\r
+ if (sizeof (size_t) != wr)\r
+ return 1;\r
+ if (filename)\r
+ {\r
+ total_write = wr = GNUNET_DISK_file_write (adc->progress_write,\r
+ filename, filename_len);\r
+ while (wr > 0 && total_write < filename_len)\r
+ {\r
+ total_write = wr = GNUNET_DISK_file_write (adc->progress_write,\r
+ &((char *)filename)[total_write], filename_len - total_write);\r
+ if (wr > 0)\r
+ total_write += wr;\r
+ }\r
+ if (filename_len != wr)\r
+ return 1;\r
+ total_write = wr = GNUNET_DISK_file_write (adc->progress_write,\r
+ &is_directory, sizeof (char));\r
+ while (wr > 0 && total_write < sizeof (char))\r
+ {\r
+ total_write = wr = GNUNET_DISK_file_write (adc->progress_write,\r
+ &((char *)&is_directory)[total_write], sizeof (char) - total_write);\r
+ if (wr > 0)\r
+ total_write += wr;\r
+ }\r
+ if (sizeof (char) != wr)\r
+ return 1;\r
+ }\r
+ return 0;\r
+}\r
+\r
+/**\r
+ * Add the given keyword to the\r
+ * keyword statistics tracker.\r
+ *\r
+ * @param cls closure (user-defined)\r
+ * @param keyword the keyword to count\r
+ * @param is_mandatory ignored\r
+ * @return always GNUNET_OK\r
+ */\r
+static int\r
+add_to_keyword_counter (void *cls, const char *keyword, int is_mandatory)\r
+{\r
+ struct GNUNET_CONTAINER_MultiHashMap *mcm = cls;\r
+ struct KeywordCounter *cnt, *first_cnt;\r
+ GNUNET_HashCode hc;\r
+ size_t klen;\r
+\r
+ klen = strlen (keyword) + 1;\r
+ GNUNET_CRYPTO_hash (keyword, klen - 1, &hc);\r
+ /* Since the map might contain multiple values per keyword, we only\r
+ * store one value, and attach all other to it, forming a linked list.\r
+ * Somewhat easier than retrieving multiple items via callback.\r
+ */\r
+ first_cnt = GNUNET_CONTAINER_multihashmap_get (mcm, &hc);\r
+ for (cnt = first_cnt; cnt && strcmp (cnt->value, keyword) != 0; cnt = cnt->next);\r
+ if (cnt == NULL)\r
+ {\r
+ cnt = GNUNET_malloc (sizeof (struct KeywordCounter) + klen);\r
+ cnt->value = (const char *) &cnt[1];\r
+ memcpy (&cnt[1], keyword, klen);\r
+ if (first_cnt != NULL)\r
+ {\r
+ if (first_cnt->prev != NULL)\r
+ {\r
+ first_cnt->prev->next = cnt;\r
+ cnt->prev = first_cnt->prev;\r
+ }\r
+ first_cnt->prev = cnt;\r
+ cnt->next = first_cnt;\r
+ }\r
+ else\r
+ GNUNET_CONTAINER_multihashmap_put (mcm, &hc, cnt,\r
+ GNUNET_CONTAINER_MULTIHASHMAPOPTION_MULTIPLE);\r
+ }\r
+ cnt->count++;\r
+ return GNUNET_OK;\r
+}\r
+\r
+/**\r
+ * Type of a function that libextractor calls for each\r
+ * meta data item found.\r
+ *\r
+ * @param cls the container multihashmap to update\r
+ * @param plugin_name name of the plugin that produced this value;\r
+ * special values can be used (i.e. '<zlib>' for zlib being\r
+ * used in the main libextractor library and yielding\r
+ * meta data).\r
+ * @param type libextractor-type describing the meta data\r
+ * @param format basic format information about data\r
+ * @param data_mime_type mime-type of data (not of the original file);\r
+ * can be NULL (if mime-type is not known)\r
+ * @param data actual meta-data found\r
+ * @param data_len number of bytes in data\r
+ * @return GNUNET_OK to continue extracting / iterating\r
+ */\r
+static int\r
+add_to_meta_counter (void *cls, const char *plugin_name,\r
+ enum EXTRACTOR_MetaType type, enum EXTRACTOR_MetaFormat format,\r
+ const char *data_mime_type, const char *data, size_t data_len)\r
+{\r
+ struct GNUNET_CONTAINER_MultiHashMap *map = cls;\r
+ GNUNET_HashCode key;\r
+ struct MetaCounter *cnt, *first_cnt;\r
+\r
+ GNUNET_CRYPTO_hash (data, data_len, &key);\r
+ first_cnt = GNUNET_CONTAINER_multihashmap_get (map, &key);\r
+ for (cnt = first_cnt; cnt\r
+ && cnt->data_size != data_len\r
+ && memcmp (cnt->data, data, cnt->data_size) != 0; cnt = cnt->next);\r
+ if (cnt == NULL)\r
+ {\r
+ cnt = GNUNET_malloc (sizeof (struct MetaCounter));\r
+ cnt->data = data;\r
+ cnt->data_size = data_len;\r
+ cnt->plugin_name = plugin_name;\r
+ cnt->type = type;\r
+ cnt->format = format;\r
+ cnt->data_mime_type = data_mime_type;\r
+\r
+ if (first_cnt != NULL)\r
+ {\r
+ if (first_cnt->prev != NULL)\r
+ {\r
+ first_cnt->prev->next = cnt;\r
+ cnt->prev = first_cnt->prev;\r
+ }\r
+ first_cnt->prev = cnt;\r
+ cnt->next = first_cnt;\r
+ }\r
+ else\r
+ GNUNET_CONTAINER_multihashmap_put (map, &key, cnt,\r
+ GNUNET_CONTAINER_MULTIHASHMAPOPTION_MULTIPLE);\r
+ }\r
+ cnt->count++;\r
+ return 0;\r
+}\r
+\r
+/**\r
+ * Allocates a struct ShareTreeItem and adds it to its parent.\r
+ */\r
+static struct ShareTreeItem *\r
+make_item (struct ShareTreeItem *parent)\r
+{\r
+ struct ShareTreeItem *item;\r
+ item = GNUNET_malloc (sizeof (struct ShareTreeItem));\r
+\r
+ item->parent = parent;\r
+ if (parent)\r
+ GNUNET_CONTAINER_DLL_insert (parent->children_head, parent->children_tail,\r
+ item);\r
+ return item;\r
+}\r
+\r
+/**\r
+ * Extract metadata from a file and add it to the share tree\r
+ *\r
+ * @param adc context to modify\r
+ * @param filename name of the file to process\r
+ */\r
+static void\r
+extract_file (struct AddDirContext *adc, const char *filename)\r
+{\r
+ struct ShareTreeItem *item;\r
+ const char *short_fn;\r
+\r
+ item = make_item (adc->parent);\r
+\r
+ GNUNET_DISK_file_size (filename, &item->file_size, GNUNET_YES);\r
+ item->is_directory = GNUNET_NO;\r
+\r
+ item->meta = GNUNET_CONTAINER_meta_data_create ();\r
+ GNUNET_FS_meta_data_extract_from_file (item->meta, filename,\r
+ adc->plugins);\r
+ GNUNET_CONTAINER_meta_data_delete (item->meta, EXTRACTOR_METATYPE_FILENAME,\r
+ NULL, 0);\r
+ short_fn = GNUNET_STRINGS_get_short_name (filename);\r
+\r
+ item->filename = GNUNET_strdup (filename);\r
+ item->short_filename = GNUNET_strdup (short_fn);\r
+\r
+ GNUNET_CONTAINER_meta_data_insert (item->meta, "<libgnunetfs>",\r
+ EXTRACTOR_METATYPE_FILENAME,\r
+ EXTRACTOR_METAFORMAT_UTF8, "text/plain",\r
+ short_fn, strlen (short_fn) + 1);\r
+}\r
+\r
+/**\r
+ * Remove the keyword from the ksk URI.\r
+ *\r
+ * @param cls the ksk uri\r
+ * @param keyword the word to remove\r
+ * @param is_mandatory ignored\r
+ * @return always GNUNET_OK\r
+ */\r
+static int\r
+remove_keyword (void *cls, const char *keyword, int is_mandatory)\r
+{\r
+ struct GNUNET_FS_Uri *ksk = cls;\r
+\r
+ GNUNET_FS_uri_ksk_remove_keyword (ksk, keyword);\r
+ return GNUNET_OK;\r
+}\r
+\r
+/**\r
+ * Remove keywords from current directory's children, if they are\r
+ * in the exluded keywords list of that directory.\r
+ *\r
+ * @param cls the ksk uri\r
+ * @param keyword the word to remove\r
+ * @param is_mandatory ignored\r
+ * @return always GNUNET_OK\r
+ */\r
+static int\r
+remove_keywords (struct ProcessMetadataStackItem *stack, struct ShareTreeItem *dir)\r
+{\r
+ struct ShareTreeItem *item;\r
+\r
+ for (item = dir->children_head; item; item = item->next)\r
+ {\r
+ if (stack->exclude_ksk != NULL)\r
+ GNUNET_FS_uri_ksk_get_keywords (stack->exclude_ksk, &remove_keyword, item->ksk_uri);\r
+ }\r
+ return GNUNET_OK;\r
+}\r
+\r
+/**\r
+ * Context passed to 'migrate_and_drop'.\r
+ */\r
+struct KeywordProcessContext\r
+{\r
+ /**\r
+ * All the keywords we migrated to the parent.\r
+ */\r
+ struct GNUNET_FS_Uri *ksk;\r
+\r
+ /**\r
+ * How often does a keyword have to occur to be\r
+ * migrated to the parent?\r
+ */\r
+ unsigned int threshold;\r
+};\r
+\r
+/**\r
+ * Context passed to 'migrate_and_drop'.\r
+ */\r
+struct MetaProcessContext\r
+{\r
+ /**\r
+ * All the metadata we copy to the parent.\r
+ */\r
+ struct GNUNET_CONTAINER_MetaData *meta;\r
+\r
+ /**\r
+ * How often does a metadata have to occur to be\r
+ * migrated to the parent?\r
+ */\r
+ unsigned int threshold;\r
+};\r
+\r
+\r
+/**\r
+ * Move "frequent" keywords over to the\r
+ * target ksk uri, free the counters.\r
+ *\r
+ */\r
+static int\r
+migrate_and_drop (void *cls, const GNUNET_HashCode * key, void *value)\r
+{\r
+ struct KeywordProcessContext *kpc = cls;\r
+ struct KeywordCounter *counter = value;\r
+\r
+ if (counter->count >= kpc->threshold && counter->count > 1)\r
+ {\r
+ GNUNET_FS_uri_ksk_add_keyword (kpc->ksk, counter->value, GNUNET_NO);\r
+ }\r
+ GNUNET_free (counter);\r
+ return GNUNET_YES;\r
+}\r
+/**\r
+ * Copy "frequent" metadata items over to the\r
+ * target metadata container, free the counters.\r
+ *\r
+ */\r
+static int\r
+migrate_and_drop_metadata (void *cls, const GNUNET_HashCode * key, void *value)\r
+{\r
+ struct MetaProcessContext *mpc = cls;\r
+ struct MetaCounter *counter = value;\r
+\r
+ if (counter->count >= mpc->threshold && counter->count > 1)\r
+ {\r
+ GNUNET_CONTAINER_meta_data_insert (mpc->meta,\r
+ counter->plugin_name,\r
+ counter->type,\r
+ counter->format,\r
+ counter->data_mime_type, counter->data,\r
+ counter->data_size);\r
+ }\r
+ GNUNET_free (counter);\r
+ return GNUNET_YES;\r
+}\r
+\r
+/**\r
+ * Go over the collected keywords from all entries in the\r
+ * directory and push common keywords up one level (by\r
+ * adding it to the returned struct). Do the same for metadata.\r
+ * Destroys keywordcounter and metacoutner for current directory.\r
+ *\r
+ * @param adc collection of child meta data\r
+ * @param exclude_ksk pointer to where moveable keywords will be stored\r
+ * @param copy_meta pointer to where copyable metadata will be stored\r
+ */\r
+static void\r
+process_keywords_and_metadata (struct ProcessMetadataStackItem *stack,\r
+ struct GNUNET_FS_Uri **exclude_ksk,\r
+ struct GNUNET_CONTAINER_MetaData **copy_meta)\r
+{\r
+ struct KeywordProcessContext kpc;\r
+ struct MetaProcessContext mpc;\r
+ struct GNUNET_CONTAINER_MetaData *tmp;\r
+\r
+ /* Surprisingly, it's impossible to create a ksk with 0 keywords directly.\r
+ * But we can create one from an empty metadata set\r
+ */\r
+ tmp = GNUNET_CONTAINER_meta_data_create ();\r
+ kpc.ksk = GNUNET_FS_uri_ksk_create_from_meta_data (tmp);\r
+ GNUNET_CONTAINER_meta_data_destroy (tmp);\r
+ mpc.meta = GNUNET_CONTAINER_meta_data_create ();\r
+\r
+ kpc.threshold = mpc.threshold = (stack->dir_entry_count + 1) / 2; /* 50% */\r
+\r
+ GNUNET_CONTAINER_multihashmap_iterate (stack->keywordcounter,\r
+ &migrate_and_drop, &kpc);\r
+ GNUNET_CONTAINER_multihashmap_iterate (stack->metacounter,\r
+ &migrate_and_drop_metadata, &mpc);\r
+\r
+ GNUNET_CONTAINER_multihashmap_destroy (stack->keywordcounter);\r
+ GNUNET_CONTAINER_multihashmap_destroy (stack->metacounter);\r
+ *exclude_ksk = kpc.ksk;\r
+ *copy_meta = mpc.meta;\r
+}\r
+\r
+/**\r
+ * Function called by the directory iterator to\r
+ * (recursively) add all of the files in the\r
+ * directory to the tree.\r
+ * Called by the directory scanner to initiate the\r
+ * scan.\r
+ * TODO: find a way to make it non-recursive.\r
+ *\r
+ * @param cls the 'struct AddDirContext*' we're in\r
+ * @param filename file or directory to scan\r
+ */\r
+static int\r
+scan_directory (void *cls, const char *filename)\r
+{\r
+ struct AddDirContext *adc = cls, recurse_adc;\r
+ struct stat sbuf;\r
+ struct ShareTreeItem *item;\r
+ const char *short_fn;\r
+ int do_stop = 0;\r
+\r
+ /* Wrap up fast */\r
+ if (adc->do_stop)\r
+ return GNUNET_SYSERR;\r
+\r
+ /* If the file doesn't exist (or is not statable for any other reason,\r
+ * skip it, and report it.\r
+ */\r
+ if (0 != STAT (filename, &sbuf))\r
+ {\r
+ do_stop = write_progress (adc, filename, S_ISDIR (sbuf.st_mode),\r
+ GNUNET_DIR_SCANNER_DOES_NOT_EXIST);\r
+ return GNUNET_OK;\r
+ }\r
+\r
+ /* Report the progress */\r
+ do_stop = write_progress (adc, filename, S_ISDIR (sbuf.st_mode),\r
+ GNUNET_DIR_SCANNER_NEW_FILE);\r
+ if (do_stop)\r
+ {\r
+ /* We were asked to stop, acknowledge that and return */\r
+ do_stop = write_progress (adc, filename, S_ISDIR (sbuf.st_mode),\r
+ GNUNET_DIR_SCANNER_ASKED_TO_STOP);\r
+ return GNUNET_SYSERR;\r
+ }\r
+\r
+ if (!S_ISDIR (sbuf.st_mode))\r
+ extract_file (adc, filename);\r
+ else\r
+ {\r
+ item = make_item (adc->parent);\r
+ item->meta = GNUNET_CONTAINER_meta_data_create ();\r
+\r
+ item->is_directory = GNUNET_YES;\r
+\r
+ /* copy fields from adc */\r
+ recurse_adc = *adc;\r
+ /* replace recurse_adc contents with the ones for this directory */\r
+ recurse_adc.parent = item;\r
+\r
+ /* recurse into directory */\r
+ GNUNET_DISK_directory_scan (filename, &scan_directory, &recurse_adc);\r
+\r
+ short_fn = GNUNET_STRINGS_get_short_name (filename);\r
+\r
+ item->filename = GNUNET_strdup (filename);\r
+ item->short_filename = GNUNET_strdup (short_fn);\r
+\r
+ if (adc->parent == NULL)\r
+ {\r
+ /* we're finished with the scan, make sure caller gets the top-level\r
+ * directory pointer\r
+ */\r
+ adc->parent = item;\r
+ }\r
+ }\r
+ return GNUNET_OK;\r
+}\r
+\r
+/**\r
+ * Signals the scanner to finish the scan as fast as possible.\r
+ * Does not block.\r
+ * Can close the pipe if asked to, but that is only used by the\r
+ * internal call to this function during cleanup. The client\r
+ * must understand the consequences of closing the pipe too early.\r
+ *\r
+ * @param ds directory scanner structure\r
+ * @param close_pipe GNUNET_YES to close\r
+ */\r
+void\r
+GNUNET_FS_directory_scan_finish (struct GNUNET_FS_DirScanner *ds,\r
+ int close_pipe)\r
+{\r
+#if WINDOWS\r
+ SetEvent (ds->stop);\r
+#else\r
+ sem_post (&ds->stop);\r
+#endif\r
+ if (close_pipe)\r
+ {\r
+ if (ds->progress_read_task != GNUNET_SCHEDULER_NO_TASK)\r
+ {\r
+ GNUNET_SCHEDULER_cancel (ds->progress_read_task);\r
+ ds->progress_read_task = GNUNET_SCHEDULER_NO_TASK;\r
+ }\r
+ GNUNET_DISK_pipe_close_end (ds->progress_pipe, GNUNET_DISK_PIPE_END_READ);\r
+ ds->progress_read = NULL;\r
+ }\r
+}\r
+\r
+/**\r
+ * Signals the scanner thread to finish (in case it isn't finishing\r
+ * already) and joins the scanner thread. Closes the pipes, frees the\r
+ * scanner contexts (both of them), returns the results of the scan.\r
+ * Results are valid (and have to be freed) even if the scanner had\r
+ * an error or was rushed to finish prematurely.\r
+ * Blocks until the scanner is finished.\r
+ *\r
+ * @param ds directory scanner structure\r
+ * @return the results of the scan (a directory tree)\r
+ */\r
+struct ShareTreeItem *\r
+GNUNET_FS_directory_scan_cleanup (struct GNUNET_FS_DirScanner *ds)\r
+{\r
+ struct ShareTreeItem *result;\r
+\r
+ GNUNET_FS_directory_scan_finish (ds, GNUNET_YES);\r
+#if WINDOWS\r
+ WaitForSingleObject (ds->thread, INFINITE);\r
+ CloseHandle (ds->stop);\r
+ CloseHandle (ds->thread);\r
+#else\r
+ pthread_join (ds->thread, NULL);\r
+ sem_destroy (&ds->stop);\r
+ pthread_detach (ds->thread);\r
+#endif\r
+\r
+ GNUNET_DISK_pipe_close (ds->progress_pipe);\r
+ result = ds->adc->parent;\r
+ GNUNET_free (ds->adc);\r
+ GNUNET_free (ds);\r
+ return result;\r
+}\r
+\r
+/**\r
+ * The function from which the scanner thread starts\r
+ */\r
+#if WINDOWS\r
+static DWORD\r
+#else\r
+static int\r
+#endif\r
+run_directory_scan_thread (struct AddDirContext *adc)\r
+{\r
+ scan_directory (adc, adc->filename_expanded);\r
+ GNUNET_free (adc->filename_expanded);\r
+ if (adc->plugins != NULL)\r
+ EXTRACTOR_plugin_remove_all (adc->plugins);\r
+ /* Tell the initiator that we're finished, it can now join the thread */\r
+ write_progress (adc, NULL, 0, GNUNET_DIR_SCANNER_FINISHED);\r
+ return 0;\r
+}\r
+\r
+/**\r
+ * Called every time there is data to read from the scanner.\r
+ * Calls the scanner progress handler.\r
+ *\r
+ * @param cls the closure (directory scanner object)\r
+ * @param tc task context in which the task is running\r
+ */\r
+static void\r
+read_progress_task (void *cls, const struct GNUNET_SCHEDULER_TaskContext *tc)\r
+{\r
+ struct GNUNET_FS_DirScanner *ds;\r
+ int end_it = 0;\r
+ enum GNUNET_DirScannerProgressUpdateReason reason;\r
+ ssize_t rd;\r
+ ssize_t total_read;\r
+\r
+ size_t filename_len;\r
+ char is_directory;\r
+ char *filename;\r
+\r
+ ds = cls;\r
+\r
+ ds->progress_read_task = GNUNET_SCHEDULER_NO_TASK;\r
+\r
+ if (!(tc->reason & GNUNET_SCHEDULER_REASON_READ_READY))\r
+ {\r
+ ds->progress_callback (ds->cls, ds, NULL, 0, GNUNET_DIR_SCANNER_SHUTDOWN);\r
+ return;\r
+ }\r
+\r
+ /* Read one message. If message is malformed or can't be read, end the scanner */\r
+ total_read = rd = GNUNET_DISK_file_read (ds->progress_read, &reason, sizeof (reason));\r
+ while (rd > 0 && total_read < sizeof (reason))\r
+ {\r
+ rd = GNUNET_DISK_file_read (ds->progress_read,\r
+ &((char *) &reason)[total_read],\r
+ sizeof (reason) - total_read);\r
+ if (rd > 0)\r
+ total_read += rd;\r
+ }\r
+ if (total_read != sizeof (reason)\r
+ || reason <= GNUNET_DIR_SCANNER_FIRST\r
+ || reason >= GNUNET_DIR_SCANNER_LAST)\r
+ {\r
+ end_it = 1;\r
+ reason = GNUNET_DIR_SCANNER_PROTOCOL_ERROR;\r
+ }\r
+\r
+ if (!end_it)\r
+ {\r
+ total_read = rd = GNUNET_DISK_file_read (ds->progress_read, &filename_len,\r
+ sizeof (size_t));\r
+ while (rd > 0 && total_read < sizeof (size_t))\r
+ {\r
+ rd = GNUNET_DISK_file_read (ds->progress_read,\r
+ &((char *) &filename_len)[total_read],\r
+ sizeof (size_t) - total_read);\r
+ if (rd > 0)\r
+ total_read += rd;\r
+ }\r
+ if (rd != sizeof (size_t))\r
+ {\r
+ end_it = 1;\r
+ reason = GNUNET_DIR_SCANNER_PROTOCOL_ERROR;\r
+ }\r
+ }\r
+ if (!end_it)\r
+ {\r
+ if (filename_len == 0)\r
+ end_it = 1;\r
+ else if (filename_len > MAX_PATH)\r
+ {\r
+ end_it = 1;\r
+ reason = GNUNET_DIR_SCANNER_PROTOCOL_ERROR;\r
+ }\r
+ }\r
+ if (!end_it)\r
+ {\r
+ filename = GNUNET_malloc (filename_len);\r
+ total_read = rd = GNUNET_DISK_file_read (ds->progress_read, filename,\r
+ filename_len);\r
+ while (rd > 0 && total_read < filename_len)\r
+ {\r
+ rd = GNUNET_DISK_file_read (ds->progress_read, &filename[total_read],\r
+ filename_len - total_read);\r
+ if (rd > 0)\r
+ total_read += rd;\r
+ }\r
+ if (rd != filename_len)\r
+ {\r
+ GNUNET_free (filename);\r
+ reason = GNUNET_DIR_SCANNER_PROTOCOL_ERROR;\r
+ end_it = 1;\r
+ }\r
+ }\r
+ if (!end_it && filename_len > 0)\r
+ {\r
+ total_read = rd = GNUNET_DISK_file_read (ds->progress_read, &is_directory,\r
+ sizeof (char));\r
+ while (rd > 0 && total_read < sizeof (char))\r
+ {\r
+ rd = GNUNET_DISK_file_read (ds->progress_read, &(&is_directory)[total_read],\r
+ sizeof (char) - total_read);\r
+ if (rd > 0)\r
+ total_read += rd;\r
+ }\r
+ if (rd != sizeof (char))\r
+ {\r
+ GNUNET_free (filename);\r
+ reason = GNUNET_DIR_SCANNER_PROTOCOL_ERROR;\r
+ end_it = 1;\r
+ }\r
+ }\r
+ if (!end_it)\r
+ {\r
+ end_it = ds->progress_callback (ds->cls, ds, (const char *) filename, is_directory, reason);\r
+ GNUNET_free (filename);\r
+ if (!end_it)\r
+ {\r
+ ds->progress_read_task = GNUNET_SCHEDULER_add_read_file (\r
+ GNUNET_TIME_UNIT_FOREVER_REL, ds->progress_read, &read_progress_task,\r
+ cls);\r
+ }\r
+ }\r
+ else\r
+ {\r
+ ds->progress_callback (ds->cls, ds, NULL, 0, reason);\r
+ }\r
+}\r
+\r
+\r
+/**\r
+ * Start a directory scanner thread.\r
+ *\r
+ * @param filename name of the directory to scan\r
+ * @param GNUNET_YES to not to run libextractor on files (only build a tree)\r
+ * @param ex if not NULL, must be a list of extra plugins for extractor\r
+ * @param cb the callback to call when there are scanning progress messages\r
+ * @param cls closure for 'cb'\r
+ * @return directory scanner object to be used for controlling the scanner\r
+ */\r
+struct GNUNET_FS_DirScanner *\r
+GNUNET_FS_directory_scan_start (const char *filename,\r
+ int disable_extractor, const char *ex,\r
+ GNUNET_FS_DirScannerProgressCallback cb, void *cls)\r
+{\r
+ struct stat sbuf;\r
+ struct AddDirContext *adc;\r
+ char *filename_expanded;\r
+ struct GNUNET_FS_DirScanner *ds;\r
+ struct GNUNET_DISK_PipeHandle *progress_pipe;\r
+ int ok;\r
+\r
+ if (0 != STAT (filename, &sbuf))\r
+ return NULL;\r
+ /* TODO: consider generalizing this for files too! */\r
+ if (!S_ISDIR (sbuf.st_mode))\r
+ {\r
+ GNUNET_break (0);\r
+ return NULL;\r
+ }\r
+ /* scan_directory() is guaranteed to be given expanded filenames,\r
+ * so expand we will!\r
+ */\r
+ filename_expanded = GNUNET_STRINGS_filename_expand (filename);\r
+ if (filename_expanded == NULL)\r
+ return NULL;\r
+\r
+ progress_pipe = GNUNET_DISK_pipe (GNUNET_NO, GNUNET_NO, GNUNET_NO);\r
+ if (progress_pipe == NULL)\r
+ {\r
+ GNUNET_free (filename_expanded);\r
+ return NULL;\r
+ }\r
+\r
+ adc = GNUNET_malloc (sizeof (struct AddDirContext));\r
+\r
+ ds = GNUNET_malloc (sizeof (struct GNUNET_FS_DirScanner));\r
+\r
+ ds->adc = adc;\r
+\r
+#if WINDOWS\r
+ ds->stop = CreateEvent (NULL, TRUE, FALSE, NULL);\r
+ adc->stop = ds->stop;\r
+ ok = ds->stop != INVALID_HANDLE_VALUE;\r
+#else\r
+ ok = !sem_init (&ds->stop, 0, 0);\r
+ adc = &ds->stop;\r
+#endif\r
+ if (!ok)\r
+ {\r
+ GNUNET_free (adc);\r
+ GNUNET_free (ds);\r
+ GNUNET_free (filename_expanded);\r
+ GNUNET_DISK_pipe_close (progress_pipe);\r
+ return NULL;\r
+ }\r
+\r
+ adc->plugins = NULL;\r
+ if (!disable_extractor)\r
+ {\r
+ adc->plugins = EXTRACTOR_plugin_add_defaults (\r
+ EXTRACTOR_OPTION_DEFAULT_POLICY);\r
+ if (ex && strlen (ex) > 0)\r
+ adc->plugins = EXTRACTOR_plugin_add_config (adc->plugins, ex,\r
+ EXTRACTOR_OPTION_DEFAULT_POLICY);\r
+ }\r
+\r
+ adc->filename_expanded = filename_expanded;\r
+ adc->progress_write = GNUNET_DISK_pipe_handle (progress_pipe,\r
+ GNUNET_DISK_PIPE_END_WRITE);\r
+\r
+\r
+ ds->progress_read = GNUNET_DISK_pipe_handle (progress_pipe,\r
+ GNUNET_DISK_PIPE_END_READ);\r
+\r
+#if WINDOWS\r
+ ds->thread = CreateThread (NULL, 0,\r
+ (LPTHREAD_START_ROUTINE) &run_directory_scan_thread, (LPVOID) adc,\r
+ 0, NULL);\r
+ ok = ds->thread != NULL;\r
+#else\r
+ ok = !pthread_create (&ds->thread, NULL, &run_directory_scan_thread,\r
+ (void *) adc);\r
+#endif\r
+ if (!ok)\r
+ {\r
+ GNUNET_free (adc);\r
+ GNUNET_free (filename_expanded);\r
+ GNUNET_DISK_pipe_close (progress_pipe);\r
+ GNUNET_free (ds);\r
+ return NULL;\r
+ }\r
+\r
+ ds->progress_callback = cb;\r
+ ds->cls = cls;\r
+ ds->adc = adc;\r
+ ds->progress_pipe = progress_pipe;\r
+\r
+ ds->progress_read_task = GNUNET_SCHEDULER_add_read_file (\r
+ GNUNET_TIME_UNIT_FOREVER_REL, ds->progress_read, &read_progress_task,\r
+ ds);\r
+\r
+ return ds;\r
+}\r
+\r
+/**\r
+ * Task that post-processes the share item tree.\r
+ * This processing has to be done in the main thread, because\r
+ * it requires access to libgcrypt's hashing functions, and\r
+ * libgcrypt is not thread-safe without some special magic.\r
+ *\r
+ * @param cls top of the stack\r
+ * @param tc task context\r
+ */\r
+static void\r
+trim_share_tree_task (void *cls,\r
+ const struct GNUNET_SCHEDULER_TaskContext *tc)\r
+{\r
+ struct ProcessMetadataStackItem *stack = cls;\r
+ struct ProcessMetadataStackItem *next = stack;\r
+ /* FIXME: figure out what to do when tc says we're shutting down */\r
+\r
+ /* item == NULL means that we've just finished going over the children of\r
+ * current directory.\r
+ */\r
+ if (stack->item == NULL)\r
+ {\r
+ if (stack->parent->item != NULL)\r
+ {\r
+ /* end of a directory */\r
+ struct GNUNET_FS_Uri *ksk;\r
+\r
+ /* use keyword and metadata counters to create lists of keywords to move\r
+ * and metadata to copy.\r
+ */\r
+ process_keywords_and_metadata (stack, &stack->parent->exclude_ksk, &stack->parent->item->meta);\r
+\r
+ /* create keywords from metadata (copies all text-metadata as keywords,\r
+ * AND parses the directory name we've just added, producing even more\r
+ * keywords.\r
+ * then merge these keywords with the ones moved from children.\r
+ */\r
+ ksk = GNUNET_FS_uri_ksk_create_from_meta_data (stack->parent->item->meta);\r
+ stack->parent->item->ksk_uri = GNUNET_FS_uri_ksk_merge (ksk, stack->parent->exclude_ksk);\r
+ GNUNET_FS_uri_destroy (ksk);\r
+\r
+ /* remove moved keywords from children (complete the move) */\r
+ remove_keywords (stack->parent, stack->parent->item);\r
+ GNUNET_FS_uri_destroy (stack->parent->exclude_ksk);\r
+\r
+ /* go up the stack */\r
+ next = stack->parent;\r
+ GNUNET_free (stack);\r
+ next->end_directory = GNUNET_YES;\r
+ }\r
+ else\r
+ {\r
+ /* we've just finished processing the toplevel directory */\r
+ struct ProcessMetadataContext *ctx = stack->ctx;\r
+ next = NULL;\r
+ GNUNET_SCHEDULER_add_continuation (ctx->cb, ctx->cls,\r
+ GNUNET_SCHEDULER_REASON_PREREQ_DONE);\r
+ GNUNET_free (stack->parent);\r
+ GNUNET_free (stack);\r
+ GNUNET_free (ctx);\r
+ }\r
+ }\r
+ else if (stack->item->is_directory\r
+ && !stack->end_directory\r
+ && stack->item->children_head != NULL)\r
+ {\r
+ /* recurse into subdirectory */\r
+ next = GNUNET_malloc (sizeof (struct ProcessMetadataStackItem));\r
+ next->ctx = stack->ctx;\r
+ next->item = stack->item->children_head;\r
+ next->keywordcounter = GNUNET_CONTAINER_multihashmap_create (1024);\r
+ next->metacounter = GNUNET_CONTAINER_multihashmap_create (1024);\r
+ next->dir_entry_count = 0;\r
+ next->parent = stack;\r
+ }\r
+ else\r
+ {\r
+ /* process a child entry (a file or a directory) and move to the next one*/\r
+ if (stack->item->is_directory)\r
+ stack->end_directory = GNUNET_NO;\r
+ stack->dir_entry_count++;\r
+ GNUNET_CONTAINER_meta_data_iterate (stack->item->meta, &add_to_meta_counter, stack->metacounter);\r
+\r
+ if (stack->item->is_directory)\r
+ {\r
+ char *user = getenv ("USER");\r
+ if ((user == NULL) || (0 != strncasecmp (user, stack->item->short_filename, strlen(user))))\r
+ {\r
+ /* only use filename if it doesn't match $USER */\r
+ GNUNET_CONTAINER_meta_data_insert (stack->item->meta, "<libgnunetfs>",\r
+ EXTRACTOR_METATYPE_FILENAME,\r
+ EXTRACTOR_METAFORMAT_UTF8,\r
+ "text/plain", stack->item->short_filename,\r
+ strlen (stack->item->short_filename) + 1);\r
+ GNUNET_CONTAINER_meta_data_insert (stack->item->meta, "<libgnunetfs>",\r
+ EXTRACTOR_METATYPE_GNUNET_ORIGINAL_FILENAME,\r
+ EXTRACTOR_METAFORMAT_UTF8,\r
+ "text/plain", stack->item->short_filename,\r
+ strlen (stack->item->short_filename) + 1);\r
+ }\r
+ }\r
+\r
+ stack->item->ksk_uri = GNUNET_FS_uri_ksk_create_from_meta_data (stack->item->meta);\r
+ GNUNET_FS_uri_ksk_get_keywords (stack->item->ksk_uri, &add_to_keyword_counter, stack->keywordcounter);\r
+ stack->item = stack->item->next;\r
+ }\r
+ /* Call this task again later, if there are more entries to process */\r
+ if (next)\r
+ GNUNET_SCHEDULER_add_continuation (&trim_share_tree_task, next,\r
+ GNUNET_SCHEDULER_REASON_PREREQ_DONE);\r
+}\r
+\r
+/**\r
+ * Process a share item tree, moving frequent keywords up and\r
+ * copying frequent metadata up.\r
+ *\r
+ * @param toplevel toplevel directory in the tree, returned by the scanner\r
+ * @param cb called after processing is done\r
+ * @param cls closure for 'cb'\r
+ */\r
+struct ProcessMetadataContext *\r
+GNUNET_FS_trim_share_tree (struct ShareTreeItem *toplevel,\r
+ GNUNET_SCHEDULER_Task cb, void *cls)\r
+{\r
+ struct ProcessMetadataContext *ret;\r
+\r
+ if (toplevel == NULL)\r
+ {\r
+ struct GNUNET_SCHEDULER_TaskContext tc;\r
+ tc.reason = GNUNET_SCHEDULER_REASON_PREREQ_DONE;\r
+ cb (cls, &tc);\r
+ return NULL;\r
+ }\r
+\r
+ ret = GNUNET_malloc (sizeof (struct ProcessMetadataContext));\r
+ ret->toplevel = toplevel;\r
+ ret->stack = GNUNET_malloc (sizeof (struct ProcessMetadataStackItem));\r
+ ret->stack->ctx = ret;\r
+ ret->stack->item = toplevel;\r
+ ret->stack->keywordcounter = GNUNET_CONTAINER_multihashmap_create (1024);\r
+ ret->stack->metacounter = GNUNET_CONTAINER_multihashmap_create (1024);\r
+ ret->stack->dir_entry_count = 0;\r
+ ret->stack->end_directory = GNUNET_NO;\r
+\r
+ /* dummy stack entry that tells us we're at the top of the stack */\r
+ ret->stack->parent = GNUNET_malloc (sizeof (struct ProcessMetadataStackItem));\r
+ ret->stack->parent->ctx = ret;\r
+\r
+ ret->cb = cb;\r
+ ret->cls = cls;\r
+\r
+ GNUNET_SCHEDULER_add_continuation (&trim_share_tree_task, ret->stack,\r
+ GNUNET_SCHEDULER_REASON_PREREQ_DONE);\r
+ return ret;\r
+}
\ No newline at end of file