adding indexing support

author Christian Grothoff <christian@grothoff.org>

Sun, 30 Aug 2009 21:07:10 +0000 (21:07 +0000)

committer Christian Grothoff <christian@grothoff.org>

Sun, 30 Aug 2009 21:07:10 +0000 (21:07 +0000)
author Christian Grothoff <christian@grothoff.org>
Sun, 30 Aug 2009 21:07:10 +0000 (21:07 +0000)
committer Christian Grothoff <christian@grothoff.org>
Sun, 30 Aug 2009 21:07:10 +0000 (21:07 +0000)
diff --git a/TODO b/TODO

index 391fd483f15ed097048c738773905f5b281da004..073c5e137fe656a9a7a3f3d2050cf37b25e8ef9b 100644 (file)
--- a/TODO
+++ b/TODO
@@ -37,11 +37,10 @@ PHASE #2: (Goal: recover basic file-sharing functionality)
    - implement testcases
  * FS (anonymous FS only)
    - design network structs (CS)
-    + list-indexed, index, unindex
+    + list-indexed, unindex
      + search/download, response
    - implement basic FS library
      + sharing API
-      ~ publish (indexing)
        ~ unindex & list indexed!!! (need publish to be done)
        ~ search (need publish to be done)
        ~ download (need publish/search to be done)
@@ -68,6 +67,7 @@ PHASE #2: (Goal: recover basic file-sharing functionality)
      + location URIs (publish, search, download)
      + persistence support (publish, unindex, search, download)
      + datastore reservation (publishing)
+    + indexing: index-failure-cleanup
    - implement adv. testcases 
      + insert: sblocks, loc uris
      + download: loc uris
diff --git a/src/fs/fs.h b/src/fs/fs.h

index 288903418fe7de8ebb27cb65b2fb04ca163e5ee7..e4eee7fd0f2c43cd9f1aa5c63801aa98e9c34227 100644 (file)
--- a/src/fs/fs.h
+++ b/src/fs/fs.h
@@ -288,6 +288,21 @@ struct GNUNET_FS_FileInformation
         */
        void *reader_cls;
  
+      /**
+       * Name of the file (must be an absolute path).
+       * Only required for indexing.  FIXME: not yet
+       * initialized!
+       */
+      char *filename;
+
+      /**
+       * If this file is being indexed, this value
+       * is set to the hash over the entire file
+       * (when the indexing process is started). 
+       * Otherwise this field is not used.
+       */
+      GNUNET_HashCode file_id;
+
        /**
         * Size of the file (in bytes).
         */
@@ -429,6 +444,13 @@ struct GNUNET_FS_PublishContext
     */
    GNUNET_SCHEDULER_TaskIdentifier upload_task;
  
+  /**
+   * Our own client handle for the FS service;
+   * only briefly used when we start to index a
+   * file, otherwise NULL.
+   */
+  struct GNUNET_CLIENT_Connection *client;
+
    /**
     * Typically GNUNET_NO.  Set to GNUNET_YES if
     * "upload_task" is GNUNET_SCHEDULER_NO_TASK
@@ -506,6 +528,29 @@ struct GNUNET_FS_Namespace
  };
  
  
+/**
+ * @brief index block (indexing a DBlock that 
+ *        can be obtained directly from reading
+ *        the plaintext file)
+ */
+struct OnDemandBlock
+{
+  /**
+   * Hash code of the entire content of the
+   * file that was indexed (used to uniquely
+   * identify the plaintext file).
+   */
+  GNUNET_HashCode file_id;
+
+  /**
+   * At which offset should we be able to find
+   * this on-demand encoded block?
+   */
+  uint64_t offset;
+
+};
+
+
  /**
   * @brief keyword block (advertising data under a keyword)
   */
@@ -571,9 +616,58 @@ struct SBlock
  };
  
  
+/**
+ * Message sent from a GNUnet (fs) publishing
+ * activity to the gnunet-fs-service to 
+ * initiate indexing of a file.  The service
+ * is supposed to check if the specified file
+ * is available and has the same cryptographic
+ * hash.  It should then respond with either
+ * a confirmation or a denial.
+ *
+ * On OSes where this works, it is considered
+ * acceptable if the service only checks that
+ * the path, device and inode match (it can
+ * then be assumed that the hash will also match
+ * without actually computing it; this is an
+ * optimization that should be safe given that
+ * the client is not our adversary).
+ */
  struct IndexStartMessage
  {
  
+  /**
+   * Message type will be 
+   * GNUNET_MESSAGE_TYPE_FS_INDEX_START.
+   */
+  struct GNUNET_MessageHeader header;
+
+  /**
+   * ID of device containing the file, as seen by the client.  This
+   * device ID is obtained using a call like "statvfs" (and converting
+   * the "f_fsid" field to a 32-bit big-endian number).  Use 0 if the
+   * OS does not support this, in which case the service must do a
+   * full hash recomputation.
+   */
+  uint32_t device;
+  
+  /**
+   * Inode of the file on the given device, as seen by the client
+   * ("st_ino" field from "struct stat").  Use 0 if the OS does not
+   * support this, in which case the service must do a full hash
+   * recomputation.
+   */
+  uint64_t inode;
+
+  /**
+   * Hash of the file that we would like to index.
+   */
+  GNUNET_HashCode file_id;
+
+  /* this is followed by a 0-terminated
+     filename of a file with the hash
+     "file_id" as seen by the client */
+
  };
  
  
diff --git a/src/fs/fs_publish.c b/src/fs/fs_publish.c

index 91ca3240a1a4aaf40f19e9af67835aa01c71aba3..13ce4d5aaea493ad9754106215e11608cbd39b06 100644 (file)
--- a/src/fs/fs_publish.c
+++ b/src/fs/fs_publish.c
@@ -26,7 +26,7 @@
   * @author Christian Grothoff
   *
   * TODO:
- * - indexing support
+ * - indexing cleanup: unindex on failure (can wait)
   * - code-sharing with unindex (can wait)
   * - persistence support (can wait)
   * - datastore reservation support (optimization)
@@ -52,6 +52,14 @@
   */
  #define MAX_SBLOCK_SIZE 60000
  
+/**
+ * Blocksize to use when hashing files
+ * for indexing (blocksize for IO, not for
+ * the DBlocks).  Larger blocksizes can
+ * be more efficient but will be more disruptive
+ * as far as the scheduler is concerned.
+ */
+#define HASHING_BLOCKSIZE (1024 * 1024)
  
  /**
   * Main function that performs the upload.
@@ -471,6 +479,7 @@ publish_content (struct GNUNET_FS_PublishContext *sc,
    void *raw_data;
    char *dd;
    struct PutContCtx * dpc_cls;
+  struct OnDemandBlock odb;
  
    // FIXME: figure out how to share this code
    // with unindex!
@@ -593,8 +602,6 @@ publish_content (struct GNUNET_FS_PublishContext *sc,
                              enc);
    // NOTE: this block below is all that really differs
    // between publish/unindex!  Parameterize & move this code!
-  // FIXME: something around here would need to change
-  // for indexing!
    if (NULL == sc->dsh)
      {
        sc->upload_task
@@ -614,20 +621,42 @@ publish_content (struct GNUNET_FS_PublishContext *sc,
        dpc_cls->cont = &do_upload;
        dpc_cls->cont_cls = sc;
        dpc_cls->p = p;
-      GNUNET_DATASTORE_put (sc->dsh,
-                           sc->rid,
-                           &mychk->query,
-                           pt_size,
-                           enc,
-                           (p->current_depth == p->chk_tree_depth) 
-                           ? GNUNET_DATASTORE_BLOCKTYPE_DBLOCK 
-                           : GNUNET_DATASTORE_BLOCKTYPE_IBLOCK,
-                           p->priority,
-                           p->anonymity,
-                           p->expirationTime,
-                           GNUNET_CONSTANTS_SERVICE_TIMEOUT,
-                           &ds_put_cont,
-                           dpc_cls);
+      if ( (p->is_directory) &&
+          (p->data.file.do_index) &&
+          (p->current_depth == p->chk_tree_depth) )
+       {
+         odb.offset = p->publish_offset;
+         odb.file_id = p->data.file.file_id;
+         GNUNET_DATASTORE_put (sc->dsh,
+                               sc->rid,
+                               &mychk->query,
+                               sizeof(struct OnDemandBlock),
+                               &odb,
+                               GNUNET_DATASTORE_BLOCKTYPE_ONDEMAND,
+                               p->priority,
+                               p->anonymity,
+                               p->expirationTime,
+                               GNUNET_CONSTANTS_SERVICE_TIMEOUT,
+                               &ds_put_cont,
+                               dpc_cls);         
+       }
+      else
+       {
+         GNUNET_DATASTORE_put (sc->dsh,
+                               sc->rid,
+                               &mychk->query,
+                               pt_size,
+                               enc,
+                               (p->current_depth == p->chk_tree_depth) 
+                               ? GNUNET_DATASTORE_BLOCKTYPE_DBLOCK 
+                               : GNUNET_DATASTORE_BLOCKTYPE_IBLOCK,
+                               p->priority,
+                               p->anonymity,
+                               p->expirationTime,
+                               GNUNET_CONSTANTS_SERVICE_TIMEOUT,
+                               &ds_put_cont,
+                               dpc_cls);
+       }
      }
    if (p->current_depth == p->chk_tree_depth)
      {
@@ -668,6 +697,153 @@ publish_content (struct GNUNET_FS_PublishContext *sc,
  }
  
  
+
+
+/**
+ * Process the response (or lack thereof) from
+ * the "fs" service to our 'start index' request.
+ *
+ * @param cls closure (of type "struct GNUNET_FS_PublishContext*"_)
+ * @param msg the response we got
+ */
+static void
+process_index_start_response (void *cls,
+                             const struct GNUNET_MessageHeader *msg)
+{
+  struct GNUNET_FS_PublishContext *sc = cls;
+  struct GNUNET_FS_FileInformation *p;
+  const char *emsg;
+  uint16_t msize;
+
+  GNUNET_CLIENT_disconnect (sc->client);
+  sc->client = NULL;
+  p = sc->fi_pos;
+  if (msg == NULL)
+    {
+      GNUNET_log (GNUNET_ERROR_TYPE_WARNING,
+                 _("Can not index file `%s': %s.  Will try to insert instead.\n"),
+                 p->data.file.filename,
+                 _("timeout on index-start request to `fs' service"));
+      p->data.file.do_index = GNUNET_NO;
+      publish_content (sc, p);
+      return;
+    }
+  if (ntohs (msg->type) != GNUNET_MESSAGE_TYPE_FS_INDEX_START_OK)
+    {
+      msize = ntohs (msg->size);
+      emsg = (const char *) &msg[1];
+      if ( (msize <= sizeof (struct GNUNET_MessageHeader)) ||
+          (emsg[msize - sizeof(struct GNUNET_MessageHeader) - 1] != '\0') )
+       emsg = gettext_noop ("unknown error");
+      GNUNET_log (GNUNET_ERROR_TYPE_WARNING,
+                 _("Can not index file `%s': %s.  Will try to insert instead.\n"),
+                 p->data.file.filename,
+                 gettext (emsg));
+      p->data.file.do_index = GNUNET_NO;
+      publish_content (sc, p);
+      return;
+    }
+  /* success! continue with indexing */
+  publish_content (sc, p);
+}
+
+
+#if LINUX
+#include <sys/statvfs.h>
+#endif
+
+/**
+ * Function called once the hash computation over an
+ * indexed file has completed.
+ *
+ * @param cls closure, our publishing context
+ * @param res resulting hash, NULL on error
+ */
+static void 
+hash_for_index_cb (void *cls,
+                  const GNUNET_HashCode *
+                  res)
+{
+  struct GNUNET_FS_PublishContext *sc = cls;
+  struct GNUNET_FS_FileInformation *p;
+  struct IndexStartMessage *ism;
+  size_t slen;
+  struct GNUNET_CLIENT_Connection *client;
+#if LINUX
+  struct stat sbuf;
+  struct statvfs fbuf;
+#endif
+
+  p = sc->fi_pos;
+  if (NULL == res) 
+    {
+      GNUNET_log (GNUNET_ERROR_TYPE_WARNING,
+                 _("Can not index file `%s': %s.  Will try to insert instead.\n"),
+                 p->data.file.filename,
+                 _("failed to compute hash"));
+      p->data.file.do_index = GNUNET_NO;
+      publish_content (sc, p);
+      return;
+    }
+  slen = strlen (p->data.file.filename) + 1;
+  if (slen > GNUNET_SERVER_MAX_MESSAGE_SIZE - sizeof(struct IndexStartMessage))
+    {
+      GNUNET_log (GNUNET_ERROR_TYPE_WARNING,
+                 _("Can not index file `%s': %s.  Will try to insert instead.\n"),
+                 p->data.file.filename,
+                 _("filename too long"));
+      p->data.file.do_index = GNUNET_NO;
+      publish_content (sc, p);
+      return;
+    }
+  client = GNUNET_CLIENT_connect (sc->h->sched,
+                                 "fs",
+                                 sc->h->cfg);
+  if (NULL == client)
+    {
+      GNUNET_log (GNUNET_ERROR_TYPE_WARNING,
+                 _("Can not index file `%s': %s.  Will try to insert instead.\n"),
+                 p->data.file.filename,
+                 _("could not connect to `fs' service"));
+      p->data.file.do_index = GNUNET_NO;
+      publish_content (sc, p);
+      return;
+    }
+  p->data.file.file_id = *res;
+  ism = GNUNET_malloc (sizeof(struct IndexStartMessage) +
+                      slen);
+  ism->header.size = htons(sizeof(struct IndexStartMessage) +
+                          slen);
+  ism->header.type = htons(GNUNET_MESSAGE_TYPE_FS_INDEX_START);
+  /* FIXME: activate this on other OSes that
+     support it (or something very similar; make
+     sure to also adjust corresponding code
+     on the service-side) */
+  /* FIXME: the block below should probably be
+     abstracted into a function in the DISK API */
+#if LINUX
+  if ( (0 == stat(p->data.file.filename,
+                 &sbuf)) &&
+       (0 == statvfs (p->data.file.filename,
+                     &fbuf) ) )
+    {
+      ism->device = htonl ((uint32_t) fbuf.f_fsid);
+      ism->inode = GNUNET_htonll( (uint64_t) sbuf.st_ino);
+    }
+#endif
+  memcpy (&ism[1],
+         p->data.file.filename,
+         slen);
+  sc->client = client;
+  GNUNET_CLIENT_transmit_and_get_response (client,
+                                          &ism->header,
+                                          GNUNET_TIME_UNIT_FOREVER_REL,
+                                          &process_index_start_response,
+                                          sc);
+  GNUNET_free (ism);
+}
+
+
  /**
   * Main function that performs the upload.
   * @param cls "struct GNUNET_FS_PublishContext" identifies the upload
@@ -744,9 +920,23 @@ do_upload (void *cls,
    if ( (!p->is_directory) &&
         (p->data.file.do_index) )
      {
-      // FIXME: need to pre-compute hash over
-      // the entire file and ask FS to prepare
-      // for indexing!
+      if (NULL == p->data.file.filename)
+       {
+         p->data.file.do_index = GNUNET_NO;
+         GNUNET_log (GNUNET_ERROR_TYPE_WARNING,
+                     _("Can not index file `%s': %s.  Will try to insert instead.\n"),
+                     "<no-name>",
+                     _("needs to be an actual file"));
+         publish_content (sc, p);
+         return;
+       }      
+      GNUNET_CRYPTO_hash_file (sc->h->sched,
+                              GNUNET_SCHEDULER_PRIORITY_IDLE,
+                              GNUNET_NO,
+                              p->data.file.filename,
+                              HASHING_BLOCKSIZE,
+                              &hash_for_index_cb,
+                              sc);
        return;
      }
    publish_content (sc, p);
diff --git a/src/include/gnunet_datastore_service.h b/src/include/gnunet_datastore_service.h

index 3805dea97a621436d297e24a0d96c9d90ca3d307..acd9af1dedfd6de0b75492cf0f34ff174e5821c8 100644 (file)
--- a/src/include/gnunet_datastore_service.h
+++ b/src/include/gnunet_datastore_service.h
@@ -46,7 +46,8 @@ extern "C"
  #define GNUNET_DATASTORE_BLOCKTYPE_IBLOCK 2
  #define GNUNET_DATASTORE_BLOCKTYPE_KBLOCK 3
  #define GNUNET_DATASTORE_BLOCKTYPE_SBLOCK 4
-#define GNUNET_DATASTORE_BLOCKTYPE_SKBLOCK 5
+#define GNUNET_DATASTORE_BLOCKTYPE_ONDEMAND 5
+#define GNUNET_DATASTORE_BLOCKTYPE_SKBLOCK 6 /* not yet used */
  
  /**
   * Handle to the datastore service.
diff --git a/src/include/gnunet_protocols.h b/src/include/gnunet_protocols.h

index 419bbe28d7baa4204aab65b24d081c00ec96ccf8..686205c3160bf2672d79b58edc430ddc343a4d32 100644 (file)
--- a/src/include/gnunet_protocols.h
+++ b/src/include/gnunet_protocols.h
@@ -367,6 +367,24 @@ extern "C"
   */
  #define GNUNET_MESSAGE_TYPE_DATASTORE_DROP 102
  
+
+/**
+ * Message sent by fs client to start indexing.
+ */
+#define GNUNET_MESSAGE_TYPE_FS_INDEX_START 128
+
+/**
+ * Affirmative response to a request for start indexing.
+ */
+#define GNUNET_MESSAGE_TYPE_FS_INDEX_START_OK 129
+
+
+/**
+ * Response to a request for start indexing that
+ * refuses.
+ */
+#define GNUNET_MESSAGE_TYPE_FS_INDEX_START_FAILED 130
+
  /*
    TODO:
    - DV
author	Christian Grothoff <christian@grothoff.org>
	Sun, 30 Aug 2009 21:07:10 +0000 (21:07 +0000)
committer	Christian Grothoff <christian@grothoff.org>
	Sun, 30 Aug 2009 21:07:10 +0000 (21:07 +0000)
TODO		patch \| blob \| history
src/fs/fs.h		patch \| blob \| history
src/fs/fs_publish.c		patch \| blob \| history
src/include/gnunet_datastore_service.h		patch \| blob \| history
src/include/gnunet_protocols.h		patch \| blob \| history