unzip: handle "central directory"
authorDenis Vlasenko <vda.linux@googlemail.com>
Sun, 21 Sep 2008 01:01:46 +0000 (01:01 -0000)
committerDenis Vlasenko <vda.linux@googlemail.com>
Sun, 21 Sep 2008 01:01:46 +0000 (01:01 -0000)
 needed for OpenOffice, gmail attachment .zips etc
 conditional on CONFIG_DESKTOP

function                                             old     new   delta
unzip_main                                          1643    1939    +296
find_cds_offset                                        -     173    +173
unzip_skip                                            11      16      +5
------------------------------------------------------------------------------
(add/remove: 1/0 grow/shrink: 2/0 up/down: 474/0)             Total: 474 bytes

archival/unzip.c

index 1c9bc246c71accea6652db50fcc794c80c7df53e..e468ff451db4dca764600812bcf156ab1f1ccbbf 100644 (file)
  */
 
 /* TODO
- * Endian issues
  * Zip64 + other methods
- * Improve handling of zip format, ie.
- * - deferred CRC, comp. & uncomp. lengths (zip header flags bit 3)
- * - unix file permissions, etc.
- * - central directory
  */
 
 #include "libbb.h"
@@ -31,12 +26,12 @@ enum {
 #if BB_BIG_ENDIAN
        ZIP_FILEHEADER_MAGIC = 0x504b0304,
        ZIP_CDS_MAGIC        = 0x504b0102,
-       ZIP_CDS_END_MAGIC    = 0x504b0506,
+       ZIP_CDE_MAGIC        = 0x504b0506,
        ZIP_DD_MAGIC         = 0x504b0708,
 #else
        ZIP_FILEHEADER_MAGIC = 0x04034b50,
        ZIP_CDS_MAGIC        = 0x02014b50,
-       ZIP_CDS_END_MAGIC    = 0x06054b50,
+       ZIP_CDE_MAGIC        = 0x06054b50,
        ZIP_DD_MAGIC         = 0x08074b50,
 #endif
 };
@@ -46,16 +41,16 @@ enum {
 typedef union {
        uint8_t raw[ZIP_HEADER_LEN];
        struct {
-               uint16_t version;                       /* 0-1 */
-               uint16_t flags;                         /* 2-3 */
-               uint16_t method;                        /* 4-5 */
-               uint16_t modtime;                       /* 6-7 */
-               uint16_t moddate;                       /* 8-9 */
-               uint32_t crc32 PACKED;        /* 10-13 */
-               uint32_t cmpsize PACKED;      /* 14-17 */
-               uint32_t ucmpsize PACKED;     /* 18-21 */
-               uint16_t filename_len;                  /* 22-23 */
-               uint16_t extra_len;                     /* 24-25 */
+               uint16_t version;               /* 0-1 */
+               uint16_t flags;                 /* 2-3 */
+               uint16_t method;                /* 4-5 */
+               uint16_t modtime;               /* 6-7 */
+               uint16_t moddate;               /* 8-9 */
+               uint32_t crc32 PACKED;          /* 10-13 */
+               uint32_t cmpsize PACKED;        /* 14-17 */
+               uint32_t ucmpsize PACKED;       /* 18-21 */
+               uint16_t filename_len;          /* 22-23 */
+               uint16_t extra_len;             /* 24-25 */
        } formatted PACKED;
 } zip_header_t; /* PACKED - gcc 4.2.1 doesn't like it (spews warning) */
 
@@ -65,11 +60,11 @@ typedef union {
  */
 struct BUG_zip_header_must_be_26_bytes {
        char BUG_zip_header_must_be_26_bytes[
-               offsetof(zip_header_t, formatted.extra_len) + 2 ==
-                       ZIP_HEADER_LEN ? 1 : -1];
+               offsetof(zip_header_t, formatted.extra_len) + 2
+                       == ZIP_HEADER_LEN ? 1 : -1];
 };
 
-#define FIX_ENDIANNESS(zip_header) do { \
+#define FIX_ENDIANNESS_ZIP(zip_header) do { \
        (zip_header).formatted.version      = SWAP_LE16((zip_header).formatted.version     ); \
        (zip_header).formatted.flags        = SWAP_LE16((zip_header).formatted.flags       ); \
        (zip_header).formatted.method       = SWAP_LE16((zip_header).formatted.method      ); \
@@ -82,9 +77,138 @@ struct BUG_zip_header_must_be_26_bytes {
        (zip_header).formatted.extra_len    = SWAP_LE16((zip_header).formatted.extra_len   ); \
 } while (0)
 
-static void unzip_skip(int fd, off_t skip)
+#define CDS_HEADER_LEN 42
+
+typedef union {
+       uint8_t raw[CDS_HEADER_LEN];
+       struct {
+               /* uint32_t signature; 50 4b 01 02 */
+               uint16_t version_made_by;       /* 0-1 */
+               uint16_t version_needed;        /* 2-3 */
+               uint16_t cds_flags;             /* 4-5 */
+               uint16_t method;                /* 6-7 */
+               uint16_t mtime;                 /* 8-9 */
+               uint16_t mdate;                 /* 10-11 */
+               uint32_t crc32;                 /* 12-15 */
+               uint32_t cmpsize;               /* 16-19 */
+               uint32_t ucmpsize;              /* 20-23 */
+               uint16_t file_name_length;      /* 24-25 */
+               uint16_t extra_field_length;    /* 26-27 */
+               uint16_t file_comment_length;   /* 28-29 */
+               uint16_t disk_number_start;     /* 30-31 */
+               uint16_t internal_file_attributes; /* 32-33 */
+               uint32_t external_file_attributes PACKED; /* 34-37 */
+               uint32_t relative_offset_of_local_header PACKED; /* 38-41 */
+       } formatted PACKED;
+} cds_header_t;
+
+struct BUG_cds_header_must_be_42_bytes {
+       char BUG_cds_header_must_be_42_bytes[
+               offsetof(cds_header_t, formatted.relative_offset_of_local_header) + 4
+                       == CDS_HEADER_LEN ? 1 : -1];
+};
+
+#define FIX_ENDIANNESS_CDS(cds_header) do { \
+       (cds_header).formatted.crc32        = SWAP_LE32((cds_header).formatted.crc32       ); \
+       (cds_header).formatted.cmpsize      = SWAP_LE32((cds_header).formatted.cmpsize     ); \
+       (cds_header).formatted.ucmpsize     = SWAP_LE32((cds_header).formatted.ucmpsize    ); \
+       (cds_header).formatted.file_name_length = SWAP_LE16((cds_header).formatted.file_name_length); \
+       (cds_header).formatted.extra_field_length = SWAP_LE16((cds_header).formatted.extra_field_length); \
+       (cds_header).formatted.file_comment_length = SWAP_LE16((cds_header).formatted.file_comment_length); \
+} while (0)
+
+#define CDE_HEADER_LEN 16
+
+typedef union {
+       uint8_t raw[CDE_HEADER_LEN];
+       struct {
+               /* uint32_t signature; 50 4b 05 06 */
+               uint16_t this_disk_no;
+               uint16_t disk_with_cds_no;
+               uint16_t cds_entries_on_this_disk;
+               uint16_t cds_entries_total;
+               uint32_t cds_size;
+               uint32_t cds_offset;
+               /* uint16_t file_comment_length; */
+               /* .ZIP file comment (variable size) */
+       } formatted PACKED;
+} cde_header_t;
+
+struct BUG_cde_header_must_be_16_bytes {
+       char BUG_cde_header_must_be_16_bytes[
+               sizeof(cde_header_t) == CDE_HEADER_LEN ? 1 : -1];
+};
+
+#define FIX_ENDIANNESS_CDE(cde_header) do { \
+       (cde_header).formatted.cds_offset = SWAP_LE16((cde_header).formatted.cds_offset); \
+} while (0)
+
+enum { zip_fd = 3 };
+
+
+#if ENABLE_DESKTOP
+/* NB: does not preserve file position! */
+static uint32_t find_cds_offset(void)
+{
+       unsigned char buf[1024];
+       cde_header_t cde_header;
+       unsigned char *p;
+       off_t end;
+
+       end = xlseek(zip_fd, 0, SEEK_END);
+       if (end < 1024)
+               end = 1024;
+       end -= 1024;
+       xlseek(zip_fd, end, SEEK_SET);
+       full_read(zip_fd, buf, 1024);
+
+       p = buf;
+       while (p <= buf + 1024 - CDE_HEADER_LEN - 4) {
+               if (*p != 'P') {
+                       p++;
+                       continue;
+               }
+               if (*++p != 'K')
+                       continue;
+               if (*++p != 5)
+                       continue;
+               if (*++p != 6)
+                       continue;
+               /* we found CDE! */
+               memcpy(cde_header.raw, p + 1, CDE_HEADER_LEN);
+               FIX_ENDIANNESS_CDE(cde_header);
+               return cde_header.formatted.cds_offset;
+       }
+       bb_error_msg_and_die("can't find file table");
+};
+
+static uint32_t read_next_cds(int count_m1, uint32_t cds_offset, cds_header_t *cds_ptr)
 {
-       bb_copyfd_exact_size(fd, -1, skip);
+       off_t org;
+
+       org = xlseek(zip_fd, 0, SEEK_CUR);
+
+       if (!cds_offset)
+               cds_offset = find_cds_offset();
+
+       while (count_m1-- >= 0) {
+               xlseek(zip_fd, cds_offset + 4, SEEK_SET);
+               xread(zip_fd, cds_ptr->raw, CDS_HEADER_LEN);
+               FIX_ENDIANNESS_CDS(*cds_ptr);
+               cds_offset += 4 + CDS_HEADER_LEN
+                       + cds_ptr->formatted.file_name_length
+                       + cds_ptr->formatted.extra_field_length
+                       + cds_ptr->formatted.file_comment_length;
+       }
+
+       xlseek(zip_fd, org, SEEK_SET);
+       return cds_offset;
+};
+#endif
+
+static void unzip_skip(off_t skip)
+{
+       bb_copyfd_exact_size(zip_fd, -1, skip);
 }
 
 static void unzip_create_leading_dirs(const char *fn)
@@ -97,17 +221,17 @@ static void unzip_create_leading_dirs(const char *fn)
        free(name);
 }
 
-static void unzip_extract(zip_header_t *zip_header, int src_fd, int dst_fd)
+static void unzip_extract(zip_header_t *zip_header, int dst_fd)
 {
        if (zip_header->formatted.method == 0) {
                /* Method 0 - stored (not compressed) */
                off_t size = zip_header->formatted.ucmpsize;
                if (size)
-                       bb_copyfd_exact_size(src_fd, dst_fd, size);
+                       bb_copyfd_exact_size(zip_fd, dst_fd, size);
        } else {
                /* Method 8 - inflate */
                inflate_unzip_result res;
-               if (inflate_unzip(&res, zip_header->formatted.cmpsize, src_fd, dst_fd) < 0)
+               if (inflate_unzip(&res, zip_header->formatted.cmpsize, zip_fd, dst_fd) < 0)
                        bb_error_msg_and_die("inflate error");
                /* Validate decompression - crc */
                if (zip_header->formatted.crc32 != (res.crc ^ 0xffffffffL)) {
@@ -131,9 +255,12 @@ int unzip_main(int argc, char **argv)
        smallint verbose = 1;
        smallint listing = 0;
        smallint overwrite = O_PROMPT;
+#if ENABLE_DESKTOP
+       uint32_t cds_offset;
+       unsigned cds_entries;
+#endif
        unsigned total_size;
        unsigned total_entries;
-       int src_fd = -1;
        int dst_fd = -1;
        char *src_fn = NULL;
        char *dst_fn = NULL;
@@ -221,13 +348,14 @@ int unzip_main(int argc, char **argv)
 
        /* Open input file */
        if (LONE_DASH(src_fn)) {
-               src_fd = STDIN_FILENO;
+               xdup2(STDIN_FILENO, zip_fd);
                /* Cannot use prompt mode since zip data is arriving on STDIN */
                if (overwrite == O_PROMPT)
                        overwrite = O_NEVER;
        } else {
                static const char extn[][5] = {"", ".zip", ".ZIP"};
                int orig_src_fn_len = strlen(src_fn);
+               int src_fd = -1;
 
                for (i = 0; (i < 3) && (src_fd == -1); i++) {
                        strcpy(src_fn + orig_src_fn_len, extn[i]);
@@ -237,6 +365,7 @@ int unzip_main(int argc, char **argv)
                        src_fn[orig_src_fn_len] = '\0';
                        bb_error_msg_and_die("can't open %s, %s.zip, %s.ZIP", src_fn, src_fn, src_fn);
                }
+               xmove_fd(src_fd, zip_fd);
        }
 
        /* Change dir if necessary */
@@ -253,37 +382,63 @@ int unzip_main(int argc, char **argv)
 
        total_size = 0;
        total_entries = 0;
+#if ENABLE_DESKTOP
+       cds_entries = 0;
+       cds_offset = 0;
+#endif
        while (1) {
                uint32_t magic;
 
                /* Check magic number */
-               xread(src_fd, &magic, 4);
+               xread(zip_fd, &magic, 4);
                /* Central directory? It's at the end, so exit */
                if (magic == ZIP_CDS_MAGIC)
                        break;
+#if ENABLE_DESKTOP
+               /* Data descriptor? It was a streaming file, go on */
+               if (magic == ZIP_DD_MAGIC) {
+                       /* skip over duplicate crc32, cmpsize and ucmpsize */
+                       unzip_skip(3 * 4);
+                       continue;
+               }
+#endif
                if (magic != ZIP_FILEHEADER_MAGIC)
                        bb_error_msg_and_die("invalid zip magic %08X", (int)magic);
 
                /* Read the file header */
-               xread(src_fd, zip_header.raw, ZIP_HEADER_LEN);
-               FIX_ENDIANNESS(zip_header);
+               xread(zip_fd, zip_header.raw, ZIP_HEADER_LEN);
+               FIX_ENDIANNESS_ZIP(zip_header);
                if ((zip_header.formatted.method != 0) && (zip_header.formatted.method != 8)) {
                        bb_error_msg_and_die("unsupported method %d", zip_header.formatted.method);
                }
-               if (zip_header.formatted.flags & (0x0008|0x0001)) {
+#if !ENABLE_DESKTOP
+               if (zip_header.formatted.flags & 0x0009) {
+                       bb_error_msg_and_die("zip flags 1 and 8 are not supported");
+               }
+#else
+               if (zip_header.formatted.flags & 0x0001) {
                        /* 0x0001 - encrypted */
+                       bb_error_msg_and_die("zip flag 1 (encryption) is not supported");
+               }
+               if (zip_header.formatted.flags & 0x0008) {
+                       cds_header_t cds_header;
                        /* 0x0008 - streaming. [u]cmpsize can be reliably gotten
                         * only from Central Directory. See unzip_doc.txt */
-                       bb_error_msg_and_die("zip flags 8 and 1 are not supported");
+                       cds_offset = read_next_cds(total_entries - cds_entries, cds_offset, &cds_header);
+                       cds_entries = total_entries + 1;
+                       zip_header.formatted.crc32    = cds_header.formatted.crc32;
+                       zip_header.formatted.cmpsize  = cds_header.formatted.cmpsize;
+                       zip_header.formatted.ucmpsize = cds_header.formatted.ucmpsize;
                }
+#endif
 
                /* Read filename */
                free(dst_fn);
                dst_fn = xzalloc(zip_header.formatted.filename_len + 1);
-               xread(src_fd, dst_fn, zip_header.formatted.filename_len);
+               xread(zip_fd, dst_fn, zip_header.formatted.filename_len);
 
                /* Skip extra header bytes */
-               unzip_skip(src_fd, zip_header.formatted.extra_len);
+               unzip_skip(zip_header.formatted.extra_len);
 
                /* Filter zip entries */
                if (find_list_entry(zreject, dst_fn)
@@ -304,7 +459,6 @@ int unzip_main(int argc, char **argv)
                                           (dostime & 0x000007e0) >> 5,
                                           dst_fn);
                                        total_size += zip_header.formatted.ucmpsize;
-                                       total_entries++;
                                } else {
                                        /* short listing -- filenames only */
                                        puts(dst_fn);
@@ -315,7 +469,7 @@ int unzip_main(int argc, char **argv)
                        } else if (last_char_is(dst_fn, '/')) { /* Extract directory */
                                if (stat(dst_fn, &stat_buf) == -1) {
                                        if (errno != ENOENT) {
-                                               bb_perror_msg_and_die("cannot stat '%s'", dst_fn);
+                                               bb_perror_msg_and_die("can't stat '%s'", dst_fn);
                                        }
                                        if (verbose) {
                                                printf("   creating: %s\n", dst_fn);
@@ -335,7 +489,7 @@ int unzip_main(int argc, char **argv)
  check_file:
                                if (stat(dst_fn, &stat_buf) == -1) { /* File does not exist */
                                        if (errno != ENOENT) {
-                                               bb_perror_msg_and_die("cannot stat '%s'", dst_fn);
+                                               bb_perror_msg_and_die("can't stat '%s'", dst_fn);
                                        }
                                        i = 'y';
                                } else { /* File already exists */
@@ -347,7 +501,7 @@ int unzip_main(int argc, char **argv)
                                                } else {
                                                        printf("replace %s? [y]es, [n]o, [A]ll, [N]one, [r]ename: ", dst_fn);
                                                        if (!fgets(key_buf, sizeof(key_buf), stdin)) {
-                                                               bb_perror_msg_and_die("cannot read input");
+                                                               bb_perror_msg_and_die("can't read input");
                                                        }
                                                        i = key_buf[0];
                                                }
@@ -368,7 +522,7 @@ int unzip_main(int argc, char **argv)
                        if (verbose) {
                                printf("  inflating: %s\n", dst_fn);
                        }
-                       unzip_extract(&zip_header, src_fd, dst_fd);
+                       unzip_extract(&zip_header, dst_fd);
                        if (dst_fd != STDOUT_FILENO) {
                                /* closing STDOUT is potentially bad for future business */
                                close(dst_fd);
@@ -379,14 +533,14 @@ int unzip_main(int argc, char **argv)
                        overwrite = O_NEVER;
                case 'n':
                        /* Skip entry data */
-                       unzip_skip(src_fd, zip_header.formatted.cmpsize);
+                       unzip_skip(zip_header.formatted.cmpsize);
                        break;
 
                case 'r':
                        /* Prompt for new name */
                        printf("new name: ");
                        if (!fgets(key_buf, sizeof(key_buf), stdin)) {
-                               bb_perror_msg_and_die("cannot read input");
+                               bb_perror_msg_and_die("can't read input");
                        }
                        free(dst_fn);
                        dst_fn = xstrdup(key_buf);
@@ -398,12 +552,7 @@ int unzip_main(int argc, char **argv)
                        goto check_file;
                }
 
-// Looks like bug (data descriptor cannot be identified this way)
-//             /* Data descriptor section */
-//             if (zip_header.formatted.flags & 4) {
-//                     /* skip over duplicate crc, compressed size and uncompressed size */
-//                     unzip_skip(src_fd, 12);
-//             }
+               total_entries++;
        }
 
        if (listing && verbose) {