Removes stray empty line from code
[oweals/busybox.git] / networking / wget.c
1 /* vi: set sw=4 ts=4: */
2 /*
3  * wget - retrieve a file using HTTP or FTP
4  *
5  * Chip Rosenthal Covad Communications <chip@laserlink.net>
6  * Licensed under GPLv2, see file LICENSE in this source tree.
7  *
8  * Copyright (C) 2010 Bradley M. Kuhn <bkuhn@ebb.org>
9  * Kuhn's copyrights are licensed GPLv2-or-later.  File as a whole remains GPLv2.
10  */
11
12 //usage:#define wget_trivial_usage
13 //usage:        IF_FEATURE_WGET_LONG_OPTIONS(
14 //usage:       "[-c|--continue] [-s|--spider] [-q|--quiet] [-O|--output-document FILE]\n"
15 //usage:       "        [--header 'header: value'] [-Y|--proxy on/off] [-P DIR]\n"
16 /* Since we ignore these opts, we don't show them in --help */
17 /* //usage:    "        [--no-check-certificate] [--no-cache]" */
18 //usage:       "        [-U|--user-agent AGENT]" IF_FEATURE_WGET_TIMEOUT(" [-T SEC]") " URL..."
19 //usage:        )
20 //usage:        IF_NOT_FEATURE_WGET_LONG_OPTIONS(
21 //usage:       "[-csq] [-O FILE] [-Y on/off] [-P DIR] [-U AGENT]"
22 //usage:                        IF_FEATURE_WGET_TIMEOUT(" [-T SEC]") " URL..."
23 //usage:        )
24 //usage:#define wget_full_usage "\n\n"
25 //usage:       "Retrieve files via HTTP or FTP\n"
26 //usage:     "\n        -s      Spider mode - only check file existence"
27 //usage:     "\n        -c      Continue retrieval of aborted transfer"
28 //usage:     "\n        -q      Quiet"
29 //usage:     "\n        -P DIR  Save to DIR (default .)"
30 //usage:        IF_FEATURE_WGET_TIMEOUT(
31 //usage:     "\n        -T SEC  Network read timeout is SEC seconds"
32 //usage:        )
33 //usage:     "\n        -O FILE Save to FILE ('-' for stdout)"
34 //usage:     "\n        -U STR  Use STR for User-Agent header"
35 //usage:     "\n        -Y      Use proxy ('on' or 'off')"
36
37 #include "libbb.h"
38
39 #if 0
40 # define log_io(...) bb_error_msg(__VA_ARGS__)
41 # define SENDFMT(fp, fmt, ...) \
42         do { \
43                 log_io("> " fmt, ##__VA_ARGS__); \
44                 fprintf(fp, fmt, ##__VA_ARGS__); \
45         } while (0);
46 #else
47 # define log_io(...) ((void)0)
48 # define SENDFMT(fp, fmt, ...) fprintf(fp, fmt, ##__VA_ARGS__)
49 #endif
50
51
52 struct host_info {
53         char *allocated;
54         const char *path;
55         char       *user;
56         const char *protocol;
57         char       *host;
58         int         port;
59 };
60 static const char P_FTP[] = "ftp";
61 static const char P_HTTP[] = "http";
62 static const char P_HTTPS[] = "https";
63
64 #if ENABLE_FEATURE_WGET_LONG_OPTIONS
65 /* User-specified headers prevent using our corresponding built-in headers.  */
66 enum {
67         HDR_HOST          = (1<<0),
68         HDR_USER_AGENT    = (1<<1),
69         HDR_RANGE         = (1<<2),
70         HDR_AUTH          = (1<<3) * ENABLE_FEATURE_WGET_AUTHENTICATION,
71         HDR_PROXY_AUTH    = (1<<4) * ENABLE_FEATURE_WGET_AUTHENTICATION,
72 };
73 static const char wget_user_headers[] ALIGN1 =
74         "Host:\0"
75         "User-Agent:\0"
76         "Range:\0"
77 # if ENABLE_FEATURE_WGET_AUTHENTICATION
78         "Authorization:\0"
79         "Proxy-Authorization:\0"
80 # endif
81         ;
82 # define USR_HEADER_HOST       (G.user_headers & HDR_HOST)
83 # define USR_HEADER_USER_AGENT (G.user_headers & HDR_USER_AGENT)
84 # define USR_HEADER_RANGE      (G.user_headers & HDR_RANGE)
85 # define USR_HEADER_AUTH       (G.user_headers & HDR_AUTH)
86 # define USR_HEADER_PROXY_AUTH (G.user_headers & HDR_PROXY_AUTH)
87 #else /* No long options, no user-headers :( */
88 # define USR_HEADER_HOST       0
89 # define USR_HEADER_USER_AGENT 0
90 # define USR_HEADER_RANGE      0
91 # define USR_HEADER_AUTH       0
92 # define USR_HEADER_PROXY_AUTH 0
93 #endif
94
95 /* Globals */
96 struct globals {
97         off_t content_len;        /* Content-length of the file */
98         off_t beg_range;          /* Range at which continue begins */
99 #if ENABLE_FEATURE_WGET_STATUSBAR
100         off_t transferred;        /* Number of bytes transferred so far */
101         const char *curfile;      /* Name of current file being transferred */
102         bb_progress_t pmt;
103 #endif
104         char *dir_prefix;
105 #if ENABLE_FEATURE_WGET_LONG_OPTIONS
106         char *post_data;
107         char *extra_headers;
108         unsigned char user_headers; /* Headers mentioned by the user */
109 #endif
110         char *fname_out;        /* where to direct output (-O) */
111         const char *proxy_flag; /* Use proxies if env vars are set */
112         const char *user_agent; /* "User-Agent" header field */
113 #if ENABLE_FEATURE_WGET_TIMEOUT
114         unsigned timeout_seconds;
115         bool connecting;
116 #endif
117         int output_fd;
118         int o_flags;
119         smallint chunked;         /* chunked transfer encoding */
120         smallint got_clen;        /* got content-length: from server  */
121         /* Local downloads do benefit from big buffer.
122          * With 512 byte buffer, it was measured to be
123          * an order of magnitude slower than with big one.
124          */
125         uint64_t just_to_align_next_member;
126         char wget_buf[CONFIG_FEATURE_COPYBUF_KB*1024];
127 } FIX_ALIASING;
128 #define G (*ptr_to_globals)
129 #define INIT_G() do { \
130         SET_PTR_TO_GLOBALS(xzalloc(sizeof(G))); \
131 } while (0)
132 #define FINI_G() do { \
133         FREE_PTR_TO_GLOBALS(); \
134 } while (0)
135
136
137 /* Must match option string! */
138 enum {
139         WGET_OPT_CONTINUE   = (1 << 0),
140         WGET_OPT_SPIDER     = (1 << 1),
141         WGET_OPT_QUIET      = (1 << 2),
142         WGET_OPT_OUTNAME    = (1 << 3),
143         WGET_OPT_PREFIX     = (1 << 4),
144         WGET_OPT_PROXY      = (1 << 5),
145         WGET_OPT_USER_AGENT = (1 << 6),
146         WGET_OPT_NETWORK_READ_TIMEOUT = (1 << 7),
147         WGET_OPT_RETRIES    = (1 << 8),
148         WGET_OPT_PASSIVE    = (1 << 9),
149         WGET_OPT_HEADER     = (1 << 10) * ENABLE_FEATURE_WGET_LONG_OPTIONS,
150         WGET_OPT_POST_DATA  = (1 << 11) * ENABLE_FEATURE_WGET_LONG_OPTIONS,
151 };
152
153 enum {
154         PROGRESS_START = -1,
155         PROGRESS_END   = 0,
156         PROGRESS_BUMP  = 1,
157 };
158 #if ENABLE_FEATURE_WGET_STATUSBAR
159 static void progress_meter(int flag)
160 {
161         if (option_mask32 & WGET_OPT_QUIET)
162                 return;
163
164         if (flag == PROGRESS_START)
165                 bb_progress_init(&G.pmt, G.curfile);
166
167         bb_progress_update(&G.pmt,
168                         G.beg_range,
169                         G.transferred,
170                         (G.chunked || !G.got_clen) ? 0 : G.beg_range + G.transferred + G.content_len
171         );
172
173         if (flag == PROGRESS_END) {
174                 bb_progress_free(&G.pmt);
175                 bb_putchar_stderr('\n');
176                 G.transferred = 0;
177         }
178 }
179 #else
180 static ALWAYS_INLINE void progress_meter(int flag UNUSED_PARAM) { }
181 #endif
182
183
184 /* IPv6 knows scoped address types i.e. link and site local addresses. Link
185  * local addresses can have a scope identifier to specify the
186  * interface/link an address is valid on (e.g. fe80::1%eth0). This scope
187  * identifier is only valid on a single node.
188  *
189  * RFC 4007 says that the scope identifier MUST NOT be sent across the wire,
190  * unless all nodes agree on the semantic. Apache e.g. regards zone identifiers
191  * in the Host header as invalid requests, see
192  * https://issues.apache.org/bugzilla/show_bug.cgi?id=35122
193  */
194 static void strip_ipv6_scope_id(char *host)
195 {
196         char *scope, *cp;
197
198         /* bbox wget actually handles IPv6 addresses without [], like
199          * wget "http://::1/xxx", but this is not standard.
200          * To save code, _here_ we do not support it. */
201
202         if (host[0] != '[')
203                 return; /* not IPv6 */
204
205         scope = strchr(host, '%');
206         if (!scope)
207                 return;
208
209         /* Remove the IPv6 zone identifier from the host address */
210         cp = strchr(host, ']');
211         if (!cp || (cp[1] != ':' && cp[1] != '\0')) {
212                 /* malformed address (not "[xx]:nn" or "[xx]") */
213                 return;
214         }
215
216         /* cp points to "]...", scope points to "%eth0]..." */
217         overlapping_strcpy(scope, cp);
218 }
219
220 #if ENABLE_FEATURE_WGET_AUTHENTICATION
221 /* Base64-encode character string. */
222 static char *base64enc(const char *str)
223 {
224         unsigned len = strlen(str);
225         if (len > sizeof(G.wget_buf)/4*3 - 10) /* paranoia */
226                 len = sizeof(G.wget_buf)/4*3 - 10;
227         bb_uuencode(G.wget_buf, str, len, bb_uuenc_tbl_base64);
228         return G.wget_buf;
229 }
230 #endif
231
232 static char* sanitize_string(char *s)
233 {
234         unsigned char *p = (void *) s;
235         while (*p >= ' ')
236                 p++;
237         *p = '\0';
238         return s;
239 }
240
241 #if ENABLE_FEATURE_WGET_TIMEOUT
242 static void alarm_handler(int sig UNUSED_PARAM)
243 {
244         /* This is theoretically unsafe (uses stdio and malloc in signal handler) */
245         if (G.connecting)
246                 bb_error_msg_and_die("download timed out");
247 }
248 #endif
249
250 static FILE *open_socket(len_and_sockaddr *lsa)
251 {
252         int fd;
253         FILE *fp;
254
255         IF_FEATURE_WGET_TIMEOUT(alarm(G.timeout_seconds); G.connecting = 1;)
256         fd = xconnect_stream(lsa);
257         IF_FEATURE_WGET_TIMEOUT(G.connecting = 0;)
258
259         /* glibc 2.4 seems to try seeking on it - ??! */
260         /* hopefully it understands what ESPIPE means... */
261         fp = fdopen(fd, "r+");
262         if (!fp)
263                 bb_perror_msg_and_die(bb_msg_memory_exhausted);
264
265         return fp;
266 }
267
268 /* Returns '\n' if it was seen, else '\0'. Trims at first '\r' or '\n' */
269 /* FIXME: does not respect FEATURE_WGET_TIMEOUT and -T N: */
270 static char fgets_and_trim(FILE *fp)
271 {
272         char c;
273         char *buf_ptr;
274
275         if (fgets(G.wget_buf, sizeof(G.wget_buf) - 1, fp) == NULL)
276                 bb_perror_msg_and_die("error getting response");
277
278         buf_ptr = strchrnul(G.wget_buf, '\n');
279         c = *buf_ptr;
280         *buf_ptr = '\0';
281         buf_ptr = strchrnul(G.wget_buf, '\r');
282         *buf_ptr = '\0';
283
284         log_io("< %s", G.wget_buf);
285
286         return c;
287 }
288
289 static int ftpcmd(const char *s1, const char *s2, FILE *fp)
290 {
291         int result;
292         if (s1) {
293                 if (!s2)
294                         s2 = "";
295                 fprintf(fp, "%s%s\r\n", s1, s2);
296                 fflush(fp);
297                 log_io("> %s%s", s1, s2);
298         }
299
300         do {
301                 fgets_and_trim(fp);
302         } while (!isdigit(G.wget_buf[0]) || G.wget_buf[3] != ' ');
303
304         G.wget_buf[3] = '\0';
305         result = xatoi_positive(G.wget_buf);
306         G.wget_buf[3] = ' ';
307         return result;
308 }
309
310 static void parse_url(const char *src_url, struct host_info *h)
311 {
312         char *url, *p, *sp;
313
314         free(h->allocated);
315         h->allocated = url = xstrdup(src_url);
316
317         h->protocol = P_FTP;
318         p = strstr(url, "://");
319         if (p) {
320                 *p = '\0';
321                 h->host = p + 3;
322                 if (strcmp(url, P_FTP) == 0) {
323                         h->port = bb_lookup_port(P_FTP, "tcp", 21);
324                 } else
325                 if (strcmp(url, P_HTTPS) == 0) {
326                         h->port = bb_lookup_port(P_HTTPS, "tcp", 443);
327                         h->protocol = P_HTTPS;
328                 } else
329                 if (strcmp(url, P_HTTP) == 0) {
330  http:
331                         h->port = bb_lookup_port(P_HTTP, "tcp", 80);
332                         h->protocol = P_HTTP;
333                 } else {
334                         *p = ':';
335                         bb_error_msg_and_die("not an http or ftp url: %s", sanitize_string(url));
336                 }
337         } else {
338                 // GNU wget is user-friendly and falls back to http://
339                 h->host = url;
340                 goto http;
341         }
342
343         // FYI:
344         // "Real" wget 'http://busybox.net?var=a/b' sends this request:
345         //   'GET /?var=a/b HTTP 1.0'
346         //   and saves 'index.html?var=a%2Fb' (we save 'b')
347         // wget 'http://busybox.net?login=john@doe':
348         //   request: 'GET /?login=john@doe HTTP/1.0'
349         //   saves: 'index.html?login=john@doe' (we save '?login=john@doe')
350         // wget 'http://busybox.net#test/test':
351         //   request: 'GET / HTTP/1.0'
352         //   saves: 'index.html' (we save 'test')
353         //
354         // We also don't add unique .N suffix if file exists...
355         sp = strchr(h->host, '/');
356         p = strchr(h->host, '?'); if (!sp || (p && sp > p)) sp = p;
357         p = strchr(h->host, '#'); if (!sp || (p && sp > p)) sp = p;
358         if (!sp) {
359                 h->path = "";
360         } else if (*sp == '/') {
361                 *sp = '\0';
362                 h->path = sp + 1;
363         } else { // '#' or '?'
364                 // http://busybox.net?login=john@doe is a valid URL
365                 // memmove converts to:
366                 // http:/busybox.nett?login=john@doe...
367                 memmove(h->host - 1, h->host, sp - h->host);
368                 h->host--;
369                 sp[-1] = '\0';
370                 h->path = sp;
371         }
372
373         sp = strrchr(h->host, '@');
374         if (sp != NULL) {
375                 // URL-decode "user:password" string before base64-encoding:
376                 // wget http://test:my%20pass@example.com should send
377                 // Authorization: Basic dGVzdDpteSBwYXNz
378                 // which decodes to "test:my pass".
379                 // Standard wget and curl do this too.
380                 *sp = '\0';
381                 free(h->user);
382                 h->user = xstrdup(percent_decode_in_place(h->host, /*strict:*/ 0));
383                 h->host = sp + 1;
384         }
385         /* else: h->user remains NULL, or as set by original request
386          * before redirect (if we are here after a redirect).
387          */
388 }
389
390 static char *gethdr(FILE *fp)
391 {
392         char *s, *hdrval;
393         int c;
394
395         /* retrieve header line */
396         c = fgets_and_trim(fp);
397
398         /* end of the headers? */
399         if (G.wget_buf[0] == '\0')
400                 return NULL;
401
402         /* convert the header name to lower case */
403         for (s = G.wget_buf; isalnum(*s) || *s == '-' || *s == '.' || *s == '_'; ++s) {
404                 /*
405                  * No-op for 20-3f and 60-7f. "0-9a-z-." are in these ranges.
406                  * 40-5f range ("@A-Z[\]^_") maps to 60-7f.
407                  * "A-Z" maps to "a-z".
408                  * "@[\]" can't occur in header names.
409                  * "^_" maps to "~,DEL" (which is wrong).
410                  * "^" was never seen yet, "_" was seen from web.archive.org
411                  * (x-archive-orig-x_commoncrawl_Signature: HEXSTRING).
412                  */
413                 *s |= 0x20;
414         }
415
416         /* verify we are at the end of the header name */
417         if (*s != ':')
418                 bb_error_msg_and_die("bad header line: %s", sanitize_string(G.wget_buf));
419
420         /* locate the start of the header value */
421         *s++ = '\0';
422         hdrval = skip_whitespace(s);
423
424         if (c != '\n') {
425                 /* Rats! The buffer isn't big enough to hold the entire header value */
426                 while (c = getc(fp), c != EOF && c != '\n')
427                         continue;
428         }
429
430         return hdrval;
431 }
432
433 static void reset_beg_range_to_zero(void)
434 {
435         bb_error_msg("restart failed");
436         G.beg_range = 0;
437         xlseek(G.output_fd, 0, SEEK_SET);
438         /* Done at the end instead: */
439         /* ftruncate(G.output_fd, 0); */
440 }
441
442 static FILE* prepare_ftp_session(FILE **dfpp, struct host_info *target, len_and_sockaddr *lsa)
443 {
444         FILE *sfp;
445         char *str;
446         int port;
447
448         if (!target->user)
449                 target->user = xstrdup("anonymous:busybox@");
450
451         sfp = open_socket(lsa);
452         if (ftpcmd(NULL, NULL, sfp) != 220)
453                 bb_error_msg_and_die("%s", sanitize_string(G.wget_buf + 4));
454
455         /*
456          * Splitting username:password pair,
457          * trying to log in
458          */
459         str = strchr(target->user, ':');
460         if (str)
461                 *str++ = '\0';
462         switch (ftpcmd("USER ", target->user, sfp)) {
463         case 230:
464                 break;
465         case 331:
466                 if (ftpcmd("PASS ", str, sfp) == 230)
467                         break;
468                 /* fall through (failed login) */
469         default:
470                 bb_error_msg_and_die("ftp login: %s", sanitize_string(G.wget_buf + 4));
471         }
472
473         ftpcmd("TYPE I", NULL, sfp);
474
475         /*
476          * Querying file size
477          */
478         if (ftpcmd("SIZE ", target->path, sfp) == 213) {
479                 G.content_len = BB_STRTOOFF(G.wget_buf + 4, NULL, 10);
480                 if (G.content_len < 0 || errno) {
481                         bb_error_msg_and_die("SIZE value is garbage");
482                 }
483                 G.got_clen = 1;
484         }
485
486         /*
487          * Entering passive mode
488          */
489         if (ftpcmd("PASV", NULL, sfp) != 227) {
490  pasv_error:
491                 bb_error_msg_and_die("bad response to %s: %s", "PASV", sanitize_string(G.wget_buf));
492         }
493         // Response is "227 garbageN1,N2,N3,N4,P1,P2[)garbage]
494         // Server's IP is N1.N2.N3.N4 (we ignore it)
495         // Server's port for data connection is P1*256+P2
496         str = strrchr(G.wget_buf, ')');
497         if (str) str[0] = '\0';
498         str = strrchr(G.wget_buf, ',');
499         if (!str) goto pasv_error;
500         port = xatou_range(str+1, 0, 255);
501         *str = '\0';
502         str = strrchr(G.wget_buf, ',');
503         if (!str) goto pasv_error;
504         port += xatou_range(str+1, 0, 255) * 256;
505         set_nport(&lsa->u.sa, htons(port));
506
507         *dfpp = open_socket(lsa);
508
509         if (G.beg_range != 0) {
510                 sprintf(G.wget_buf, "REST %"OFF_FMT"u", G.beg_range);
511                 if (ftpcmd(G.wget_buf, NULL, sfp) == 350)
512                         G.content_len -= G.beg_range;
513                 else
514                         reset_beg_range_to_zero();
515         }
516
517         if (ftpcmd("RETR ", target->path, sfp) > 150)
518                 bb_error_msg_and_die("bad response to %s: %s", "RETR", sanitize_string(G.wget_buf));
519
520         return sfp;
521 }
522
523 static int spawn_https_helper(const char *host, unsigned port)
524 {
525         char *allocated = NULL;
526         int sp[2];
527         int pid;
528
529         if (socketpair(AF_UNIX, SOCK_STREAM, 0, sp) != 0)
530                 /* Kernel can have AF_UNIX support disabled */
531                 bb_perror_msg_and_die("socketpair");
532
533         if (!strchr(host, ':'))
534                 host = allocated = xasprintf("%s:%u", host, port);
535
536         pid = BB_MMU ? xfork() : xvfork();
537         if (pid == 0) {
538                 /* Child */
539                 char *argv[6];
540
541                 close(sp[0]);
542                 xmove_fd(sp[1], 0);
543                 xdup2(0, 1);
544                 /*
545                  * TODO: develop a tiny ssl/tls helper (using matrixssl?),
546                  * try to exec it here before falling back to big fat openssl.
547                  */
548                 /*
549                  * openssl s_client -quiet -connect www.kernel.org:443 2>/dev/null
550                  * It prints some debug stuff on stderr, don't know how to suppress it.
551                  * Work around by dev-nulling stderr. We lose all error messages :(
552                  */
553                 xmove_fd(2, 3);
554                 xopen("/dev/null", O_RDWR);
555                 argv[0] = (char*)"openssl";
556                 argv[1] = (char*)"s_client";
557                 argv[2] = (char*)"-quiet";
558                 argv[3] = (char*)"-connect";
559                 argv[4] = (char*)host;
560                 argv[5] = NULL;
561                 BB_EXECVP(argv[0], argv);
562                 xmove_fd(3, 2);
563                 bb_perror_msg_and_die("can't execute '%s'", argv[0]);
564                 /* notreached */
565         }
566
567         /* Parent */
568         free(allocated);
569         close(sp[1]);
570         return sp[0];
571 }
572
573 /* See networking/ssl_helper/README */
574 #define SSL_HELPER 0
575
576 #if SSL_HELPER
577 static void spawn_https_helper1(int network_fd)
578 {
579         int sp[2];
580         int pid;
581
582         if (socketpair(AF_UNIX, SOCK_STREAM, 0, sp) != 0)
583                 /* Kernel can have AF_UNIX support disabled */
584                 bb_perror_msg_and_die("socketpair");
585
586         pid = BB_MMU ? xfork() : xvfork();
587         if (pid == 0) {
588                 /* Child */
589                 char *argv[3];
590
591                 close(sp[0]);
592                 xmove_fd(sp[1], 0);
593                 xdup2(0, 1);
594                 xmove_fd(network_fd, 3);
595                 /*
596                  * A simple ssl/tls helper
597                  */
598                 argv[0] = (char*)"ssl_helper";
599                 argv[1] = (char*)"-d3";
600                 argv[2] = NULL;
601                 BB_EXECVP(argv[0], argv);
602                 bb_perror_msg_and_die("can't execute '%s'", argv[0]);
603                 /* notreached */
604         }
605
606         /* Parent */
607         close(sp[1]);
608         xmove_fd(sp[0], network_fd);
609 }
610 #endif
611
612 static void NOINLINE retrieve_file_data(FILE *dfp)
613 {
614 #if ENABLE_FEATURE_WGET_STATUSBAR || ENABLE_FEATURE_WGET_TIMEOUT
615 # if ENABLE_FEATURE_WGET_TIMEOUT
616         unsigned second_cnt = G.timeout_seconds;
617 # endif
618         struct pollfd polldata;
619
620         polldata.fd = fileno(dfp);
621         polldata.events = POLLIN | POLLPRI;
622 #endif
623         progress_meter(PROGRESS_START);
624
625         if (G.chunked)
626                 goto get_clen;
627
628         /* Loops only if chunked */
629         while (1) {
630
631 #if ENABLE_FEATURE_WGET_STATUSBAR || ENABLE_FEATURE_WGET_TIMEOUT
632                 /* Must use nonblocking I/O, otherwise fread will loop
633                  * and *block* until it reads full buffer,
634                  * which messes up progress bar and/or timeout logic.
635                  * Because of nonblocking I/O, we need to dance
636                  * very carefully around EAGAIN. See explanation at
637                  * clearerr() calls.
638                  */
639                 ndelay_on(polldata.fd);
640 #endif
641                 while (1) {
642                         int n;
643                         unsigned rdsz;
644
645 #if ENABLE_FEATURE_WGET_STATUSBAR || ENABLE_FEATURE_WGET_TIMEOUT
646                         /* fread internally uses read loop, which in our case
647                          * is usually exited when we get EAGAIN.
648                          * In this case, libc sets error marker on the stream.
649                          * Need to clear it before next fread to avoid possible
650                          * rare false positive ferror below. Rare because usually
651                          * fread gets more than zero bytes, and we don't fall
652                          * into if (n <= 0) ...
653                          */
654                         clearerr(dfp);
655 #endif
656                         errno = 0;
657                         rdsz = sizeof(G.wget_buf);
658                         if (G.got_clen) {
659                                 if (G.content_len < (off_t)sizeof(G.wget_buf)) {
660                                         if ((int)G.content_len <= 0)
661                                                 break;
662                                         rdsz = (unsigned)G.content_len;
663                                 }
664                         }
665                         n = fread(G.wget_buf, 1, rdsz, dfp);
666
667                         if (n > 0) {
668                                 xwrite(G.output_fd, G.wget_buf, n);
669 #if ENABLE_FEATURE_WGET_STATUSBAR
670                                 G.transferred += n;
671 #endif
672                                 if (G.got_clen) {
673                                         G.content_len -= n;
674                                         if (G.content_len == 0)
675                                                 break;
676                                 }
677 #if ENABLE_FEATURE_WGET_TIMEOUT
678                                 second_cnt = G.timeout_seconds;
679 #endif
680                                 goto bump;
681                         }
682
683                         /* n <= 0.
684                          * man fread:
685                          * If error occurs, or EOF is reached, the return value
686                          * is a short item count (or zero).
687                          * fread does not distinguish between EOF and error.
688                          */
689                         if (errno != EAGAIN) {
690                                 if (ferror(dfp)) {
691                                         progress_meter(PROGRESS_END);
692                                         bb_perror_msg_and_die(bb_msg_read_error);
693                                 }
694                                 break; /* EOF, not error */
695                         }
696
697 #if ENABLE_FEATURE_WGET_STATUSBAR || ENABLE_FEATURE_WGET_TIMEOUT
698                         /* It was EAGAIN. There is no data. Wait up to one second
699                          * then abort if timed out, or update the bar and try reading again.
700                          */
701                         if (safe_poll(&polldata, 1, 1000) == 0) {
702 # if ENABLE_FEATURE_WGET_TIMEOUT
703                                 if (second_cnt != 0 && --second_cnt == 0) {
704                                         progress_meter(PROGRESS_END);
705                                         bb_error_msg_and_die("download timed out");
706                                 }
707 # endif
708                                 /* We used to loop back to poll here,
709                                  * but there is no great harm in letting fread
710                                  * to try reading anyway.
711                                  */
712                         }
713 #endif
714  bump:
715                         /* Need to do it _every_ second for "stalled" indicator
716                          * to be shown properly.
717                          */
718                         progress_meter(PROGRESS_BUMP);
719                 } /* while (reading data) */
720
721 #if ENABLE_FEATURE_WGET_STATUSBAR || ENABLE_FEATURE_WGET_TIMEOUT
722                 clearerr(dfp);
723                 ndelay_off(polldata.fd); /* else fgets can get very unhappy */
724 #endif
725                 if (!G.chunked)
726                         break;
727
728                 fgets_and_trim(dfp); /* Eat empty line */
729  get_clen:
730                 fgets_and_trim(dfp);
731                 G.content_len = STRTOOFF(G.wget_buf, NULL, 16);
732                 /* FIXME: error check? */
733                 if (G.content_len == 0)
734                         break; /* all done! */
735                 G.got_clen = 1;
736                 /*
737                  * Note that fgets may result in some data being buffered in dfp.
738                  * We loop back to fread, which will retrieve this data.
739                  * Also note that code has to be arranged so that fread
740                  * is done _before_ one-second poll wait - poll doesn't know
741                  * about stdio buffering and can result in spurious one second waits!
742                  */
743         }
744
745         /* If -c failed, we restart from the beginning,
746          * but we do not truncate file then, we do it only now, at the end.
747          * This lets user to ^C if his 99% complete 10 GB file download
748          * failed to restart *without* losing the almost complete file.
749          */
750         {
751                 off_t pos = lseek(G.output_fd, 0, SEEK_CUR);
752                 if (pos != (off_t)-1)
753                         ftruncate(G.output_fd, pos);
754         }
755
756         /* Draw full bar and free its resources */
757         G.chunked = 0;  /* makes it show 100% even for chunked download */
758         G.got_clen = 1; /* makes it show 100% even for download of (formerly) unknown size */
759         progress_meter(PROGRESS_END);
760 }
761
762 static void download_one_url(const char *url)
763 {
764         bool use_proxy;                 /* Use proxies if env vars are set  */
765         int redir_limit;
766         len_and_sockaddr *lsa;
767         FILE *sfp;                      /* socket to web/ftp server         */
768         FILE *dfp;                      /* socket to ftp server (data)      */
769         char *proxy = NULL;
770         char *fname_out_alloc;
771         char *redirected_path = NULL;
772         struct host_info server;
773         struct host_info target;
774
775         server.allocated = NULL;
776         target.allocated = NULL;
777         server.user = NULL;
778         target.user = NULL;
779
780         parse_url(url, &target);
781
782         /* Use the proxy if necessary */
783         use_proxy = (strcmp(G.proxy_flag, "off") != 0);
784         if (use_proxy) {
785                 proxy = getenv(target.protocol == P_FTP ? "ftp_proxy" : "http_proxy");
786 //FIXME: what if protocol is https? Ok to use http_proxy?
787                 use_proxy = (proxy && proxy[0]);
788                 if (use_proxy)
789                         parse_url(proxy, &server);
790         }
791         if (!use_proxy) {
792                 server.port = target.port;
793                 if (ENABLE_FEATURE_IPV6) {
794                         //free(server.allocated); - can't be non-NULL
795                         server.host = server.allocated = xstrdup(target.host);
796                 } else {
797                         server.host = target.host;
798                 }
799         }
800
801         if (ENABLE_FEATURE_IPV6)
802                 strip_ipv6_scope_id(target.host);
803
804         /* If there was no -O FILE, guess output filename */
805         fname_out_alloc = NULL;
806         if (!(option_mask32 & WGET_OPT_OUTNAME)) {
807                 G.fname_out = bb_get_last_path_component_nostrip(target.path);
808                 /* handle "wget http://kernel.org//" */
809                 if (G.fname_out[0] == '/' || !G.fname_out[0])
810                         G.fname_out = (char*)"index.html";
811                 /* -P DIR is considered only if there was no -O FILE */
812                 if (G.dir_prefix)
813                         G.fname_out = fname_out_alloc = concat_path_file(G.dir_prefix, G.fname_out);
814                 else {
815                         /* redirects may free target.path later, need to make a copy */
816                         G.fname_out = fname_out_alloc = xstrdup(G.fname_out);
817                 }
818         }
819 #if ENABLE_FEATURE_WGET_STATUSBAR
820         G.curfile = bb_get_last_path_component_nostrip(G.fname_out);
821 #endif
822
823         /* Determine where to start transfer */
824         G.beg_range = 0;
825         if (option_mask32 & WGET_OPT_CONTINUE) {
826                 G.output_fd = open(G.fname_out, O_WRONLY);
827                 if (G.output_fd >= 0) {
828                         G.beg_range = xlseek(G.output_fd, 0, SEEK_END);
829                 }
830                 /* File doesn't exist. We do not create file here yet.
831                  * We are not sure it exists on remote side */
832         }
833
834         redir_limit = 5;
835  resolve_lsa:
836         lsa = xhost2sockaddr(server.host, server.port);
837         if (!(option_mask32 & WGET_OPT_QUIET)) {
838                 char *s = xmalloc_sockaddr2dotted(&lsa->u.sa);
839                 fprintf(stderr, "Connecting to %s (%s)\n", server.host, s);
840                 free(s);
841         }
842  establish_session:
843         /*G.content_len = 0; - redundant, got_clen = 0 is enough */
844         G.got_clen = 0;
845         G.chunked = 0;
846         if (use_proxy || target.protocol != P_FTP) {
847                 /*
848                  *  HTTP session
849                  */
850                 char *str;
851                 int status;
852
853                 /* Open socket to http(s) server */
854                 if (target.protocol == P_HTTPS) {
855 /* openssl-based helper
856  * Inconvenient API since we can't give it an open fd
857  */
858                         int fd = spawn_https_helper(server.host, server.port);
859                         sfp = fdopen(fd, "r+");
860                         if (!sfp)
861                                 bb_perror_msg_and_die(bb_msg_memory_exhausted);
862                 } else
863                         sfp = open_socket(lsa);
864 #if SSL_HELPER
865                 if (target.protocol == P_HTTPS)
866                         spawn_https_helper1(fileno(sfp));
867 #endif
868                 /* Send HTTP request */
869                 if (use_proxy) {
870                         SENDFMT(sfp, "GET %s://%s/%s HTTP/1.1\r\n",
871                                 target.protocol, target.host,
872                                 target.path);
873                 } else {
874                         SENDFMT(sfp, "%s /%s HTTP/1.1\r\n",
875                                 (option_mask32 & WGET_OPT_POST_DATA) ? "POST" : "GET",
876                                 target.path);
877                 }
878                 if (!USR_HEADER_HOST)
879                         SENDFMT(sfp, "Host: %s\r\n", target.host);
880                 if (!USR_HEADER_USER_AGENT)
881                         SENDFMT(sfp, "User-Agent: %s\r\n", G.user_agent);
882
883                 /* Ask server to close the connection as soon as we are done
884                  * (IOW: we do not intend to send more requests)
885                  */
886                 SENDFMT(sfp, "Connection: close\r\n");
887
888 #if ENABLE_FEATURE_WGET_AUTHENTICATION
889                 if (target.user && !USR_HEADER_AUTH) {
890                         SENDFMT(sfp, "Proxy-Authorization: Basic %s\r\n"+6,
891                                 base64enc(target.user));
892                 }
893                 if (use_proxy && server.user && !USR_HEADER_PROXY_AUTH) {
894                         SENDFMT(sfp, "Proxy-Authorization: Basic %s\r\n",
895                                 base64enc(server.user));
896                 }
897 #endif
898
899                 if (G.beg_range != 0 && !USR_HEADER_RANGE)
900                         SENDFMT(sfp, "Range: bytes=%"OFF_FMT"u-\r\n", G.beg_range);
901
902 #if ENABLE_FEATURE_WGET_LONG_OPTIONS
903                 if (G.extra_headers) {
904                         log_io(G.extra_headers);
905                         fputs(G.extra_headers, sfp);
906                 }
907
908                 if (option_mask32 & WGET_OPT_POST_DATA) {
909                         SENDFMT(sfp,
910                                 "Content-Type: application/x-www-form-urlencoded\r\n"
911                                 "Content-Length: %u\r\n"
912                                 "\r\n"
913                                 "%s",
914                                 (int) strlen(G.post_data), G.post_data
915                         );
916                 } else
917 #endif
918                 {
919                         SENDFMT(sfp, "\r\n");
920                 }
921
922                 fflush(sfp);
923
924                 /*
925                  * Retrieve HTTP response line and check for "200" status code.
926                  */
927  read_response:
928                 fgets_and_trim(sfp);
929
930                 str = G.wget_buf;
931                 str = skip_non_whitespace(str);
932                 str = skip_whitespace(str);
933                 // FIXME: no error check
934                 // xatou wouldn't work: "200 OK"
935                 status = atoi(str);
936                 switch (status) {
937                 case 0:
938                 case 100:
939                         while (gethdr(sfp) != NULL)
940                                 /* eat all remaining headers */;
941                         goto read_response;
942                 case 200:
943 /*
944 Response 204 doesn't say "null file", it says "metadata
945 has changed but data didn't":
946
947 "10.2.5 204 No Content
948 The server has fulfilled the request but does not need to return
949 an entity-body, and might want to return updated metainformation.
950 The response MAY include new or updated metainformation in the form
951 of entity-headers, which if present SHOULD be associated with
952 the requested variant.
953
954 If the client is a user agent, it SHOULD NOT change its document
955 view from that which caused the request to be sent. This response
956 is primarily intended to allow input for actions to take place
957 without causing a change to the user agent's active document view,
958 although any new or updated metainformation SHOULD be applied
959 to the document currently in the user agent's active view.
960
961 The 204 response MUST NOT include a message-body, and thus
962 is always terminated by the first empty line after the header fields."
963
964 However, in real world it was observed that some web servers
965 (e.g. Boa/0.94.14rc21) simply use code 204 when file size is zero.
966 */
967                 case 204:
968                         if (G.beg_range != 0) {
969                                 /* "Range:..." was not honored by the server.
970                                  * Restart download from the beginning.
971                                  */
972                                 reset_beg_range_to_zero();
973                         }
974                         break;
975                 case 300:  /* redirection */
976                 case 301:
977                 case 302:
978                 case 303:
979                         break;
980                 case 206: /* Partial Content */
981                         if (G.beg_range != 0)
982                                 /* "Range:..." worked. Good. */
983                                 break;
984                         /* Partial Content even though we did not ask for it??? */
985                         /* fall through */
986                 default:
987                         bb_error_msg_and_die("server returned error: %s", sanitize_string(G.wget_buf));
988                 }
989
990                 /*
991                  * Retrieve HTTP headers.
992                  */
993                 while ((str = gethdr(sfp)) != NULL) {
994                         static const char keywords[] ALIGN1 =
995                                 "content-length\0""transfer-encoding\0""location\0";
996                         enum {
997                                 KEY_content_length = 1, KEY_transfer_encoding, KEY_location
998                         };
999                         smalluint key;
1000
1001                         /* gethdr converted "FOO:" string to lowercase */
1002
1003                         /* strip trailing whitespace */
1004                         char *s = strchrnul(str, '\0') - 1;
1005                         while (s >= str && (*s == ' ' || *s == '\t')) {
1006                                 *s = '\0';
1007                                 s--;
1008                         }
1009                         key = index_in_strings(keywords, G.wget_buf) + 1;
1010                         if (key == KEY_content_length) {
1011                                 G.content_len = BB_STRTOOFF(str, NULL, 10);
1012                                 if (G.content_len < 0 || errno) {
1013                                         bb_error_msg_and_die("content-length %s is garbage", sanitize_string(str));
1014                                 }
1015                                 G.got_clen = 1;
1016                                 continue;
1017                         }
1018                         if (key == KEY_transfer_encoding) {
1019                                 if (strcmp(str_tolower(str), "chunked") != 0)
1020                                         bb_error_msg_and_die("transfer encoding '%s' is not supported", sanitize_string(str));
1021                                 G.chunked = 1;
1022                         }
1023                         if (key == KEY_location && status >= 300) {
1024                                 if (--redir_limit == 0)
1025                                         bb_error_msg_and_die("too many redirections");
1026                                 fclose(sfp);
1027                                 if (str[0] == '/') {
1028                                         free(redirected_path);
1029                                         target.path = redirected_path = xstrdup(str+1);
1030                                         /* lsa stays the same: it's on the same server */
1031                                 } else {
1032                                         parse_url(str, &target);
1033                                         if (!use_proxy) {
1034                                                 /* server.user remains untouched */
1035                                                 free(server.allocated);
1036                                                 server.allocated = NULL;
1037                                                 server.host = target.host;
1038                                                 /* strip_ipv6_scope_id(target.host); - no! */
1039                                                 /* we assume remote never gives us IPv6 addr with scope id */
1040                                                 server.port = target.port;
1041                                                 free(lsa);
1042                                                 goto resolve_lsa;
1043                                         } /* else: lsa stays the same: we use proxy */
1044                                 }
1045                                 goto establish_session;
1046                         }
1047                 }
1048 //              if (status >= 300)
1049 //                      bb_error_msg_and_die("bad redirection (no Location: header from server)");
1050
1051                 /* For HTTP, data is pumped over the same connection */
1052                 dfp = sfp;
1053         } else {
1054                 /*
1055                  *  FTP session
1056                  */
1057                 sfp = prepare_ftp_session(&dfp, &target, lsa);
1058         }
1059
1060         free(lsa);
1061
1062         if (!(option_mask32 & WGET_OPT_SPIDER)) {
1063                 if (G.output_fd < 0)
1064                         G.output_fd = xopen(G.fname_out, G.o_flags);
1065                 retrieve_file_data(dfp);
1066                 if (!(option_mask32 & WGET_OPT_OUTNAME)) {
1067                         xclose(G.output_fd);
1068                         G.output_fd = -1;
1069                 }
1070         }
1071
1072         if (dfp != sfp) {
1073                 /* It's ftp. Close data connection properly */
1074                 fclose(dfp);
1075                 if (ftpcmd(NULL, NULL, sfp) != 226)
1076                         bb_error_msg_and_die("ftp error: %s", sanitize_string(G.wget_buf + 4));
1077                 /* ftpcmd("QUIT", NULL, sfp); - why bother? */
1078         }
1079         fclose(sfp);
1080
1081         free(server.allocated);
1082         free(target.allocated);
1083         free(server.user);
1084         free(target.user);
1085         free(fname_out_alloc);
1086         free(redirected_path);
1087 }
1088
1089 int wget_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE;
1090 int wget_main(int argc UNUSED_PARAM, char **argv)
1091 {
1092 #if ENABLE_FEATURE_WGET_LONG_OPTIONS
1093         static const char wget_longopts[] ALIGN1 =
1094                 /* name, has_arg, val */
1095                 "continue\0"         No_argument       "c"
1096 //FIXME: -s isn't --spider, it's --save-headers!
1097                 "spider\0"           No_argument       "s"
1098                 "quiet\0"            No_argument       "q"
1099                 "output-document\0"  Required_argument "O"
1100                 "directory-prefix\0" Required_argument "P"
1101                 "proxy\0"            Required_argument "Y"
1102                 "user-agent\0"       Required_argument "U"
1103 #if ENABLE_FEATURE_WGET_TIMEOUT
1104                 "timeout\0"          Required_argument "T"
1105 #endif
1106                 /* Ignored: */
1107                 // "tries\0"            Required_argument "t"
1108                 /* Ignored (we always use PASV): */
1109                 "passive-ftp\0"      No_argument       "\xff"
1110                 "header\0"           Required_argument "\xfe"
1111                 "post-data\0"        Required_argument "\xfd"
1112                 /* Ignored (we don't do ssl) */
1113                 "no-check-certificate\0" No_argument   "\xfc"
1114                 /* Ignored (we don't support caching) */
1115                 "no-cache\0"         No_argument       "\xfb"
1116                 ;
1117 #endif
1118
1119 #if ENABLE_FEATURE_WGET_LONG_OPTIONS
1120         llist_t *headers_llist = NULL;
1121 #endif
1122
1123         INIT_G();
1124
1125 #if ENABLE_FEATURE_WGET_TIMEOUT
1126         G.timeout_seconds = 900;
1127         signal(SIGALRM, alarm_handler);
1128 #endif
1129         G.proxy_flag = "on";   /* use proxies if env vars are set */
1130         G.user_agent = "Wget"; /* "User-Agent" header field */
1131
1132 #if ENABLE_FEATURE_WGET_LONG_OPTIONS
1133         applet_long_options = wget_longopts;
1134 #endif
1135         opt_complementary = "-1"
1136                         IF_FEATURE_WGET_TIMEOUT(":T+")
1137                         IF_FEATURE_WGET_LONG_OPTIONS(":\xfe::");
1138         getopt32(argv, "csqO:P:Y:U:T:" /*ignored:*/ "t:",
1139                 &G.fname_out, &G.dir_prefix,
1140                 &G.proxy_flag, &G.user_agent,
1141                 IF_FEATURE_WGET_TIMEOUT(&G.timeout_seconds) IF_NOT_FEATURE_WGET_TIMEOUT(NULL),
1142                 NULL /* -t RETRIES */
1143                 IF_FEATURE_WGET_LONG_OPTIONS(, &headers_llist)
1144                 IF_FEATURE_WGET_LONG_OPTIONS(, &G.post_data)
1145         );
1146         argv += optind;
1147
1148 #if ENABLE_FEATURE_WGET_LONG_OPTIONS
1149         if (headers_llist) {
1150                 int size = 0;
1151                 char *hdr;
1152                 llist_t *ll = headers_llist;
1153                 while (ll) {
1154                         size += strlen(ll->data) + 2;
1155                         ll = ll->link;
1156                 }
1157                 G.extra_headers = hdr = xmalloc(size + 1);
1158                 while (headers_llist) {
1159                         int bit;
1160                         const char *words;
1161
1162                         size = sprintf(hdr, "%s\r\n",
1163                                         (char*)llist_pop(&headers_llist));
1164                         /* a bit like index_in_substrings but don't match full key */
1165                         bit = 1;
1166                         words = wget_user_headers;
1167                         while (*words) {
1168                                 if (strstr(hdr, words) == hdr) {
1169                                         G.user_headers |= bit;
1170                                         break;
1171                                 }
1172                                 bit <<= 1;
1173                                 words += strlen(words) + 1;
1174                         }
1175                         hdr += size;
1176                 }
1177         }
1178 #endif
1179
1180         G.output_fd = -1;
1181         G.o_flags = O_WRONLY | O_CREAT | O_TRUNC | O_EXCL;
1182         if (G.fname_out) { /* -O FILE ? */
1183                 if (LONE_DASH(G.fname_out)) { /* -O - ? */
1184                         G.output_fd = 1;
1185                         option_mask32 &= ~WGET_OPT_CONTINUE;
1186                 }
1187                 /* compat with wget: -O FILE can overwrite */
1188                 G.o_flags = O_WRONLY | O_CREAT | O_TRUNC;
1189         }
1190
1191         while (*argv)
1192                 download_one_url(*argv++);
1193
1194         if (G.output_fd >= 0)
1195                 xclose(G.output_fd);
1196
1197 #if ENABLE_FEATURE_CLEAN_UP && ENABLE_FEATURE_WGET_LONG_OPTIONS
1198         free(G.extra_headers);
1199 #endif
1200         FINI_G();
1201
1202         return EXIT_SUCCESS;
1203 }