3a4be98783380e5790788302c0d9e80df9f23a8d
[oweals/busybox.git] / networking / wget.c
1 /* vi: set sw=4 ts=4: */
2 /*
3  * wget - retrieve a file using HTTP or FTP
4  *
5  * Chip Rosenthal Covad Communications <chip@laserlink.net>
6  * Licensed under GPLv2, see file LICENSE in this source tree.
7  *
8  * Copyright (C) 2010 Bradley M. Kuhn <bkuhn@ebb.org>
9  * Kuhn's copyrights are licensed GPLv2-or-later.  File as a whole remains GPLv2.
10  */
11
12 //usage:#define wget_trivial_usage
13 //usage:        IF_FEATURE_WGET_LONG_OPTIONS(
14 //usage:       "[-c|--continue] [-s|--spider] [-q|--quiet] [-O|--output-document FILE]\n"
15 //usage:       "        [--header 'header: value'] [-Y|--proxy on/off] [-P DIR]\n"
16 //usage:       "        [--no-check-certificate] [-U|--user-agent AGENT]"
17 //usage:                        IF_FEATURE_WGET_TIMEOUT(" [-T SEC]") " URL..."
18 //usage:        )
19 //usage:        IF_NOT_FEATURE_WGET_LONG_OPTIONS(
20 //usage:       "[-csq] [-O FILE] [-Y on/off] [-P DIR] [-U AGENT]"
21 //usage:                        IF_FEATURE_WGET_TIMEOUT(" [-T SEC]") " URL..."
22 //usage:        )
23 //usage:#define wget_full_usage "\n\n"
24 //usage:       "Retrieve files via HTTP or FTP\n"
25 //usage:     "\nOptions:"
26 //usage:     "\n        -s      Spider mode - only check file existence"
27 //usage:     "\n        -c      Continue retrieval of aborted transfer"
28 //usage:     "\n        -q      Quiet"
29 //usage:     "\n        -P DIR  Save to DIR (default .)"
30 //usage:        IF_FEATURE_WGET_TIMEOUT(
31 //usage:     "\n        -T SEC  Network read timeout is SEC seconds"
32 //usage:        )
33 //usage:     "\n        -O FILE Save to FILE ('-' for stdout)"
34 //usage:     "\n        -U STR  Use STR for User-Agent header"
35 //usage:     "\n        -Y      Use proxy ('on' or 'off')"
36
37 #include "libbb.h"
38
39 //#define log_io(...) bb_error_msg(__VA_ARGS__)
40 #define log_io(...) ((void)0)
41
42
43 struct host_info {
44         char *allocated;
45         const char *path;
46         const char *user;
47         char       *host;
48         int         port;
49         smallint    is_ftp;
50 };
51
52
53 /* Globals */
54 struct globals {
55         off_t content_len;        /* Content-length of the file */
56         off_t beg_range;          /* Range at which continue begins */
57 #if ENABLE_FEATURE_WGET_STATUSBAR
58         off_t transferred;        /* Number of bytes transferred so far */
59         const char *curfile;      /* Name of current file being transferred */
60         bb_progress_t pmt;
61 #endif
62         char *dir_prefix;
63 #if ENABLE_FEATURE_WGET_LONG_OPTIONS
64         char *post_data;
65         char *extra_headers;
66 #endif
67         char *fname_out;        /* where to direct output (-O) */
68         const char *proxy_flag; /* Use proxies if env vars are set */
69         const char *user_agent; /* "User-Agent" header field */
70 #if ENABLE_FEATURE_WGET_TIMEOUT
71         unsigned timeout_seconds;
72 #endif
73         int output_fd;
74         int o_flags;
75         smallint chunked;         /* chunked transfer encoding */
76         smallint got_clen;        /* got content-length: from server  */
77         /* Local downloads do benefit from big buffer.
78          * With 512 byte buffer, it was measured to be
79          * an order of magnitude slower than with big one.
80          */
81         uint64_t just_to_align_next_member;
82         char wget_buf[CONFIG_FEATURE_COPYBUF_KB*1024];
83 } FIX_ALIASING;
84 #define G (*ptr_to_globals)
85 #define INIT_G() do { \
86         SET_PTR_TO_GLOBALS(xzalloc(sizeof(G))); \
87         IF_FEATURE_WGET_TIMEOUT(G.timeout_seconds = 900;) \
88 } while (0)
89
90
91 /* Must match option string! */
92 enum {
93         WGET_OPT_CONTINUE   = (1 << 0),
94         WGET_OPT_SPIDER     = (1 << 1),
95         WGET_OPT_QUIET      = (1 << 2),
96         WGET_OPT_OUTNAME    = (1 << 3),
97         WGET_OPT_PREFIX     = (1 << 4),
98         WGET_OPT_PROXY      = (1 << 5),
99         WGET_OPT_USER_AGENT = (1 << 6),
100         WGET_OPT_NETWORK_READ_TIMEOUT = (1 << 7),
101         WGET_OPT_RETRIES    = (1 << 8),
102         WGET_OPT_PASSIVE    = (1 << 9),
103         WGET_OPT_HEADER     = (1 << 10) * ENABLE_FEATURE_WGET_LONG_OPTIONS,
104         WGET_OPT_POST_DATA  = (1 << 11) * ENABLE_FEATURE_WGET_LONG_OPTIONS,
105 };
106
107 enum {
108         PROGRESS_START = -1,
109         PROGRESS_END   = 0,
110         PROGRESS_BUMP  = 1,
111 };
112 #if ENABLE_FEATURE_WGET_STATUSBAR
113 static void progress_meter(int flag)
114 {
115         if (option_mask32 & WGET_OPT_QUIET)
116                 return;
117
118         if (flag == PROGRESS_START)
119                 bb_progress_init(&G.pmt, G.curfile);
120
121         bb_progress_update(&G.pmt,
122                         G.beg_range,
123                         G.transferred,
124                         (G.chunked || !G.got_clen) ? 0 : G.beg_range + G.transferred + G.content_len
125         );
126
127         if (flag == PROGRESS_END) {
128                 bb_progress_free(&G.pmt);
129                 bb_putchar_stderr('\n');
130                 G.transferred = 0;
131         }
132 }
133 #else
134 static ALWAYS_INLINE void progress_meter(int flag UNUSED_PARAM) { }
135 #endif
136
137
138 /* IPv6 knows scoped address types i.e. link and site local addresses. Link
139  * local addresses can have a scope identifier to specify the
140  * interface/link an address is valid on (e.g. fe80::1%eth0). This scope
141  * identifier is only valid on a single node.
142  *
143  * RFC 4007 says that the scope identifier MUST NOT be sent across the wire,
144  * unless all nodes agree on the semantic. Apache e.g. regards zone identifiers
145  * in the Host header as invalid requests, see
146  * https://issues.apache.org/bugzilla/show_bug.cgi?id=35122
147  */
148 static void strip_ipv6_scope_id(char *host)
149 {
150         char *scope, *cp;
151
152         /* bbox wget actually handles IPv6 addresses without [], like
153          * wget "http://::1/xxx", but this is not standard.
154          * To save code, _here_ we do not support it. */
155
156         if (host[0] != '[')
157                 return; /* not IPv6 */
158
159         scope = strchr(host, '%');
160         if (!scope)
161                 return;
162
163         /* Remove the IPv6 zone identifier from the host address */
164         cp = strchr(host, ']');
165         if (!cp || (cp[1] != ':' && cp[1] != '\0')) {
166                 /* malformed address (not "[xx]:nn" or "[xx]") */
167                 return;
168         }
169
170         /* cp points to "]...", scope points to "%eth0]..." */
171         overlapping_strcpy(scope, cp);
172 }
173
174 #if ENABLE_FEATURE_WGET_AUTHENTICATION
175 /* Base64-encode character string. */
176 static char *base64enc(const char *str)
177 {
178         unsigned len = strlen(str);
179         if (len > sizeof(G.wget_buf)/4*3 - 10) /* paranoia */
180                 len = sizeof(G.wget_buf)/4*3 - 10;
181         bb_uuencode(G.wget_buf, str, len, bb_uuenc_tbl_base64);
182         return G.wget_buf;
183 }
184 #endif
185
186 static char* sanitize_string(char *s)
187 {
188         unsigned char *p = (void *) s;
189         while (*p >= ' ')
190                 p++;
191         *p = '\0';
192         return s;
193 }
194
195 static FILE *open_socket(len_and_sockaddr *lsa)
196 {
197         FILE *fp;
198
199         /* glibc 2.4 seems to try seeking on it - ??! */
200         /* hopefully it understands what ESPIPE means... */
201         fp = fdopen(xconnect_stream(lsa), "r+");
202         if (fp == NULL)
203                 bb_perror_msg_and_die(bb_msg_memory_exhausted);
204
205         return fp;
206 }
207
208 /* Returns '\n' if it was seen, else '\0'. Trims at first '\r' or '\n' */
209 static char fgets_and_trim(FILE *fp)
210 {
211         char c;
212         char *buf_ptr;
213
214         if (fgets(G.wget_buf, sizeof(G.wget_buf) - 1, fp) == NULL)
215                 bb_perror_msg_and_die("error getting response");
216
217         buf_ptr = strchrnul(G.wget_buf, '\n');
218         c = *buf_ptr;
219         *buf_ptr = '\0';
220         buf_ptr = strchrnul(G.wget_buf, '\r');
221         *buf_ptr = '\0';
222
223         log_io("< %s", G.wget_buf);
224
225         return c;
226 }
227
228 static int ftpcmd(const char *s1, const char *s2, FILE *fp)
229 {
230         int result;
231         if (s1) {
232                 if (!s2)
233                         s2 = "";
234                 fprintf(fp, "%s%s\r\n", s1, s2);
235                 fflush(fp);
236                 log_io("> %s%s", s1, s2);
237         }
238
239         do {
240                 fgets_and_trim(fp);
241         } while (!isdigit(G.wget_buf[0]) || G.wget_buf[3] != ' ');
242
243         G.wget_buf[3] = '\0';
244         result = xatoi_positive(G.wget_buf);
245         G.wget_buf[3] = ' ';
246         return result;
247 }
248
249 static void parse_url(const char *src_url, struct host_info *h)
250 {
251         char *url, *p, *sp;
252
253         free(h->allocated);
254         h->allocated = url = xstrdup(src_url);
255
256         if (strncmp(url, "http://", 7) == 0) {
257                 h->port = bb_lookup_port("http", "tcp", 80);
258                 h->host = url + 7;
259                 h->is_ftp = 0;
260         } else if (strncmp(url, "ftp://", 6) == 0) {
261                 h->port = bb_lookup_port("ftp", "tcp", 21);
262                 h->host = url + 6;
263                 h->is_ftp = 1;
264         } else
265                 bb_error_msg_and_die("not an http or ftp url: %s", sanitize_string(url));
266
267         // FYI:
268         // "Real" wget 'http://busybox.net?var=a/b' sends this request:
269         //   'GET /?var=a/b HTTP 1.0'
270         //   and saves 'index.html?var=a%2Fb' (we save 'b')
271         // wget 'http://busybox.net?login=john@doe':
272         //   request: 'GET /?login=john@doe HTTP/1.0'
273         //   saves: 'index.html?login=john@doe' (we save '?login=john@doe')
274         // wget 'http://busybox.net#test/test':
275         //   request: 'GET / HTTP/1.0'
276         //   saves: 'index.html' (we save 'test')
277         //
278         // We also don't add unique .N suffix if file exists...
279         sp = strchr(h->host, '/');
280         p = strchr(h->host, '?'); if (!sp || (p && sp > p)) sp = p;
281         p = strchr(h->host, '#'); if (!sp || (p && sp > p)) sp = p;
282         if (!sp) {
283                 h->path = "";
284         } else if (*sp == '/') {
285                 *sp = '\0';
286                 h->path = sp + 1;
287         } else { // '#' or '?'
288                 // http://busybox.net?login=john@doe is a valid URL
289                 // memmove converts to:
290                 // http:/busybox.nett?login=john@doe...
291                 memmove(h->host - 1, h->host, sp - h->host);
292                 h->host--;
293                 sp[-1] = '\0';
294                 h->path = sp;
295         }
296
297         // We used to set h->user to NULL here, but this interferes
298         // with handling of code 302 ("object was moved")
299
300         sp = strrchr(h->host, '@');
301         if (sp != NULL) {
302                 h->user = h->host;
303                 *sp = '\0';
304                 h->host = sp + 1;
305         }
306
307         sp = h->host;
308 }
309
310 static char *gethdr(FILE *fp)
311 {
312         char *s, *hdrval;
313         int c;
314
315         /* *istrunc = 0; */
316
317         /* retrieve header line */
318         c = fgets_and_trim(fp);
319
320         /* end of the headers? */
321         if (G.wget_buf[0] == '\0')
322                 return NULL;
323
324         /* convert the header name to lower case */
325         for (s = G.wget_buf; isalnum(*s) || *s == '-' || *s == '.'; ++s) {
326                 /* tolower for "A-Z", no-op for "0-9a-z-." */
327                 *s |= 0x20;
328         }
329
330         /* verify we are at the end of the header name */
331         if (*s != ':')
332                 bb_error_msg_and_die("bad header line: %s", sanitize_string(G.wget_buf));
333
334         /* locate the start of the header value */
335         *s++ = '\0';
336         hdrval = skip_whitespace(s);
337
338         if (c != '\n') {
339                 /* Rats! The buffer isn't big enough to hold the entire header value */
340                 while (c = getc(fp), c != EOF && c != '\n')
341                         continue;
342         }
343
344         return hdrval;
345 }
346
347 static FILE* prepare_ftp_session(FILE **dfpp, struct host_info *target, len_and_sockaddr *lsa)
348 {
349         FILE *sfp;
350         char *str;
351         int port;
352
353         if (!target->user)
354                 target->user = xstrdup("anonymous:busybox@");
355
356         sfp = open_socket(lsa);
357         if (ftpcmd(NULL, NULL, sfp) != 220)
358                 bb_error_msg_and_die("%s", sanitize_string(G.wget_buf + 4));
359
360         /*
361          * Splitting username:password pair,
362          * trying to log in
363          */
364         str = strchr(target->user, ':');
365         if (str)
366                 *str++ = '\0';
367         switch (ftpcmd("USER ", target->user, sfp)) {
368         case 230:
369                 break;
370         case 331:
371                 if (ftpcmd("PASS ", str, sfp) == 230)
372                         break;
373                 /* fall through (failed login) */
374         default:
375                 bb_error_msg_and_die("ftp login: %s", sanitize_string(G.wget_buf + 4));
376         }
377
378         ftpcmd("TYPE I", NULL, sfp);
379
380         /*
381          * Querying file size
382          */
383         if (ftpcmd("SIZE ", target->path, sfp) == 213) {
384                 G.content_len = BB_STRTOOFF(G.wget_buf + 4, NULL, 10);
385                 if (G.content_len < 0 || errno) {
386                         bb_error_msg_and_die("SIZE value is garbage");
387                 }
388                 G.got_clen = 1;
389         }
390
391         /*
392          * Entering passive mode
393          */
394         if (ftpcmd("PASV", NULL, sfp) != 227) {
395  pasv_error:
396                 bb_error_msg_and_die("bad response to %s: %s", "PASV", sanitize_string(G.wget_buf));
397         }
398         // Response is "227 garbageN1,N2,N3,N4,P1,P2[)garbage]
399         // Server's IP is N1.N2.N3.N4 (we ignore it)
400         // Server's port for data connection is P1*256+P2
401         str = strrchr(G.wget_buf, ')');
402         if (str) str[0] = '\0';
403         str = strrchr(G.wget_buf, ',');
404         if (!str) goto pasv_error;
405         port = xatou_range(str+1, 0, 255);
406         *str = '\0';
407         str = strrchr(G.wget_buf, ',');
408         if (!str) goto pasv_error;
409         port += xatou_range(str+1, 0, 255) * 256;
410         set_nport(&lsa->u.sa, htons(port));
411
412         *dfpp = open_socket(lsa);
413
414         if (G.beg_range) {
415                 sprintf(G.wget_buf, "REST %"OFF_FMT"u", G.beg_range);
416                 if (ftpcmd(G.wget_buf, NULL, sfp) == 350)
417                         G.content_len -= G.beg_range;
418         }
419
420         if (ftpcmd("RETR ", target->path, sfp) > 150)
421                 bb_error_msg_and_die("bad response to %s: %s", "RETR", sanitize_string(G.wget_buf));
422
423         return sfp;
424 }
425
426 static void NOINLINE retrieve_file_data(FILE *dfp)
427 {
428 #if ENABLE_FEATURE_WGET_STATUSBAR || ENABLE_FEATURE_WGET_TIMEOUT
429 # if ENABLE_FEATURE_WGET_TIMEOUT
430         unsigned second_cnt;
431 # endif
432         struct pollfd polldata;
433
434         polldata.fd = fileno(dfp);
435         polldata.events = POLLIN | POLLPRI;
436 #endif
437         progress_meter(PROGRESS_START);
438
439         if (G.chunked)
440                 goto get_clen;
441
442         /* Loops only if chunked */
443         while (1) {
444
445 #if ENABLE_FEATURE_WGET_STATUSBAR || ENABLE_FEATURE_WGET_TIMEOUT
446                 /* Must use nonblocking I/O, otherwise fread will loop
447                  * and *block* until it reads full buffer,
448                  * which messes up progress bar and/or timeout logic.
449                  * Because of nonblocking I/O, we need to dance
450                  * very carefully around EAGAIN. See explanation at
451                  * clearerr() call.
452                  */
453                 ndelay_on(polldata.fd);
454 #endif
455                 while (1) {
456                         int n;
457                         unsigned rdsz;
458
459                         rdsz = sizeof(G.wget_buf);
460                         if (G.got_clen) {
461                                 if (G.content_len < (off_t)sizeof(G.wget_buf)) {
462                                         if ((int)G.content_len <= 0)
463                                                 break;
464                                         rdsz = (unsigned)G.content_len;
465                                 }
466                         }
467
468 #if ENABLE_FEATURE_WGET_STATUSBAR || ENABLE_FEATURE_WGET_TIMEOUT
469 # if ENABLE_FEATURE_WGET_TIMEOUT
470                         second_cnt = G.timeout_seconds;
471 # endif
472                         while (1) {
473                                 if (safe_poll(&polldata, 1, 1000) != 0)
474                                         break; /* error, EOF, or data is available */
475 # if ENABLE_FEATURE_WGET_TIMEOUT
476                                 if (second_cnt != 0 && --second_cnt == 0) {
477                                         progress_meter(PROGRESS_END);
478                                         bb_error_msg_and_die("download timed out");
479                                 }
480 # endif
481                                 /* Needed for "stalled" indicator */
482                                 progress_meter(PROGRESS_BUMP);
483                         }
484
485                         /* fread internally uses read loop, which in our case
486                          * is usually exited when we get EAGAIN.
487                          * In this case, libc sets error marker on the stream.
488                          * Need to clear it before next fread to avoid possible
489                          * rare false positive ferror below. Rare because usually
490                          * fread gets more than zero bytes, and we don't fall
491                          * into if (n <= 0) ...
492                          */
493                         clearerr(dfp);
494                         errno = 0;
495 #endif
496                         n = fread(G.wget_buf, 1, rdsz, dfp);
497                         /* man fread:
498                          * If error occurs, or EOF is reached, the return value
499                          * is a short item count (or zero).
500                          * fread does not distinguish between EOF and error.
501                          */
502                         if (n <= 0) {
503 #if ENABLE_FEATURE_WGET_STATUSBAR || ENABLE_FEATURE_WGET_TIMEOUT
504                                 if (errno == EAGAIN) /* poll lied, there is no data? */
505                                         continue; /* yes */
506 #endif
507                                 if (ferror(dfp))
508                                         bb_perror_msg_and_die(bb_msg_read_error);
509                                 break; /* EOF, not error */
510                         }
511
512                         xwrite(G.output_fd, G.wget_buf, n);
513
514 #if ENABLE_FEATURE_WGET_STATUSBAR
515                         G.transferred += n;
516                         progress_meter(PROGRESS_BUMP);
517 #endif
518                         if (G.got_clen) {
519                                 G.content_len -= n;
520                                 if (G.content_len == 0)
521                                         break;
522                         }
523                 }
524 #if ENABLE_FEATURE_WGET_STATUSBAR || ENABLE_FEATURE_WGET_TIMEOUT
525                 clearerr(dfp);
526                 ndelay_off(polldata.fd); /* else fgets can get very unhappy */
527 #endif
528                 if (!G.chunked)
529                         break;
530
531                 fgets_and_trim(dfp); /* Eat empty line */
532  get_clen:
533                 fgets_and_trim(dfp);
534                 G.content_len = STRTOOFF(G.wget_buf, NULL, 16);
535                 /* FIXME: error check? */
536                 if (G.content_len == 0)
537                         break; /* all done! */
538                 G.got_clen = 1;
539         }
540
541         /* Draw full bar and free its resources */
542         G.chunked = 0;  /* makes it show 100% even for chunked download */
543         G.got_clen = 1; /* makes it show 100% even for download of (formerly) unknown size */
544         progress_meter(PROGRESS_END);
545 }
546
547 static void download_one_url(const char *url)
548 {
549         bool use_proxy;                 /* Use proxies if env vars are set  */
550         int redir_limit;
551         len_and_sockaddr *lsa;
552         FILE *sfp;                      /* socket to web/ftp server         */
553         FILE *dfp;                      /* socket to ftp server (data)      */
554         char *proxy = NULL;
555         char *fname_out_alloc;
556         struct host_info server;
557         struct host_info target;
558
559         server.allocated = NULL;
560         target.allocated = NULL;
561         server.user = NULL;
562         target.user = NULL;
563
564         parse_url(url, &target);
565
566         /* Use the proxy if necessary */
567         use_proxy = (strcmp(G.proxy_flag, "off") != 0);
568         if (use_proxy) {
569                 proxy = getenv(target.is_ftp ? "ftp_proxy" : "http_proxy");
570                 use_proxy = (proxy && proxy[0]);
571                 if (use_proxy)
572                         parse_url(proxy, &server);
573         }
574         if (!use_proxy) {
575                 server.port = target.port;
576                 if (ENABLE_FEATURE_IPV6) {
577                         //free(server.allocated); - can't be non-NULL
578                         server.host = server.allocated = xstrdup(target.host);
579                 } else {
580                         server.host = target.host;
581                 }
582         }
583
584         if (ENABLE_FEATURE_IPV6)
585                 strip_ipv6_scope_id(target.host);
586
587         /* If there was no -O FILE, guess output filename */
588         fname_out_alloc = NULL;
589         if (!(option_mask32 & WGET_OPT_OUTNAME)) {
590                 G.fname_out = bb_get_last_path_component_nostrip(target.path);
591                 /* handle "wget http://kernel.org//" */
592                 if (G.fname_out[0] == '/' || !G.fname_out[0])
593                         G.fname_out = (char*)"index.html";
594                 /* -P DIR is considered only if there was no -O FILE */
595                 else {
596                         if (G.dir_prefix)
597                                 G.fname_out = fname_out_alloc = concat_path_file(G.dir_prefix, G.fname_out);
598                         else {
599                                 /* redirects may free target.path later, need to make a copy */
600                                 G.fname_out = fname_out_alloc = xstrdup(G.fname_out);
601                         }
602                 }
603         }
604 #if ENABLE_FEATURE_WGET_STATUSBAR
605         G.curfile = bb_get_last_path_component_nostrip(G.fname_out);
606 #endif
607
608         /* Determine where to start transfer */
609         G.beg_range = 0;
610         if (option_mask32 & WGET_OPT_CONTINUE) {
611                 G.output_fd = open(G.fname_out, O_WRONLY);
612                 if (G.output_fd >= 0) {
613                         G.beg_range = xlseek(G.output_fd, 0, SEEK_END);
614                 }
615                 /* File doesn't exist. We do not create file here yet.
616                  * We are not sure it exists on remote side */
617         }
618
619         redir_limit = 5;
620  resolve_lsa:
621         lsa = xhost2sockaddr(server.host, server.port);
622         if (!(option_mask32 & WGET_OPT_QUIET)) {
623                 char *s = xmalloc_sockaddr2dotted(&lsa->u.sa);
624                 fprintf(stderr, "Connecting to %s (%s)\n", server.host, s);
625                 free(s);
626         }
627  establish_session:
628         /*G.content_len = 0; - redundant, got_clen = 0 is enough */
629         G.got_clen = 0;
630         G.chunked = 0;
631         if (use_proxy || !target.is_ftp) {
632                 /*
633                  *  HTTP session
634                  */
635                 char *str;
636                 int status;
637
638
639                 /* Open socket to http server */
640                 sfp = open_socket(lsa);
641
642                 /* Send HTTP request */
643                 if (use_proxy) {
644                         fprintf(sfp, "GET %stp://%s/%s HTTP/1.1\r\n",
645                                 target.is_ftp ? "f" : "ht", target.host,
646                                 target.path);
647                 } else {
648                         if (option_mask32 & WGET_OPT_POST_DATA)
649                                 fprintf(sfp, "POST /%s HTTP/1.1\r\n", target.path);
650                         else
651                                 fprintf(sfp, "GET /%s HTTP/1.1\r\n", target.path);
652                 }
653
654                 fprintf(sfp, "Host: %s\r\nUser-Agent: %s\r\n",
655                         target.host, G.user_agent);
656
657                 /* Ask server to close the connection as soon as we are done
658                  * (IOW: we do not intend to send more requests)
659                  */
660                 fprintf(sfp, "Connection: close\r\n");
661
662 #if ENABLE_FEATURE_WGET_AUTHENTICATION
663                 if (target.user) {
664                         fprintf(sfp, "Proxy-Authorization: Basic %s\r\n"+6,
665                                 base64enc(target.user));
666                 }
667                 if (use_proxy && server.user) {
668                         fprintf(sfp, "Proxy-Authorization: Basic %s\r\n",
669                                 base64enc(server.user));
670                 }
671 #endif
672
673                 if (G.beg_range)
674                         fprintf(sfp, "Range: bytes=%"OFF_FMT"u-\r\n", G.beg_range);
675
676 #if ENABLE_FEATURE_WGET_LONG_OPTIONS
677                 if (G.extra_headers)
678                         fputs(G.extra_headers, sfp);
679
680                 if (option_mask32 & WGET_OPT_POST_DATA) {
681                         fprintf(sfp,
682                                 "Content-Type: application/x-www-form-urlencoded\r\n"
683                                 "Content-Length: %u\r\n"
684                                 "\r\n"
685                                 "%s",
686                                 (int) strlen(G.post_data), G.post_data
687                         );
688                 } else
689 #endif
690                 {
691                         fprintf(sfp, "\r\n");
692                 }
693
694                 fflush(sfp);
695
696                 /*
697                  * Retrieve HTTP response line and check for "200" status code.
698                  */
699  read_response:
700                 fgets_and_trim(sfp);
701
702                 str = G.wget_buf;
703                 str = skip_non_whitespace(str);
704                 str = skip_whitespace(str);
705                 // FIXME: no error check
706                 // xatou wouldn't work: "200 OK"
707                 status = atoi(str);
708                 switch (status) {
709                 case 0:
710                 case 100:
711                         while (gethdr(sfp) != NULL)
712                                 /* eat all remaining headers */;
713                         goto read_response;
714                 case 200:
715 /*
716 Response 204 doesn't say "null file", it says "metadata
717 has changed but data didn't":
718
719 "10.2.5 204 No Content
720 The server has fulfilled the request but does not need to return
721 an entity-body, and might want to return updated metainformation.
722 The response MAY include new or updated metainformation in the form
723 of entity-headers, which if present SHOULD be associated with
724 the requested variant.
725
726 If the client is a user agent, it SHOULD NOT change its document
727 view from that which caused the request to be sent. This response
728 is primarily intended to allow input for actions to take place
729 without causing a change to the user agent's active document view,
730 although any new or updated metainformation SHOULD be applied
731 to the document currently in the user agent's active view.
732
733 The 204 response MUST NOT include a message-body, and thus
734 is always terminated by the first empty line after the header fields."
735
736 However, in real world it was observed that some web servers
737 (e.g. Boa/0.94.14rc21) simply use code 204 when file size is zero.
738 */
739                 case 204:
740                         break;
741                 case 300:  /* redirection */
742                 case 301:
743                 case 302:
744                 case 303:
745                         break;
746                 case 206:
747                         if (G.beg_range)
748                                 break;
749                         /* fall through */
750                 default:
751                         bb_error_msg_and_die("server returned error: %s", sanitize_string(G.wget_buf));
752                 }
753
754                 /*
755                  * Retrieve HTTP headers.
756                  */
757                 while ((str = gethdr(sfp)) != NULL) {
758                         static const char keywords[] ALIGN1 =
759                                 "content-length\0""transfer-encoding\0""location\0";
760                         enum {
761                                 KEY_content_length = 1, KEY_transfer_encoding, KEY_location
762                         };
763                         smalluint key;
764
765                         /* gethdr converted "FOO:" string to lowercase */
766
767                         /* strip trailing whitespace */
768                         char *s = strchrnul(str, '\0') - 1;
769                         while (s >= str && (*s == ' ' || *s == '\t')) {
770                                 *s = '\0';
771                                 s--;
772                         }
773                         key = index_in_strings(keywords, G.wget_buf) + 1;
774                         if (key == KEY_content_length) {
775                                 G.content_len = BB_STRTOOFF(str, NULL, 10);
776                                 if (G.content_len < 0 || errno) {
777                                         bb_error_msg_and_die("content-length %s is garbage", sanitize_string(str));
778                                 }
779                                 G.got_clen = 1;
780                                 continue;
781                         }
782                         if (key == KEY_transfer_encoding) {
783                                 if (strcmp(str_tolower(str), "chunked") != 0)
784                                         bb_error_msg_and_die("transfer encoding '%s' is not supported", sanitize_string(str));
785                                 G.chunked = 1;
786                         }
787                         if (key == KEY_location && status >= 300) {
788                                 if (--redir_limit == 0)
789                                         bb_error_msg_and_die("too many redirections");
790                                 fclose(sfp);
791                                 if (str[0] == '/') {
792                                         free(target.allocated);
793                                         target.path = target.allocated = xstrdup(str+1);
794                                         /* lsa stays the same: it's on the same server */
795                                 } else {
796                                         parse_url(str, &target);
797                                         if (!use_proxy) {
798                                                 free(server.allocated);
799                                                 server.allocated = NULL;
800                                                 server.host = target.host;
801                                                 /* strip_ipv6_scope_id(target.host); - no! */
802                                                 /* we assume remote never gives us IPv6 addr with scope id */
803                                                 server.port = target.port;
804                                                 free(lsa);
805                                                 goto resolve_lsa;
806                                         } /* else: lsa stays the same: we use proxy */
807                                 }
808                                 goto establish_session;
809                         }
810                 }
811 //              if (status >= 300)
812 //                      bb_error_msg_and_die("bad redirection (no Location: header from server)");
813
814                 /* For HTTP, data is pumped over the same connection */
815                 dfp = sfp;
816
817         } else {
818                 /*
819                  *  FTP session
820                  */
821                 sfp = prepare_ftp_session(&dfp, &target, lsa);
822         }
823
824         free(lsa);
825
826         if (!(option_mask32 & WGET_OPT_SPIDER)) {
827                 if (G.output_fd < 0)
828                         G.output_fd = xopen(G.fname_out, G.o_flags);
829                 retrieve_file_data(dfp);
830                 if (!(option_mask32 & WGET_OPT_OUTNAME)) {
831                         xclose(G.output_fd);
832                         G.output_fd = -1;
833                 }
834         }
835
836         if (dfp != sfp) {
837                 /* It's ftp. Close data connection properly */
838                 fclose(dfp);
839                 if (ftpcmd(NULL, NULL, sfp) != 226)
840                         bb_error_msg_and_die("ftp error: %s", sanitize_string(G.wget_buf + 4));
841                 /* ftpcmd("QUIT", NULL, sfp); - why bother? */
842         }
843         fclose(sfp);
844
845         free(server.allocated);
846         free(target.allocated);
847         free(fname_out_alloc);
848 }
849
850 int wget_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE;
851 int wget_main(int argc UNUSED_PARAM, char **argv)
852 {
853 #if ENABLE_FEATURE_WGET_LONG_OPTIONS
854         static const char wget_longopts[] ALIGN1 =
855                 /* name, has_arg, val */
856                 "continue\0"         No_argument       "c"
857 //FIXME: -s isn't --spider, it's --save-headers!
858                 "spider\0"           No_argument       "s"
859                 "quiet\0"            No_argument       "q"
860                 "output-document\0"  Required_argument "O"
861                 "directory-prefix\0" Required_argument "P"
862                 "proxy\0"            Required_argument "Y"
863                 "user-agent\0"       Required_argument "U"
864 #if ENABLE_FEATURE_WGET_TIMEOUT
865                 "timeout\0"          Required_argument "T"
866 #endif
867                 /* Ignored: */
868                 // "tries\0"            Required_argument "t"
869                 /* Ignored (we always use PASV): */
870                 "passive-ftp\0"      No_argument       "\xff"
871                 "header\0"           Required_argument "\xfe"
872                 "post-data\0"        Required_argument "\xfd"
873                 /* Ignored (we don't do ssl) */
874                 "no-check-certificate\0" No_argument   "\xfc"
875                 ;
876 #endif
877
878 #if ENABLE_FEATURE_WGET_LONG_OPTIONS
879         llist_t *headers_llist = NULL;
880 #endif
881
882         INIT_G();
883
884         IF_FEATURE_WGET_TIMEOUT(G.timeout_seconds = 900;)
885         G.proxy_flag = "on";   /* use proxies if env vars are set */
886         G.user_agent = "Wget"; /* "User-Agent" header field */
887
888 #if ENABLE_FEATURE_WGET_LONG_OPTIONS
889         applet_long_options = wget_longopts;
890 #endif
891         opt_complementary = "-1" IF_FEATURE_WGET_TIMEOUT(":T+") IF_FEATURE_WGET_LONG_OPTIONS(":\xfe::");
892         getopt32(argv, "csqO:P:Y:U:T:" /*ignored:*/ "t:",
893                 &G.fname_out, &G.dir_prefix,
894                 &G.proxy_flag, &G.user_agent,
895                 IF_FEATURE_WGET_TIMEOUT(&G.timeout_seconds) IF_NOT_FEATURE_WGET_TIMEOUT(NULL),
896                 NULL /* -t RETRIES */
897                 IF_FEATURE_WGET_LONG_OPTIONS(, &headers_llist)
898                 IF_FEATURE_WGET_LONG_OPTIONS(, &G.post_data)
899         );
900         argv += optind;
901
902 #if ENABLE_FEATURE_WGET_LONG_OPTIONS
903         if (headers_llist) {
904                 int size = 1;
905                 char *cp;
906                 llist_t *ll = headers_llist;
907                 while (ll) {
908                         size += strlen(ll->data) + 2;
909                         ll = ll->link;
910                 }
911                 G.extra_headers = cp = xmalloc(size);
912                 while (headers_llist) {
913                         cp += sprintf(cp, "%s\r\n", (char*)llist_pop(&headers_llist));
914                 }
915         }
916 #endif
917
918         G.output_fd = -1;
919         G.o_flags = O_WRONLY | O_CREAT | O_TRUNC | O_EXCL;
920         if (G.fname_out) { /* -O FILE ? */
921                 if (LONE_DASH(G.fname_out)) { /* -O - ? */
922                         G.output_fd = 1;
923                         option_mask32 &= ~WGET_OPT_CONTINUE;
924                 }
925                 /* compat with wget: -O FILE can overwrite */
926                 G.o_flags = O_WRONLY | O_CREAT | O_TRUNC;
927         }
928
929         while (*argv)
930                 download_one_url(*argv++);
931
932         if (G.output_fd >= 0)
933                 xclose(G.output_fd);
934
935         return EXIT_SUCCESS;
936 }