7ca947aec1a4c5609c5f5b7f3fc91479038d8daf
[oweals/busybox.git] / networking / wget.c
1 /* vi: set sw=4 ts=4: */
2 /*
3  * wget - retrieve a file using HTTP or FTP
4  *
5  * Chip Rosenthal Covad Communications <chip@laserlink.net>
6  * Licensed under GPLv2, see file LICENSE in this source tree.
7  *
8  * Copyright (C) 2010 Bradley M. Kuhn <bkuhn@ebb.org>
9  * Kuhn's copyrights are licensed GPLv2-or-later.  File as a whole remains GPLv2.
10  */
11
12 //usage:#define wget_trivial_usage
13 //usage:        IF_FEATURE_WGET_LONG_OPTIONS(
14 //usage:       "[-c|--continue] [-s|--spider] [-q|--quiet] [-O|--output-document FILE]\n"
15 //usage:       "        [--header 'header: value'] [-Y|--proxy on/off] [-P DIR]\n"
16 /* Since we ignore these opts, we don't show them in --help */
17 /* //usage:    "        [--no-check-certificate] [--no-cache]" */
18 //usage:       "        [-U|--user-agent AGENT]" IF_FEATURE_WGET_TIMEOUT(" [-T SEC]") " URL..."
19 //usage:        )
20 //usage:        IF_NOT_FEATURE_WGET_LONG_OPTIONS(
21 //usage:       "[-csq] [-O FILE] [-Y on/off] [-P DIR] [-U AGENT]"
22 //usage:                        IF_FEATURE_WGET_TIMEOUT(" [-T SEC]") " URL..."
23 //usage:        )
24 //usage:#define wget_full_usage "\n\n"
25 //usage:       "Retrieve files via HTTP or FTP\n"
26 //usage:     "\n        -s      Spider mode - only check file existence"
27 //usage:     "\n        -c      Continue retrieval of aborted transfer"
28 //usage:     "\n        -q      Quiet"
29 //usage:     "\n        -P DIR  Save to DIR (default .)"
30 //usage:        IF_FEATURE_WGET_TIMEOUT(
31 //usage:     "\n        -T SEC  Network read timeout is SEC seconds"
32 //usage:        )
33 //usage:     "\n        -O FILE Save to FILE ('-' for stdout)"
34 //usage:     "\n        -U STR  Use STR for User-Agent header"
35 //usage:     "\n        -Y      Use proxy ('on' or 'off')"
36
37 #include "libbb.h"
38
39 #if 0
40 # define log_io(...) bb_error_msg(__VA_ARGS__)
41 #else
42 # define log_io(...) ((void)0)
43 #endif
44
45
46 struct host_info {
47         char *allocated;
48         const char *path;
49         char       *user;
50         char       *host;
51         int         port;
52         smallint    is_ftp;
53 };
54
55
56 /* Globals */
57 struct globals {
58         off_t content_len;        /* Content-length of the file */
59         off_t beg_range;          /* Range at which continue begins */
60 #if ENABLE_FEATURE_WGET_STATUSBAR
61         off_t transferred;        /* Number of bytes transferred so far */
62         const char *curfile;      /* Name of current file being transferred */
63         bb_progress_t pmt;
64 #endif
65         char *dir_prefix;
66 #if ENABLE_FEATURE_WGET_LONG_OPTIONS
67         char *post_data;
68         char *extra_headers;
69 #endif
70         char *fname_out;        /* where to direct output (-O) */
71         const char *proxy_flag; /* Use proxies if env vars are set */
72         const char *user_agent; /* "User-Agent" header field */
73 #if ENABLE_FEATURE_WGET_TIMEOUT
74         unsigned timeout_seconds;
75         bool connecting;
76 #endif
77         int output_fd;
78         int o_flags;
79         smallint chunked;         /* chunked transfer encoding */
80         smallint got_clen;        /* got content-length: from server  */
81         /* Local downloads do benefit from big buffer.
82          * With 512 byte buffer, it was measured to be
83          * an order of magnitude slower than with big one.
84          */
85         uint64_t just_to_align_next_member;
86         char wget_buf[CONFIG_FEATURE_COPYBUF_KB*1024];
87 } FIX_ALIASING;
88 #define G (*ptr_to_globals)
89 #define INIT_G() do { \
90         SET_PTR_TO_GLOBALS(xzalloc(sizeof(G))); \
91 } while (0)
92 #define FINI_G() do { \
93         FREE_PTR_TO_GLOBALS(); \
94 } while (0)
95
96
97 /* Must match option string! */
98 enum {
99         WGET_OPT_CONTINUE   = (1 << 0),
100         WGET_OPT_SPIDER     = (1 << 1),
101         WGET_OPT_QUIET      = (1 << 2),
102         WGET_OPT_OUTNAME    = (1 << 3),
103         WGET_OPT_PREFIX     = (1 << 4),
104         WGET_OPT_PROXY      = (1 << 5),
105         WGET_OPT_USER_AGENT = (1 << 6),
106         WGET_OPT_NETWORK_READ_TIMEOUT = (1 << 7),
107         WGET_OPT_RETRIES    = (1 << 8),
108         WGET_OPT_PASSIVE    = (1 << 9),
109         WGET_OPT_HEADER     = (1 << 10) * ENABLE_FEATURE_WGET_LONG_OPTIONS,
110         WGET_OPT_POST_DATA  = (1 << 11) * ENABLE_FEATURE_WGET_LONG_OPTIONS,
111 };
112
113 enum {
114         PROGRESS_START = -1,
115         PROGRESS_END   = 0,
116         PROGRESS_BUMP  = 1,
117 };
118 #if ENABLE_FEATURE_WGET_STATUSBAR
119 static void progress_meter(int flag)
120 {
121         if (option_mask32 & WGET_OPT_QUIET)
122                 return;
123
124         if (flag == PROGRESS_START)
125                 bb_progress_init(&G.pmt, G.curfile);
126
127         bb_progress_update(&G.pmt,
128                         G.beg_range,
129                         G.transferred,
130                         (G.chunked || !G.got_clen) ? 0 : G.beg_range + G.transferred + G.content_len
131         );
132
133         if (flag == PROGRESS_END) {
134                 bb_progress_free(&G.pmt);
135                 bb_putchar_stderr('\n');
136                 G.transferred = 0;
137         }
138 }
139 #else
140 static ALWAYS_INLINE void progress_meter(int flag UNUSED_PARAM) { }
141 #endif
142
143
144 /* IPv6 knows scoped address types i.e. link and site local addresses. Link
145  * local addresses can have a scope identifier to specify the
146  * interface/link an address is valid on (e.g. fe80::1%eth0). This scope
147  * identifier is only valid on a single node.
148  *
149  * RFC 4007 says that the scope identifier MUST NOT be sent across the wire,
150  * unless all nodes agree on the semantic. Apache e.g. regards zone identifiers
151  * in the Host header as invalid requests, see
152  * https://issues.apache.org/bugzilla/show_bug.cgi?id=35122
153  */
154 static void strip_ipv6_scope_id(char *host)
155 {
156         char *scope, *cp;
157
158         /* bbox wget actually handles IPv6 addresses without [], like
159          * wget "http://::1/xxx", but this is not standard.
160          * To save code, _here_ we do not support it. */
161
162         if (host[0] != '[')
163                 return; /* not IPv6 */
164
165         scope = strchr(host, '%');
166         if (!scope)
167                 return;
168
169         /* Remove the IPv6 zone identifier from the host address */
170         cp = strchr(host, ']');
171         if (!cp || (cp[1] != ':' && cp[1] != '\0')) {
172                 /* malformed address (not "[xx]:nn" or "[xx]") */
173                 return;
174         }
175
176         /* cp points to "]...", scope points to "%eth0]..." */
177         overlapping_strcpy(scope, cp);
178 }
179
180 #if ENABLE_FEATURE_WGET_AUTHENTICATION
181 /* Base64-encode character string. */
182 static char *base64enc(const char *str)
183 {
184         unsigned len = strlen(str);
185         if (len > sizeof(G.wget_buf)/4*3 - 10) /* paranoia */
186                 len = sizeof(G.wget_buf)/4*3 - 10;
187         bb_uuencode(G.wget_buf, str, len, bb_uuenc_tbl_base64);
188         return G.wget_buf;
189 }
190 #endif
191
192 static char* sanitize_string(char *s)
193 {
194         unsigned char *p = (void *) s;
195         while (*p >= ' ')
196                 p++;
197         *p = '\0';
198         return s;
199 }
200
201 #if ENABLE_FEATURE_WGET_TIMEOUT
202 static void alarm_handler(int sig UNUSED_PARAM)
203 {
204         /* This is theoretically unsafe (uses stdio and malloc in signal handler) */
205         if (G.connecting)
206                 bb_error_msg_and_die("download timed out");
207 }
208 #endif
209
210 static FILE *open_socket(len_and_sockaddr *lsa)
211 {
212         int fd;
213         FILE *fp;
214
215         IF_FEATURE_WGET_TIMEOUT(alarm(G.timeout_seconds); G.connecting = 1;)
216         fd = xconnect_stream(lsa);
217         IF_FEATURE_WGET_TIMEOUT(G.connecting = 0;)
218
219         /* glibc 2.4 seems to try seeking on it - ??! */
220         /* hopefully it understands what ESPIPE means... */
221         fp = fdopen(fd, "r+");
222         if (fp == NULL)
223                 bb_perror_msg_and_die(bb_msg_memory_exhausted);
224
225         return fp;
226 }
227
228 /* Returns '\n' if it was seen, else '\0'. Trims at first '\r' or '\n' */
229 /* FIXME: does not respect FEATURE_WGET_TIMEOUT and -T N: */
230 static char fgets_and_trim(FILE *fp)
231 {
232         char c;
233         char *buf_ptr;
234
235         if (fgets(G.wget_buf, sizeof(G.wget_buf) - 1, fp) == NULL)
236                 bb_perror_msg_and_die("error getting response");
237
238         buf_ptr = strchrnul(G.wget_buf, '\n');
239         c = *buf_ptr;
240         *buf_ptr = '\0';
241         buf_ptr = strchrnul(G.wget_buf, '\r');
242         *buf_ptr = '\0';
243
244         log_io("< %s", G.wget_buf);
245
246         return c;
247 }
248
249 static int ftpcmd(const char *s1, const char *s2, FILE *fp)
250 {
251         int result;
252         if (s1) {
253                 if (!s2)
254                         s2 = "";
255                 fprintf(fp, "%s%s\r\n", s1, s2);
256                 fflush(fp);
257                 log_io("> %s%s", s1, s2);
258         }
259
260         do {
261                 fgets_and_trim(fp);
262         } while (!isdigit(G.wget_buf[0]) || G.wget_buf[3] != ' ');
263
264         G.wget_buf[3] = '\0';
265         result = xatoi_positive(G.wget_buf);
266         G.wget_buf[3] = ' ';
267         return result;
268 }
269
270 static void parse_url(const char *src_url, struct host_info *h)
271 {
272         char *url, *p, *sp;
273
274         free(h->allocated);
275         h->allocated = url = xstrdup(src_url);
276
277         if (strncmp(url, "ftp://", 6) == 0) {
278                 h->port = bb_lookup_port("ftp", "tcp", 21);
279                 h->host = url + 6;
280                 h->is_ftp = 1;
281         } else
282         if (strncmp(url, "http://", 7) == 0) {
283                 h->host = url + 7;
284  http:
285                 h->port = bb_lookup_port("http", "tcp", 80);
286                 h->is_ftp = 0;
287         } else
288         if (!strstr(url, "//")) {
289                 // GNU wget is user-friendly and falls back to http://
290                 h->host = url;
291                 goto http;
292         } else
293                 bb_error_msg_and_die("not an http or ftp url: %s", sanitize_string(url));
294
295         // FYI:
296         // "Real" wget 'http://busybox.net?var=a/b' sends this request:
297         //   'GET /?var=a/b HTTP 1.0'
298         //   and saves 'index.html?var=a%2Fb' (we save 'b')
299         // wget 'http://busybox.net?login=john@doe':
300         //   request: 'GET /?login=john@doe HTTP/1.0'
301         //   saves: 'index.html?login=john@doe' (we save '?login=john@doe')
302         // wget 'http://busybox.net#test/test':
303         //   request: 'GET / HTTP/1.0'
304         //   saves: 'index.html' (we save 'test')
305         //
306         // We also don't add unique .N suffix if file exists...
307         sp = strchr(h->host, '/');
308         p = strchr(h->host, '?'); if (!sp || (p && sp > p)) sp = p;
309         p = strchr(h->host, '#'); if (!sp || (p && sp > p)) sp = p;
310         if (!sp) {
311                 h->path = "";
312         } else if (*sp == '/') {
313                 *sp = '\0';
314                 h->path = sp + 1;
315         } else { // '#' or '?'
316                 // http://busybox.net?login=john@doe is a valid URL
317                 // memmove converts to:
318                 // http:/busybox.nett?login=john@doe...
319                 memmove(h->host - 1, h->host, sp - h->host);
320                 h->host--;
321                 sp[-1] = '\0';
322                 h->path = sp;
323         }
324
325         sp = strrchr(h->host, '@');
326         if (sp != NULL) {
327                 // URL-decode "user:password" string before base64-encoding:
328                 // wget http://test:my%20pass@example.com should send
329                 // Authorization: Basic dGVzdDpteSBwYXNz
330                 // which decodes to "test:my pass".
331                 // Standard wget and curl do this too.
332                 *sp = '\0';
333                 free(h->user);
334                 h->user = xstrdup(percent_decode_in_place(h->host, /*strict:*/ 0));
335                 h->host = sp + 1;
336         }
337         /* else: h->user remains NULL, or as set by original request
338          * before redirect (if we are here after a redirect).
339          */
340 }
341
342 static char *gethdr(FILE *fp)
343 {
344         char *s, *hdrval;
345         int c;
346
347         /* retrieve header line */
348         c = fgets_and_trim(fp);
349
350         /* end of the headers? */
351         if (G.wget_buf[0] == '\0')
352                 return NULL;
353
354         /* convert the header name to lower case */
355         for (s = G.wget_buf; isalnum(*s) || *s == '-' || *s == '.' || *s == '_'; ++s) {
356                 /*
357                  * No-op for 20-3f and 60-7f. "0-9a-z-." are in these ranges.
358                  * 40-5f range ("@A-Z[\]^_") maps to 60-7f.
359                  * "A-Z" maps to "a-z".
360                  * "@[\]" can't occur in header names.
361                  * "^_" maps to "~,DEL" (which is wrong).
362                  * "^" was never seen yet, "_" was seen from web.archive.org
363                  * (x-archive-orig-x_commoncrawl_Signature: HEXSTRING).
364                  */
365                 *s |= 0x20;
366         }
367
368         /* verify we are at the end of the header name */
369         if (*s != ':')
370                 bb_error_msg_and_die("bad header line: %s", sanitize_string(G.wget_buf));
371
372         /* locate the start of the header value */
373         *s++ = '\0';
374         hdrval = skip_whitespace(s);
375
376         if (c != '\n') {
377                 /* Rats! The buffer isn't big enough to hold the entire header value */
378                 while (c = getc(fp), c != EOF && c != '\n')
379                         continue;
380         }
381
382         return hdrval;
383 }
384
385 static void reset_beg_range_to_zero(void)
386 {
387         bb_error_msg("restart failed");
388         G.beg_range = 0;
389         xlseek(G.output_fd, 0, SEEK_SET);
390         /* Done at the end instead: */
391         /* ftruncate(G.output_fd, 0); */
392 }
393
394 static FILE* prepare_ftp_session(FILE **dfpp, struct host_info *target, len_and_sockaddr *lsa)
395 {
396         FILE *sfp;
397         char *str;
398         int port;
399
400         if (!target->user)
401                 target->user = xstrdup("anonymous:busybox@");
402
403         sfp = open_socket(lsa);
404         if (ftpcmd(NULL, NULL, sfp) != 220)
405                 bb_error_msg_and_die("%s", sanitize_string(G.wget_buf + 4));
406
407         /*
408          * Splitting username:password pair,
409          * trying to log in
410          */
411         str = strchr(target->user, ':');
412         if (str)
413                 *str++ = '\0';
414         switch (ftpcmd("USER ", target->user, sfp)) {
415         case 230:
416                 break;
417         case 331:
418                 if (ftpcmd("PASS ", str, sfp) == 230)
419                         break;
420                 /* fall through (failed login) */
421         default:
422                 bb_error_msg_and_die("ftp login: %s", sanitize_string(G.wget_buf + 4));
423         }
424
425         ftpcmd("TYPE I", NULL, sfp);
426
427         /*
428          * Querying file size
429          */
430         if (ftpcmd("SIZE ", target->path, sfp) == 213) {
431                 G.content_len = BB_STRTOOFF(G.wget_buf + 4, NULL, 10);
432                 if (G.content_len < 0 || errno) {
433                         bb_error_msg_and_die("SIZE value is garbage");
434                 }
435                 G.got_clen = 1;
436         }
437
438         /*
439          * Entering passive mode
440          */
441         if (ftpcmd("PASV", NULL, sfp) != 227) {
442  pasv_error:
443                 bb_error_msg_and_die("bad response to %s: %s", "PASV", sanitize_string(G.wget_buf));
444         }
445         // Response is "227 garbageN1,N2,N3,N4,P1,P2[)garbage]
446         // Server's IP is N1.N2.N3.N4 (we ignore it)
447         // Server's port for data connection is P1*256+P2
448         str = strrchr(G.wget_buf, ')');
449         if (str) str[0] = '\0';
450         str = strrchr(G.wget_buf, ',');
451         if (!str) goto pasv_error;
452         port = xatou_range(str+1, 0, 255);
453         *str = '\0';
454         str = strrchr(G.wget_buf, ',');
455         if (!str) goto pasv_error;
456         port += xatou_range(str+1, 0, 255) * 256;
457         set_nport(&lsa->u.sa, htons(port));
458
459         *dfpp = open_socket(lsa);
460
461         if (G.beg_range != 0) {
462                 sprintf(G.wget_buf, "REST %"OFF_FMT"u", G.beg_range);
463                 if (ftpcmd(G.wget_buf, NULL, sfp) == 350)
464                         G.content_len -= G.beg_range;
465                 else
466                         reset_beg_range_to_zero();
467         }
468
469         if (ftpcmd("RETR ", target->path, sfp) > 150)
470                 bb_error_msg_and_die("bad response to %s: %s", "RETR", sanitize_string(G.wget_buf));
471
472         return sfp;
473 }
474
475 static void NOINLINE retrieve_file_data(FILE *dfp)
476 {
477 #if ENABLE_FEATURE_WGET_STATUSBAR || ENABLE_FEATURE_WGET_TIMEOUT
478 # if ENABLE_FEATURE_WGET_TIMEOUT
479         unsigned second_cnt = G.timeout_seconds;
480 # endif
481         struct pollfd polldata;
482
483         polldata.fd = fileno(dfp);
484         polldata.events = POLLIN | POLLPRI;
485 #endif
486         progress_meter(PROGRESS_START);
487
488         if (G.chunked)
489                 goto get_clen;
490
491         /* Loops only if chunked */
492         while (1) {
493
494 #if ENABLE_FEATURE_WGET_STATUSBAR || ENABLE_FEATURE_WGET_TIMEOUT
495                 /* Must use nonblocking I/O, otherwise fread will loop
496                  * and *block* until it reads full buffer,
497                  * which messes up progress bar and/or timeout logic.
498                  * Because of nonblocking I/O, we need to dance
499                  * very carefully around EAGAIN. See explanation at
500                  * clearerr() calls.
501                  */
502                 ndelay_on(polldata.fd);
503 #endif
504                 while (1) {
505                         int n;
506                         unsigned rdsz;
507
508 #if ENABLE_FEATURE_WGET_STATUSBAR || ENABLE_FEATURE_WGET_TIMEOUT
509                         /* fread internally uses read loop, which in our case
510                          * is usually exited when we get EAGAIN.
511                          * In this case, libc sets error marker on the stream.
512                          * Need to clear it before next fread to avoid possible
513                          * rare false positive ferror below. Rare because usually
514                          * fread gets more than zero bytes, and we don't fall
515                          * into if (n <= 0) ...
516                          */
517                         clearerr(dfp);
518 #endif
519                         errno = 0;
520                         rdsz = sizeof(G.wget_buf);
521                         if (G.got_clen) {
522                                 if (G.content_len < (off_t)sizeof(G.wget_buf)) {
523                                         if ((int)G.content_len <= 0)
524                                                 break;
525                                         rdsz = (unsigned)G.content_len;
526                                 }
527                         }
528                         n = fread(G.wget_buf, 1, rdsz, dfp);
529
530                         if (n > 0) {
531                                 xwrite(G.output_fd, G.wget_buf, n);
532 #if ENABLE_FEATURE_WGET_STATUSBAR
533                                 G.transferred += n;
534 #endif
535                                 if (G.got_clen) {
536                                         G.content_len -= n;
537                                         if (G.content_len == 0)
538                                                 break;
539                                 }
540 #if ENABLE_FEATURE_WGET_TIMEOUT
541                                 second_cnt = G.timeout_seconds;
542 #endif
543                                 continue;
544                         }
545
546                         /* n <= 0.
547                          * man fread:
548                          * If error occurs, or EOF is reached, the return value
549                          * is a short item count (or zero).
550                          * fread does not distinguish between EOF and error.
551                          */
552                         if (errno != EAGAIN) {
553                                 if (ferror(dfp)) {
554                                         progress_meter(PROGRESS_END);
555                                         bb_perror_msg_and_die(bb_msg_read_error);
556                                 }
557                                 break; /* EOF, not error */
558                         }
559
560 #if ENABLE_FEATURE_WGET_STATUSBAR || ENABLE_FEATURE_WGET_TIMEOUT
561                         /* It was EAGAIN. There is no data. Wait up to one second
562                          * then abort if timed out, or update the bar and try reading again.
563                          */
564                         if (safe_poll(&polldata, 1, 1000) == 0) {
565 # if ENABLE_FEATURE_WGET_TIMEOUT
566                                 if (second_cnt != 0 && --second_cnt == 0) {
567                                         progress_meter(PROGRESS_END);
568                                         bb_error_msg_and_die("download timed out");
569                                 }
570 # endif
571                                 /* We used to loop back to poll here,
572                                  * but there is no great harm in letting fread
573                                  * to try reading anyway.
574                                  */
575                         }
576                         /* Need to do it _every_ second for "stalled" indicator
577                          * to be shown properly.
578                          */
579                         progress_meter(PROGRESS_BUMP);
580 #endif
581                 } /* while (reading data) */
582
583 #if ENABLE_FEATURE_WGET_STATUSBAR || ENABLE_FEATURE_WGET_TIMEOUT
584                 clearerr(dfp);
585                 ndelay_off(polldata.fd); /* else fgets can get very unhappy */
586 #endif
587                 if (!G.chunked)
588                         break;
589
590                 fgets_and_trim(dfp); /* Eat empty line */
591  get_clen:
592                 fgets_and_trim(dfp);
593                 G.content_len = STRTOOFF(G.wget_buf, NULL, 16);
594                 /* FIXME: error check? */
595                 if (G.content_len == 0)
596                         break; /* all done! */
597                 G.got_clen = 1;
598                 /*
599                  * Note that fgets may result in some data being buffered in dfp.
600                  * We loop back to fread, which will retrieve this data.
601                  * Also note that code has to be arranged so that fread
602                  * is done _before_ one-second poll wait - poll doesn't know
603                  * about stdio buffering and can result in spurious one second waits!
604                  */
605         }
606
607         /* If -c failed, we restart from the beginning,
608          * but we do not truncate file then, we do it only now, at the end.
609          * This lets user to ^C if his 99% complete 10 GB file download
610          * failed to restart *without* losing the almost complete file.
611          */
612         {
613                 off_t pos = lseek(G.output_fd, 0, SEEK_CUR);
614                 if (pos != (off_t)-1)
615                         ftruncate(G.output_fd, pos);
616         }
617
618         /* Draw full bar and free its resources */
619         G.chunked = 0;  /* makes it show 100% even for chunked download */
620         G.got_clen = 1; /* makes it show 100% even for download of (formerly) unknown size */
621         progress_meter(PROGRESS_END);
622 }
623
624 static void download_one_url(const char *url)
625 {
626         bool use_proxy;                 /* Use proxies if env vars are set  */
627         int redir_limit;
628         len_and_sockaddr *lsa;
629         FILE *sfp;                      /* socket to web/ftp server         */
630         FILE *dfp;                      /* socket to ftp server (data)      */
631         char *proxy = NULL;
632         char *fname_out_alloc;
633         char *redirected_path = NULL;
634         struct host_info server;
635         struct host_info target;
636
637         server.allocated = NULL;
638         target.allocated = NULL;
639         server.user = NULL;
640         target.user = NULL;
641
642         parse_url(url, &target);
643
644         /* Use the proxy if necessary */
645         use_proxy = (strcmp(G.proxy_flag, "off") != 0);
646         if (use_proxy) {
647                 proxy = getenv(target.is_ftp ? "ftp_proxy" : "http_proxy");
648                 use_proxy = (proxy && proxy[0]);
649                 if (use_proxy)
650                         parse_url(proxy, &server);
651         }
652         if (!use_proxy) {
653                 server.port = target.port;
654                 if (ENABLE_FEATURE_IPV6) {
655                         //free(server.allocated); - can't be non-NULL
656                         server.host = server.allocated = xstrdup(target.host);
657                 } else {
658                         server.host = target.host;
659                 }
660         }
661
662         if (ENABLE_FEATURE_IPV6)
663                 strip_ipv6_scope_id(target.host);
664
665         /* If there was no -O FILE, guess output filename */
666         fname_out_alloc = NULL;
667         if (!(option_mask32 & WGET_OPT_OUTNAME)) {
668                 G.fname_out = bb_get_last_path_component_nostrip(target.path);
669                 /* handle "wget http://kernel.org//" */
670                 if (G.fname_out[0] == '/' || !G.fname_out[0])
671                         G.fname_out = (char*)"index.html";
672                 /* -P DIR is considered only if there was no -O FILE */
673                 if (G.dir_prefix)
674                         G.fname_out = fname_out_alloc = concat_path_file(G.dir_prefix, G.fname_out);
675                 else {
676                         /* redirects may free target.path later, need to make a copy */
677                         G.fname_out = fname_out_alloc = xstrdup(G.fname_out);
678                 }
679         }
680 #if ENABLE_FEATURE_WGET_STATUSBAR
681         G.curfile = bb_get_last_path_component_nostrip(G.fname_out);
682 #endif
683
684         /* Determine where to start transfer */
685         G.beg_range = 0;
686         if (option_mask32 & WGET_OPT_CONTINUE) {
687                 G.output_fd = open(G.fname_out, O_WRONLY);
688                 if (G.output_fd >= 0) {
689                         G.beg_range = xlseek(G.output_fd, 0, SEEK_END);
690                 }
691                 /* File doesn't exist. We do not create file here yet.
692                  * We are not sure it exists on remote side */
693         }
694
695         redir_limit = 5;
696  resolve_lsa:
697         lsa = xhost2sockaddr(server.host, server.port);
698         if (!(option_mask32 & WGET_OPT_QUIET)) {
699                 char *s = xmalloc_sockaddr2dotted(&lsa->u.sa);
700                 fprintf(stderr, "Connecting to %s (%s)\n", server.host, s);
701                 free(s);
702         }
703  establish_session:
704         /*G.content_len = 0; - redundant, got_clen = 0 is enough */
705         G.got_clen = 0;
706         G.chunked = 0;
707         if (use_proxy || !target.is_ftp) {
708                 /*
709                  *  HTTP session
710                  */
711                 char *str;
712                 int status;
713
714
715                 /* Open socket to http server */
716                 sfp = open_socket(lsa);
717
718                 /* Send HTTP request */
719                 if (use_proxy) {
720                         fprintf(sfp, "GET %stp://%s/%s HTTP/1.1\r\n",
721                                 target.is_ftp ? "f" : "ht", target.host,
722                                 target.path);
723                 } else {
724                         if (option_mask32 & WGET_OPT_POST_DATA)
725                                 fprintf(sfp, "POST /%s HTTP/1.1\r\n", target.path);
726                         else
727                                 fprintf(sfp, "GET /%s HTTP/1.1\r\n", target.path);
728                 }
729
730                 fprintf(sfp, "Host: %s\r\nUser-Agent: %s\r\n",
731                         target.host, G.user_agent);
732
733                 /* Ask server to close the connection as soon as we are done
734                  * (IOW: we do not intend to send more requests)
735                  */
736                 fprintf(sfp, "Connection: close\r\n");
737
738 #if ENABLE_FEATURE_WGET_AUTHENTICATION
739                 if (target.user) {
740                         fprintf(sfp, "Proxy-Authorization: Basic %s\r\n"+6,
741                                 base64enc(target.user));
742                 }
743                 if (use_proxy && server.user) {
744                         fprintf(sfp, "Proxy-Authorization: Basic %s\r\n",
745                                 base64enc(server.user));
746                 }
747 #endif
748
749                 if (G.beg_range != 0)
750                         fprintf(sfp, "Range: bytes=%"OFF_FMT"u-\r\n", G.beg_range);
751
752 #if ENABLE_FEATURE_WGET_LONG_OPTIONS
753                 if (G.extra_headers)
754                         fputs(G.extra_headers, sfp);
755
756                 if (option_mask32 & WGET_OPT_POST_DATA) {
757                         fprintf(sfp,
758                                 "Content-Type: application/x-www-form-urlencoded\r\n"
759                                 "Content-Length: %u\r\n"
760                                 "\r\n"
761                                 "%s",
762                                 (int) strlen(G.post_data), G.post_data
763                         );
764                 } else
765 #endif
766                 {
767                         fprintf(sfp, "\r\n");
768                 }
769
770                 fflush(sfp);
771
772                 /*
773                  * Retrieve HTTP response line and check for "200" status code.
774                  */
775  read_response:
776                 fgets_and_trim(sfp);
777
778                 str = G.wget_buf;
779                 str = skip_non_whitespace(str);
780                 str = skip_whitespace(str);
781                 // FIXME: no error check
782                 // xatou wouldn't work: "200 OK"
783                 status = atoi(str);
784                 switch (status) {
785                 case 0:
786                 case 100:
787                         while (gethdr(sfp) != NULL)
788                                 /* eat all remaining headers */;
789                         goto read_response;
790                 case 200:
791 /*
792 Response 204 doesn't say "null file", it says "metadata
793 has changed but data didn't":
794
795 "10.2.5 204 No Content
796 The server has fulfilled the request but does not need to return
797 an entity-body, and might want to return updated metainformation.
798 The response MAY include new or updated metainformation in the form
799 of entity-headers, which if present SHOULD be associated with
800 the requested variant.
801
802 If the client is a user agent, it SHOULD NOT change its document
803 view from that which caused the request to be sent. This response
804 is primarily intended to allow input for actions to take place
805 without causing a change to the user agent's active document view,
806 although any new or updated metainformation SHOULD be applied
807 to the document currently in the user agent's active view.
808
809 The 204 response MUST NOT include a message-body, and thus
810 is always terminated by the first empty line after the header fields."
811
812 However, in real world it was observed that some web servers
813 (e.g. Boa/0.94.14rc21) simply use code 204 when file size is zero.
814 */
815                 case 204:
816                         if (G.beg_range != 0) {
817                                 /* "Range:..." was not honored by the server.
818                                  * Restart download from the beginning.
819                                  */
820                                 reset_beg_range_to_zero();
821                         }
822                         break;
823                 case 300:  /* redirection */
824                 case 301:
825                 case 302:
826                 case 303:
827                         break;
828                 case 206: /* Partial Content */
829                         if (G.beg_range != 0)
830                                 /* "Range:..." worked. Good. */
831                                 break;
832                         /* Partial Content even though we did not ask for it??? */
833                         /* fall through */
834                 default:
835                         bb_error_msg_and_die("server returned error: %s", sanitize_string(G.wget_buf));
836                 }
837
838                 /*
839                  * Retrieve HTTP headers.
840                  */
841                 while ((str = gethdr(sfp)) != NULL) {
842                         static const char keywords[] ALIGN1 =
843                                 "content-length\0""transfer-encoding\0""location\0";
844                         enum {
845                                 KEY_content_length = 1, KEY_transfer_encoding, KEY_location
846                         };
847                         smalluint key;
848
849                         /* gethdr converted "FOO:" string to lowercase */
850
851                         /* strip trailing whitespace */
852                         char *s = strchrnul(str, '\0') - 1;
853                         while (s >= str && (*s == ' ' || *s == '\t')) {
854                                 *s = '\0';
855                                 s--;
856                         }
857                         key = index_in_strings(keywords, G.wget_buf) + 1;
858                         if (key == KEY_content_length) {
859                                 G.content_len = BB_STRTOOFF(str, NULL, 10);
860                                 if (G.content_len < 0 || errno) {
861                                         bb_error_msg_and_die("content-length %s is garbage", sanitize_string(str));
862                                 }
863                                 G.got_clen = 1;
864                                 continue;
865                         }
866                         if (key == KEY_transfer_encoding) {
867                                 if (strcmp(str_tolower(str), "chunked") != 0)
868                                         bb_error_msg_and_die("transfer encoding '%s' is not supported", sanitize_string(str));
869                                 G.chunked = 1;
870                         }
871                         if (key == KEY_location && status >= 300) {
872                                 if (--redir_limit == 0)
873                                         bb_error_msg_and_die("too many redirections");
874                                 fclose(sfp);
875                                 if (str[0] == '/') {
876                                         free(redirected_path);
877                                         target.path = redirected_path = xstrdup(str+1);
878                                         /* lsa stays the same: it's on the same server */
879                                 } else {
880                                         parse_url(str, &target);
881                                         if (!use_proxy) {
882                                                 /* server.user remains untouched */
883                                                 free(server.allocated);
884                                                 server.allocated = NULL;
885                                                 server.host = target.host;
886                                                 /* strip_ipv6_scope_id(target.host); - no! */
887                                                 /* we assume remote never gives us IPv6 addr with scope id */
888                                                 server.port = target.port;
889                                                 free(lsa);
890                                                 goto resolve_lsa;
891                                         } /* else: lsa stays the same: we use proxy */
892                                 }
893                                 goto establish_session;
894                         }
895                 }
896 //              if (status >= 300)
897 //                      bb_error_msg_and_die("bad redirection (no Location: header from server)");
898
899                 /* For HTTP, data is pumped over the same connection */
900                 dfp = sfp;
901
902         } else {
903                 /*
904                  *  FTP session
905                  */
906                 sfp = prepare_ftp_session(&dfp, &target, lsa);
907         }
908
909         free(lsa);
910
911         if (!(option_mask32 & WGET_OPT_SPIDER)) {
912                 if (G.output_fd < 0)
913                         G.output_fd = xopen(G.fname_out, G.o_flags);
914                 retrieve_file_data(dfp);
915                 if (!(option_mask32 & WGET_OPT_OUTNAME)) {
916                         xclose(G.output_fd);
917                         G.output_fd = -1;
918                 }
919         }
920
921         if (dfp != sfp) {
922                 /* It's ftp. Close data connection properly */
923                 fclose(dfp);
924                 if (ftpcmd(NULL, NULL, sfp) != 226)
925                         bb_error_msg_and_die("ftp error: %s", sanitize_string(G.wget_buf + 4));
926                 /* ftpcmd("QUIT", NULL, sfp); - why bother? */
927         }
928         fclose(sfp);
929
930         free(server.allocated);
931         free(target.allocated);
932         free(server.user);
933         free(target.user);
934         free(fname_out_alloc);
935         free(redirected_path);
936 }
937
938 int wget_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE;
939 int wget_main(int argc UNUSED_PARAM, char **argv)
940 {
941 #if ENABLE_FEATURE_WGET_LONG_OPTIONS
942         static const char wget_longopts[] ALIGN1 =
943                 /* name, has_arg, val */
944                 "continue\0"         No_argument       "c"
945 //FIXME: -s isn't --spider, it's --save-headers!
946                 "spider\0"           No_argument       "s"
947                 "quiet\0"            No_argument       "q"
948                 "output-document\0"  Required_argument "O"
949                 "directory-prefix\0" Required_argument "P"
950                 "proxy\0"            Required_argument "Y"
951                 "user-agent\0"       Required_argument "U"
952 #if ENABLE_FEATURE_WGET_TIMEOUT
953                 "timeout\0"          Required_argument "T"
954 #endif
955                 /* Ignored: */
956                 // "tries\0"            Required_argument "t"
957                 /* Ignored (we always use PASV): */
958                 "passive-ftp\0"      No_argument       "\xff"
959                 "header\0"           Required_argument "\xfe"
960                 "post-data\0"        Required_argument "\xfd"
961                 /* Ignored (we don't do ssl) */
962                 "no-check-certificate\0" No_argument   "\xfc"
963                 /* Ignored (we don't support caching) */
964                 "no-cache\0"         No_argument       "\xfb"
965                 ;
966 #endif
967
968 #if ENABLE_FEATURE_WGET_LONG_OPTIONS
969         llist_t *headers_llist = NULL;
970 #endif
971
972         INIT_G();
973
974 #if ENABLE_FEATURE_WGET_TIMEOUT
975         G.timeout_seconds = 900;
976         signal(SIGALRM, alarm_handler);
977 #endif
978         G.proxy_flag = "on";   /* use proxies if env vars are set */
979         G.user_agent = "Wget"; /* "User-Agent" header field */
980
981 #if ENABLE_FEATURE_WGET_LONG_OPTIONS
982         applet_long_options = wget_longopts;
983 #endif
984         opt_complementary = "-1" IF_FEATURE_WGET_TIMEOUT(":T+") IF_FEATURE_WGET_LONG_OPTIONS(":\xfe::");
985         getopt32(argv, "csqO:P:Y:U:T:" /*ignored:*/ "t:",
986                 &G.fname_out, &G.dir_prefix,
987                 &G.proxy_flag, &G.user_agent,
988                 IF_FEATURE_WGET_TIMEOUT(&G.timeout_seconds) IF_NOT_FEATURE_WGET_TIMEOUT(NULL),
989                 NULL /* -t RETRIES */
990                 IF_FEATURE_WGET_LONG_OPTIONS(, &headers_llist)
991                 IF_FEATURE_WGET_LONG_OPTIONS(, &G.post_data)
992         );
993         argv += optind;
994
995 #if ENABLE_FEATURE_WGET_LONG_OPTIONS
996         if (headers_llist) {
997                 int size = 1;
998                 char *cp;
999                 llist_t *ll = headers_llist;
1000                 while (ll) {
1001                         size += strlen(ll->data) + 2;
1002                         ll = ll->link;
1003                 }
1004                 G.extra_headers = cp = xmalloc(size);
1005                 while (headers_llist) {
1006                         cp += sprintf(cp, "%s\r\n", (char*)llist_pop(&headers_llist));
1007                 }
1008         }
1009 #endif
1010
1011         G.output_fd = -1;
1012         G.o_flags = O_WRONLY | O_CREAT | O_TRUNC | O_EXCL;
1013         if (G.fname_out) { /* -O FILE ? */
1014                 if (LONE_DASH(G.fname_out)) { /* -O - ? */
1015                         G.output_fd = 1;
1016                         option_mask32 &= ~WGET_OPT_CONTINUE;
1017                 }
1018                 /* compat with wget: -O FILE can overwrite */
1019                 G.o_flags = O_WRONLY | O_CREAT | O_TRUNC;
1020         }
1021
1022         while (*argv)
1023                 download_one_url(*argv++);
1024
1025         if (G.output_fd >= 0)
1026                 xclose(G.output_fd);
1027
1028 #if ENABLE_FEATURE_CLEAN_UP && ENABLE_FEATURE_WGET_LONG_OPTIONS
1029         free(G.extra_headers);
1030 #endif
1031         FINI_G();
1032
1033         return EXIT_SUCCESS;
1034 }