httpd: make byte ranges which start at 0 work too. Closes 4766
[oweals/busybox.git] / networking / wget.c
1 /* vi: set sw=4 ts=4: */
2 /*
3  * wget - retrieve a file using HTTP or FTP
4  *
5  * Chip Rosenthal Covad Communications <chip@laserlink.net>
6  * Licensed under GPLv2, see file LICENSE in this source tree.
7  *
8  * Copyright (C) 2010 Bradley M. Kuhn <bkuhn@ebb.org>
9  * Kuhn's copyrights are licensed GPLv2-or-later.  File as a whole remains GPLv2.
10  */
11
12 //usage:#define wget_trivial_usage
13 //usage:        IF_FEATURE_WGET_LONG_OPTIONS(
14 //usage:       "[-c|--continue] [-s|--spider] [-q|--quiet] [-O|--output-document FILE]\n"
15 //usage:       "        [--header 'header: value'] [-Y|--proxy on/off] [-P DIR]\n"
16 //usage:       "        [--no-check-certificate] [-U|--user-agent AGENT]"
17 //usage:                        IF_FEATURE_WGET_TIMEOUT(" [-T SEC]") " URL..."
18 //usage:        )
19 //usage:        IF_NOT_FEATURE_WGET_LONG_OPTIONS(
20 //usage:       "[-csq] [-O FILE] [-Y on/off] [-P DIR] [-U AGENT]"
21 //usage:                        IF_FEATURE_WGET_TIMEOUT(" [-T SEC]") " URL..."
22 //usage:        )
23 //usage:#define wget_full_usage "\n\n"
24 //usage:       "Retrieve files via HTTP or FTP\n"
25 //usage:     "\n        -s      Spider mode - only check file existence"
26 //usage:     "\n        -c      Continue retrieval of aborted transfer"
27 //usage:     "\n        -q      Quiet"
28 //usage:     "\n        -P DIR  Save to DIR (default .)"
29 //usage:        IF_FEATURE_WGET_TIMEOUT(
30 //usage:     "\n        -T SEC  Network read timeout is SEC seconds"
31 //usage:        )
32 //usage:     "\n        -O FILE Save to FILE ('-' for stdout)"
33 //usage:     "\n        -U STR  Use STR for User-Agent header"
34 //usage:     "\n        -Y      Use proxy ('on' or 'off')"
35
36 #include "libbb.h"
37
38 //#define log_io(...) bb_error_msg(__VA_ARGS__)
39 #define log_io(...) ((void)0)
40
41
42 struct host_info {
43         char *allocated;
44         const char *path;
45         const char *user;
46         char       *host;
47         int         port;
48         smallint    is_ftp;
49 };
50
51
52 /* Globals */
53 struct globals {
54         off_t content_len;        /* Content-length of the file */
55         off_t beg_range;          /* Range at which continue begins */
56 #if ENABLE_FEATURE_WGET_STATUSBAR
57         off_t transferred;        /* Number of bytes transferred so far */
58         const char *curfile;      /* Name of current file being transferred */
59         bb_progress_t pmt;
60 #endif
61         char *dir_prefix;
62 #if ENABLE_FEATURE_WGET_LONG_OPTIONS
63         char *post_data;
64         char *extra_headers;
65 #endif
66         char *fname_out;        /* where to direct output (-O) */
67         const char *proxy_flag; /* Use proxies if env vars are set */
68         const char *user_agent; /* "User-Agent" header field */
69 #if ENABLE_FEATURE_WGET_TIMEOUT
70         unsigned timeout_seconds;
71 #endif
72         int output_fd;
73         int o_flags;
74         smallint chunked;         /* chunked transfer encoding */
75         smallint got_clen;        /* got content-length: from server  */
76         /* Local downloads do benefit from big buffer.
77          * With 512 byte buffer, it was measured to be
78          * an order of magnitude slower than with big one.
79          */
80         uint64_t just_to_align_next_member;
81         char wget_buf[CONFIG_FEATURE_COPYBUF_KB*1024];
82 } FIX_ALIASING;
83 #define G (*ptr_to_globals)
84 #define INIT_G() do { \
85         SET_PTR_TO_GLOBALS(xzalloc(sizeof(G))); \
86         IF_FEATURE_WGET_TIMEOUT(G.timeout_seconds = 900;) \
87 } while (0)
88
89
90 /* Must match option string! */
91 enum {
92         WGET_OPT_CONTINUE   = (1 << 0),
93         WGET_OPT_SPIDER     = (1 << 1),
94         WGET_OPT_QUIET      = (1 << 2),
95         WGET_OPT_OUTNAME    = (1 << 3),
96         WGET_OPT_PREFIX     = (1 << 4),
97         WGET_OPT_PROXY      = (1 << 5),
98         WGET_OPT_USER_AGENT = (1 << 6),
99         WGET_OPT_NETWORK_READ_TIMEOUT = (1 << 7),
100         WGET_OPT_RETRIES    = (1 << 8),
101         WGET_OPT_PASSIVE    = (1 << 9),
102         WGET_OPT_HEADER     = (1 << 10) * ENABLE_FEATURE_WGET_LONG_OPTIONS,
103         WGET_OPT_POST_DATA  = (1 << 11) * ENABLE_FEATURE_WGET_LONG_OPTIONS,
104 };
105
106 enum {
107         PROGRESS_START = -1,
108         PROGRESS_END   = 0,
109         PROGRESS_BUMP  = 1,
110 };
111 #if ENABLE_FEATURE_WGET_STATUSBAR
112 static void progress_meter(int flag)
113 {
114         if (option_mask32 & WGET_OPT_QUIET)
115                 return;
116
117         if (flag == PROGRESS_START)
118                 bb_progress_init(&G.pmt, G.curfile);
119
120         bb_progress_update(&G.pmt,
121                         G.beg_range,
122                         G.transferred,
123                         (G.chunked || !G.got_clen) ? 0 : G.beg_range + G.transferred + G.content_len
124         );
125
126         if (flag == PROGRESS_END) {
127                 bb_progress_free(&G.pmt);
128                 bb_putchar_stderr('\n');
129                 G.transferred = 0;
130         }
131 }
132 #else
133 static ALWAYS_INLINE void progress_meter(int flag UNUSED_PARAM) { }
134 #endif
135
136
137 /* IPv6 knows scoped address types i.e. link and site local addresses. Link
138  * local addresses can have a scope identifier to specify the
139  * interface/link an address is valid on (e.g. fe80::1%eth0). This scope
140  * identifier is only valid on a single node.
141  *
142  * RFC 4007 says that the scope identifier MUST NOT be sent across the wire,
143  * unless all nodes agree on the semantic. Apache e.g. regards zone identifiers
144  * in the Host header as invalid requests, see
145  * https://issues.apache.org/bugzilla/show_bug.cgi?id=35122
146  */
147 static void strip_ipv6_scope_id(char *host)
148 {
149         char *scope, *cp;
150
151         /* bbox wget actually handles IPv6 addresses without [], like
152          * wget "http://::1/xxx", but this is not standard.
153          * To save code, _here_ we do not support it. */
154
155         if (host[0] != '[')
156                 return; /* not IPv6 */
157
158         scope = strchr(host, '%');
159         if (!scope)
160                 return;
161
162         /* Remove the IPv6 zone identifier from the host address */
163         cp = strchr(host, ']');
164         if (!cp || (cp[1] != ':' && cp[1] != '\0')) {
165                 /* malformed address (not "[xx]:nn" or "[xx]") */
166                 return;
167         }
168
169         /* cp points to "]...", scope points to "%eth0]..." */
170         overlapping_strcpy(scope, cp);
171 }
172
173 #if ENABLE_FEATURE_WGET_AUTHENTICATION
174 /* Base64-encode character string. */
175 static char *base64enc(const char *str)
176 {
177         unsigned len = strlen(str);
178         if (len > sizeof(G.wget_buf)/4*3 - 10) /* paranoia */
179                 len = sizeof(G.wget_buf)/4*3 - 10;
180         bb_uuencode(G.wget_buf, str, len, bb_uuenc_tbl_base64);
181         return G.wget_buf;
182 }
183 #endif
184
185 static char* sanitize_string(char *s)
186 {
187         unsigned char *p = (void *) s;
188         while (*p >= ' ')
189                 p++;
190         *p = '\0';
191         return s;
192 }
193
194 static FILE *open_socket(len_and_sockaddr *lsa)
195 {
196         FILE *fp;
197
198         /* glibc 2.4 seems to try seeking on it - ??! */
199         /* hopefully it understands what ESPIPE means... */
200         fp = fdopen(xconnect_stream(lsa), "r+");
201         if (fp == NULL)
202                 bb_perror_msg_and_die(bb_msg_memory_exhausted);
203
204         return fp;
205 }
206
207 /* Returns '\n' if it was seen, else '\0'. Trims at first '\r' or '\n' */
208 static char fgets_and_trim(FILE *fp)
209 {
210         char c;
211         char *buf_ptr;
212
213         if (fgets(G.wget_buf, sizeof(G.wget_buf) - 1, fp) == NULL)
214                 bb_perror_msg_and_die("error getting response");
215
216         buf_ptr = strchrnul(G.wget_buf, '\n');
217         c = *buf_ptr;
218         *buf_ptr = '\0';
219         buf_ptr = strchrnul(G.wget_buf, '\r');
220         *buf_ptr = '\0';
221
222         log_io("< %s", G.wget_buf);
223
224         return c;
225 }
226
227 static int ftpcmd(const char *s1, const char *s2, FILE *fp)
228 {
229         int result;
230         if (s1) {
231                 if (!s2)
232                         s2 = "";
233                 fprintf(fp, "%s%s\r\n", s1, s2);
234                 fflush(fp);
235                 log_io("> %s%s", s1, s2);
236         }
237
238         do {
239                 fgets_and_trim(fp);
240         } while (!isdigit(G.wget_buf[0]) || G.wget_buf[3] != ' ');
241
242         G.wget_buf[3] = '\0';
243         result = xatoi_positive(G.wget_buf);
244         G.wget_buf[3] = ' ';
245         return result;
246 }
247
248 static void parse_url(const char *src_url, struct host_info *h)
249 {
250         char *url, *p, *sp;
251
252         free(h->allocated);
253         h->allocated = url = xstrdup(src_url);
254
255         if (strncmp(url, "http://", 7) == 0) {
256                 h->port = bb_lookup_port("http", "tcp", 80);
257                 h->host = url + 7;
258                 h->is_ftp = 0;
259         } else if (strncmp(url, "ftp://", 6) == 0) {
260                 h->port = bb_lookup_port("ftp", "tcp", 21);
261                 h->host = url + 6;
262                 h->is_ftp = 1;
263         } else
264                 bb_error_msg_and_die("not an http or ftp url: %s", sanitize_string(url));
265
266         // FYI:
267         // "Real" wget 'http://busybox.net?var=a/b' sends this request:
268         //   'GET /?var=a/b HTTP 1.0'
269         //   and saves 'index.html?var=a%2Fb' (we save 'b')
270         // wget 'http://busybox.net?login=john@doe':
271         //   request: 'GET /?login=john@doe HTTP/1.0'
272         //   saves: 'index.html?login=john@doe' (we save '?login=john@doe')
273         // wget 'http://busybox.net#test/test':
274         //   request: 'GET / HTTP/1.0'
275         //   saves: 'index.html' (we save 'test')
276         //
277         // We also don't add unique .N suffix if file exists...
278         sp = strchr(h->host, '/');
279         p = strchr(h->host, '?'); if (!sp || (p && sp > p)) sp = p;
280         p = strchr(h->host, '#'); if (!sp || (p && sp > p)) sp = p;
281         if (!sp) {
282                 h->path = "";
283         } else if (*sp == '/') {
284                 *sp = '\0';
285                 h->path = sp + 1;
286         } else { // '#' or '?'
287                 // http://busybox.net?login=john@doe is a valid URL
288                 // memmove converts to:
289                 // http:/busybox.nett?login=john@doe...
290                 memmove(h->host - 1, h->host, sp - h->host);
291                 h->host--;
292                 sp[-1] = '\0';
293                 h->path = sp;
294         }
295
296         // We used to set h->user to NULL here, but this interferes
297         // with handling of code 302 ("object was moved")
298
299         sp = strrchr(h->host, '@');
300         if (sp != NULL) {
301                 // URL-decode "user:password" string before base64-encoding:
302                 // wget http://test:my%20pass@example.com should send
303                 // Authorization: Basic dGVzdDpteSBwYXNz
304                 // which decodes to "test:my pass".
305                 // Standard wget and curl do this too.
306                 *sp = '\0';
307                 h->user = percent_decode_in_place(h->host, /*strict:*/ 0);
308                 h->host = sp + 1;
309         }
310
311         sp = h->host;
312 }
313
314 static char *gethdr(FILE *fp)
315 {
316         char *s, *hdrval;
317         int c;
318
319         /* *istrunc = 0; */
320
321         /* retrieve header line */
322         c = fgets_and_trim(fp);
323
324         /* end of the headers? */
325         if (G.wget_buf[0] == '\0')
326                 return NULL;
327
328         /* convert the header name to lower case */
329         for (s = G.wget_buf; isalnum(*s) || *s == '-' || *s == '.'; ++s) {
330                 /* tolower for "A-Z", no-op for "0-9a-z-." */
331                 *s |= 0x20;
332         }
333
334         /* verify we are at the end of the header name */
335         if (*s != ':')
336                 bb_error_msg_and_die("bad header line: %s", sanitize_string(G.wget_buf));
337
338         /* locate the start of the header value */
339         *s++ = '\0';
340         hdrval = skip_whitespace(s);
341
342         if (c != '\n') {
343                 /* Rats! The buffer isn't big enough to hold the entire header value */
344                 while (c = getc(fp), c != EOF && c != '\n')
345                         continue;
346         }
347
348         return hdrval;
349 }
350
351 static FILE* prepare_ftp_session(FILE **dfpp, struct host_info *target, len_and_sockaddr *lsa)
352 {
353         FILE *sfp;
354         char *str;
355         int port;
356
357         if (!target->user)
358                 target->user = xstrdup("anonymous:busybox@");
359
360         sfp = open_socket(lsa);
361         if (ftpcmd(NULL, NULL, sfp) != 220)
362                 bb_error_msg_and_die("%s", sanitize_string(G.wget_buf + 4));
363
364         /*
365          * Splitting username:password pair,
366          * trying to log in
367          */
368         str = strchr(target->user, ':');
369         if (str)
370                 *str++ = '\0';
371         switch (ftpcmd("USER ", target->user, sfp)) {
372         case 230:
373                 break;
374         case 331:
375                 if (ftpcmd("PASS ", str, sfp) == 230)
376                         break;
377                 /* fall through (failed login) */
378         default:
379                 bb_error_msg_and_die("ftp login: %s", sanitize_string(G.wget_buf + 4));
380         }
381
382         ftpcmd("TYPE I", NULL, sfp);
383
384         /*
385          * Querying file size
386          */
387         if (ftpcmd("SIZE ", target->path, sfp) == 213) {
388                 G.content_len = BB_STRTOOFF(G.wget_buf + 4, NULL, 10);
389                 if (G.content_len < 0 || errno) {
390                         bb_error_msg_and_die("SIZE value is garbage");
391                 }
392                 G.got_clen = 1;
393         }
394
395         /*
396          * Entering passive mode
397          */
398         if (ftpcmd("PASV", NULL, sfp) != 227) {
399  pasv_error:
400                 bb_error_msg_and_die("bad response to %s: %s", "PASV", sanitize_string(G.wget_buf));
401         }
402         // Response is "227 garbageN1,N2,N3,N4,P1,P2[)garbage]
403         // Server's IP is N1.N2.N3.N4 (we ignore it)
404         // Server's port for data connection is P1*256+P2
405         str = strrchr(G.wget_buf, ')');
406         if (str) str[0] = '\0';
407         str = strrchr(G.wget_buf, ',');
408         if (!str) goto pasv_error;
409         port = xatou_range(str+1, 0, 255);
410         *str = '\0';
411         str = strrchr(G.wget_buf, ',');
412         if (!str) goto pasv_error;
413         port += xatou_range(str+1, 0, 255) * 256;
414         set_nport(&lsa->u.sa, htons(port));
415
416         *dfpp = open_socket(lsa);
417
418         if (G.beg_range) {
419                 sprintf(G.wget_buf, "REST %"OFF_FMT"u", G.beg_range);
420                 if (ftpcmd(G.wget_buf, NULL, sfp) == 350)
421                         G.content_len -= G.beg_range;
422         }
423
424         if (ftpcmd("RETR ", target->path, sfp) > 150)
425                 bb_error_msg_and_die("bad response to %s: %s", "RETR", sanitize_string(G.wget_buf));
426
427         return sfp;
428 }
429
430 static void NOINLINE retrieve_file_data(FILE *dfp)
431 {
432 #if ENABLE_FEATURE_WGET_STATUSBAR || ENABLE_FEATURE_WGET_TIMEOUT
433 # if ENABLE_FEATURE_WGET_TIMEOUT
434         unsigned second_cnt;
435 # endif
436         struct pollfd polldata;
437
438         polldata.fd = fileno(dfp);
439         polldata.events = POLLIN | POLLPRI;
440 #endif
441         progress_meter(PROGRESS_START);
442
443         if (G.chunked)
444                 goto get_clen;
445
446         /* Loops only if chunked */
447         while (1) {
448
449 #if ENABLE_FEATURE_WGET_STATUSBAR || ENABLE_FEATURE_WGET_TIMEOUT
450                 /* Must use nonblocking I/O, otherwise fread will loop
451                  * and *block* until it reads full buffer,
452                  * which messes up progress bar and/or timeout logic.
453                  * Because of nonblocking I/O, we need to dance
454                  * very carefully around EAGAIN. See explanation at
455                  * clearerr() call.
456                  */
457                 ndelay_on(polldata.fd);
458 #endif
459                 while (1) {
460                         int n;
461                         unsigned rdsz;
462
463                         rdsz = sizeof(G.wget_buf);
464                         if (G.got_clen) {
465                                 if (G.content_len < (off_t)sizeof(G.wget_buf)) {
466                                         if ((int)G.content_len <= 0)
467                                                 break;
468                                         rdsz = (unsigned)G.content_len;
469                                 }
470                         }
471
472 #if ENABLE_FEATURE_WGET_STATUSBAR || ENABLE_FEATURE_WGET_TIMEOUT
473 # if ENABLE_FEATURE_WGET_TIMEOUT
474                         second_cnt = G.timeout_seconds;
475 # endif
476                         while (1) {
477                                 if (safe_poll(&polldata, 1, 1000) != 0)
478                                         break; /* error, EOF, or data is available */
479 # if ENABLE_FEATURE_WGET_TIMEOUT
480                                 if (second_cnt != 0 && --second_cnt == 0) {
481                                         progress_meter(PROGRESS_END);
482                                         bb_error_msg_and_die("download timed out");
483                                 }
484 # endif
485                                 /* Needed for "stalled" indicator */
486                                 progress_meter(PROGRESS_BUMP);
487                         }
488
489                         /* fread internally uses read loop, which in our case
490                          * is usually exited when we get EAGAIN.
491                          * In this case, libc sets error marker on the stream.
492                          * Need to clear it before next fread to avoid possible
493                          * rare false positive ferror below. Rare because usually
494                          * fread gets more than zero bytes, and we don't fall
495                          * into if (n <= 0) ...
496                          */
497                         clearerr(dfp);
498                         errno = 0;
499 #endif
500                         n = fread(G.wget_buf, 1, rdsz, dfp);
501                         /* man fread:
502                          * If error occurs, or EOF is reached, the return value
503                          * is a short item count (or zero).
504                          * fread does not distinguish between EOF and error.
505                          */
506                         if (n <= 0) {
507 #if ENABLE_FEATURE_WGET_STATUSBAR || ENABLE_FEATURE_WGET_TIMEOUT
508                                 if (errno == EAGAIN) /* poll lied, there is no data? */
509                                         continue; /* yes */
510 #endif
511                                 if (ferror(dfp))
512                                         bb_perror_msg_and_die(bb_msg_read_error);
513                                 break; /* EOF, not error */
514                         }
515
516                         xwrite(G.output_fd, G.wget_buf, n);
517
518 #if ENABLE_FEATURE_WGET_STATUSBAR
519                         G.transferred += n;
520                         progress_meter(PROGRESS_BUMP);
521 #endif
522                         if (G.got_clen) {
523                                 G.content_len -= n;
524                                 if (G.content_len == 0)
525                                         break;
526                         }
527                 }
528 #if ENABLE_FEATURE_WGET_STATUSBAR || ENABLE_FEATURE_WGET_TIMEOUT
529                 clearerr(dfp);
530                 ndelay_off(polldata.fd); /* else fgets can get very unhappy */
531 #endif
532                 if (!G.chunked)
533                         break;
534
535                 fgets_and_trim(dfp); /* Eat empty line */
536  get_clen:
537                 fgets_and_trim(dfp);
538                 G.content_len = STRTOOFF(G.wget_buf, NULL, 16);
539                 /* FIXME: error check? */
540                 if (G.content_len == 0)
541                         break; /* all done! */
542                 G.got_clen = 1;
543         }
544
545         /* Draw full bar and free its resources */
546         G.chunked = 0;  /* makes it show 100% even for chunked download */
547         G.got_clen = 1; /* makes it show 100% even for download of (formerly) unknown size */
548         progress_meter(PROGRESS_END);
549 }
550
551 static void download_one_url(const char *url)
552 {
553         bool use_proxy;                 /* Use proxies if env vars are set  */
554         int redir_limit;
555         len_and_sockaddr *lsa;
556         FILE *sfp;                      /* socket to web/ftp server         */
557         FILE *dfp;                      /* socket to ftp server (data)      */
558         char *proxy = NULL;
559         char *fname_out_alloc;
560         char *redirected_path = NULL;
561         struct host_info server;
562         struct host_info target;
563
564         server.allocated = NULL;
565         target.allocated = NULL;
566         server.user = NULL;
567         target.user = NULL;
568
569         parse_url(url, &target);
570
571         /* Use the proxy if necessary */
572         use_proxy = (strcmp(G.proxy_flag, "off") != 0);
573         if (use_proxy) {
574                 proxy = getenv(target.is_ftp ? "ftp_proxy" : "http_proxy");
575                 use_proxy = (proxy && proxy[0]);
576                 if (use_proxy)
577                         parse_url(proxy, &server);
578         }
579         if (!use_proxy) {
580                 server.port = target.port;
581                 if (ENABLE_FEATURE_IPV6) {
582                         //free(server.allocated); - can't be non-NULL
583                         server.host = server.allocated = xstrdup(target.host);
584                 } else {
585                         server.host = target.host;
586                 }
587         }
588
589         if (ENABLE_FEATURE_IPV6)
590                 strip_ipv6_scope_id(target.host);
591
592         /* If there was no -O FILE, guess output filename */
593         fname_out_alloc = NULL;
594         if (!(option_mask32 & WGET_OPT_OUTNAME)) {
595                 G.fname_out = bb_get_last_path_component_nostrip(target.path);
596                 /* handle "wget http://kernel.org//" */
597                 if (G.fname_out[0] == '/' || !G.fname_out[0])
598                         G.fname_out = (char*)"index.html";
599                 /* -P DIR is considered only if there was no -O FILE */
600                 else {
601                         if (G.dir_prefix)
602                                 G.fname_out = fname_out_alloc = concat_path_file(G.dir_prefix, G.fname_out);
603                         else {
604                                 /* redirects may free target.path later, need to make a copy */
605                                 G.fname_out = fname_out_alloc = xstrdup(G.fname_out);
606                         }
607                 }
608         }
609 #if ENABLE_FEATURE_WGET_STATUSBAR
610         G.curfile = bb_get_last_path_component_nostrip(G.fname_out);
611 #endif
612
613         /* Determine where to start transfer */
614         G.beg_range = 0;
615         if (option_mask32 & WGET_OPT_CONTINUE) {
616                 G.output_fd = open(G.fname_out, O_WRONLY);
617                 if (G.output_fd >= 0) {
618                         G.beg_range = xlseek(G.output_fd, 0, SEEK_END);
619                 }
620                 /* File doesn't exist. We do not create file here yet.
621                  * We are not sure it exists on remote side */
622         }
623
624         redir_limit = 5;
625  resolve_lsa:
626         lsa = xhost2sockaddr(server.host, server.port);
627         if (!(option_mask32 & WGET_OPT_QUIET)) {
628                 char *s = xmalloc_sockaddr2dotted(&lsa->u.sa);
629                 fprintf(stderr, "Connecting to %s (%s)\n", server.host, s);
630                 free(s);
631         }
632  establish_session:
633         /*G.content_len = 0; - redundant, got_clen = 0 is enough */
634         G.got_clen = 0;
635         G.chunked = 0;
636         if (use_proxy || !target.is_ftp) {
637                 /*
638                  *  HTTP session
639                  */
640                 char *str;
641                 int status;
642
643
644                 /* Open socket to http server */
645                 sfp = open_socket(lsa);
646
647                 /* Send HTTP request */
648                 if (use_proxy) {
649                         fprintf(sfp, "GET %stp://%s/%s HTTP/1.1\r\n",
650                                 target.is_ftp ? "f" : "ht", target.host,
651                                 target.path);
652                 } else {
653                         if (option_mask32 & WGET_OPT_POST_DATA)
654                                 fprintf(sfp, "POST /%s HTTP/1.1\r\n", target.path);
655                         else
656                                 fprintf(sfp, "GET /%s HTTP/1.1\r\n", target.path);
657                 }
658
659                 fprintf(sfp, "Host: %s\r\nUser-Agent: %s\r\n",
660                         target.host, G.user_agent);
661
662                 /* Ask server to close the connection as soon as we are done
663                  * (IOW: we do not intend to send more requests)
664                  */
665                 fprintf(sfp, "Connection: close\r\n");
666
667 #if ENABLE_FEATURE_WGET_AUTHENTICATION
668                 if (target.user) {
669                         fprintf(sfp, "Proxy-Authorization: Basic %s\r\n"+6,
670                                 base64enc(target.user));
671                 }
672                 if (use_proxy && server.user) {
673                         fprintf(sfp, "Proxy-Authorization: Basic %s\r\n",
674                                 base64enc(server.user));
675                 }
676 #endif
677
678                 if (G.beg_range)
679                         fprintf(sfp, "Range: bytes=%"OFF_FMT"u-\r\n", G.beg_range);
680
681 #if ENABLE_FEATURE_WGET_LONG_OPTIONS
682                 if (G.extra_headers)
683                         fputs(G.extra_headers, sfp);
684
685                 if (option_mask32 & WGET_OPT_POST_DATA) {
686                         fprintf(sfp,
687                                 "Content-Type: application/x-www-form-urlencoded\r\n"
688                                 "Content-Length: %u\r\n"
689                                 "\r\n"
690                                 "%s",
691                                 (int) strlen(G.post_data), G.post_data
692                         );
693                 } else
694 #endif
695                 {
696                         fprintf(sfp, "\r\n");
697                 }
698
699                 fflush(sfp);
700
701                 /*
702                  * Retrieve HTTP response line and check for "200" status code.
703                  */
704  read_response:
705                 fgets_and_trim(sfp);
706
707                 str = G.wget_buf;
708                 str = skip_non_whitespace(str);
709                 str = skip_whitespace(str);
710                 // FIXME: no error check
711                 // xatou wouldn't work: "200 OK"
712                 status = atoi(str);
713                 switch (status) {
714                 case 0:
715                 case 100:
716                         while (gethdr(sfp) != NULL)
717                                 /* eat all remaining headers */;
718                         goto read_response;
719                 case 200:
720 /*
721 Response 204 doesn't say "null file", it says "metadata
722 has changed but data didn't":
723
724 "10.2.5 204 No Content
725 The server has fulfilled the request but does not need to return
726 an entity-body, and might want to return updated metainformation.
727 The response MAY include new or updated metainformation in the form
728 of entity-headers, which if present SHOULD be associated with
729 the requested variant.
730
731 If the client is a user agent, it SHOULD NOT change its document
732 view from that which caused the request to be sent. This response
733 is primarily intended to allow input for actions to take place
734 without causing a change to the user agent's active document view,
735 although any new or updated metainformation SHOULD be applied
736 to the document currently in the user agent's active view.
737
738 The 204 response MUST NOT include a message-body, and thus
739 is always terminated by the first empty line after the header fields."
740
741 However, in real world it was observed that some web servers
742 (e.g. Boa/0.94.14rc21) simply use code 204 when file size is zero.
743 */
744                 case 204:
745                         break;
746                 case 300:  /* redirection */
747                 case 301:
748                 case 302:
749                 case 303:
750                         break;
751                 case 206:
752                         if (G.beg_range)
753                                 break;
754                         /* fall through */
755                 default:
756                         bb_error_msg_and_die("server returned error: %s", sanitize_string(G.wget_buf));
757                 }
758
759                 /*
760                  * Retrieve HTTP headers.
761                  */
762                 while ((str = gethdr(sfp)) != NULL) {
763                         static const char keywords[] ALIGN1 =
764                                 "content-length\0""transfer-encoding\0""location\0";
765                         enum {
766                                 KEY_content_length = 1, KEY_transfer_encoding, KEY_location
767                         };
768                         smalluint key;
769
770                         /* gethdr converted "FOO:" string to lowercase */
771
772                         /* strip trailing whitespace */
773                         char *s = strchrnul(str, '\0') - 1;
774                         while (s >= str && (*s == ' ' || *s == '\t')) {
775                                 *s = '\0';
776                                 s--;
777                         }
778                         key = index_in_strings(keywords, G.wget_buf) + 1;
779                         if (key == KEY_content_length) {
780                                 G.content_len = BB_STRTOOFF(str, NULL, 10);
781                                 if (G.content_len < 0 || errno) {
782                                         bb_error_msg_and_die("content-length %s is garbage", sanitize_string(str));
783                                 }
784                                 G.got_clen = 1;
785                                 continue;
786                         }
787                         if (key == KEY_transfer_encoding) {
788                                 if (strcmp(str_tolower(str), "chunked") != 0)
789                                         bb_error_msg_and_die("transfer encoding '%s' is not supported", sanitize_string(str));
790                                 G.chunked = 1;
791                         }
792                         if (key == KEY_location && status >= 300) {
793                                 if (--redir_limit == 0)
794                                         bb_error_msg_and_die("too many redirections");
795                                 fclose(sfp);
796                                 if (str[0] == '/') {
797                                         free(redirected_path);
798                                         target.path = redirected_path = xstrdup(str+1);
799                                         /* lsa stays the same: it's on the same server */
800                                 } else {
801                                         parse_url(str, &target);
802                                         if (!use_proxy) {
803                                                 free(server.allocated);
804                                                 server.allocated = NULL;
805                                                 server.host = target.host;
806                                                 /* strip_ipv6_scope_id(target.host); - no! */
807                                                 /* we assume remote never gives us IPv6 addr with scope id */
808                                                 server.port = target.port;
809                                                 free(lsa);
810                                                 goto resolve_lsa;
811                                         } /* else: lsa stays the same: we use proxy */
812                                 }
813                                 goto establish_session;
814                         }
815                 }
816 //              if (status >= 300)
817 //                      bb_error_msg_and_die("bad redirection (no Location: header from server)");
818
819                 /* For HTTP, data is pumped over the same connection */
820                 dfp = sfp;
821
822         } else {
823                 /*
824                  *  FTP session
825                  */
826                 sfp = prepare_ftp_session(&dfp, &target, lsa);
827         }
828
829         free(lsa);
830
831         if (!(option_mask32 & WGET_OPT_SPIDER)) {
832                 if (G.output_fd < 0)
833                         G.output_fd = xopen(G.fname_out, G.o_flags);
834                 retrieve_file_data(dfp);
835                 if (!(option_mask32 & WGET_OPT_OUTNAME)) {
836                         xclose(G.output_fd);
837                         G.output_fd = -1;
838                 }
839         }
840
841         if (dfp != sfp) {
842                 /* It's ftp. Close data connection properly */
843                 fclose(dfp);
844                 if (ftpcmd(NULL, NULL, sfp) != 226)
845                         bb_error_msg_and_die("ftp error: %s", sanitize_string(G.wget_buf + 4));
846                 /* ftpcmd("QUIT", NULL, sfp); - why bother? */
847         }
848         fclose(sfp);
849
850         free(server.allocated);
851         free(target.allocated);
852         free(fname_out_alloc);
853         free(redirected_path);
854 }
855
856 int wget_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE;
857 int wget_main(int argc UNUSED_PARAM, char **argv)
858 {
859 #if ENABLE_FEATURE_WGET_LONG_OPTIONS
860         static const char wget_longopts[] ALIGN1 =
861                 /* name, has_arg, val */
862                 "continue\0"         No_argument       "c"
863 //FIXME: -s isn't --spider, it's --save-headers!
864                 "spider\0"           No_argument       "s"
865                 "quiet\0"            No_argument       "q"
866                 "output-document\0"  Required_argument "O"
867                 "directory-prefix\0" Required_argument "P"
868                 "proxy\0"            Required_argument "Y"
869                 "user-agent\0"       Required_argument "U"
870 #if ENABLE_FEATURE_WGET_TIMEOUT
871                 "timeout\0"          Required_argument "T"
872 #endif
873                 /* Ignored: */
874                 // "tries\0"            Required_argument "t"
875                 /* Ignored (we always use PASV): */
876                 "passive-ftp\0"      No_argument       "\xff"
877                 "header\0"           Required_argument "\xfe"
878                 "post-data\0"        Required_argument "\xfd"
879                 /* Ignored (we don't do ssl) */
880                 "no-check-certificate\0" No_argument   "\xfc"
881                 ;
882 #endif
883
884 #if ENABLE_FEATURE_WGET_LONG_OPTIONS
885         llist_t *headers_llist = NULL;
886 #endif
887
888         INIT_G();
889
890         IF_FEATURE_WGET_TIMEOUT(G.timeout_seconds = 900;)
891         G.proxy_flag = "on";   /* use proxies if env vars are set */
892         G.user_agent = "Wget"; /* "User-Agent" header field */
893
894 #if ENABLE_FEATURE_WGET_LONG_OPTIONS
895         applet_long_options = wget_longopts;
896 #endif
897         opt_complementary = "-1" IF_FEATURE_WGET_TIMEOUT(":T+") IF_FEATURE_WGET_LONG_OPTIONS(":\xfe::");
898         getopt32(argv, "csqO:P:Y:U:T:" /*ignored:*/ "t:",
899                 &G.fname_out, &G.dir_prefix,
900                 &G.proxy_flag, &G.user_agent,
901                 IF_FEATURE_WGET_TIMEOUT(&G.timeout_seconds) IF_NOT_FEATURE_WGET_TIMEOUT(NULL),
902                 NULL /* -t RETRIES */
903                 IF_FEATURE_WGET_LONG_OPTIONS(, &headers_llist)
904                 IF_FEATURE_WGET_LONG_OPTIONS(, &G.post_data)
905         );
906         argv += optind;
907
908 #if ENABLE_FEATURE_WGET_LONG_OPTIONS
909         if (headers_llist) {
910                 int size = 1;
911                 char *cp;
912                 llist_t *ll = headers_llist;
913                 while (ll) {
914                         size += strlen(ll->data) + 2;
915                         ll = ll->link;
916                 }
917                 G.extra_headers = cp = xmalloc(size);
918                 while (headers_llist) {
919                         cp += sprintf(cp, "%s\r\n", (char*)llist_pop(&headers_llist));
920                 }
921         }
922 #endif
923
924         G.output_fd = -1;
925         G.o_flags = O_WRONLY | O_CREAT | O_TRUNC | O_EXCL;
926         if (G.fname_out) { /* -O FILE ? */
927                 if (LONE_DASH(G.fname_out)) { /* -O - ? */
928                         G.output_fd = 1;
929                         option_mask32 &= ~WGET_OPT_CONTINUE;
930                 }
931                 /* compat with wget: -O FILE can overwrite */
932                 G.o_flags = O_WRONLY | O_CREAT | O_TRUNC;
933         }
934
935         while (*argv)
936                 download_one_url(*argv++);
937
938         if (G.output_fd >= 0)
939                 xclose(G.output_fd);
940
941         return EXIT_SUCCESS;
942 }