3416636aede3be371a019fe4e339150155ac592e
[oweals/busybox.git] / networking / wget.c
1 /* vi: set sw=4 ts=4: */
2 /*
3  * wget - retrieve a file using HTTP or FTP
4  *
5  * Chip Rosenthal Covad Communications <chip@laserlink.net>
6  * Licensed under GPLv2, see file LICENSE in this source tree.
7  *
8  * Copyright (C) 2010 Bradley M. Kuhn <bkuhn@ebb.org>
9  * Kuhn's copyrights are licensed GPLv2-or-later.  File as a whole remains GPLv2.
10  */
11
12 //usage:#define wget_trivial_usage
13 //usage:        IF_FEATURE_WGET_LONG_OPTIONS(
14 //usage:       "[-c|--continue] [-s|--spider] [-q|--quiet] [-O|--output-document FILE]\n"
15 //usage:       "        [--header 'header: value'] [-Y|--proxy on/off] [-P DIR]\n"
16 /* Since we ignore these opts, we don't show them in --help */
17 /* //usage:    "        [--no-check-certificate] [--no-cache]" */
18 //usage:       "        [-U|--user-agent AGENT]" IF_FEATURE_WGET_TIMEOUT(" [-T SEC]") " URL..."
19 //usage:        )
20 //usage:        IF_NOT_FEATURE_WGET_LONG_OPTIONS(
21 //usage:       "[-csq] [-O FILE] [-Y on/off] [-P DIR] [-U AGENT]"
22 //usage:                        IF_FEATURE_WGET_TIMEOUT(" [-T SEC]") " URL..."
23 //usage:        )
24 //usage:#define wget_full_usage "\n\n"
25 //usage:       "Retrieve files via HTTP or FTP\n"
26 //usage:     "\n        -s      Spider mode - only check file existence"
27 //usage:     "\n        -c      Continue retrieval of aborted transfer"
28 //usage:     "\n        -q      Quiet"
29 //usage:     "\n        -P DIR  Save to DIR (default .)"
30 //usage:        IF_FEATURE_WGET_TIMEOUT(
31 //usage:     "\n        -T SEC  Network read timeout is SEC seconds"
32 //usage:        )
33 //usage:     "\n        -O FILE Save to FILE ('-' for stdout)"
34 //usage:     "\n        -U STR  Use STR for User-Agent header"
35 //usage:     "\n        -Y      Use proxy ('on' or 'off')"
36
37 #include "libbb.h"
38
39 #if 0
40 # define log_io(...) bb_error_msg(__VA_ARGS__)
41 #else
42 # define log_io(...) ((void)0)
43 #endif
44
45
46 struct host_info {
47         char *allocated;
48         const char *path;
49         const char *user;
50         char       *host;
51         int         port;
52         smallint    is_ftp;
53 };
54
55
56 /* Globals */
57 struct globals {
58         off_t content_len;        /* Content-length of the file */
59         off_t beg_range;          /* Range at which continue begins */
60 #if ENABLE_FEATURE_WGET_STATUSBAR
61         off_t transferred;        /* Number of bytes transferred so far */
62         const char *curfile;      /* Name of current file being transferred */
63         bb_progress_t pmt;
64 #endif
65         char *dir_prefix;
66 #if ENABLE_FEATURE_WGET_LONG_OPTIONS
67         char *post_data;
68         char *extra_headers;
69 #endif
70         char *fname_out;        /* where to direct output (-O) */
71         const char *proxy_flag; /* Use proxies if env vars are set */
72         const char *user_agent; /* "User-Agent" header field */
73 #if ENABLE_FEATURE_WGET_TIMEOUT
74         unsigned timeout_seconds;
75 #endif
76         int output_fd;
77         int o_flags;
78         smallint chunked;         /* chunked transfer encoding */
79         smallint got_clen;        /* got content-length: from server  */
80         /* Local downloads do benefit from big buffer.
81          * With 512 byte buffer, it was measured to be
82          * an order of magnitude slower than with big one.
83          */
84         uint64_t just_to_align_next_member;
85         char wget_buf[CONFIG_FEATURE_COPYBUF_KB*1024];
86 } FIX_ALIASING;
87 #define G (*ptr_to_globals)
88 #define INIT_G() do { \
89         SET_PTR_TO_GLOBALS(xzalloc(sizeof(G))); \
90         IF_FEATURE_WGET_TIMEOUT(G.timeout_seconds = 900;) \
91 } while (0)
92
93
94 /* Must match option string! */
95 enum {
96         WGET_OPT_CONTINUE   = (1 << 0),
97         WGET_OPT_SPIDER     = (1 << 1),
98         WGET_OPT_QUIET      = (1 << 2),
99         WGET_OPT_OUTNAME    = (1 << 3),
100         WGET_OPT_PREFIX     = (1 << 4),
101         WGET_OPT_PROXY      = (1 << 5),
102         WGET_OPT_USER_AGENT = (1 << 6),
103         WGET_OPT_NETWORK_READ_TIMEOUT = (1 << 7),
104         WGET_OPT_RETRIES    = (1 << 8),
105         WGET_OPT_PASSIVE    = (1 << 9),
106         WGET_OPT_HEADER     = (1 << 10) * ENABLE_FEATURE_WGET_LONG_OPTIONS,
107         WGET_OPT_POST_DATA  = (1 << 11) * ENABLE_FEATURE_WGET_LONG_OPTIONS,
108 };
109
110 enum {
111         PROGRESS_START = -1,
112         PROGRESS_END   = 0,
113         PROGRESS_BUMP  = 1,
114 };
115 #if ENABLE_FEATURE_WGET_STATUSBAR
116 static void progress_meter(int flag)
117 {
118         if (option_mask32 & WGET_OPT_QUIET)
119                 return;
120
121         if (flag == PROGRESS_START)
122                 bb_progress_init(&G.pmt, G.curfile);
123
124         bb_progress_update(&G.pmt,
125                         G.beg_range,
126                         G.transferred,
127                         (G.chunked || !G.got_clen) ? 0 : G.beg_range + G.transferred + G.content_len
128         );
129
130         if (flag == PROGRESS_END) {
131                 bb_progress_free(&G.pmt);
132                 bb_putchar_stderr('\n');
133                 G.transferred = 0;
134         }
135 }
136 #else
137 static ALWAYS_INLINE void progress_meter(int flag UNUSED_PARAM) { }
138 #endif
139
140
141 /* IPv6 knows scoped address types i.e. link and site local addresses. Link
142  * local addresses can have a scope identifier to specify the
143  * interface/link an address is valid on (e.g. fe80::1%eth0). This scope
144  * identifier is only valid on a single node.
145  *
146  * RFC 4007 says that the scope identifier MUST NOT be sent across the wire,
147  * unless all nodes agree on the semantic. Apache e.g. regards zone identifiers
148  * in the Host header as invalid requests, see
149  * https://issues.apache.org/bugzilla/show_bug.cgi?id=35122
150  */
151 static void strip_ipv6_scope_id(char *host)
152 {
153         char *scope, *cp;
154
155         /* bbox wget actually handles IPv6 addresses without [], like
156          * wget "http://::1/xxx", but this is not standard.
157          * To save code, _here_ we do not support it. */
158
159         if (host[0] != '[')
160                 return; /* not IPv6 */
161
162         scope = strchr(host, '%');
163         if (!scope)
164                 return;
165
166         /* Remove the IPv6 zone identifier from the host address */
167         cp = strchr(host, ']');
168         if (!cp || (cp[1] != ':' && cp[1] != '\0')) {
169                 /* malformed address (not "[xx]:nn" or "[xx]") */
170                 return;
171         }
172
173         /* cp points to "]...", scope points to "%eth0]..." */
174         overlapping_strcpy(scope, cp);
175 }
176
177 #if ENABLE_FEATURE_WGET_AUTHENTICATION
178 /* Base64-encode character string. */
179 static char *base64enc(const char *str)
180 {
181         unsigned len = strlen(str);
182         if (len > sizeof(G.wget_buf)/4*3 - 10) /* paranoia */
183                 len = sizeof(G.wget_buf)/4*3 - 10;
184         bb_uuencode(G.wget_buf, str, len, bb_uuenc_tbl_base64);
185         return G.wget_buf;
186 }
187 #endif
188
189 static char* sanitize_string(char *s)
190 {
191         unsigned char *p = (void *) s;
192         while (*p >= ' ')
193                 p++;
194         *p = '\0';
195         return s;
196 }
197
198 static FILE *open_socket(len_and_sockaddr *lsa)
199 {
200         FILE *fp;
201
202         /* glibc 2.4 seems to try seeking on it - ??! */
203         /* hopefully it understands what ESPIPE means... */
204         fp = fdopen(xconnect_stream(lsa), "r+");
205         if (fp == NULL)
206                 bb_perror_msg_and_die(bb_msg_memory_exhausted);
207
208         return fp;
209 }
210
211 /* Returns '\n' if it was seen, else '\0'. Trims at first '\r' or '\n' */
212 static char fgets_and_trim(FILE *fp)
213 {
214         char c;
215         char *buf_ptr;
216
217         if (fgets(G.wget_buf, sizeof(G.wget_buf) - 1, fp) == NULL)
218                 bb_perror_msg_and_die("error getting response");
219
220         buf_ptr = strchrnul(G.wget_buf, '\n');
221         c = *buf_ptr;
222         *buf_ptr = '\0';
223         buf_ptr = strchrnul(G.wget_buf, '\r');
224         *buf_ptr = '\0';
225
226         log_io("< %s", G.wget_buf);
227
228         return c;
229 }
230
231 static int ftpcmd(const char *s1, const char *s2, FILE *fp)
232 {
233         int result;
234         if (s1) {
235                 if (!s2)
236                         s2 = "";
237                 fprintf(fp, "%s%s\r\n", s1, s2);
238                 fflush(fp);
239                 log_io("> %s%s", s1, s2);
240         }
241
242         do {
243                 fgets_and_trim(fp);
244         } while (!isdigit(G.wget_buf[0]) || G.wget_buf[3] != ' ');
245
246         G.wget_buf[3] = '\0';
247         result = xatoi_positive(G.wget_buf);
248         G.wget_buf[3] = ' ';
249         return result;
250 }
251
252 static void parse_url(const char *src_url, struct host_info *h)
253 {
254         char *url, *p, *sp;
255
256         free(h->allocated);
257         h->allocated = url = xstrdup(src_url);
258
259         if (strncmp(url, "http://", 7) == 0) {
260                 h->port = bb_lookup_port("http", "tcp", 80);
261                 h->host = url + 7;
262                 h->is_ftp = 0;
263         } else if (strncmp(url, "ftp://", 6) == 0) {
264                 h->port = bb_lookup_port("ftp", "tcp", 21);
265                 h->host = url + 6;
266                 h->is_ftp = 1;
267         } else
268                 bb_error_msg_and_die("not an http or ftp url: %s", sanitize_string(url));
269
270         // FYI:
271         // "Real" wget 'http://busybox.net?var=a/b' sends this request:
272         //   'GET /?var=a/b HTTP 1.0'
273         //   and saves 'index.html?var=a%2Fb' (we save 'b')
274         // wget 'http://busybox.net?login=john@doe':
275         //   request: 'GET /?login=john@doe HTTP/1.0'
276         //   saves: 'index.html?login=john@doe' (we save '?login=john@doe')
277         // wget 'http://busybox.net#test/test':
278         //   request: 'GET / HTTP/1.0'
279         //   saves: 'index.html' (we save 'test')
280         //
281         // We also don't add unique .N suffix if file exists...
282         sp = strchr(h->host, '/');
283         p = strchr(h->host, '?'); if (!sp || (p && sp > p)) sp = p;
284         p = strchr(h->host, '#'); if (!sp || (p && sp > p)) sp = p;
285         if (!sp) {
286                 h->path = "";
287         } else if (*sp == '/') {
288                 *sp = '\0';
289                 h->path = sp + 1;
290         } else { // '#' or '?'
291                 // http://busybox.net?login=john@doe is a valid URL
292                 // memmove converts to:
293                 // http:/busybox.nett?login=john@doe...
294                 memmove(h->host - 1, h->host, sp - h->host);
295                 h->host--;
296                 sp[-1] = '\0';
297                 h->path = sp;
298         }
299
300         // We used to set h->user to NULL here, but this interferes
301         // with handling of code 302 ("object was moved")
302
303         sp = strrchr(h->host, '@');
304         if (sp != NULL) {
305                 // URL-decode "user:password" string before base64-encoding:
306                 // wget http://test:my%20pass@example.com should send
307                 // Authorization: Basic dGVzdDpteSBwYXNz
308                 // which decodes to "test:my pass".
309                 // Standard wget and curl do this too.
310                 *sp = '\0';
311                 h->user = percent_decode_in_place(h->host, /*strict:*/ 0);
312                 h->host = sp + 1;
313         }
314
315         sp = h->host;
316 }
317
318 static char *gethdr(FILE *fp)
319 {
320         char *s, *hdrval;
321         int c;
322
323         /* retrieve header line */
324         c = fgets_and_trim(fp);
325
326         /* end of the headers? */
327         if (G.wget_buf[0] == '\0')
328                 return NULL;
329
330         /* convert the header name to lower case */
331         for (s = G.wget_buf; isalnum(*s) || *s == '-' || *s == '.'; ++s) {
332                 /* tolower for "A-Z", no-op for "0-9a-z-." */
333                 *s |= 0x20;
334         }
335
336         /* verify we are at the end of the header name */
337         if (*s != ':')
338                 bb_error_msg_and_die("bad header line: %s", sanitize_string(G.wget_buf));
339
340         /* locate the start of the header value */
341         *s++ = '\0';
342         hdrval = skip_whitespace(s);
343
344         if (c != '\n') {
345                 /* Rats! The buffer isn't big enough to hold the entire header value */
346                 while (c = getc(fp), c != EOF && c != '\n')
347                         continue;
348         }
349
350         return hdrval;
351 }
352
353 static void reset_beg_range_to_zero(void)
354 {
355         bb_error_msg("restart failed");
356         G.beg_range = 0;
357         xlseek(G.output_fd, 0, SEEK_SET);
358         /* Done at the end instead: */
359         /* ftruncate(G.output_fd, 0); */
360 }
361
362 static FILE* prepare_ftp_session(FILE **dfpp, struct host_info *target, len_and_sockaddr *lsa)
363 {
364         FILE *sfp;
365         char *str;
366         int port;
367
368         if (!target->user)
369                 target->user = xstrdup("anonymous:busybox@");
370
371         sfp = open_socket(lsa);
372         if (ftpcmd(NULL, NULL, sfp) != 220)
373                 bb_error_msg_and_die("%s", sanitize_string(G.wget_buf + 4));
374
375         /*
376          * Splitting username:password pair,
377          * trying to log in
378          */
379         str = strchr(target->user, ':');
380         if (str)
381                 *str++ = '\0';
382         switch (ftpcmd("USER ", target->user, sfp)) {
383         case 230:
384                 break;
385         case 331:
386                 if (ftpcmd("PASS ", str, sfp) == 230)
387                         break;
388                 /* fall through (failed login) */
389         default:
390                 bb_error_msg_and_die("ftp login: %s", sanitize_string(G.wget_buf + 4));
391         }
392
393         ftpcmd("TYPE I", NULL, sfp);
394
395         /*
396          * Querying file size
397          */
398         if (ftpcmd("SIZE ", target->path, sfp) == 213) {
399                 G.content_len = BB_STRTOOFF(G.wget_buf + 4, NULL, 10);
400                 if (G.content_len < 0 || errno) {
401                         bb_error_msg_and_die("SIZE value is garbage");
402                 }
403                 G.got_clen = 1;
404         }
405
406         /*
407          * Entering passive mode
408          */
409         if (ftpcmd("PASV", NULL, sfp) != 227) {
410  pasv_error:
411                 bb_error_msg_and_die("bad response to %s: %s", "PASV", sanitize_string(G.wget_buf));
412         }
413         // Response is "227 garbageN1,N2,N3,N4,P1,P2[)garbage]
414         // Server's IP is N1.N2.N3.N4 (we ignore it)
415         // Server's port for data connection is P1*256+P2
416         str = strrchr(G.wget_buf, ')');
417         if (str) str[0] = '\0';
418         str = strrchr(G.wget_buf, ',');
419         if (!str) goto pasv_error;
420         port = xatou_range(str+1, 0, 255);
421         *str = '\0';
422         str = strrchr(G.wget_buf, ',');
423         if (!str) goto pasv_error;
424         port += xatou_range(str+1, 0, 255) * 256;
425         set_nport(&lsa->u.sa, htons(port));
426
427         *dfpp = open_socket(lsa);
428
429         if (G.beg_range != 0) {
430                 sprintf(G.wget_buf, "REST %"OFF_FMT"u", G.beg_range);
431                 if (ftpcmd(G.wget_buf, NULL, sfp) == 350)
432                         G.content_len -= G.beg_range;
433                 else
434                         reset_beg_range_to_zero();
435         }
436
437         if (ftpcmd("RETR ", target->path, sfp) > 150)
438                 bb_error_msg_and_die("bad response to %s: %s", "RETR", sanitize_string(G.wget_buf));
439
440         return sfp;
441 }
442
443 static void NOINLINE retrieve_file_data(FILE *dfp)
444 {
445 #if ENABLE_FEATURE_WGET_STATUSBAR || ENABLE_FEATURE_WGET_TIMEOUT
446 # if ENABLE_FEATURE_WGET_TIMEOUT
447         unsigned second_cnt;
448 # endif
449         struct pollfd polldata;
450
451 # if ENABLE_FEATURE_WGET_TIMEOUT
452         second_cnt = G.timeout_seconds;
453 # endif
454         polldata.fd = fileno(dfp);
455         polldata.events = POLLIN | POLLPRI;
456 #endif
457         progress_meter(PROGRESS_START);
458
459         if (G.chunked)
460                 goto get_clen;
461
462         /* Loops only if chunked */
463         while (1) {
464
465 #if ENABLE_FEATURE_WGET_STATUSBAR || ENABLE_FEATURE_WGET_TIMEOUT
466                 /* Must use nonblocking I/O, otherwise fread will loop
467                  * and *block* until it reads full buffer,
468                  * which messes up progress bar and/or timeout logic.
469                  * Because of nonblocking I/O, we need to dance
470                  * very carefully around EAGAIN. See explanation at
471                  * clearerr() call.
472                  */
473                 ndelay_on(polldata.fd);
474 #endif
475                 while (1) {
476                         int n;
477                         unsigned rdsz;
478
479                         rdsz = sizeof(G.wget_buf);
480                         if (G.got_clen) {
481                                 if (G.content_len < (off_t)sizeof(G.wget_buf)) {
482                                         if ((int)G.content_len <= 0)
483                                                 break;
484                                         rdsz = (unsigned)G.content_len;
485                                 }
486                         }
487
488 #if ENABLE_FEATURE_WGET_STATUSBAR || ENABLE_FEATURE_WGET_TIMEOUT
489                         if (safe_poll(&polldata, 1, 1000) == 0) {
490 # if ENABLE_FEATURE_WGET_TIMEOUT
491                                 if (second_cnt != 0 && --second_cnt == 0) {
492                                         progress_meter(PROGRESS_END);
493                                         bb_error_msg_and_die("download timed out");
494                                 }
495 # endif
496                                 /* Needed for "stalled" indicator */
497                                 progress_meter(PROGRESS_BUMP);
498                                 /*
499                                  * We used to loop back to poll here,
500                                  * but in chunked case, we can be here after
501                                  * fgets and it could buffer some data in dfp...
502                                  * which poll knows nothing about!
503                                  * Therefore let's try fread'ing anyway.
504                                  */
505                         }
506
507                         /* fread internally uses read loop, which in our case
508                          * is usually exited when we get EAGAIN.
509                          * In this case, libc sets error marker on the stream.
510                          * Need to clear it before next fread to avoid possible
511                          * rare false positive ferror below. Rare because usually
512                          * fread gets more than zero bytes, and we don't fall
513                          * into if (n <= 0) ...
514                          */
515                         clearerr(dfp);
516                         errno = 0;
517 #endif
518                         n = fread(G.wget_buf, 1, rdsz, dfp);
519                         /* man fread:
520                          * If error occurs, or EOF is reached, the return value
521                          * is a short item count (or zero).
522                          * fread does not distinguish between EOF and error.
523                          */
524                         if (n <= 0) {
525 #if ENABLE_FEATURE_WGET_STATUSBAR || ENABLE_FEATURE_WGET_TIMEOUT
526                                 if (errno == EAGAIN) /* poll lied, there is no data? */
527                                         continue; /* yes */
528 #endif
529                                 if (ferror(dfp))
530                                         bb_perror_msg_and_die(bb_msg_read_error);
531                                 break; /* EOF, not error */
532                         }
533
534                         xwrite(G.output_fd, G.wget_buf, n);
535 #if ENABLE_FEATURE_WGET_TIMEOUT
536                         second_cnt = G.timeout_seconds;
537 #endif
538 #if ENABLE_FEATURE_WGET_STATUSBAR
539                         G.transferred += n;
540                         progress_meter(PROGRESS_BUMP);
541 #endif
542                         if (G.got_clen) {
543                                 G.content_len -= n;
544                                 if (G.content_len == 0)
545                                         break;
546                         }
547                 }
548 #if ENABLE_FEATURE_WGET_STATUSBAR || ENABLE_FEATURE_WGET_TIMEOUT
549                 clearerr(dfp);
550                 ndelay_off(polldata.fd); /* else fgets can get very unhappy */
551 #endif
552                 if (!G.chunked)
553                         break;
554
555                 fgets_and_trim(dfp); /* Eat empty line */
556  get_clen:
557                 fgets_and_trim(dfp);
558                 G.content_len = STRTOOFF(G.wget_buf, NULL, 16);
559                 /* FIXME: error check? */
560                 if (G.content_len == 0)
561                         break; /* all done! */
562                 G.got_clen = 1;
563         }
564
565         /* If -c failed, we restart from the beginning,
566          * but we do not truncate file then, we do it only now, at the end.
567          * This lets user to ^C if his 99% complete 10 GB file download
568          * failed to restart *without* losing the almost complete file.
569          */
570         {
571                 off_t pos = lseek(G.output_fd, 0, SEEK_CUR);
572                 if (pos != (off_t)-1)
573                         ftruncate(G.output_fd, pos);
574         }
575
576         /* Draw full bar and free its resources */
577         G.chunked = 0;  /* makes it show 100% even for chunked download */
578         G.got_clen = 1; /* makes it show 100% even for download of (formerly) unknown size */
579         progress_meter(PROGRESS_END);
580 }
581
582 static void download_one_url(const char *url)
583 {
584         bool use_proxy;                 /* Use proxies if env vars are set  */
585         int redir_limit;
586         len_and_sockaddr *lsa;
587         FILE *sfp;                      /* socket to web/ftp server         */
588         FILE *dfp;                      /* socket to ftp server (data)      */
589         char *proxy = NULL;
590         char *fname_out_alloc;
591         char *redirected_path = NULL;
592         struct host_info server;
593         struct host_info target;
594
595         server.allocated = NULL;
596         target.allocated = NULL;
597         server.user = NULL;
598         target.user = NULL;
599
600         parse_url(url, &target);
601
602         /* Use the proxy if necessary */
603         use_proxy = (strcmp(G.proxy_flag, "off") != 0);
604         if (use_proxy) {
605                 proxy = getenv(target.is_ftp ? "ftp_proxy" : "http_proxy");
606                 use_proxy = (proxy && proxy[0]);
607                 if (use_proxy)
608                         parse_url(proxy, &server);
609         }
610         if (!use_proxy) {
611                 server.port = target.port;
612                 if (ENABLE_FEATURE_IPV6) {
613                         //free(server.allocated); - can't be non-NULL
614                         server.host = server.allocated = xstrdup(target.host);
615                 } else {
616                         server.host = target.host;
617                 }
618         }
619
620         if (ENABLE_FEATURE_IPV6)
621                 strip_ipv6_scope_id(target.host);
622
623         /* If there was no -O FILE, guess output filename */
624         fname_out_alloc = NULL;
625         if (!(option_mask32 & WGET_OPT_OUTNAME)) {
626                 G.fname_out = bb_get_last_path_component_nostrip(target.path);
627                 /* handle "wget http://kernel.org//" */
628                 if (G.fname_out[0] == '/' || !G.fname_out[0])
629                         G.fname_out = (char*)"index.html";
630                 /* -P DIR is considered only if there was no -O FILE */
631                 if (G.dir_prefix)
632                         G.fname_out = fname_out_alloc = concat_path_file(G.dir_prefix, G.fname_out);
633                 else {
634                         /* redirects may free target.path later, need to make a copy */
635                         G.fname_out = fname_out_alloc = xstrdup(G.fname_out);
636                 }
637         }
638 #if ENABLE_FEATURE_WGET_STATUSBAR
639         G.curfile = bb_get_last_path_component_nostrip(G.fname_out);
640 #endif
641
642         /* Determine where to start transfer */
643         G.beg_range = 0;
644         if (option_mask32 & WGET_OPT_CONTINUE) {
645                 G.output_fd = open(G.fname_out, O_WRONLY);
646                 if (G.output_fd >= 0) {
647                         G.beg_range = xlseek(G.output_fd, 0, SEEK_END);
648                 }
649                 /* File doesn't exist. We do not create file here yet.
650                  * We are not sure it exists on remote side */
651         }
652
653         redir_limit = 5;
654  resolve_lsa:
655         lsa = xhost2sockaddr(server.host, server.port);
656         if (!(option_mask32 & WGET_OPT_QUIET)) {
657                 char *s = xmalloc_sockaddr2dotted(&lsa->u.sa);
658                 fprintf(stderr, "Connecting to %s (%s)\n", server.host, s);
659                 free(s);
660         }
661  establish_session:
662         /*G.content_len = 0; - redundant, got_clen = 0 is enough */
663         G.got_clen = 0;
664         G.chunked = 0;
665         if (use_proxy || !target.is_ftp) {
666                 /*
667                  *  HTTP session
668                  */
669                 char *str;
670                 int status;
671
672
673                 /* Open socket to http server */
674                 sfp = open_socket(lsa);
675
676                 /* Send HTTP request */
677                 if (use_proxy) {
678                         fprintf(sfp, "GET %stp://%s/%s HTTP/1.1\r\n",
679                                 target.is_ftp ? "f" : "ht", target.host,
680                                 target.path);
681                 } else {
682                         if (option_mask32 & WGET_OPT_POST_DATA)
683                                 fprintf(sfp, "POST /%s HTTP/1.1\r\n", target.path);
684                         else
685                                 fprintf(sfp, "GET /%s HTTP/1.1\r\n", target.path);
686                 }
687
688                 fprintf(sfp, "Host: %s\r\nUser-Agent: %s\r\n",
689                         target.host, G.user_agent);
690
691                 /* Ask server to close the connection as soon as we are done
692                  * (IOW: we do not intend to send more requests)
693                  */
694                 fprintf(sfp, "Connection: close\r\n");
695
696 #if ENABLE_FEATURE_WGET_AUTHENTICATION
697                 if (target.user) {
698                         fprintf(sfp, "Proxy-Authorization: Basic %s\r\n"+6,
699                                 base64enc(target.user));
700                 }
701                 if (use_proxy && server.user) {
702                         fprintf(sfp, "Proxy-Authorization: Basic %s\r\n",
703                                 base64enc(server.user));
704                 }
705 #endif
706
707                 if (G.beg_range != 0)
708                         fprintf(sfp, "Range: bytes=%"OFF_FMT"u-\r\n", G.beg_range);
709
710 #if ENABLE_FEATURE_WGET_LONG_OPTIONS
711                 if (G.extra_headers)
712                         fputs(G.extra_headers, sfp);
713
714                 if (option_mask32 & WGET_OPT_POST_DATA) {
715                         fprintf(sfp,
716                                 "Content-Type: application/x-www-form-urlencoded\r\n"
717                                 "Content-Length: %u\r\n"
718                                 "\r\n"
719                                 "%s",
720                                 (int) strlen(G.post_data), G.post_data
721                         );
722                 } else
723 #endif
724                 {
725                         fprintf(sfp, "\r\n");
726                 }
727
728                 fflush(sfp);
729
730                 /*
731                  * Retrieve HTTP response line and check for "200" status code.
732                  */
733  read_response:
734                 fgets_and_trim(sfp);
735
736                 str = G.wget_buf;
737                 str = skip_non_whitespace(str);
738                 str = skip_whitespace(str);
739                 // FIXME: no error check
740                 // xatou wouldn't work: "200 OK"
741                 status = atoi(str);
742                 switch (status) {
743                 case 0:
744                 case 100:
745                         while (gethdr(sfp) != NULL)
746                                 /* eat all remaining headers */;
747                         goto read_response;
748                 case 200:
749 /*
750 Response 204 doesn't say "null file", it says "metadata
751 has changed but data didn't":
752
753 "10.2.5 204 No Content
754 The server has fulfilled the request but does not need to return
755 an entity-body, and might want to return updated metainformation.
756 The response MAY include new or updated metainformation in the form
757 of entity-headers, which if present SHOULD be associated with
758 the requested variant.
759
760 If the client is a user agent, it SHOULD NOT change its document
761 view from that which caused the request to be sent. This response
762 is primarily intended to allow input for actions to take place
763 without causing a change to the user agent's active document view,
764 although any new or updated metainformation SHOULD be applied
765 to the document currently in the user agent's active view.
766
767 The 204 response MUST NOT include a message-body, and thus
768 is always terminated by the first empty line after the header fields."
769
770 However, in real world it was observed that some web servers
771 (e.g. Boa/0.94.14rc21) simply use code 204 when file size is zero.
772 */
773                 case 204:
774                         if (G.beg_range != 0) {
775                                 /* "Range:..." was not honored by the server.
776                                  * Restart download from the beginning.
777                                  */
778                                 reset_beg_range_to_zero();
779                         }
780                         break;
781                 case 300:  /* redirection */
782                 case 301:
783                 case 302:
784                 case 303:
785                         break;
786                 case 206: /* Partial Content */
787                         if (G.beg_range != 0)
788                                 /* "Range:..." worked. Good. */
789                                 break;
790                         /* Partial Content even though we did not ask for it??? */
791                         /* fall through */
792                 default:
793                         bb_error_msg_and_die("server returned error: %s", sanitize_string(G.wget_buf));
794                 }
795
796                 /*
797                  * Retrieve HTTP headers.
798                  */
799                 while ((str = gethdr(sfp)) != NULL) {
800                         static const char keywords[] ALIGN1 =
801                                 "content-length\0""transfer-encoding\0""location\0";
802                         enum {
803                                 KEY_content_length = 1, KEY_transfer_encoding, KEY_location
804                         };
805                         smalluint key;
806
807                         /* gethdr converted "FOO:" string to lowercase */
808
809                         /* strip trailing whitespace */
810                         char *s = strchrnul(str, '\0') - 1;
811                         while (s >= str && (*s == ' ' || *s == '\t')) {
812                                 *s = '\0';
813                                 s--;
814                         }
815                         key = index_in_strings(keywords, G.wget_buf) + 1;
816                         if (key == KEY_content_length) {
817                                 G.content_len = BB_STRTOOFF(str, NULL, 10);
818                                 if (G.content_len < 0 || errno) {
819                                         bb_error_msg_and_die("content-length %s is garbage", sanitize_string(str));
820                                 }
821                                 G.got_clen = 1;
822                                 continue;
823                         }
824                         if (key == KEY_transfer_encoding) {
825                                 if (strcmp(str_tolower(str), "chunked") != 0)
826                                         bb_error_msg_and_die("transfer encoding '%s' is not supported", sanitize_string(str));
827                                 G.chunked = 1;
828                         }
829                         if (key == KEY_location && status >= 300) {
830                                 if (--redir_limit == 0)
831                                         bb_error_msg_and_die("too many redirections");
832                                 fclose(sfp);
833                                 if (str[0] == '/') {
834                                         free(redirected_path);
835                                         target.path = redirected_path = xstrdup(str+1);
836                                         /* lsa stays the same: it's on the same server */
837                                 } else {
838                                         parse_url(str, &target);
839                                         if (!use_proxy) {
840                                                 free(server.allocated);
841                                                 server.allocated = NULL;
842                                                 server.host = target.host;
843                                                 /* strip_ipv6_scope_id(target.host); - no! */
844                                                 /* we assume remote never gives us IPv6 addr with scope id */
845                                                 server.port = target.port;
846                                                 free(lsa);
847                                                 goto resolve_lsa;
848                                         } /* else: lsa stays the same: we use proxy */
849                                 }
850                                 goto establish_session;
851                         }
852                 }
853 //              if (status >= 300)
854 //                      bb_error_msg_and_die("bad redirection (no Location: header from server)");
855
856                 /* For HTTP, data is pumped over the same connection */
857                 dfp = sfp;
858
859         } else {
860                 /*
861                  *  FTP session
862                  */
863                 sfp = prepare_ftp_session(&dfp, &target, lsa);
864         }
865
866         free(lsa);
867
868         if (!(option_mask32 & WGET_OPT_SPIDER)) {
869                 if (G.output_fd < 0)
870                         G.output_fd = xopen(G.fname_out, G.o_flags);
871                 retrieve_file_data(dfp);
872                 if (!(option_mask32 & WGET_OPT_OUTNAME)) {
873                         xclose(G.output_fd);
874                         G.output_fd = -1;
875                 }
876         }
877
878         if (dfp != sfp) {
879                 /* It's ftp. Close data connection properly */
880                 fclose(dfp);
881                 if (ftpcmd(NULL, NULL, sfp) != 226)
882                         bb_error_msg_and_die("ftp error: %s", sanitize_string(G.wget_buf + 4));
883                 /* ftpcmd("QUIT", NULL, sfp); - why bother? */
884         }
885         fclose(sfp);
886
887         free(server.allocated);
888         free(target.allocated);
889         free(fname_out_alloc);
890         free(redirected_path);
891 }
892
893 int wget_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE;
894 int wget_main(int argc UNUSED_PARAM, char **argv)
895 {
896 #if ENABLE_FEATURE_WGET_LONG_OPTIONS
897         static const char wget_longopts[] ALIGN1 =
898                 /* name, has_arg, val */
899                 "continue\0"         No_argument       "c"
900 //FIXME: -s isn't --spider, it's --save-headers!
901                 "spider\0"           No_argument       "s"
902                 "quiet\0"            No_argument       "q"
903                 "output-document\0"  Required_argument "O"
904                 "directory-prefix\0" Required_argument "P"
905                 "proxy\0"            Required_argument "Y"
906                 "user-agent\0"       Required_argument "U"
907 #if ENABLE_FEATURE_WGET_TIMEOUT
908                 "timeout\0"          Required_argument "T"
909 #endif
910                 /* Ignored: */
911                 // "tries\0"            Required_argument "t"
912                 /* Ignored (we always use PASV): */
913                 "passive-ftp\0"      No_argument       "\xff"
914                 "header\0"           Required_argument "\xfe"
915                 "post-data\0"        Required_argument "\xfd"
916                 /* Ignored (we don't do ssl) */
917                 "no-check-certificate\0" No_argument   "\xfc"
918                 /* Ignored (we don't support caching) */
919                 "no-cache\0"         No_argument       "\xfb"
920                 ;
921 #endif
922
923 #if ENABLE_FEATURE_WGET_LONG_OPTIONS
924         llist_t *headers_llist = NULL;
925 #endif
926
927         INIT_G();
928
929         IF_FEATURE_WGET_TIMEOUT(G.timeout_seconds = 900;)
930         G.proxy_flag = "on";   /* use proxies if env vars are set */
931         G.user_agent = "Wget"; /* "User-Agent" header field */
932
933 #if ENABLE_FEATURE_WGET_LONG_OPTIONS
934         applet_long_options = wget_longopts;
935 #endif
936         opt_complementary = "-1" IF_FEATURE_WGET_TIMEOUT(":T+") IF_FEATURE_WGET_LONG_OPTIONS(":\xfe::");
937         getopt32(argv, "csqO:P:Y:U:T:" /*ignored:*/ "t:",
938                 &G.fname_out, &G.dir_prefix,
939                 &G.proxy_flag, &G.user_agent,
940                 IF_FEATURE_WGET_TIMEOUT(&G.timeout_seconds) IF_NOT_FEATURE_WGET_TIMEOUT(NULL),
941                 NULL /* -t RETRIES */
942                 IF_FEATURE_WGET_LONG_OPTIONS(, &headers_llist)
943                 IF_FEATURE_WGET_LONG_OPTIONS(, &G.post_data)
944         );
945         argv += optind;
946
947 #if ENABLE_FEATURE_WGET_LONG_OPTIONS
948         if (headers_llist) {
949                 int size = 1;
950                 char *cp;
951                 llist_t *ll = headers_llist;
952                 while (ll) {
953                         size += strlen(ll->data) + 2;
954                         ll = ll->link;
955                 }
956                 G.extra_headers = cp = xmalloc(size);
957                 while (headers_llist) {
958                         cp += sprintf(cp, "%s\r\n", (char*)llist_pop(&headers_llist));
959                 }
960         }
961 #endif
962
963         G.output_fd = -1;
964         G.o_flags = O_WRONLY | O_CREAT | O_TRUNC | O_EXCL;
965         if (G.fname_out) { /* -O FILE ? */
966                 if (LONE_DASH(G.fname_out)) { /* -O - ? */
967                         G.output_fd = 1;
968                         option_mask32 &= ~WGET_OPT_CONTINUE;
969                 }
970                 /* compat with wget: -O FILE can overwrite */
971                 G.o_flags = O_WRONLY | O_CREAT | O_TRUNC;
972         }
973
974         while (*argv)
975                 download_one_url(*argv++);
976
977         if (G.output_fd >= 0)
978                 xclose(G.output_fd);
979
980         return EXIT_SUCCESS;
981 }