wget: URL-decode user:password before base64-encoding it into auth hdr. Closes 3625.
[oweals/busybox.git] / networking / wget.c
1 /* vi: set sw=4 ts=4: */
2 /*
3  * wget - retrieve a file using HTTP or FTP
4  *
5  * Chip Rosenthal Covad Communications <chip@laserlink.net>
6  * Licensed under GPLv2, see file LICENSE in this source tree.
7  *
8  * Copyright (C) 2010 Bradley M. Kuhn <bkuhn@ebb.org>
9  * Kuhn's copyrights are licensed GPLv2-or-later.  File as a whole remains GPLv2.
10  */
11
12 //usage:#define wget_trivial_usage
13 //usage:        IF_FEATURE_WGET_LONG_OPTIONS(
14 //usage:       "[-c|--continue] [-s|--spider] [-q|--quiet] [-O|--output-document FILE]\n"
15 //usage:       "        [--header 'header: value'] [-Y|--proxy on/off] [-P DIR]\n"
16 //usage:       "        [--no-check-certificate] [-U|--user-agent AGENT]"
17 //usage:                        IF_FEATURE_WGET_TIMEOUT(" [-T SEC]") " URL..."
18 //usage:        )
19 //usage:        IF_NOT_FEATURE_WGET_LONG_OPTIONS(
20 //usage:       "[-csq] [-O FILE] [-Y on/off] [-P DIR] [-U AGENT]"
21 //usage:                        IF_FEATURE_WGET_TIMEOUT(" [-T SEC]") " URL..."
22 //usage:        )
23 //usage:#define wget_full_usage "\n\n"
24 //usage:       "Retrieve files via HTTP or FTP\n"
25 //usage:     "\n        -s      Spider mode - only check file existence"
26 //usage:     "\n        -c      Continue retrieval of aborted transfer"
27 //usage:     "\n        -q      Quiet"
28 //usage:     "\n        -P DIR  Save to DIR (default .)"
29 //usage:        IF_FEATURE_WGET_TIMEOUT(
30 //usage:     "\n        -T SEC  Network read timeout is SEC seconds"
31 //usage:        )
32 //usage:     "\n        -O FILE Save to FILE ('-' for stdout)"
33 //usage:     "\n        -U STR  Use STR for User-Agent header"
34 //usage:     "\n        -Y      Use proxy ('on' or 'off')"
35
36 #include "libbb.h"
37
38 //#define log_io(...) bb_error_msg(__VA_ARGS__)
39 #define log_io(...) ((void)0)
40
41
42 struct host_info {
43         char *allocated;
44         const char *path;
45         const char *user;
46         char       *host;
47         int         port;
48         smallint    is_ftp;
49 };
50
51
52 /* Globals */
53 struct globals {
54         off_t content_len;        /* Content-length of the file */
55         off_t beg_range;          /* Range at which continue begins */
56 #if ENABLE_FEATURE_WGET_STATUSBAR
57         off_t transferred;        /* Number of bytes transferred so far */
58         const char *curfile;      /* Name of current file being transferred */
59         bb_progress_t pmt;
60 #endif
61         char *dir_prefix;
62 #if ENABLE_FEATURE_WGET_LONG_OPTIONS
63         char *post_data;
64         char *extra_headers;
65 #endif
66         char *fname_out;        /* where to direct output (-O) */
67         const char *proxy_flag; /* Use proxies if env vars are set */
68         const char *user_agent; /* "User-Agent" header field */
69 #if ENABLE_FEATURE_WGET_TIMEOUT
70         unsigned timeout_seconds;
71 #endif
72         int output_fd;
73         int o_flags;
74         smallint chunked;         /* chunked transfer encoding */
75         smallint got_clen;        /* got content-length: from server  */
76         /* Local downloads do benefit from big buffer.
77          * With 512 byte buffer, it was measured to be
78          * an order of magnitude slower than with big one.
79          */
80         uint64_t just_to_align_next_member;
81         char wget_buf[CONFIG_FEATURE_COPYBUF_KB*1024];
82 } FIX_ALIASING;
83 #define G (*ptr_to_globals)
84 #define INIT_G() do { \
85         SET_PTR_TO_GLOBALS(xzalloc(sizeof(G))); \
86         IF_FEATURE_WGET_TIMEOUT(G.timeout_seconds = 900;) \
87 } while (0)
88
89
90 /* Must match option string! */
91 enum {
92         WGET_OPT_CONTINUE   = (1 << 0),
93         WGET_OPT_SPIDER     = (1 << 1),
94         WGET_OPT_QUIET      = (1 << 2),
95         WGET_OPT_OUTNAME    = (1 << 3),
96         WGET_OPT_PREFIX     = (1 << 4),
97         WGET_OPT_PROXY      = (1 << 5),
98         WGET_OPT_USER_AGENT = (1 << 6),
99         WGET_OPT_NETWORK_READ_TIMEOUT = (1 << 7),
100         WGET_OPT_RETRIES    = (1 << 8),
101         WGET_OPT_PASSIVE    = (1 << 9),
102         WGET_OPT_HEADER     = (1 << 10) * ENABLE_FEATURE_WGET_LONG_OPTIONS,
103         WGET_OPT_POST_DATA  = (1 << 11) * ENABLE_FEATURE_WGET_LONG_OPTIONS,
104 };
105
106 enum {
107         PROGRESS_START = -1,
108         PROGRESS_END   = 0,
109         PROGRESS_BUMP  = 1,
110 };
111 #if ENABLE_FEATURE_WGET_STATUSBAR
112 static void progress_meter(int flag)
113 {
114         if (option_mask32 & WGET_OPT_QUIET)
115                 return;
116
117         if (flag == PROGRESS_START)
118                 bb_progress_init(&G.pmt, G.curfile);
119
120         bb_progress_update(&G.pmt,
121                         G.beg_range,
122                         G.transferred,
123                         (G.chunked || !G.got_clen) ? 0 : G.beg_range + G.transferred + G.content_len
124         );
125
126         if (flag == PROGRESS_END) {
127                 bb_progress_free(&G.pmt);
128                 bb_putchar_stderr('\n');
129                 G.transferred = 0;
130         }
131 }
132 #else
133 static ALWAYS_INLINE void progress_meter(int flag UNUSED_PARAM) { }
134 #endif
135
136
137 /* IPv6 knows scoped address types i.e. link and site local addresses. Link
138  * local addresses can have a scope identifier to specify the
139  * interface/link an address is valid on (e.g. fe80::1%eth0). This scope
140  * identifier is only valid on a single node.
141  *
142  * RFC 4007 says that the scope identifier MUST NOT be sent across the wire,
143  * unless all nodes agree on the semantic. Apache e.g. regards zone identifiers
144  * in the Host header as invalid requests, see
145  * https://issues.apache.org/bugzilla/show_bug.cgi?id=35122
146  */
147 static void strip_ipv6_scope_id(char *host)
148 {
149         char *scope, *cp;
150
151         /* bbox wget actually handles IPv6 addresses without [], like
152          * wget "http://::1/xxx", but this is not standard.
153          * To save code, _here_ we do not support it. */
154
155         if (host[0] != '[')
156                 return; /* not IPv6 */
157
158         scope = strchr(host, '%');
159         if (!scope)
160                 return;
161
162         /* Remove the IPv6 zone identifier from the host address */
163         cp = strchr(host, ']');
164         if (!cp || (cp[1] != ':' && cp[1] != '\0')) {
165                 /* malformed address (not "[xx]:nn" or "[xx]") */
166                 return;
167         }
168
169         /* cp points to "]...", scope points to "%eth0]..." */
170         overlapping_strcpy(scope, cp);
171 }
172
173 #if ENABLE_FEATURE_WGET_AUTHENTICATION
174 /* Base64-encode character string. */
175 static char *base64enc(const char *str)
176 {
177         unsigned len = strlen(str);
178         if (len > sizeof(G.wget_buf)/4*3 - 10) /* paranoia */
179                 len = sizeof(G.wget_buf)/4*3 - 10;
180         bb_uuencode(G.wget_buf, str, len, bb_uuenc_tbl_base64);
181         return G.wget_buf;
182 }
183 #endif
184
185 static char* sanitize_string(char *s)
186 {
187         unsigned char *p = (void *) s;
188         while (*p >= ' ')
189                 p++;
190         *p = '\0';
191         return s;
192 }
193
194 static FILE *open_socket(len_and_sockaddr *lsa)
195 {
196         FILE *fp;
197
198         /* glibc 2.4 seems to try seeking on it - ??! */
199         /* hopefully it understands what ESPIPE means... */
200         fp = fdopen(xconnect_stream(lsa), "r+");
201         if (fp == NULL)
202                 bb_perror_msg_and_die(bb_msg_memory_exhausted);
203
204         return fp;
205 }
206
207 /* Returns '\n' if it was seen, else '\0'. Trims at first '\r' or '\n' */
208 static char fgets_and_trim(FILE *fp)
209 {
210         char c;
211         char *buf_ptr;
212
213         if (fgets(G.wget_buf, sizeof(G.wget_buf) - 1, fp) == NULL)
214                 bb_perror_msg_and_die("error getting response");
215
216         buf_ptr = strchrnul(G.wget_buf, '\n');
217         c = *buf_ptr;
218         *buf_ptr = '\0';
219         buf_ptr = strchrnul(G.wget_buf, '\r');
220         *buf_ptr = '\0';
221
222         log_io("< %s", G.wget_buf);
223
224         return c;
225 }
226
227 static int ftpcmd(const char *s1, const char *s2, FILE *fp)
228 {
229         int result;
230         if (s1) {
231                 if (!s2)
232                         s2 = "";
233                 fprintf(fp, "%s%s\r\n", s1, s2);
234                 fflush(fp);
235                 log_io("> %s%s", s1, s2);
236         }
237
238         do {
239                 fgets_and_trim(fp);
240         } while (!isdigit(G.wget_buf[0]) || G.wget_buf[3] != ' ');
241
242         G.wget_buf[3] = '\0';
243         result = xatoi_positive(G.wget_buf);
244         G.wget_buf[3] = ' ';
245         return result;
246 }
247
248 static void parse_url(const char *src_url, struct host_info *h)
249 {
250         char *url, *p, *sp;
251
252         free(h->allocated);
253         h->allocated = url = xstrdup(src_url);
254
255         if (strncmp(url, "http://", 7) == 0) {
256                 h->port = bb_lookup_port("http", "tcp", 80);
257                 h->host = url + 7;
258                 h->is_ftp = 0;
259         } else if (strncmp(url, "ftp://", 6) == 0) {
260                 h->port = bb_lookup_port("ftp", "tcp", 21);
261                 h->host = url + 6;
262                 h->is_ftp = 1;
263         } else
264                 bb_error_msg_and_die("not an http or ftp url: %s", sanitize_string(url));
265
266         // FYI:
267         // "Real" wget 'http://busybox.net?var=a/b' sends this request:
268         //   'GET /?var=a/b HTTP 1.0'
269         //   and saves 'index.html?var=a%2Fb' (we save 'b')
270         // wget 'http://busybox.net?login=john@doe':
271         //   request: 'GET /?login=john@doe HTTP/1.0'
272         //   saves: 'index.html?login=john@doe' (we save '?login=john@doe')
273         // wget 'http://busybox.net#test/test':
274         //   request: 'GET / HTTP/1.0'
275         //   saves: 'index.html' (we save 'test')
276         //
277         // We also don't add unique .N suffix if file exists...
278         sp = strchr(h->host, '/');
279         p = strchr(h->host, '?'); if (!sp || (p && sp > p)) sp = p;
280         p = strchr(h->host, '#'); if (!sp || (p && sp > p)) sp = p;
281         if (!sp) {
282                 h->path = "";
283         } else if (*sp == '/') {
284                 *sp = '\0';
285                 h->path = sp + 1;
286         } else { // '#' or '?'
287                 // http://busybox.net?login=john@doe is a valid URL
288                 // memmove converts to:
289                 // http:/busybox.nett?login=john@doe...
290                 memmove(h->host - 1, h->host, sp - h->host);
291                 h->host--;
292                 sp[-1] = '\0';
293                 h->path = sp;
294         }
295
296         // We used to set h->user to NULL here, but this interferes
297         // with handling of code 302 ("object was moved")
298
299         sp = strrchr(h->host, '@');
300         if (sp != NULL) {
301                 // URL-decode "user:password" string before base64-encoding:
302                 // wget http://test:my%20pass@example.com should send
303                 // Authorization: Basic dGVzdDpteSBwYXNz
304                 // which decodes to "test:my pass".
305                 // Standard wget and curl do this too.
306                 *sp = '\0';
307                 h->user = percent_decode_in_place(h->host, /*strict:*/ 0);
308                 h->host = sp + 1;
309         }
310
311         sp = h->host;
312 }
313
314 static char *gethdr(FILE *fp)
315 {
316         char *s, *hdrval;
317         int c;
318
319         /* *istrunc = 0; */
320
321         /* retrieve header line */
322         c = fgets_and_trim(fp);
323
324         /* end of the headers? */
325         if (G.wget_buf[0] == '\0')
326                 return NULL;
327
328         /* convert the header name to lower case */
329         for (s = G.wget_buf; isalnum(*s) || *s == '-' || *s == '.'; ++s) {
330                 /* tolower for "A-Z", no-op for "0-9a-z-." */
331                 *s |= 0x20;
332         }
333
334         /* verify we are at the end of the header name */
335         if (*s != ':')
336                 bb_error_msg_and_die("bad header line: %s", sanitize_string(G.wget_buf));
337
338         /* locate the start of the header value */
339         *s++ = '\0';
340         hdrval = skip_whitespace(s);
341
342         if (c != '\n') {
343                 /* Rats! The buffer isn't big enough to hold the entire header value */
344                 while (c = getc(fp), c != EOF && c != '\n')
345                         continue;
346         }
347
348         return hdrval;
349 }
350
351 static FILE* prepare_ftp_session(FILE **dfpp, struct host_info *target, len_and_sockaddr *lsa)
352 {
353         FILE *sfp;
354         char *str;
355         int port;
356
357         if (!target->user)
358                 target->user = xstrdup("anonymous:busybox@");
359
360         sfp = open_socket(lsa);
361         if (ftpcmd(NULL, NULL, sfp) != 220)
362                 bb_error_msg_and_die("%s", sanitize_string(G.wget_buf + 4));
363
364         /*
365          * Splitting username:password pair,
366          * trying to log in
367          */
368         str = strchr(target->user, ':');
369         if (str)
370                 *str++ = '\0';
371         switch (ftpcmd("USER ", target->user, sfp)) {
372         case 230:
373                 break;
374         case 331:
375                 if (ftpcmd("PASS ", str, sfp) == 230)
376                         break;
377                 /* fall through (failed login) */
378         default:
379                 bb_error_msg_and_die("ftp login: %s", sanitize_string(G.wget_buf + 4));
380         }
381
382         ftpcmd("TYPE I", NULL, sfp);
383
384         /*
385          * Querying file size
386          */
387         if (ftpcmd("SIZE ", target->path, sfp) == 213) {
388                 G.content_len = BB_STRTOOFF(G.wget_buf + 4, NULL, 10);
389                 if (G.content_len < 0 || errno) {
390                         bb_error_msg_and_die("SIZE value is garbage");
391                 }
392                 G.got_clen = 1;
393         }
394
395         /*
396          * Entering passive mode
397          */
398         if (ftpcmd("PASV", NULL, sfp) != 227) {
399  pasv_error:
400                 bb_error_msg_and_die("bad response to %s: %s", "PASV", sanitize_string(G.wget_buf));
401         }
402         // Response is "227 garbageN1,N2,N3,N4,P1,P2[)garbage]
403         // Server's IP is N1.N2.N3.N4 (we ignore it)
404         // Server's port for data connection is P1*256+P2
405         str = strrchr(G.wget_buf, ')');
406         if (str) str[0] = '\0';
407         str = strrchr(G.wget_buf, ',');
408         if (!str) goto pasv_error;
409         port = xatou_range(str+1, 0, 255);
410         *str = '\0';
411         str = strrchr(G.wget_buf, ',');
412         if (!str) goto pasv_error;
413         port += xatou_range(str+1, 0, 255) * 256;
414         set_nport(&lsa->u.sa, htons(port));
415
416         *dfpp = open_socket(lsa);
417
418         if (G.beg_range) {
419                 sprintf(G.wget_buf, "REST %"OFF_FMT"u", G.beg_range);
420                 if (ftpcmd(G.wget_buf, NULL, sfp) == 350)
421                         G.content_len -= G.beg_range;
422         }
423
424         if (ftpcmd("RETR ", target->path, sfp) > 150)
425                 bb_error_msg_and_die("bad response to %s: %s", "RETR", sanitize_string(G.wget_buf));
426
427         return sfp;
428 }
429
430 static void NOINLINE retrieve_file_data(FILE *dfp)
431 {
432 #if ENABLE_FEATURE_WGET_STATUSBAR || ENABLE_FEATURE_WGET_TIMEOUT
433 # if ENABLE_FEATURE_WGET_TIMEOUT
434         unsigned second_cnt;
435 # endif
436         struct pollfd polldata;
437
438         polldata.fd = fileno(dfp);
439         polldata.events = POLLIN | POLLPRI;
440 #endif
441         progress_meter(PROGRESS_START);
442
443         if (G.chunked)
444                 goto get_clen;
445
446         /* Loops only if chunked */
447         while (1) {
448
449 #if ENABLE_FEATURE_WGET_STATUSBAR || ENABLE_FEATURE_WGET_TIMEOUT
450                 /* Must use nonblocking I/O, otherwise fread will loop
451                  * and *block* until it reads full buffer,
452                  * which messes up progress bar and/or timeout logic.
453                  * Because of nonblocking I/O, we need to dance
454                  * very carefully around EAGAIN. See explanation at
455                  * clearerr() call.
456                  */
457                 ndelay_on(polldata.fd);
458 #endif
459                 while (1) {
460                         int n;
461                         unsigned rdsz;
462
463                         rdsz = sizeof(G.wget_buf);
464                         if (G.got_clen) {
465                                 if (G.content_len < (off_t)sizeof(G.wget_buf)) {
466                                         if ((int)G.content_len <= 0)
467                                                 break;
468                                         rdsz = (unsigned)G.content_len;
469                                 }
470                         }
471
472 #if ENABLE_FEATURE_WGET_STATUSBAR || ENABLE_FEATURE_WGET_TIMEOUT
473 # if ENABLE_FEATURE_WGET_TIMEOUT
474                         second_cnt = G.timeout_seconds;
475 # endif
476                         while (1) {
477                                 if (safe_poll(&polldata, 1, 1000) != 0)
478                                         break; /* error, EOF, or data is available */
479 # if ENABLE_FEATURE_WGET_TIMEOUT
480                                 if (second_cnt != 0 && --second_cnt == 0) {
481                                         progress_meter(PROGRESS_END);
482                                         bb_error_msg_and_die("download timed out");
483                                 }
484 # endif
485                                 /* Needed for "stalled" indicator */
486                                 progress_meter(PROGRESS_BUMP);
487                         }
488
489                         /* fread internally uses read loop, which in our case
490                          * is usually exited when we get EAGAIN.
491                          * In this case, libc sets error marker on the stream.
492                          * Need to clear it before next fread to avoid possible
493                          * rare false positive ferror below. Rare because usually
494                          * fread gets more than zero bytes, and we don't fall
495                          * into if (n <= 0) ...
496                          */
497                         clearerr(dfp);
498                         errno = 0;
499 #endif
500                         n = fread(G.wget_buf, 1, rdsz, dfp);
501                         /* man fread:
502                          * If error occurs, or EOF is reached, the return value
503                          * is a short item count (or zero).
504                          * fread does not distinguish between EOF and error.
505                          */
506                         if (n <= 0) {
507 #if ENABLE_FEATURE_WGET_STATUSBAR || ENABLE_FEATURE_WGET_TIMEOUT
508                                 if (errno == EAGAIN) /* poll lied, there is no data? */
509                                         continue; /* yes */
510 #endif
511                                 if (ferror(dfp))
512                                         bb_perror_msg_and_die(bb_msg_read_error);
513                                 break; /* EOF, not error */
514                         }
515
516                         xwrite(G.output_fd, G.wget_buf, n);
517
518 #if ENABLE_FEATURE_WGET_STATUSBAR
519                         G.transferred += n;
520                         progress_meter(PROGRESS_BUMP);
521 #endif
522                         if (G.got_clen) {
523                                 G.content_len -= n;
524                                 if (G.content_len == 0)
525                                         break;
526                         }
527                 }
528 #if ENABLE_FEATURE_WGET_STATUSBAR || ENABLE_FEATURE_WGET_TIMEOUT
529                 clearerr(dfp);
530                 ndelay_off(polldata.fd); /* else fgets can get very unhappy */
531 #endif
532                 if (!G.chunked)
533                         break;
534
535                 fgets_and_trim(dfp); /* Eat empty line */
536  get_clen:
537                 fgets_and_trim(dfp);
538                 G.content_len = STRTOOFF(G.wget_buf, NULL, 16);
539                 /* FIXME: error check? */
540                 if (G.content_len == 0)
541                         break; /* all done! */
542                 G.got_clen = 1;
543         }
544
545         /* Draw full bar and free its resources */
546         G.chunked = 0;  /* makes it show 100% even for chunked download */
547         G.got_clen = 1; /* makes it show 100% even for download of (formerly) unknown size */
548         progress_meter(PROGRESS_END);
549 }
550
551 static void download_one_url(const char *url)
552 {
553         bool use_proxy;                 /* Use proxies if env vars are set  */
554         int redir_limit;
555         len_and_sockaddr *lsa;
556         FILE *sfp;                      /* socket to web/ftp server         */
557         FILE *dfp;                      /* socket to ftp server (data)      */
558         char *proxy = NULL;
559         char *fname_out_alloc;
560         struct host_info server;
561         struct host_info target;
562
563         server.allocated = NULL;
564         target.allocated = NULL;
565         server.user = NULL;
566         target.user = NULL;
567
568         parse_url(url, &target);
569
570         /* Use the proxy if necessary */
571         use_proxy = (strcmp(G.proxy_flag, "off") != 0);
572         if (use_proxy) {
573                 proxy = getenv(target.is_ftp ? "ftp_proxy" : "http_proxy");
574                 use_proxy = (proxy && proxy[0]);
575                 if (use_proxy)
576                         parse_url(proxy, &server);
577         }
578         if (!use_proxy) {
579                 server.port = target.port;
580                 if (ENABLE_FEATURE_IPV6) {
581                         //free(server.allocated); - can't be non-NULL
582                         server.host = server.allocated = xstrdup(target.host);
583                 } else {
584                         server.host = target.host;
585                 }
586         }
587
588         if (ENABLE_FEATURE_IPV6)
589                 strip_ipv6_scope_id(target.host);
590
591         /* If there was no -O FILE, guess output filename */
592         fname_out_alloc = NULL;
593         if (!(option_mask32 & WGET_OPT_OUTNAME)) {
594                 G.fname_out = bb_get_last_path_component_nostrip(target.path);
595                 /* handle "wget http://kernel.org//" */
596                 if (G.fname_out[0] == '/' || !G.fname_out[0])
597                         G.fname_out = (char*)"index.html";
598                 /* -P DIR is considered only if there was no -O FILE */
599                 else {
600                         if (G.dir_prefix)
601                                 G.fname_out = fname_out_alloc = concat_path_file(G.dir_prefix, G.fname_out);
602                         else {
603                                 /* redirects may free target.path later, need to make a copy */
604                                 G.fname_out = fname_out_alloc = xstrdup(G.fname_out);
605                         }
606                 }
607         }
608 #if ENABLE_FEATURE_WGET_STATUSBAR
609         G.curfile = bb_get_last_path_component_nostrip(G.fname_out);
610 #endif
611
612         /* Determine where to start transfer */
613         G.beg_range = 0;
614         if (option_mask32 & WGET_OPT_CONTINUE) {
615                 G.output_fd = open(G.fname_out, O_WRONLY);
616                 if (G.output_fd >= 0) {
617                         G.beg_range = xlseek(G.output_fd, 0, SEEK_END);
618                 }
619                 /* File doesn't exist. We do not create file here yet.
620                  * We are not sure it exists on remote side */
621         }
622
623         redir_limit = 5;
624  resolve_lsa:
625         lsa = xhost2sockaddr(server.host, server.port);
626         if (!(option_mask32 & WGET_OPT_QUIET)) {
627                 char *s = xmalloc_sockaddr2dotted(&lsa->u.sa);
628                 fprintf(stderr, "Connecting to %s (%s)\n", server.host, s);
629                 free(s);
630         }
631  establish_session:
632         /*G.content_len = 0; - redundant, got_clen = 0 is enough */
633         G.got_clen = 0;
634         G.chunked = 0;
635         if (use_proxy || !target.is_ftp) {
636                 /*
637                  *  HTTP session
638                  */
639                 char *str;
640                 int status;
641
642
643                 /* Open socket to http server */
644                 sfp = open_socket(lsa);
645
646                 /* Send HTTP request */
647                 if (use_proxy) {
648                         fprintf(sfp, "GET %stp://%s/%s HTTP/1.1\r\n",
649                                 target.is_ftp ? "f" : "ht", target.host,
650                                 target.path);
651                 } else {
652                         if (option_mask32 & WGET_OPT_POST_DATA)
653                                 fprintf(sfp, "POST /%s HTTP/1.1\r\n", target.path);
654                         else
655                                 fprintf(sfp, "GET /%s HTTP/1.1\r\n", target.path);
656                 }
657
658                 fprintf(sfp, "Host: %s\r\nUser-Agent: %s\r\n",
659                         target.host, G.user_agent);
660
661                 /* Ask server to close the connection as soon as we are done
662                  * (IOW: we do not intend to send more requests)
663                  */
664                 fprintf(sfp, "Connection: close\r\n");
665
666 #if ENABLE_FEATURE_WGET_AUTHENTICATION
667                 if (target.user) {
668                         fprintf(sfp, "Proxy-Authorization: Basic %s\r\n"+6,
669                                 base64enc(target.user));
670                 }
671                 if (use_proxy && server.user) {
672                         fprintf(sfp, "Proxy-Authorization: Basic %s\r\n",
673                                 base64enc(server.user));
674                 }
675 #endif
676
677                 if (G.beg_range)
678                         fprintf(sfp, "Range: bytes=%"OFF_FMT"u-\r\n", G.beg_range);
679
680 #if ENABLE_FEATURE_WGET_LONG_OPTIONS
681                 if (G.extra_headers)
682                         fputs(G.extra_headers, sfp);
683
684                 if (option_mask32 & WGET_OPT_POST_DATA) {
685                         fprintf(sfp,
686                                 "Content-Type: application/x-www-form-urlencoded\r\n"
687                                 "Content-Length: %u\r\n"
688                                 "\r\n"
689                                 "%s",
690                                 (int) strlen(G.post_data), G.post_data
691                         );
692                 } else
693 #endif
694                 {
695                         fprintf(sfp, "\r\n");
696                 }
697
698                 fflush(sfp);
699
700                 /*
701                  * Retrieve HTTP response line and check for "200" status code.
702                  */
703  read_response:
704                 fgets_and_trim(sfp);
705
706                 str = G.wget_buf;
707                 str = skip_non_whitespace(str);
708                 str = skip_whitespace(str);
709                 // FIXME: no error check
710                 // xatou wouldn't work: "200 OK"
711                 status = atoi(str);
712                 switch (status) {
713                 case 0:
714                 case 100:
715                         while (gethdr(sfp) != NULL)
716                                 /* eat all remaining headers */;
717                         goto read_response;
718                 case 200:
719 /*
720 Response 204 doesn't say "null file", it says "metadata
721 has changed but data didn't":
722
723 "10.2.5 204 No Content
724 The server has fulfilled the request but does not need to return
725 an entity-body, and might want to return updated metainformation.
726 The response MAY include new or updated metainformation in the form
727 of entity-headers, which if present SHOULD be associated with
728 the requested variant.
729
730 If the client is a user agent, it SHOULD NOT change its document
731 view from that which caused the request to be sent. This response
732 is primarily intended to allow input for actions to take place
733 without causing a change to the user agent's active document view,
734 although any new or updated metainformation SHOULD be applied
735 to the document currently in the user agent's active view.
736
737 The 204 response MUST NOT include a message-body, and thus
738 is always terminated by the first empty line after the header fields."
739
740 However, in real world it was observed that some web servers
741 (e.g. Boa/0.94.14rc21) simply use code 204 when file size is zero.
742 */
743                 case 204:
744                         break;
745                 case 300:  /* redirection */
746                 case 301:
747                 case 302:
748                 case 303:
749                         break;
750                 case 206:
751                         if (G.beg_range)
752                                 break;
753                         /* fall through */
754                 default:
755                         bb_error_msg_and_die("server returned error: %s", sanitize_string(G.wget_buf));
756                 }
757
758                 /*
759                  * Retrieve HTTP headers.
760                  */
761                 while ((str = gethdr(sfp)) != NULL) {
762                         static const char keywords[] ALIGN1 =
763                                 "content-length\0""transfer-encoding\0""location\0";
764                         enum {
765                                 KEY_content_length = 1, KEY_transfer_encoding, KEY_location
766                         };
767                         smalluint key;
768
769                         /* gethdr converted "FOO:" string to lowercase */
770
771                         /* strip trailing whitespace */
772                         char *s = strchrnul(str, '\0') - 1;
773                         while (s >= str && (*s == ' ' || *s == '\t')) {
774                                 *s = '\0';
775                                 s--;
776                         }
777                         key = index_in_strings(keywords, G.wget_buf) + 1;
778                         if (key == KEY_content_length) {
779                                 G.content_len = BB_STRTOOFF(str, NULL, 10);
780                                 if (G.content_len < 0 || errno) {
781                                         bb_error_msg_and_die("content-length %s is garbage", sanitize_string(str));
782                                 }
783                                 G.got_clen = 1;
784                                 continue;
785                         }
786                         if (key == KEY_transfer_encoding) {
787                                 if (strcmp(str_tolower(str), "chunked") != 0)
788                                         bb_error_msg_and_die("transfer encoding '%s' is not supported", sanitize_string(str));
789                                 G.chunked = 1;
790                         }
791                         if (key == KEY_location && status >= 300) {
792                                 if (--redir_limit == 0)
793                                         bb_error_msg_and_die("too many redirections");
794                                 fclose(sfp);
795                                 if (str[0] == '/') {
796                                         free(target.allocated);
797                                         target.path = target.allocated = xstrdup(str+1);
798                                         /* lsa stays the same: it's on the same server */
799                                 } else {
800                                         parse_url(str, &target);
801                                         if (!use_proxy) {
802                                                 free(server.allocated);
803                                                 server.allocated = NULL;
804                                                 server.host = target.host;
805                                                 /* strip_ipv6_scope_id(target.host); - no! */
806                                                 /* we assume remote never gives us IPv6 addr with scope id */
807                                                 server.port = target.port;
808                                                 free(lsa);
809                                                 goto resolve_lsa;
810                                         } /* else: lsa stays the same: we use proxy */
811                                 }
812                                 goto establish_session;
813                         }
814                 }
815 //              if (status >= 300)
816 //                      bb_error_msg_and_die("bad redirection (no Location: header from server)");
817
818                 /* For HTTP, data is pumped over the same connection */
819                 dfp = sfp;
820
821         } else {
822                 /*
823                  *  FTP session
824                  */
825                 sfp = prepare_ftp_session(&dfp, &target, lsa);
826         }
827
828         free(lsa);
829
830         if (!(option_mask32 & WGET_OPT_SPIDER)) {
831                 if (G.output_fd < 0)
832                         G.output_fd = xopen(G.fname_out, G.o_flags);
833                 retrieve_file_data(dfp);
834                 if (!(option_mask32 & WGET_OPT_OUTNAME)) {
835                         xclose(G.output_fd);
836                         G.output_fd = -1;
837                 }
838         }
839
840         if (dfp != sfp) {
841                 /* It's ftp. Close data connection properly */
842                 fclose(dfp);
843                 if (ftpcmd(NULL, NULL, sfp) != 226)
844                         bb_error_msg_and_die("ftp error: %s", sanitize_string(G.wget_buf + 4));
845                 /* ftpcmd("QUIT", NULL, sfp); - why bother? */
846         }
847         fclose(sfp);
848
849         free(server.allocated);
850         free(target.allocated);
851         free(fname_out_alloc);
852 }
853
854 int wget_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE;
855 int wget_main(int argc UNUSED_PARAM, char **argv)
856 {
857 #if ENABLE_FEATURE_WGET_LONG_OPTIONS
858         static const char wget_longopts[] ALIGN1 =
859                 /* name, has_arg, val */
860                 "continue\0"         No_argument       "c"
861 //FIXME: -s isn't --spider, it's --save-headers!
862                 "spider\0"           No_argument       "s"
863                 "quiet\0"            No_argument       "q"
864                 "output-document\0"  Required_argument "O"
865                 "directory-prefix\0" Required_argument "P"
866                 "proxy\0"            Required_argument "Y"
867                 "user-agent\0"       Required_argument "U"
868 #if ENABLE_FEATURE_WGET_TIMEOUT
869                 "timeout\0"          Required_argument "T"
870 #endif
871                 /* Ignored: */
872                 // "tries\0"            Required_argument "t"
873                 /* Ignored (we always use PASV): */
874                 "passive-ftp\0"      No_argument       "\xff"
875                 "header\0"           Required_argument "\xfe"
876                 "post-data\0"        Required_argument "\xfd"
877                 /* Ignored (we don't do ssl) */
878                 "no-check-certificate\0" No_argument   "\xfc"
879                 ;
880 #endif
881
882 #if ENABLE_FEATURE_WGET_LONG_OPTIONS
883         llist_t *headers_llist = NULL;
884 #endif
885
886         INIT_G();
887
888         IF_FEATURE_WGET_TIMEOUT(G.timeout_seconds = 900;)
889         G.proxy_flag = "on";   /* use proxies if env vars are set */
890         G.user_agent = "Wget"; /* "User-Agent" header field */
891
892 #if ENABLE_FEATURE_WGET_LONG_OPTIONS
893         applet_long_options = wget_longopts;
894 #endif
895         opt_complementary = "-1" IF_FEATURE_WGET_TIMEOUT(":T+") IF_FEATURE_WGET_LONG_OPTIONS(":\xfe::");
896         getopt32(argv, "csqO:P:Y:U:T:" /*ignored:*/ "t:",
897                 &G.fname_out, &G.dir_prefix,
898                 &G.proxy_flag, &G.user_agent,
899                 IF_FEATURE_WGET_TIMEOUT(&G.timeout_seconds) IF_NOT_FEATURE_WGET_TIMEOUT(NULL),
900                 NULL /* -t RETRIES */
901                 IF_FEATURE_WGET_LONG_OPTIONS(, &headers_llist)
902                 IF_FEATURE_WGET_LONG_OPTIONS(, &G.post_data)
903         );
904         argv += optind;
905
906 #if ENABLE_FEATURE_WGET_LONG_OPTIONS
907         if (headers_llist) {
908                 int size = 1;
909                 char *cp;
910                 llist_t *ll = headers_llist;
911                 while (ll) {
912                         size += strlen(ll->data) + 2;
913                         ll = ll->link;
914                 }
915                 G.extra_headers = cp = xmalloc(size);
916                 while (headers_llist) {
917                         cp += sprintf(cp, "%s\r\n", (char*)llist_pop(&headers_llist));
918                 }
919         }
920 #endif
921
922         G.output_fd = -1;
923         G.o_flags = O_WRONLY | O_CREAT | O_TRUNC | O_EXCL;
924         if (G.fname_out) { /* -O FILE ? */
925                 if (LONE_DASH(G.fname_out)) { /* -O - ? */
926                         G.output_fd = 1;
927                         option_mask32 &= ~WGET_OPT_CONTINUE;
928                 }
929                 /* compat with wget: -O FILE can overwrite */
930                 G.o_flags = O_WRONLY | O_CREAT | O_TRUNC;
931         }
932
933         while (*argv)
934                 download_one_url(*argv++);
935
936         if (G.output_fd >= 0)
937                 xclose(G.output_fd);
938
939         return EXIT_SUCCESS;
940 }