tftp: optional tftp-hpa compat
[oweals/busybox.git] / coreutils / wc.c
1 /* vi: set sw=4 ts=4: */
2 /*
3  * wc implementation for busybox
4  *
5  * Copyright (C) 2003  Manuel Novoa III  <mjn3@codepoet.org>
6  *
7  * Licensed under GPLv2 or later, see file LICENSE in this source tree.
8  */
9 /* Mar 16, 2003      Manuel Novoa III   (mjn3@codepoet.org)
10  *
11  * Rewritten to fix a number of problems and do some size optimizations.
12  * Problems in the previous busybox implementation (besides bloat) included:
13  *  1) broken 'wc -c' optimization (read note below)
14  *  2) broken handling of '-' args
15  *  3) no checking of ferror on EOF returns
16  *  4) isprint() wasn't considered when word counting.
17  *
18  * NOTES:
19  *
20  * The previous busybox wc attempted an optimization using stat for the
21  * case of counting chars only.  I omitted that because it was broken.
22  * It didn't take into account the possibility of input coming from a
23  * pipe, or input from a file with file pointer not at the beginning.
24  *
25  * To implement such a speed optimization correctly, not only do you
26  * need the size, but also the file position.  Note also that the
27  * file position may be past the end of file.  Consider the example
28  * (adapted from example in gnu wc.c)
29  *
30  *      echo hello > /tmp/testfile &&
31  *      (dd ibs=1k skip=1 count=0 &> /dev/null; wc -c) < /tmp/testfile
32  *
33  * for which 'wc -c' should output '0'.
34  */
35 //config:config WC
36 //config:       bool "wc (4.5 kb)"
37 //config:       default y
38 //config:       help
39 //config:       wc is used to print the number of bytes, words, and lines,
40 //config:       in specified files.
41 //config:
42 //config:config FEATURE_WC_LARGE
43 //config:       bool "Support very large counts"
44 //config:       default y
45 //config:       depends on WC
46 //config:       help
47 //config:       Use "unsigned long long" for counter variables.
48
49 //applet:IF_WC(APPLET(wc, BB_DIR_USR_BIN, BB_SUID_DROP))
50
51 //kbuild:lib-$(CONFIG_WC) += wc.o
52
53 /* BB_AUDIT SUSv3 compliant. */
54 /* http://www.opengroup.org/onlinepubs/007904975/utilities/wc.html */
55
56 #include "libbb.h"
57 #include "unicode.h"
58
59 #if !ENABLE_LOCALE_SUPPORT
60 # undef isprint
61 # undef isspace
62 # define isprint(c) ((unsigned)((c) - 0x20) <= (0x7e - 0x20))
63 # define isspace(c) ((c) == ' ')
64 #endif
65
66 #if ENABLE_FEATURE_WC_LARGE
67 # define COUNT_T unsigned long long
68 # define COUNT_FMT "llu"
69 #else
70 # define COUNT_T unsigned
71 # define COUNT_FMT "u"
72 #endif
73
74 /* We support -m even when UNICODE_SUPPORT is off,
75  * we just don't advertise it in help text,
76  * since it is the same as -c in this case.
77  */
78
79 //usage:#define wc_trivial_usage
80 //usage:       "[-c"IF_UNICODE_SUPPORT("m")"lwL] [FILE]..."
81 //usage:
82 //usage:#define wc_full_usage "\n\n"
83 //usage:       "Count lines, words, and bytes for each FILE (or stdin)\n"
84 //usage:     "\n        -c      Count bytes"
85 //usage:        IF_UNICODE_SUPPORT(
86 //usage:     "\n        -m      Count characters"
87 //usage:        )
88 //usage:     "\n        -l      Count newlines"
89 //usage:     "\n        -w      Count words"
90 //usage:     "\n        -L      Print longest line length"
91 //usage:
92 //usage:#define wc_example_usage
93 //usage:       "$ wc /etc/passwd\n"
94 //usage:       "     31      46    1365 /etc/passwd\n"
95
96 /* Order is important if we want to be compatible with
97  * column order in "wc -cmlwL" output:
98  */
99 enum {
100         WC_LINES    = 0, /* -l */
101         WC_WORDS    = 1, /* -w */
102         WC_UNICHARS = 2, /* -m */
103         WC_BYTES    = 3, /* -c */
104         WC_LENGTH   = 4, /* -L */
105         NUM_WCS     = 5,
106 };
107
108 int wc_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE;
109 int wc_main(int argc UNUSED_PARAM, char **argv)
110 {
111         const char *arg;
112         const char *start_fmt = " %9"COUNT_FMT + 1;
113         const char *fname_fmt = " %s\n";
114         COUNT_T *pcounts;
115         COUNT_T counts[NUM_WCS];
116         COUNT_T totals[NUM_WCS];
117         int num_files;
118         smallint status = EXIT_SUCCESS;
119         unsigned print_type;
120
121         init_unicode();
122
123         print_type = getopt32(argv, "lwmcL");
124
125         if (print_type == 0) {
126                 print_type = (1 << WC_LINES) | (1 << WC_WORDS) | (1 << WC_BYTES);
127         }
128
129         argv += optind;
130         if (!argv[0]) {
131                 *--argv = (char *) bb_msg_standard_input;
132                 fname_fmt = "\n";
133         }
134         if (!argv[1]) { /* zero or one filename? */
135                 if (!((print_type-1) & print_type)) /* exactly one option? */
136                         start_fmt = "%"COUNT_FMT;
137         }
138
139         memset(totals, 0, sizeof(totals));
140
141         pcounts = counts;
142
143         num_files = 0;
144         while ((arg = *argv++) != NULL) {
145                 FILE *fp;
146                 const char *s;
147                 unsigned u;
148                 unsigned linepos;
149                 smallint in_word;
150
151                 ++num_files;
152                 fp = fopen_or_warn_stdin(arg);
153                 if (!fp) {
154                         status = EXIT_FAILURE;
155                         continue;
156                 }
157
158                 memset(counts, 0, sizeof(counts));
159                 linepos = 0;
160                 in_word = 0;
161
162                 while (1) {
163                         int c;
164                         /* Our -w doesn't match GNU wc exactly... oh well */
165
166                         c = getc(fp);
167                         if (c == EOF) {
168                                 if (ferror(fp)) {
169                                         bb_simple_perror_msg(arg);
170                                         status = EXIT_FAILURE;
171                                 }
172                                 goto DO_EOF;  /* Treat an EOF as '\r'. */
173                         }
174
175                         /* Cater for -c and -m */
176                         ++counts[WC_BYTES];
177                         if (unicode_status != UNICODE_ON /* every byte is a new char */
178                          || (c & 0xc0) != 0x80 /* it isn't a 2nd+ byte of a Unicode char */
179                         ) {
180                                 ++counts[WC_UNICHARS];
181                         }
182
183                         if (isprint_asciionly(c)) { /* FIXME: not unicode-aware */
184                                 ++linepos;
185                                 if (!isspace(c)) {
186                                         in_word = 1;
187                                         continue;
188                                 }
189                         } else if ((unsigned)(c - 9) <= 4) {
190                                 /* \t  9
191                                  * \n 10
192                                  * \v 11
193                                  * \f 12
194                                  * \r 13
195                                  */
196                                 if (c == '\t') {
197                                         linepos = (linepos | 7) + 1;
198                                 } else {  /* '\n', '\r', '\f', or '\v' */
199  DO_EOF:
200                                         if (linepos > counts[WC_LENGTH]) {
201                                                 counts[WC_LENGTH] = linepos;
202                                         }
203                                         if (c == '\n') {
204                                                 ++counts[WC_LINES];
205                                         }
206                                         if (c != '\v') {
207                                                 linepos = 0;
208                                         }
209                                 }
210                         } else {
211                                 continue;
212                         }
213
214                         counts[WC_WORDS] += in_word;
215                         in_word = 0;
216                         if (c == EOF) {
217                                 break;
218                         }
219                 }
220
221                 fclose_if_not_stdin(fp);
222
223                 if (totals[WC_LENGTH] < counts[WC_LENGTH]) {
224                         totals[WC_LENGTH] = counts[WC_LENGTH];
225                 }
226                 totals[WC_LENGTH] -= counts[WC_LENGTH];
227
228  OUTPUT:
229                 /* coreutils wc tries hard to print pretty columns
230                  * (saves results for all files, finds max col len etc...)
231                  * we won't try that hard, it will bloat us too much */
232                 s = start_fmt;
233                 u = 0;
234                 do {
235                         if (print_type & (1 << u)) {
236                                 printf(s, pcounts[u]);
237                                 s = " %9"COUNT_FMT; /* Ok... restore the leading space. */
238                         }
239                         totals[u] += pcounts[u];
240                 } while (++u < NUM_WCS);
241                 printf(fname_fmt, arg);
242         }
243
244         /* If more than one file was processed, we want the totals.  To save some
245          * space, we set the pcounts ptr to the totals array.  This has the side
246          * effect of trashing the totals array after outputting it, but that's
247          * irrelavent since we no longer need it. */
248         if (num_files > 1) {
249                 num_files = 0;  /* Make sure we don't get here again. */
250                 arg = "total";
251                 pcounts = totals;
252                 --argv;
253                 goto OUTPUT;
254         }
255
256         fflush_stdout_and_exit(status);
257 }