Trim help text a bit more
[oweals/busybox.git] / coreutils / tr.c
1 /* vi: set sw=4 ts=4: */
2 /*
3  * Mini tr implementation for busybox
4  *
5  ** Copyright (c) 1987,1997, Prentice Hall   All rights reserved.
6  *
7  * The name of Prentice Hall may not be used to endorse or promote
8  * products derived from this software without specific prior
9  * written permission.
10  *
11  * Copyright (c) Michiel Huisjes
12  *
13  * This version of tr is adapted from Minix tr and was modified
14  * by Erik Andersen <andersen@codepoet.org> to be used in busybox.
15  *
16  * Licensed under GPLv2 or later, see file LICENSE in this tarball for details.
17  */
18 /* http://www.opengroup.org/onlinepubs/009695399/utilities/tr.html
19  * TODO: graph, print
20  */
21 #include "libbb.h"
22
23 enum {
24         ASCII = 256,
25         /* string buffer needs to be at least as big as the whole "alphabet".
26          * BUFSIZ == ASCII is ok, but we will realloc in expand
27          * even for smallest patterns, let's avoid that by using *2:
28          */
29         TR_BUFSIZ = (BUFSIZ > ASCII*2) ? BUFSIZ : ASCII*2,
30 };
31
32 static void map(char *pvector,
33                 char *string1, unsigned string1_len,
34                 char *string2, unsigned string2_len)
35 {
36         char last = '0';
37         unsigned i, j;
38
39         for (j = 0, i = 0; i < string1_len; i++) {
40                 if (string2_len <= j)
41                         pvector[(unsigned char)(string1[i])] = last;
42                 else
43                         pvector[(unsigned char)(string1[i])] = last = string2[j++];
44         }
45 }
46
47 /* supported constructs:
48  *   Ranges,  e.g.,  0-9   ==>  0123456789
49  *   Escapes, e.g.,  \a    ==>  Control-G
50  *   Character classes, e.g. [:upper:] ==> A...Z
51  *   Equiv classess, e.g. [=A=] ==> A   (hmmmmmmm?)
52  * not supported:
53  *   \ooo-\ooo - octal ranges
54  *   [x*N] - repeat char x N times
55  *   [x*] - repeat char x until it fills STRING2:
56  * # echo qwe123 | /usr/bin/tr 123456789 '[d]'
57  * qwe[d]
58  * # echo qwe123 | /usr/bin/tr 123456789 '[d*]'
59  * qweddd
60  */
61 static unsigned expand(const char *arg, char **buffer_p)
62 {
63         char *buffer = *buffer_p;
64         unsigned pos = 0;
65         unsigned size = TR_BUFSIZ;
66         unsigned i; /* can't be unsigned char: must be able to hold 256 */
67         unsigned char ac;
68
69         while (*arg) {
70                 if (pos + ASCII > size) {
71                         size += ASCII;
72                         *buffer_p = buffer = xrealloc(buffer, size);
73                 }
74                 if (*arg == '\\') {
75                         arg++;
76                         buffer[pos++] = bb_process_escape_sequence(&arg);
77                         continue;
78                 }
79                 if (arg[1] == '-') { /* "0-9..." */
80                         ac = arg[2];
81                         if (ac == '\0') { /* "0-": copy verbatim */
82                                 buffer[pos++] = *arg++; /* copy '0' */
83                                 continue; /* next iter will copy '-' and stop */
84                         }
85                         i = (unsigned char) *arg;
86                         while (i <= ac) /* ok: i is unsigned _int_ */
87                                 buffer[pos++] = i++;
88                         arg += 3; /* skip 0-9 */
89                         continue;
90                 }
91                 if ((ENABLE_FEATURE_TR_CLASSES || ENABLE_FEATURE_TR_EQUIV)
92                  && *arg == '['
93                 ) {
94                         arg++;
95                         i = (unsigned char) *arg++;
96                         /* "[xyz...". i=x, arg points to y */
97                         if (ENABLE_FEATURE_TR_CLASSES && i == ':') { /* [:class:] */
98 #define CLO ":]\0"
99                                 static const char classes[] ALIGN1 =
100                                         "alpha"CLO "alnum"CLO "digit"CLO
101                                         "lower"CLO "upper"CLO "space"CLO
102                                         "blank"CLO "punct"CLO "cntrl"CLO
103                                         "xdigit"CLO;
104                                 enum {
105                                         CLASS_invalid = 0, /* we increment the retval */
106                                         CLASS_alpha = 1,
107                                         CLASS_alnum = 2,
108                                         CLASS_digit = 3,
109                                         CLASS_lower = 4,
110                                         CLASS_upper = 5,
111                                         CLASS_space = 6,
112                                         CLASS_blank = 7,
113                                         CLASS_punct = 8,
114                                         CLASS_cntrl = 9,
115                                         CLASS_xdigit = 10,
116                                         //CLASS_graph = 11,
117                                         //CLASS_print = 12,
118                                 };
119                                 smalluint j;
120                                 char *tmp;
121
122                                 /* xdigit needs 8, not 7 */
123                                 i = 7 + (arg[0] == 'x');
124                                 tmp = xstrndup(arg, i);
125                                 j = index_in_strings(classes, tmp) + 1;
126                                 free(tmp);
127
128                                 if (j == CLASS_invalid)
129                                         goto skip_bracket;
130
131                                 arg += i;
132                                 if (j == CLASS_alnum || j == CLASS_digit || j == CLASS_xdigit) {
133                                         for (i = '0'; i <= '9'; i++)
134                                                 buffer[pos++] = i;
135                                 }
136                                 if (j == CLASS_alpha || j == CLASS_alnum || j == CLASS_upper) {
137                                         for (i = 'A'; i <= 'Z'; i++)
138                                                 buffer[pos++] = i;
139                                 }
140                                 if (j == CLASS_alpha || j == CLASS_alnum || j == CLASS_lower) {
141                                         for (i = 'a'; i <= 'z'; i++)
142                                                 buffer[pos++] = i;
143                                 }
144                                 if (j == CLASS_space || j == CLASS_blank) {
145                                         buffer[pos++] = '\t';
146                                         if (j == CLASS_space) {
147                                                 buffer[pos++] = '\n';
148                                                 buffer[pos++] = '\v';
149                                                 buffer[pos++] = '\f';
150                                                 buffer[pos++] = '\r';
151                                         }
152                                         buffer[pos++] = ' ';
153                                 }
154                                 if (j == CLASS_punct || j == CLASS_cntrl) {
155                                         for (i = '\0'; i < ASCII; i++) {
156                                                 if ((j == CLASS_punct && isprint(i) && !isalnum(i) && !isspace(i))
157                                                  || (j == CLASS_cntrl && iscntrl(i))
158                                                 ) {
159                                                         buffer[pos++] = i;
160                                                 }
161                                         }
162                                 }
163                                 if (j == CLASS_xdigit) {
164                                         for (i = 'A'; i <= 'F'; i++) {
165                                                 buffer[pos + 6] = i | 0x20;
166                                                 buffer[pos++] = i;
167                                         }
168                                         pos += 6;
169                                 }
170                                 continue;
171                         }
172                         /* "[xyz...", i=x, arg points to y */
173                         if (ENABLE_FEATURE_TR_EQUIV && i == '=') { /* [=CHAR=] */
174                                 buffer[pos++] = *arg; /* copy CHAR */
175                                 if (!arg[0] || arg[1] != '=' || arg[2] != ']')
176                                         bb_show_usage();
177                                 arg += 3;       /* skip CHAR=] */
178                                 continue;
179                         }
180                         /* The rest of "[xyz..." cases is treated as normal
181                          * string, "[" has no special meaning here:
182                          * tr "[a-z]" "[A-Z]" can be written as tr "a-z" "A-Z",
183                          * also try tr "[a-z]" "_A-Z+" and you'll see that
184                          * [] is not special here.
185                          */
186  skip_bracket:
187                         arg -= 2; /* points to "[" in "[xyz..." */
188                 }
189                 buffer[pos++] = *arg++;
190         }
191         return pos;
192 }
193
194 /* NB: buffer is guaranteed to be at least TR_BUFSIZE
195  * (which is >= ASCII) big.
196  */
197 static int complement(char *buffer, int buffer_len)
198 {
199         int len;
200         char conv[ASCII];
201         unsigned char ch;
202
203         len = 0;
204         ch = '\0';
205         while (1) {
206                 if (memchr(buffer, ch, buffer_len) == NULL)
207                         conv[len++] = ch;
208                 if (++ch == '\0')
209                         break;
210         }
211         memcpy(buffer, conv, len);
212         return len;
213 }
214
215 int tr_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE;
216 int tr_main(int argc UNUSED_PARAM, char **argv)
217 {
218         int i;
219         smalluint opts;
220         ssize_t read_chars;
221         size_t in_index, out_index;
222         unsigned last = UCHAR_MAX + 1; /* not equal to any char */
223         unsigned char coded, c;
224         char *str1 = xmalloc(TR_BUFSIZ);
225         char *str2 = xmalloc(TR_BUFSIZ);
226         int str2_length;
227         int str1_length;
228         char *vector = xzalloc(ASCII * 3);
229         char *invec  = vector + ASCII;
230         char *outvec = vector + ASCII * 2;
231
232 #define TR_OPT_complement       (3 << 0)
233 #define TR_OPT_delete           (1 << 2)
234 #define TR_OPT_squeeze_reps     (1 << 3)
235
236         for (i = 0; i < ASCII; i++) {
237                 vector[i] = i;
238                 /*invec[i] = outvec[i] = FALSE; - done by xzalloc */
239         }
240
241         /* -C/-c difference is that -C complements "characters",
242          * and -c complements "values" (binary bytes I guess).
243          * In POSIX locale, these are the same.
244          */
245
246         opt_complementary = "-1";
247         opts = getopt32(argv, "+Ccds"); /* '+': stop at first non-option */
248         argv += optind;
249
250         str1_length = expand(*argv++, &str1);
251         str2_length = 0;
252         if (opts & TR_OPT_complement)
253                 str1_length = complement(str1, str1_length);
254         if (*argv) {
255                 if (argv[0][0] == '\0')
256                         bb_error_msg_and_die("STRING2 cannot be empty");
257                 str2_length = expand(*argv, &str2);
258                 map(vector, str1, str1_length,
259                                 str2, str2_length);
260         }
261         for (i = 0; i < str1_length; i++)
262                 invec[(unsigned char)(str1[i])] = TRUE;
263         for (i = 0; i < str2_length; i++)
264                 outvec[(unsigned char)(str2[i])] = TRUE;
265
266         goto start_from;
267
268         /* In this loop, str1 space is reused as input buffer,
269          * str2 - as output one. */
270         for (;;) {
271                 /* If we're out of input, flush output and read more input. */
272                 if ((ssize_t)in_index == read_chars) {
273                         if (out_index) {
274                                 xwrite(STDOUT_FILENO, str2, out_index);
275  start_from:
276                                 out_index = 0;
277                         }
278                         read_chars = safe_read(STDIN_FILENO, str1, TR_BUFSIZ);
279                         if (read_chars <= 0) {
280                                 if (read_chars < 0)
281                                         bb_perror_msg_and_die(bb_msg_read_error);
282                                 break;
283                         }
284                         in_index = 0;
285                 }
286                 c = str1[in_index++];
287                 if ((opts & TR_OPT_delete) && invec[c])
288                         continue;
289                 coded = vector[c];
290                 if ((opts & TR_OPT_squeeze_reps) && last == coded
291                  && (invec[c] || outvec[coded])
292                 ) {
293                         continue;
294                 }
295                 str2[out_index++] = last = coded;
296         }
297
298         return EXIT_SUCCESS;
299 }