From: Denis Vlasenko Date: Sat, 9 Aug 2008 16:15:14 +0000 (-0000) Subject: grep: option to use GNU regex matching instead of POSIX one. X-Git-Tag: 1_12_0~25 X-Git-Url: https://git.librecmc.org/?a=commitdiff_plain;h=3fd15e197e21aa313ce56126ee814f0ebc884dee;p=oweals%2Fbusybox.git grep: option to use GNU regex matching instead of POSIX one. This fixes problems with NULs in files being scanned, but costs +800 bytes. The same can be done to sed (TODO). --- diff --git a/Config.in b/Config.in index 5ad35ce20..c2005c78a 100644 --- a/Config.in +++ b/Config.in @@ -21,6 +21,15 @@ config DESKTOP Select this only if you plan to use busybox on full-blown desktop machine with common Linux distro, not on an embedded box. +config EXTRA_COMPAT + bool "Provide compatible behavior for rare corner cases (bigger code)" + default n + help + This option makes grep, sed etc handle rare corner cases + (embedded NUL bytes and such). This makes code bigger and uses + some GNU extensions in libc. You probably only need this option + if you plan to run busybox on desktop. + config FEATURE_ASSUME_UNICODE bool "Assume that 1:1 char/glyph correspondence is not true" default n diff --git a/findutils/grep.c b/findutils/grep.c index 030e62461..f2ed01e74 100644 --- a/findutils/grep.c +++ b/findutils/grep.c @@ -96,6 +96,7 @@ struct globals { int lines_before; int lines_after; char **before_buf; + USE_EXTRA_COMPAT(size_t *before_buf_size;) int last_line_printed; #endif /* globals used internally */ @@ -117,6 +118,7 @@ struct globals { #define lines_before (G.lines_before ) #define lines_after (G.lines_after ) #define before_buf (G.before_buf ) +#define before_buf_size (G.before_buf_size ) #define last_line_printed (G.last_line_printed ) #define pattern_head (G.pattern_head ) #define cur_file (G.cur_file ) @@ -124,14 +126,24 @@ struct globals { typedef struct grep_list_data_t { char *pattern; - regex_t preg; +/* for GNU regex, matched_range must be persistent across grep_file() calls */ +#if !ENABLE_EXTRA_COMPAT + regex_t compiled_regex; + regmatch_t matched_range; +#else + struct re_pattern_buffer compiled_regex; + struct re_registers matched_range; +#endif #define ALLOCATED 1 #define COMPILED 2 int flg_mem_alocated_compiled; } grep_list_data_t; - -static void print_line(const char *line, int linenum, char decoration) +#if !ENABLE_EXTRA_COMPAT +#define print_line(line, line_len, linenum, decoration) \ + print_line(line, linenum, decoration) +#endif +static void print_line(const char *line, size_t line_len, int linenum, char decoration) { #if ENABLE_FEATURE_GREP_CONTEXT /* Happens when we go to next file, immediately hit match @@ -139,8 +151,9 @@ static void print_line(const char *line, int linenum, char decoration) if (linenum < 1) return; /* possibly print the little '--' separator */ - if ((lines_before || lines_after) && did_print_line && - last_line_printed != linenum - 1) { + if ((lines_before || lines_after) && did_print_line + && last_line_printed != linenum - 1 + ) { puts("--"); } /* guard against printing "--" before first line of first file */ @@ -152,17 +165,50 @@ static void print_line(const char *line, int linenum, char decoration) if (PRINT_LINE_NUM) printf("%i%c", linenum, decoration); /* Emulate weird GNU grep behavior with -ov */ - if ((option_mask32 & (OPT_v|OPT_o)) != (OPT_v|OPT_o)) + if ((option_mask32 & (OPT_v|OPT_o)) != (OPT_v|OPT_o)) { +#if !ENABLE_EXTRA_COMPAT puts(line); +#else + fwrite(line, 1, line_len, stdout); + putchar('\n'); +#endif + } } -static int grep_file(FILE *file) +#if ENABLE_EXTRA_COMPAT +/* Unlike getline, this one removes trailing '\n' */ +static ssize_t FAST_FUNC bb_getline(char **line_ptr, size_t *line_alloc_len, FILE *file) { + ssize_t res_sz; char *line; + + res_sz = getline(line_ptr, line_alloc_len, file); + line = *line_ptr; + + if (res_sz > 0) { + if (line[res_sz - 1] == '\n') + line[--res_sz] = '\0'; + } else { + free(line); /* uclibc allocates a buffer even on EOF. WTF? */ + } + return res_sz; +} +#endif + +static int grep_file(FILE *file) +{ smalluint found; int linenum = 0; int nmatches = 0; - regmatch_t regmatch; +#if !ENABLE_EXTRA_COMPAT + char *line; +#else + char *line = NULL; + ssize_t line_len; + size_t line_alloc_len; +#define rm_so start[0] +#define rm_eo end[0] +#endif #if ENABLE_FEATURE_GREP_CONTEXT int print_n_lines_after = 0; int curpos = 0; /* track where we are in the circular 'before' buffer */ @@ -171,7 +217,13 @@ static int grep_file(FILE *file) enum { print_n_lines_after = 0 }; #endif /* ENABLE_FEATURE_GREP_CONTEXT */ - while ((line = xmalloc_fgetline(file)) != NULL) { + while ( +#if !ENABLE_EXTRA_COMPAT + (line = xmalloc_fgetline(file)) != NULL +#else + (line_len = bb_getline(&line, &line_alloc_len, file)) >= 0 +#endif + ) { llist_t *pattern_ptr = pattern_head; grep_list_data_t *gl = gl; /* for gcc */ @@ -184,19 +236,35 @@ static int grep_file(FILE *file) } else { if (!(gl->flg_mem_alocated_compiled & COMPILED)) { gl->flg_mem_alocated_compiled |= COMPILED; - xregcomp(&(gl->preg), gl->pattern, reflags); +#if !ENABLE_EXTRA_COMPAT + xregcomp(&gl->compiled_regex, gl->pattern, reflags); +#else + memset(&gl->compiled_regex, 0, sizeof(gl->compiled_regex)); + if (re_compile_pattern(gl->pattern, strlen(gl->pattern), &gl->compiled_regex)) + bb_error_msg_and_die("bad regex '%s'", gl->pattern); +#endif } - regmatch.rm_so = 0; - regmatch.rm_eo = 0; - if (regexec(&(gl->preg), line, 1, ®match, 0) == 0) { +#if !ENABLE_EXTRA_COMPAT + gl->matched_range.rm_so = 0; + gl->matched_range.rm_eo = 0; +#endif + if ( +#if !ENABLE_EXTRA_COMPAT + regexec(&gl->compiled_regex, line, 1, &gl->matched_range, 0) == 0 +#else + re_search(&gl->compiled_regex, line, line_len, + /*start:*/ 0, /*range:*/ line_len, + &gl->matched_range) >= 0 +#endif + ) { if (!(option_mask32 & OPT_w)) found = 1; else { char c = ' '; - if (regmatch.rm_so) - c = line[regmatch.rm_so - 1]; + if (gl->matched_range.rm_so) + c = line[gl->matched_range.rm_so - 1]; if (!isalnum(c) && c != '_') { - c = line[regmatch.rm_eo]; + c = line[gl->matched_range.rm_eo]; if (!c || (!isalnum(c) && c != '_')) found = 1; } @@ -261,7 +329,7 @@ static int grep_file(FILE *file) /* now print each line in the buffer, clearing them as we go */ while (before_buf[idx] != NULL) { - print_line(before_buf[idx], first_buf_entry_line_num, '-'); + print_line(before_buf[idx], before_buf_size[idx], first_buf_entry_line_num, '-'); free(before_buf[idx]); before_buf[idx] = NULL; idx = (idx + 1) % lines_before; @@ -277,13 +345,15 @@ static int grep_file(FILE *file) /* -Fo just prints the pattern * (unless -v: -Fov doesnt print anything at all) */ if (found) - print_line(gl->pattern, linenum, ':'); + print_line(gl->pattern, strlen(gl->pattern), linenum, ':'); } else { - line[regmatch.rm_eo] = '\0'; - print_line(line + regmatch.rm_so, linenum, ':'); + line[gl->matched_range.rm_eo] = '\0'; + print_line(line + gl->matched_range.rm_so, + gl->matched_range.rm_eo - gl->matched_range.rm_so, + linenum, ':'); } } else { - print_line(line, linenum, ':'); + print_line(line, line_len, linenum, ':'); } } } @@ -291,12 +361,13 @@ static int grep_file(FILE *file) else { /* no match */ /* if we need to print some context lines after the last match, do so */ if (print_n_lines_after) { - print_line(line, linenum, '-'); + print_line(line, strlen(line), linenum, '-'); print_n_lines_after--; } else if (lines_before) { /* Add the line to the circular 'before' buffer */ free(before_buf[curpos]); before_buf[curpos] = line; + USE_EXTRA_COMPAT(before_buf_size[curpos] = line_len;) curpos = (curpos + 1) % lines_before; /* avoid free(line) - we took the line */ line = NULL; @@ -304,13 +375,14 @@ static int grep_file(FILE *file) } #endif /* ENABLE_FEATURE_GREP_CONTEXT */ +#if !ENABLE_EXTRA_COMPAT free(line); - +#endif /* Did we print all context after last requested match? */ if ((option_mask32 & OPT_m) && !print_n_lines_after && nmatches == max_matches) break; - } + } /* while (read line) */ /* special-case file post-processing for options where we don't print line * matches, just filenames and possibly match counts */ @@ -428,15 +500,16 @@ int grep_main(int argc, char **argv) lines_after = Copt; if (!(option_mask32 & OPT_B)) /* not overridden */ lines_before = Copt; - //option_mask32 |= OPT_A|OPT_B; /* for parser */ } /* sanity checks */ if (option_mask32 & (OPT_c|OPT_q|OPT_l|OPT_L)) { option_mask32 &= ~OPT_n; lines_before = 0; lines_after = 0; - } else if (lines_before > 0) - before_buf = xzalloc(lines_before * sizeof(char *)); + } else if (lines_before > 0) { + before_buf = xzalloc(lines_before * sizeof(before_buf[0])); + USE_EXTRA_COMPAT(before_buf_size = xzalloc(lines_before * sizeof(before_buf_size[0]));) + } #else /* with auto sanity checks */ /* -H unsets -h; -c,-q or -l unset -n; -e,-f are lists; -m N */ @@ -537,7 +610,7 @@ int grep_main(int argc, char **argv) if (gl->flg_mem_alocated_compiled & ALLOCATED) free(gl->pattern); if (gl->flg_mem_alocated_compiled & COMPILED) - regfree(&(gl->preg)); + regfree(&gl->compiled_regex); free(gl); free(pattern_head_ptr); } diff --git a/libbb/get_line_from_file.c b/libbb/get_line_from_file.c index 56761f941..968d7572d 100644 --- a/libbb/get_line_from_file.c +++ b/libbb/get_line_from_file.c @@ -9,6 +9,10 @@ * Licensed under GPLv2 or later, see file LICENSE in this tarball for details. */ +/* for getline() [GNUism] */ +#ifndef _GNU_SOURCE +#define _GNU_SOURCE 1 +#endif #include "libbb.h" /* This function reads an entire line from a text file, up to a newline @@ -55,7 +59,6 @@ char* FAST_FUNC xmalloc_fgets(FILE *file) return bb_get_chunk_from_file(file, &i); } - /* Get line. Remove trailing \n */ char* FAST_FUNC xmalloc_fgetline(FILE *file) { @@ -68,6 +71,44 @@ char* FAST_FUNC xmalloc_fgetline(FILE *file) return c; } +#if 0 + +/* GNUism getline() should be faster (not tested) than a loop with fgetc */ + +/* Get line, including trailing \n if any */ +char* FAST_FUNC xmalloc_fgets(FILE *file) +{ + char *res_buf = NULL; + size_t res_sz; + + if (getline(&res_buf, &res_sz, file) == -1) { + free(res_buf); /* uclibc allocates a buffer even on EOF. WTF? */ + res_buf = NULL; + } +//TODO: trimming to res_sz? + return res_buf; +} +/* Get line. Remove trailing \n */ +char* FAST_FUNC xmalloc_fgetline(FILE *file) +{ + char *res_buf = NULL; + size_t res_sz; + + res_sz = getline(&res_buf, &res_sz, file); + + if ((ssize_t)res_sz != -1) { + if (res_buf[res_sz - 1] == '\n') + res_buf[--res_sz] = '\0'; +//TODO: trimming to res_sz? + } else { + free(res_buf); /* uclibc allocates a buffer even on EOF. WTF? */ + res_buf = NULL; + } + return res_buf; +} + +#endif + #if 0 /* Faster routines (~twice as fast). +170 bytes. Unused as of 2008-07. * diff --git a/libbb/xregcomp.c b/libbb/xregcomp.c index abfa35ff1..61efb5bc6 100644 --- a/libbb/xregcomp.c +++ b/libbb/xregcomp.c @@ -27,6 +27,6 @@ void FAST_FUNC xregcomp(regex_t *preg, const char *regex, int cflags) { char *errmsg = regcomp_or_errmsg(preg, regex, cflags); if (errmsg) { - bb_error_msg_and_die("xregcomp: %s", errmsg); + bb_error_msg_and_die("bad regex '%s': %s", regex, errmsg); } } diff --git a/testsuite/grep.tests b/testsuite/grep.tests index b2de2af54..8cee1b9ee 100755 --- a/testsuite/grep.tests +++ b/testsuite/grep.tests @@ -62,12 +62,8 @@ testing "grep -s nofile - (stdin and nonexisting file, match)" \ "grep -s domatch nonexistent - ; echo \$?" \ "(standard input):domatch\n2\n" "" "nomatch\ndomatch\nend\n" -# This doesn't match GNU behaviour (Binary file input matches) -# acts like GNU grep -a -testing "grep handles binary files" "grep foo input" "foo\n" "\0foo\n\n" "" -# This doesn't match GNU behaviour (Binary file (standard input) matches) -# acts like GNU grep -a -testing "grep handles binary stdin" "grep foo" "foo\n" "" "\0foo\n\n" +testing "grep handles NUL in files" "grep -a foo input" "\0foo\n" "\0foo\n\n" "" +testing "grep handles NUL on stdin" "grep -a foo" "\0foo\n" "" "\0foo\n\n" testing "grep matches NUL" "grep . input > /dev/null 2>&1 ; echo \$?" \ "0\n" "\0\n" ""