From d44b07fc904f6a0d31ba025f3e9f423c1e47547e Mon Sep 17 00:00:00 2001 From: Rich Felker Date: Fri, 12 Oct 2018 22:32:41 -0400 Subject: [PATCH] rewrite core of the glob implementation for correctness & optimization this code has been long overdue for a rewrite, but the immediate cause that necessitated it was total failure to see past unreadable path components. for example, A/B/* would fail to match anything, even though it should succeed, when both A and A/B are searchable but only A/B is readable. this problem both was caught in conformance testing, and impacted users. the old glob implementation insisted on searching the listing of each path component for a match, even if the next component was a literal. it also used considerable stack space, up to length of the pattern, per recursion level, and relied on an artificial bound of the pattern length by PATH_MAX, which was incorrect because a pattern can be much longer than PATH_MAX while having matches shorter (for example, with necessarily long bracket expressions, or with redundancy). in the new implementation, each level of recursion starts by consuming the maximal literal (possibly escaped-literal) path prefix remaining in the pattern, and only opening a directory to read when there is a proper glob pattern in the next path component. it then recurses into each matching entry. the top-level glob function provided automatic storage (up to PATH_MAX) for construction of candidate/result strings, and allocates a duplicate of the pattern that can be modified in-place with temporary null-termination to pass to fnmatch. this allocation is not a big deal since glob already has to perform allocation, and has to link free to clean up if it experiences an allocation failure or other error after some results have already been allocated. care is taken to use the d_type field from iterated dirents when possible; stat is called only when there are literal path components past the last proper-glob component, or when needed to disambiguate symlinks for the purpose of GLOB_MARK. one peculiarity with the new implementation is the manner in which the error handling callback will be called. if attempting to match */B/C/D where a directory A exists that is inaccessible, the error reported will be a stat error for A/B/C/D rather than (previous and wrong implementation) an opendir error for A, or (likely on other implementations) a stat error for A/B. such behavior does not seem to be non-conforming, but if it turns out to be undesirable for any reason, backtracking could be done on error to report the first component producing it. also, redundant slashes are no longer normalized, but preserved as they appear in the pattern; this is probably more correct, and falls out naturally from the algorithm used. since trailing slashes (which force all matches to be directories) are preserved as well, the behavior of GLOB_MARK has been adjusted not to append an additional slash to results that already end in slash. --- src/regex/glob.c | 217 ++++++++++++++++++++++++----------------------- 1 file changed, 112 insertions(+), 105 deletions(-) diff --git a/src/regex/glob.c b/src/regex/glob.c index 3e1b034e..751b6966 100644 --- a/src/regex/glob.c +++ b/src/regex/glob.c @@ -1,3 +1,4 @@ +#define _BSD_SOURCE #include #include #include @@ -14,27 +15,6 @@ struct match char name[]; }; -static int is_literal(const char *p, int useesc) -{ - int bracket = 0; - for (; *p; p++) { - switch (*p) { - case '\\': - if (!useesc) break; - case '?': - case '*': - return 0; - case '[': - bracket = 1; - break; - case ']': - if (bracket) return 0; - break; - } - } - return 1; -} - static int append(struct match **tail, const char *name, size_t len, int mark) { struct match *new = malloc(sizeof(struct match) + len + 2); @@ -42,7 +22,7 @@ static int append(struct match **tail, const char *name, size_t len, int mark) (*tail)->next = new; new->next = NULL; memcpy(new->name, name, len+1); - if (mark) { + if (mark && len && name[len-1]!='/') { new->name[len] = '/'; new->name[len+1] = 0; } @@ -50,96 +30,125 @@ static int append(struct match **tail, const char *name, size_t len, int mark) return 0; } -static int match_in_dir(const char *d, const char *p, int flags, int (*errfunc)(const char *path, int err), struct match **tail) +static int do_glob(char *buf, size_t pos, int type, char *pat, int flags, int (*errfunc)(const char *path, int err), struct match **tail) { - DIR *dir; - struct dirent de_buf, *de; - char pat[strlen(p)+1]; - char *p2; - size_t l = strlen(d); - int literal; - int fnm_flags= ((flags & GLOB_NOESCAPE) ? FNM_NOESCAPE : 0) - | ((!(flags & GLOB_PERIOD)) ? FNM_PERIOD : 0); - int error; - - if ((p2 = strchr(p, '/'))) { - strcpy(pat, p); - pat[p2-p] = 0; - for (; *p2 == '/'; p2++); - p = pat; + /* If GLOB_MARK is unused, we don't care about type. */ + if (!type && !(flags & GLOB_MARK)) type = DT_REG; + + /* Special-case the remaining pattern being all slashes, in + * which case we can use caller-passed type if it's a dir. */ + if (*pat && type!=DT_DIR) type = 0; + while (pos+1 < PATH_MAX && *pat=='/') buf[pos++] = *pat++; + + /* Consume maximal [escaped-]literal prefix of pattern, copying + * and un-escaping it to the running buffer as we go. */ + ptrdiff_t i=0, j=0; + int in_bracket = 0, overflow = 0; + for (; pat[i]!='*' && pat[i]!='?' && (!in_bracket || pat[i]!=']'); i++) { + if (!pat[i]) { + if (overflow) return 0; + pat += i; + pos += j; + i = j = 0; + break; + } else if (pat[i] == '[') { + in_bracket = 1; + } else if (pat[i] == '/') { + if (overflow) return 0; + in_bracket = 0; + pat += i+1; + i = -1; + pos += j+1; + j = -1; + } else if (pat[i] == '\\' && !(flags & GLOB_NOESCAPE)) { + /* Backslashes inside a bracket are (at least by + * our interpretation) non-special, so if next + * char is ']' we have a complete expression. */ + if (in_bracket && pat[i+1]==']') break; + /* Unpaired final backslash never matches. */ + if (!pat[i+1] || pat[i+1]=='/') return 0; + i++; + } + /* Only store a character if it fits in the buffer, but if + * a potential bracket expression is open, the overflow + * must be remembered and handled later only if the bracket + * is unterminated (and thereby a literal), so as not to + * disallow long bracket expressions with short matches. */ + if (pos+(j+1) < PATH_MAX) { + buf[pos+j++] = pat[i]; + } else if (in_bracket) { + overflow = 1; + } else { + return 0; + } + /* If we consume any new components, the caller-passed type + * or dummy type from above is no longer valid. */ + type = 0; } - literal = is_literal(p, !(flags & GLOB_NOESCAPE)); - if (*d == '/' && !*(d+1)) l = 0; - - /* rely on opendir failing for nondirectory objects */ - dir = opendir(*d ? d : "."); - error = errno; - if (!dir) { - /* this is not an error -- we let opendir call stat for us */ - if (error == ENOTDIR) return 0; - if (error == EACCES && !*p) { - struct stat st; - if (!stat(d, &st) && S_ISDIR(st.st_mode)) { - if (append(tail, d, l, l)) - return GLOB_NOSPACE; - return 0; - } + buf[pos] = 0; + if (!*pat) { + /* If we consumed any components above, or if GLOB_MARK is + * requested and we don't yet know if the match is a dir, + * we must call stat to confirm the file exists and/or + * determine its type. */ + struct stat st; + if ((flags & GLOB_MARK) && type==DT_LNK) type = 0; + if (!type && stat(buf, &st)) { + if (errno!=ENOENT && (errfunc(buf, errno) || (flags & GLOB_ERR))) + return GLOB_ABORTED; + return 0; } - if (errfunc(d, error) || (flags & GLOB_ERR)) - return GLOB_ABORTED; + if (!type && S_ISDIR(st.st_mode)) type = DT_DIR; + if (append(tail, buf, pos, (flags & GLOB_MARK) && type==DT_DIR)) + return GLOB_NOSPACE; return 0; } - if (!*p) { - error = append(tail, d, l, l) ? GLOB_NOSPACE : 0; - closedir(dir); - return error; + char *p2 = strchr(pat, '/'); + DIR *dir = opendir(pos ? buf : "."); + if (!dir) { + if (errfunc(buf, errno) || (flags & GLOB_ERR)) + return GLOB_ABORTED; + return 0; } - while (!(error = readdir_r(dir, &de_buf, &de)) && de) { - char namebuf[l+de->d_reclen+2], *name = namebuf; - if (!literal && fnmatch(p, de->d_name, fnm_flags)) + int old_errno = errno; + struct dirent *de; + while (errno=0, de=readdir(dir)) { + /* Quickly skip non-directories when there's pattern left. */ + if (p2 && de->d_type && de->d_type!=DT_DIR && de->d_type!=DT_LNK) continue; - if (literal && strcmp(p, de->d_name)) - continue; - if (p2 && de->d_type && !S_ISDIR(de->d_type<<12) && !S_ISLNK(de->d_type<<12)) + + size_t l = strlen(de->d_name); + if (l >= PATH_MAX-pos) continue; + + if (p2) *p2 = 0; + + int fnm_flags= ((flags & GLOB_NOESCAPE) ? FNM_NOESCAPE : 0) + | ((!(flags & GLOB_PERIOD)) ? FNM_PERIOD : 0); + + if (fnmatch(pat, de->d_name, fnm_flags)) continue; + /* With GLOB_PERIOD, don't allow matching . or .. unless * fnmatch would match them with FNM_PERIOD rules in effect. */ if (p2 && (flags & GLOB_PERIOD) && de->d_name[0]=='.' && (!de->d_name[1] || de->d_name[1]=='.' && !de->d_name[2]) - && fnmatch(p, de->d_name, fnm_flags | FNM_PERIOD)) + && fnmatch(pat, de->d_name, fnm_flags | FNM_PERIOD)) continue; - if (*d) { - memcpy(name, d, l); - name[l] = '/'; - strcpy(name+l+1, de->d_name); - } else { - name = de->d_name; - } - if (p2) { - if ((error = match_in_dir(name, p2, flags, errfunc, tail))) { - closedir(dir); - return error; - } - } else { - int mark = 0; - if (flags & GLOB_MARK) { - if (de->d_type && !S_ISLNK(de->d_type<<12)) - mark = S_ISDIR(de->d_type<<12); - else { - struct stat st; - stat(name, &st); - mark = S_ISDIR(st.st_mode); - } - } - if (append(tail, name, l+de->d_reclen+1, mark)) { - closedir(dir); - return GLOB_NOSPACE; - } + + memcpy(buf+pos, de->d_name, l+1); + if (p2) *p2 = '/'; + int r = do_glob(buf, pos+l, de->d_type, p2 ? p2 : "", flags, errfunc, tail); + if (r) { + closedir(dir); + return r; } } + int readerr = errno; + if (p2) *p2 = '/'; closedir(dir); - if (error && (errfunc(d, error) || (flags & GLOB_ERR))) + if (readerr && (errfunc(buf, errno) || (flags & GLOB_ERR))) return GLOB_ABORTED; + errno = old_errno; return 0; } @@ -164,19 +173,12 @@ static int sort(const void *a, const void *b) int glob(const char *restrict pat, int flags, int (*errfunc)(const char *path, int err), glob_t *restrict g) { - const char *p=pat, *d; struct match head = { .next = NULL }, *tail = &head; size_t cnt, i; size_t offs = (flags & GLOB_DOOFFS) ? g->gl_offs : 0; int error = 0; + char buf[PATH_MAX]; - if (*p == '/') { - for (; *p == '/'; p++); - d = "/"; - } else { - d = ""; - } - if (!errfunc) errfunc = ignore_err; if (!(flags & GLOB_APPEND)) { @@ -185,9 +187,14 @@ int glob(const char *restrict pat, int flags, int (*errfunc)(const char *path, i g->gl_pathv = NULL; } - if (strnlen(p, PATH_MAX+1) > PATH_MAX) return GLOB_NOSPACE; + if (*pat) { + char *p = strdup(pat); + if (!p) return GLOB_NOSPACE; + buf[0] = 0; + error = do_glob(buf, 0, 0, p, flags, errfunc, &tail); + free(p); + } - if (*pat) error = match_in_dir(d, p, flags, errfunc, &tail); if (error == GLOB_NOSPACE) { freelist(&head); return error; -- 2.25.1