2 * COMPONENT_NAME: austext
12 * (C) COPYRIGHT International Business Machines Corp. 1992,1995
14 * Licensed Materials - Property of IBM
15 * US Government Users Restricted Rights - Use, duplication or
16 * disclosure restricted by GSA ADP Schedule Contract with IBM Corp.
18 /************************* BMSTRSTR.C ****************************
19 * $TOG: bmstrstr.c /main/6 1998/04/17 11:25:23 mgreess $
20 * Original module named fastsearch.c
21 * and included colocation string search functions.
22 * Modification of Boyer-Moore-Horspool algorithm,
23 * Sec 10.5.2 of Information Retrieval, Frakes and Baeza-Yates, editors.
24 * Provides a generalized boyer-moore
25 * strstr() function. The table used in the BMH algorithm is
26 * generated in a separate function to improve efficiency when
27 * looking for the same substring pattern in multiple text strings.
28 * The 'length' arguments can be passed if known, or passed as
29 * strlen(xxx) if not known. HOWEVER the string arrays MUST be at
30 * least 1 char larger then strlen() says ('cause we insert a \0).
31 * This whole thing has been coded for SPEED!
34 * Revision 2.2 1995/10/26 15:37:42 miker
37 * Revision 2.1 1995/09/22 19:10:39 miker
38 * Freeze DtSearch 0.1, AusText 2.1.8
40 * Revision 1.2 1995/08/31 22:12:05 miker
41 * Minor changes for DtSearch.
46 #include <sys/param.h>
49 #define __strcmp strcmp
52 /*********#define TEST_BMSTRSTR**************/
54 /****************************************/
58 /****************************************/
59 /* Builds the table for search substring 'pattern'
60 * used by the BMH core search algorithm.
61 * 'patlen' is the string length of 'pattern'.
62 * The caller defines and passes 'bmhtable',
63 * an array of long integers (size_t) of size MAX_BMHTAB.
64 * This function initializes bmhtable for later search call.
67 unsigned char *pattern,
73 for (k = 0; k < MAX_BMHTAB; k++)
76 for (k = 0; k < patlen; k++)
77 bmhtable[pattern[k]] = (patlen - k);
79 } /* bmhtable_build() */
82 /****************************************/
86 /****************************************/
87 /* Performs 'core' BMH search after bmhtable is built.
88 * Returns ptr to first occurrence of pattern in text or NULL.
89 * WARNING! IF EITHER txtlen OR patlen <= 0, THIS FUNCTION WILL CRASH!!!
90 * Pattern and patlen MUST BE identical with those used to build
97 unsigned char *pattern,
101 register unsigned char
102 lastchar = pattern[patlen - 1];
103 register unsigned char
105 register unsigned char
107 register unsigned char
114 /* Terminate pattern with a char we KNOW is not in text.
115 * Note that this requires string to have room for \0 at end.
117 savechar = pattern[patlen];
118 pattern[patlen] = '\0';
120 last = text + txtlen;
121 for (cp = text + patlen - 1; cp < last; cp += bmhtable[textchar]) {
123 * Check if last character matches. If it doesn't, no need
124 * to check any further.
126 if ((textchar = *cp) != lastchar)
130 if (!__strcmp ((char *) (cp + 1 - patlen), (char *) pattern))
131 result = cp + 1 - patlen;
136 pattern[patlen] = savechar; /* restore last char */
137 return (char *) result;
141 /****************************************/
145 /****************************************/
146 /* Search in text [1..txtlen] for pattern [1..patlen].
147 * Returns ptr to first occurrence of pattern, or NULL.
152 unsigned char *pattern,
155 size_t bmhtable[MAX_BMHTAB];
157 bmhtable_build (pattern, patlen, bmhtable);
158 return bmhcore (text, txtlen, pattern, patlen, bmhtable);
162 #ifdef TEST_BMSTRSTR /* for test only */
163 #include <sys/stat.h>
164 /****************************************/
168 /****************************************/
169 /* tests bmstrstr() against standard strstr() on a specified file */
175 char fname[BUFSIZ+1];
176 char pattern[MAXPATHLEN+1];
177 char *readbuf = NULL;
181 printf ("\nEnter a filename (Ctrl-C quits) > ");
184 fgets (fname, sizeof(fname), stdin);
185 if (strlen(fname) && fname[strlen(fname)-1] == '\n')
186 fname[strlen(fname)-1] = '\0';
188 if ((f = fopen (fname, "r")) == NULL) {
189 printf ("Can't open '%s': %s\n", fname, strerror (errno));
193 fstat (fileno (f), &statbuf);
194 if (fsize > statbuf.st_size) {
198 fsize = statbuf.st_size;
200 readbuf = malloc (fsize + 64L);
202 fread (readbuf, fsize, 1L, f);
205 printf ("Enter a search pattern > ");
208 fgets (pattern, sizeof(pattern), stdin);
209 if (strlen(pattern) && pattern[strlen(pattern)-1] == '\n')
210 pattern[strlen(pattern)-1] = '\0';
212 ptr = bmstrstr (readbuf, fsize, pattern, strlen (pattern));
214 puts ("bmstrstr: Pattern not found.");
216 printf ("bmstrstr: Pattern found at offset %ld.\n", ptr - readbuf);
218 ptr = strstr (readbuf, pattern);
220 puts ("strstr: Pattern not found.");
222 printf ("strstr: Pattern found at offset %ld.\n", ptr - readbuf);
225 } /* main() test program */
229 /************************* BMSTRSTR.C ****************************/