2 * CDE - Common Desktop Environment
4 * Copyright (c) 1993-2012, The Open Group. All rights reserved.
6 * These libraries and programs are free software; you can
7 * redistribute them and/or modify them under the terms of the GNU
8 * Lesser General Public License as published by the Free Software
9 * Foundation; either version 2 of the License, or (at your option)
12 * These libraries and programs are distributed in the hope that
13 * they will be useful, but WITHOUT ANY WARRANTY; without even the
14 * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU Lesser General Public License for more
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with these libraries and programs; if not, write
20 * to the Free Software Foundation, Inc., 51 Franklin Street, Fifth
21 * Floor, Boston, MA 02110-1301 USA
24 * COMPONENT_NAME: austext
26 * FUNCTIONS: euro_lstrupr
45 * (C) COPYRIGHT International Business Machines Corp. 1995,1996
47 * Licensed Materials - Property of IBM
48 * US Government Users Restricted Rights - Use, duplication or
49 * disclosure restricted by GSA ADP Schedule Contract with IBM Corp.
51 /******************** LANG.C ********************
52 * $XConsortium: lang.c /main/11 1996/11/25 18:47:29 drk $
54 * Includes load_language(), unload_language(), and functions and data for
55 * parsing and stemming European languages in DtSearch/AusText.
56 * Incorporates p/o socrates.c, p/o proctext.c, parser.c
57 * delsfx.c, loadchr.c, stop.c, inclist.c, convneg.c, isendwrd.c
58 * Related to similar semantic modules repackaged into semantic.c.
59 * Paice suffix removal algorithm from C. Paice, 1990,
60 * "Another Stemmer", ACM SIGIR Forum, 24(3), 56-61.
63 * Revision 2.13 1996/03/25 18:55:26 miker
64 * Changed FILENAME_MAX to _POSIX_PATH_MAX.
66 * Revision 2.12 1996/03/25 17:00:19 miker
67 * Cleanup compiler warning.
69 * Revision 2.11 1996/03/13 22:58:13 miker
70 * Changed char to UCHAR several places.
72 * Revision 2.10 1996/03/05 16:49:58 miker
73 * Move COMMENT_CHARS to SearchP.h.
75 * Revision 2.9 1996/03/05 16:31:20 miker
76 * Added test of PA_MSGS for yacc-based boolean queries.
77 * Made comment chars in linguistic files independent of locale.
78 * Changed several char ptrs to unsigned char so parser will
79 * work when compiled under default signed char compilers.
80 * Simplified several statements with LHS *var++ for same reason.
82 * Revision 2.8 1996/02/05 16:16:05 miker
85 * Revision 2.7 1996/02/05 16:10:54 miker
86 * load_paice_suffixes: discard .sfx lines beginning with all numeric
87 * first token for compatibility with older file formats.
89 * Revision 2.6 1996/02/01 19:11:43 miker
90 * AusText 2.1.11, DtSearch 0.3: Major rewrite for new parsers.
91 * Moved charmaps to new module langmap.c. Removed hard coded
92 * paice stemmer values--now dynamic from .sfx file.
94 * Revision 2.5 1995/10/26 14:55:28 miker
97 * Revision 2.4 1995/10/19 20:54:36 miker
98 * Increased msg buf sizes to accommodate larger database file names.
100 * Revision 2.3 1995/10/06 14:39:45 miker
101 * Bug fix: coredump loading multiple databases
104 * Revision 2.2 1995/10/03 21:39:10 miker
105 * Changed teskey_parser, paice_stemmer, and null_stemmer
106 * to return number of words parsed/stemmed, not just boolean.
108 * Revision 2.1 1995/09/22 21:00:19 miker
109 * Freeze DtSearch 0.1, AusText 2.1.8
111 * Revision 1.3 1995/09/19 22:08:28 miker
112 * Added support for loading and parsing Japanese language DtSrLaJPN.
114 * Revision 1.2 1995/09/05 21:34:52 miker
115 * Fixed bug: search engine wouldn't parse words of exactly
118 * Revision 1.1 1995/08/31 21:03:44 miker
127 #include <sys/stat.h>
129 #define X_INCLUDE_STRING_H
130 #define XOS_USE_NO_LOCKING
131 #include <X11/Xos_r.h>
133 #define PROGNAME "LANG"
134 #define EXT_SUFFIX ".sfx" /* standard paice suffix file format */
135 #define OUTBUFSZ 6140
136 #define SFX_DELIMS " \t\n"
139 #define IS_VOWEL(c) ((paice_charmap [(UCHAR)c] & VOWEL) != 0)
141 /************************************************/
145 /************************************************/
146 /* List of Paice suffix removal rules from .sfx files */
147 typedef struct prule_t {
148 struct prule_t *link; /* Ptr to next list node */
149 UCHAR *suffix; /* Applicable suffix string, backwards */
150 UCHAR suflen; /* Length of suffix */
151 char must_be_intact; /* Optional '*'. Rule only applies
153 UCHAR remove_count; /* Number of suffix chars to remove */
154 UCHAR aplen; /* Length of apndstr */
155 UCHAR *apndstr; /* Optional append string */
156 char is_last_rule; /* '$' terminate or '>' continue algorithm */
159 char *ensure_end_slash (char *pathstr);
160 void unload_jpn_language (DBLK *dblk);
162 /************************************************/
166 /************************************************/
167 int debugging_loadlang = FALSE;
168 int debugging_loadword = FALSE;
169 int debugging_search_wordtree = FALSE;
170 int debugging_teskey = FALSE;
171 int debugging_paice = FALSE;
172 static int *paice_charmap;
173 static UCHAR paicebuf [DtSrMAXWIDTH_HWORD + 2];
175 static int word_is_intact;
177 /* Language strings correspond to DtSrLa.. constants. */
178 static char *lang_fnames[] = {
180 "eng", /* 1 ('eng2' same files as 'eng') */
186 "jpn", /* 7 ('jpn2' same files as 'jpn' */
191 /************************************************/
195 /************************************************/
196 /* Returns language name string given language number */
197 static char *language_name (DtSrINT16 langno)
199 static char *language_names[] = {
200 "English-ASCII", /* 0 = DtSrLaENG */
201 "English-Latin1", /* 1 = DtSrLaENG2 */
202 "Spanish", /* 2 = DtSrLaESP */
203 "French", /* 3 = DtSrLaFRA */
204 "Italian", /* 4 = DtSrLaITA */
205 "German", /* 5 = DtSrLaDEU */
206 "Japanese-comp" /* 6 = DtSrLaJPN */
207 "Japanese-.knj" /* 7 = DtSrLaJPN2 */
212 else if (langno > DtSrLaLAST)
213 return "(Custom Language)";
215 return language_names [langno];
216 } /* language_name() */
219 /************************************************/
221 /* search_wordtree */
223 /************************************************/
225 * Formerly search_inclist() in inclist.c and search_stoplist() in stop.c.
226 * Searches a word list in a binary WORDTREE.
227 * Passed wordstring is presumed to be a clean,
228 * uppercase word token string terminated by \0.
229 * Variables are static for speeeeed.
230 * Returns TRUE if successful search, else FALSE.
231 * See also search_wordtree_jpn() in jpn.c
233 static int search_wordtree (WORDTREE *wordtree, UCHAR *wordstring)
235 static int direction;
236 static WORDTREE *node;
238 if (debugging_search_wordtree)
239 fprintf (aa_stderr, PROGNAME"196 search wordtree for '%s':\n",
241 /* MAIN SEARCH LOOP: binary tree search */
242 for (node = wordtree; node != NULL; ) {
243 if ((direction = strcmp ((char *) wordstring, node->word)) == 0) {
244 if (debugging_search_wordtree)
245 fprintf (aa_stderr, " HIT!\n");
248 /* Descend left or right depending on word */
249 if (debugging_search_wordtree)
250 fprintf (aa_stderr, " %c '%s'\n",
251 (direction < 0) ? 'L' : 'R', (char *) node->word);
257 if (debugging_search_wordtree)
258 fprintf (aa_stderr, " MISS.\n");
260 } /* search_wordtree() */
263 /************************************************/
267 /************************************************/
269 * Teskey_parser() is derived from the former Socrates() in socrates.c.
270 * Returns next teskey-parsed word token from a character stream.
271 * Called from (1) dtsrindex, where readchar_ftext() cofunction
272 * reads the .fzk file document 'stream', or (2) search engine
273 * query parsers, where readchar_string() cofunction 'reads'
274 * from the query string.
275 * (The word hiliting parser does not directly call teskey_parser; it has
276 * its own simplified equivalent to the parsing algorithms herein.)
278 * First call passes args in PARG structure. This resets end of
279 * text block (ETX) flag, resets 'offset' counter to zero, etc.
280 * Subsequent calls should pass NULL, and parser returns
281 * next token in block, until reader cofunction reads ETX,
282 * ie special ETX char ('\0'). Subsequent calls to parser
283 * return NULL meaning "no tokens left in current stream".
284 * Reader cofunctions tolerate repeated calls after
285 * the first ETX, still returning '\0'.
287 * This parser presumes all incoming text is unformatted.
288 * Since parser accesses streams a char at a time it does
289 * not require periodic line feeds or anything else.
291 * Parser also returns offset information: number of bytes
292 * since beginning of text block.
294 * Variables are static for speeeeeeed.
296 * OUTPUT FORMAT: NULL or a static C string containing a single
297 * parsed word token. Word buffer reused at next call.
298 * Each word is translated as follows:
299 * All alphas TO UPPERCASE.
300 * Teskey algorithm used to find word boundaries.
301 * Always keeps include-list words.
302 * Throws away stoplist words, very short words, and very long words.
303 * All intervening nonconcordables discarded.
305 * There is a slight mod to the published Teskey algorithm.
306 * Words can begin with optionally concordable chars
307 * but not end with them. For example if '-' is optionally
308 * concordable, '-foo-' will be parsed into '-foo'.
310 char *teskey_parser (PARG *parg)
312 static READCFP cofunction;
313 static void *cofunction_arg;
314 static DBLK *dblk = NULL;
315 static UCHAR *outbuf = NULL;
316 static size_t outbufsz = 0;
317 static UCHAR *endmaxword; /* end largest possible output word */
318 static UCHAR *outp; /* next loc in outbuf */
320 static int minwordsz, maxwordsz;
322 static enum {BETW_WORDS, IN_WORD, TOO_LONG}
324 static long *offsetp, readcount, candidate_offset;
325 static int is_hiliting;
328 /* If first call for current text block... */
331 minwordsz = dblk->dbrec.or_minwordsz;
332 maxwordsz = dblk->dbrec.or_maxwordsz;
333 charmap = dblk->charmap;
334 offsetp = parg->offsetp;
335 is_hiliting = (parg->flags & PA_HILITING);
336 add_msgs = (parg->flags & PA_MSGS);
337 if (charmap == NULL) {
338 fprintf (aa_stderr, catgets(dtsearch_catd, MS_lang, 4,
339 "%s dblk not initialized.\n"),
345 cofunction_arg = parg->string;
346 cofunction = (READCFP) readchar_string;
348 else if (parg->ftext) {
349 cofunction_arg = parg;
350 cofunction = (READCFP) readchar_ftext;
353 fprintf (aa_stderr, catgets(dtsearch_catd, MS_lang, 5,
354 "%s Program Error: parg contains neither file nor string.\n"),
359 if (outbufsz <= maxwordsz) {
362 outbufsz = maxwordsz + 8;
363 outbuf = austext_malloc (outbufsz + 8, PROGNAME"807", NULL);
365 endmaxword = outbuf + maxwordsz;
366 if (debugging_teskey)
368 "teskey: start of text block, maxwsz=%d outbufsz=%lu\n",
369 maxwordsz, (unsigned long) outbufsz);
373 /* CANDIDATE WORD LOOP: Read text chars into outbuf.
374 * Exit loop when outbuf contains one candidate token or at ETX.
378 tpstate = BETW_WORDS;
379 while ((*outp = cofunction (cofunction_arg))) {
381 cofunction_arg = NULL;
383 /*------------- BETW_WORDS State ------------
384 * Reader is between word tokens.
386 if (tpstate == BETW_WORDS) {
388 * Discard nonconcordable chars between words.
390 if ((charmap[*outp] & NON_CONCORD) != 0)
393 * Fully concordable char is definite start of new word.
394 * Convert to uppercase and go get next char.
396 if ((charmap[*outp] & CONCORDABLE) != 0) {
397 *outp = charmap[*outp] & 0x00ff;
399 candidate_offset = readcount;
404 * Must be optionally concordable. It can only
405 * start a new word if next char is concordable.
406 * If so, convert a fully concordable char
407 * to uppercase and go get next char.
408 * Otherwise discard just like non_concord.
411 if ((*outp = cofunction(NULL)))
413 if ((charmap[*outp] & CONCORDABLE) != 0) {
414 *outp = charmap[*outp] & 0x00ff;
416 candidate_offset = readcount - 1;
424 } /* endif BETW_WORDS */
427 /*------------- IN_WORD State ------------
428 * Reader is in middle of a word.
429 * Convert all concordables to uppercase and append.
430 * Terminate word at first non_concord.
431 * Non_concords treatment depends on next char.
433 else if (tpstate == IN_WORD) {
434 if ((charmap[*outp] & CONCORDABLE) != 0) {
435 if (outp < endmaxword) {
436 *outp = charmap[*outp] & 0x00ff;
441 if (debugging_teskey)
443 "teskey: ofs=%3ld \"%.15s...\", (TOO LONG)\n",
444 candidate_offset-1, outbuf);
446 char msgbuf [DtSrMAXWIDTH_HWORD + 100];
447 sprintf (msgbuf, catgets(dtsearch_catd, MS_lang, 8,
448 "%s '%.*s...' is larger\n"
449 "than the maximum word size of database '%s'.") ,
450 PROGNAME"449", maxwordsz,
451 parg->string, dblk->label);
452 DtSearchAddMessage (msgbuf);
460 if ((charmap[*outp] & NON_CONCORD) != 0) {
464 /* Must be opt_concord... */
466 if ((*outp = cofunction(NULL)))
468 if ((charmap[*outp] & CONCORDABLE) != 0) {
469 if (outp < endmaxword) {
470 *outp = charmap[*outp] & 0x00ff; /* uppercase */
475 if (debugging_teskey)
477 "teskey: ofs=%3ld \"%.15s...\", (TOO LONG)\n",
478 candidate_offset-1, outbuf);
484 else { /* next char NOT concordable...*/
488 } /* endif IN_WORD */
491 /*------------- TOO_LONG State ------------
492 * Reader is in middle of a word that exceeds max word size.
493 * Discard all concordables and opt_concords until we
494 * can get between words again with a clear non_concord.
496 else if (tpstate == TOO_LONG) {
497 if ((charmap[*outp] & NON_CONCORD) != 0) {
499 tpstate = BETW_WORDS;
504 /*------------- UNKNOWN State ------------*/
506 fprintf (aa_stderr, catgets(dtsearch_catd, MS_lang, 10,
507 "%s Program Error: Unknown parser state.\n"),
511 } /* end read loop for next CANDIDATE WORD */
513 /*---------- TEST FOR ETX -------------*/
514 if (outbuf[0] == 0) {
515 if (debugging_teskey)
516 fprintf (aa_stderr, "teskey: etx\n");
519 sprintf (msgbuf, catgets(dtsearch_catd, MS_lang, 12,
520 "%s '%.120s' is not a valid word in database '%s'.") ,
521 PROGNAME"506", parg->string, dblk->label);
522 DtSearchAddMessage (msgbuf);
527 wordlen = strlen ((char *) outbuf);
528 candidate_offset--; /* token offset is one less than number of reads */
529 if (debugging_teskey)
530 fprintf (aa_stderr, "teskey: ofs=%3ld \"%s\"",
531 candidate_offset, outbuf);
534 if (debugging_teskey)
535 fprintf (aa_stderr, ", (hiliting, skip tree searches)");
539 /*--------- INCLUDE LIST ----------
540 * Search before testing for stoplist or minimum word length.
542 if (dblk->inclist != NULL) {
543 if (search_wordtree (dblk->inclist, outbuf)) {
544 if (debugging_teskey)
545 fprintf (aa_stderr, ", (INCLUDE LIST)");
550 /*--------- TOO SHORT -----------*/
551 if (wordlen < minwordsz) {
552 if (debugging_teskey)
553 fprintf (aa_stderr, ", (TOO SHORT, min %d)\n", minwordsz);
556 sprintf (msgbuf, catgets(dtsearch_catd, MS_lang, 17,
557 "%s '%s' is less than the\n"
558 "minimum word size of database '%s'.") ,
559 PROGNAME"543", parg->string, dblk->label);
560 DtSearchAddMessage (msgbuf);
563 goto READ_ANOTHER_WORD;
566 /*----------- STOP LIST -------------*/
567 if (dblk->stoplist != NULL) {
568 if (search_wordtree (dblk->stoplist, outbuf)) {
569 if (debugging_teskey)
570 fprintf (aa_stderr, ", (STOP LIST)\n");
573 sprintf (msgbuf, catgets(dtsearch_catd, MS_lang, 19,
574 "%s The word '%s' is not indexed in database '%s'.") ,
575 PROGNAME"558", parg->string, dblk->label);
576 DtSearchAddMessage (msgbuf);
579 goto READ_ANOTHER_WORD;
584 /* Word is correctly parsed and passes all dblk filters. */
585 if (debugging_teskey)
586 fprintf (aa_stderr, ", ...good word\n");
588 *offsetp = candidate_offset;
589 return (char *) outbuf;
590 } /* teskey_parser() */
593 /************************************************/
597 /************************************************/
598 /* Verifies passed word token is teskey-concordable
599 * in code page of passed charmap. Used in validating
600 * word files. Returns TRUE if all chars concordable
601 * or optionally concordable, else returns FALSE.
603 int is_concordable (char *word, int *charmap)
606 for (cptr = (UCHAR *)word; *cptr != 0; cptr++)
607 if ((charmap[*cptr] & NON_CONCORD) != 0)
610 } /* is_concordable() */
613 /************************************************/
617 /************************************************/
618 /* Called by load_stop_list(), load_include_list(), etc,
619 * to read an appropriate word list file into binary tree structures.
621 * INPUT FILE FORMAT: One word per line, all chars teskey concordable.
622 * Preferred order is frequency of occurrence in the corpus
623 * to make searches efficient. Otherwise the words should at least
624 * be in random order or an order that will approximate a binary search.
625 * If first char is any of COMMENT_CHARS, line is ignored as comments.
626 * Ascii spaces, tabs, or newline delimits the first word token--
627 * anything else on the line is ignored as comments.
628 * Optionally characters in word token will be checked for teskey
631 * RETURNS 0 if file successfully loaded, returns 1 if file missing,
632 * returns 2 and messages in global msglist if file has fatal errors.
646 char sprintbuf [_POSIX_PATH_MAX + 1024];
649 WORDTREE **this_link;
650 _Xstrtokparams strtok_buf;
652 if (debugging_loadlang)
653 fprintf (aa_stderr, PROGNAME"1071 "
654 "load_wordtree: db=%s fname='%s'\n",
655 NULLORSTR(dblk->name), NULLORSTR(fname));
657 if ((fileid = fopen (fname, "rt")) == NULL) {
658 /* Not being able to find the file is not an error.
659 * We indicate that with the return code.
660 * But any other error (like permissions) is fatal.
662 if (errno == ENOENT) {
663 if (debugging_loadlang)
664 fputs (" ...file not found.\n", aa_stderr);
669 catgets (dtsearch_catd, MS_misc, 362, "%s: %s: %s."),
670 PROGNAME"362", fname, strerror(errno));
671 DtSearchAddMessage (sprintbuf);
676 /*--------- Main Read Loop ----------*/
678 while (fgets (readbuf, sizeof(readbuf), fileid) != NULL) {
681 * Ignore comment lines beginning with punctuation char.
682 * Ignore empty lines (strtok returns NULL, no tokens).
683 * Otherwise first or only word on line is the desired word.
685 if (strchr (COMMENT_CHARS, readbuf[0]))
687 if ((token = _XStrtok(readbuf, " \t\n", strtok_buf)) == NULL)
689 dblk->lstrupr (token, dblk);
691 if (debugging_loadword)
692 fprintf (aa_stderr, " WORD: '%s' ", token);
694 /* If requested confirm all chars are teskey-concordable. */
696 if (!is_concordable (token, dblk->charmap)) {
697 sprintf (sprintbuf, catgets (dtsearch_catd, MS_misc, 400,
698 "%s: %s, line %ld: Invalid chars in word '%s'."),
699 PROGNAME"400", fname, linecount, token);
700 DtSearchAddMessage (sprintbuf);
705 /* Unless we've already detected some errors,
706 * allocate a new node and load its data fields.
711 new = austext_malloc (sizeof(WORDTREE) + i + 4,
712 PROGNAME"104", NULL);
716 new->word = (void *) (new + 1);
717 strcpy (new->word, token);
719 /* Descend binary tree and insert in correct alphabetical place */
720 is_duplicate = FALSE;
721 for (this_link = treetop; *this_link != NULL; ) {
722 i = strcmp (new->word, (*this_link)->word);
724 /* test for duplicate word */
726 sprintf (sprintbuf, catgets (dtsearch_catd, MS_misc, 423,
727 "%s Word '%s' in '%s' is a duplicate."),
728 PROGNAME"423", token, fname);
729 DtSearchAddMessage (sprintbuf);
730 /* duplicates aren't fatal, just ignore the word */
732 break; /* no point in continuing descent */
735 /* Descend tree to find correct insertion point */
736 if (debugging_loadword)
737 fputc(((i < 0)? 'L' : 'R'), aa_stderr);
738 this_link = (WORDTREE **) ((i < 0) ?
739 &(*this_link)->llink : &(*this_link)->rlink);
740 } /* end forloop to find tree insertion point */
742 /* Don't link anything if error found while descending tree */
744 if (debugging_loadword)
745 fputs (" duplicate!\n", aa_stderr);
750 /* Insert new node at current location in tree */
752 if (debugging_loadword)
753 fputs(" .\n", aa_stderr);
754 } /* end of read loop */
759 if (debugging_loadlang)
761 PROGNAME"1186 load word file '%s' failed.\n", fname);
765 if (debugging_loadlang)
767 PROGNAME"1193 load word file '%s' successful.\n", fname);
770 } /* load_wordtree() */
773 /************************************************/
777 /************************************************/
778 /* Formerly free_bintree() in msgutil.c.
779 * Frees storage for all nodes in a WORDTREE and
780 * sets its top-of-list pointer to NULL.
781 * Works only for node structures where all memory
782 * was allocated in a single call to malloc().
783 * Uses link inversion traversal (eg, Data Structure Techniques,
784 * Thomas A. Standish, Algorithm 3.6) where TAG is initialized
785 * at preorder visit, and node is freed at postorder visit.
787 static void free_wordtree (WORDTREE ** wordtree_head)
790 WORDTREE *prev = NULL;
791 WORDTREE *pres = *wordtree_head;
793 if (*wordtree_head == NULL)
797 pres->word = (void *) 0; /* preorder visit: TAG = 0 */
808 pres->word = (void *) 1; /* TAG = 1 */
816 if (prev == NULL) { /* end of algorithm? */
817 *wordtree_head = NULL;
820 if (prev->word == (void *) 0) { /* go up left leg */
826 else { /* go up right leg */
828 prev->word = (void *) 0; /* restore TAG = 0 */
831 goto POSTORDER_VISIT;
833 } /* free_wordtree() */
836 /************************************************/
838 /* load_include_list */
840 /************************************************/
841 /* Builds include list by reading include file
842 * into a binary tree structure.
843 * Unlike stoplists, include-lists are optional.
844 * Also unlike stoplists, there are no language default include-lists.
845 * 'dblist' may be NULL.
846 * RETURNS TRUE if no problems, else FALSE with msg in ausapi_msglist.
848 static int load_include_list (DBLK *dblk, DBLK *dblist)
851 int filename_was_null = (dblk->fname_inc == NULL);
853 char sprintbuf [512];
855 dblk->inclist = NULL; /* just to be sure */
857 if (debugging_loadlang)
859 PROGNAME"1705 Load inclist: db='%s' lang=#%d,%s\n",
860 NULLORSTR(dblk->name), (int)dblk->dbrec.or_language,
861 language_name(dblk->dbrec.or_language));
863 /* If file name not provided, generate one based on
864 * dblk's path, database name, and default extension.
866 if (filename_was_null) {
867 if (dblk->name[0] == 0) {
868 dblk->fname_inc = "";
869 dblk->inclist = NULL;
870 if (debugging_loadlang)
871 fprintf (aa_stderr, PROGNAME"1339 "
872 "No inclist because neither fname nor dbname provided.\n");
875 if (dblk->path == NULL)
876 dblk->path = strdup("");
877 dblk->fname_inc = austext_malloc (strlen(dblk->path) + 36,
878 PROGNAME"1187", NULL);
879 strcpy (dblk->fname_inc, dblk->path);
880 ensure_end_slash (dblk->fname_inc);
881 strcat (dblk->fname_inc, dblk->name);
882 strcat (dblk->fname_inc, EXT_INCLIST);
884 if (debugging_loadlang)
886 PROGNAME"1350 Include list file name = '%s'.\n",
889 /* Don't reload the same file if it's already
890 * been loaded into a previous dblk in a list.
891 * Code works just fine if dblist == NULL.
893 for (db = dblist; db != NULL; db = db->link) {
894 if (db == dblk || db->fname_inc == NULL)
896 if (strcmp (db->fname_inc, dblk->fname_inc) == 0) {
897 dblk->inclist = db->inclist;
898 dblk->lang_flags |= LF_DUP_INC;
899 if (debugging_loadlang)
900 fprintf (aa_stderr, PROGNAME"1363 "
901 "Using previously loaded inclist, db='%s'.\n",
907 /* Include list is optional so missing file is
908 * not an error unless caller named a specific file.
910 i = load_wordtree (&dblk->inclist, dblk, dblk->fname_inc, TRUE);
916 if (filename_was_null) {
917 dblk->fname_inc = "";
918 dblk->inclist = NULL;
923 catgets (dtsearch_catd, MS_misc, 362, "%s: %s: %s."),
924 PROGNAME"1218", dblk->fname_inc, strerror(ENOENT));
925 DtSearchAddMessage (sprintbuf);
932 } /* load_include_list() */
935 /************************************************/
939 /************************************************/
940 /* Builds stoplist by reading stoplist file into a
941 * binary tree structure. File name can be
942 * (1) passed in dblk.fname_stp,
943 * (2) generated from dblk path, name, and '.stp',
944 * (3) default for dblk path, language, and '.stp'.
945 * 'dblist' may be NULL.
946 * RETURNS TRUE if no problems, else FALSE with msg in ausapi_msglist.
948 static int load_stop_list (DBLK *dblk, DBLK *dblist)
952 char sprintbuf [_POSIX_PATH_MAX + 512];
955 dblk->stoplist = NULL; /* just to be sure */
957 if (debugging_loadlang)
959 PROGNAME"1700 Load stoplist: db='%s' lang=#%d,%s\n",
960 NULLORSTR(dblk->name), (int)dblk->dbrec.or_language,
961 language_name(dblk->dbrec.or_language));
963 /* If file name not provided, generate one based on
964 * dblk's path, database name, and default extension.
965 * And if that doesn't work, generate one based on
966 * dblk's path, language, and default extension.
968 if (dblk->fname_stp == NULL) {
969 if (dblk->path == NULL)
970 dblk->path = strdup("");
971 dblk->fname_stp = austext_malloc (strlen(dblk->path) + 36,
972 PROGNAME"919", NULL);
974 strcpy (dblk->fname_stp, dblk->path);
975 ensure_end_slash (dblk->fname_stp);
976 strcat (dblk->fname_stp, dblk->name);
977 strcat (dblk->fname_stp, EXT_STOPLIST);
979 stat (dblk->fname_stp, &statbuf);
980 if (errno == ENOENT) {
981 strcpy (dblk->fname_stp, dblk->path);
982 ensure_end_slash (dblk->fname_stp);
983 strcat (dblk->fname_stp, lang_fnames [dblk->dbrec.or_language]);
984 strcat (dblk->fname_stp, EXT_STOPLIST);
987 if (debugging_loadlang)
989 PROGNAME"1448 Stoplist file name = '%s'.\n",
992 /* Don't reload the same file if it's already
993 * been loaded into a previous dblk in a list.
994 * Code works just fine if dblist == NULL.
996 for (db = dblist; db != NULL; db = db->link) {
997 if (db == dblk || db->fname_stp == NULL)
999 if (strcmp (db->fname_stp, dblk->fname_stp) == 0) {
1000 dblk->stoplist = db->stoplist;
1001 dblk->lang_flags |= LF_DUP_STP;
1002 if (debugging_loadlang)
1003 fprintf (aa_stderr, PROGNAME"1460 "
1004 "Using previously loaded stoplist, db='%s'.\n",
1010 /* Stop lists are mandatory--a missing stoplist is fatal. */
1011 i = load_wordtree (&dblk->stoplist, dblk, dblk->fname_stp, TRUE);
1014 catgets (dtsearch_catd, MS_misc, 362, "%s: %s: %s"),
1015 PROGNAME"1270", dblk->fname_stp, strerror(ENOENT));
1016 DtSearchAddMessage (sprintbuf);
1019 } /* load_stop_list() */
1022 /************************************************/
1024 /* free_paice_rules */
1026 /************************************************/
1027 /* Frees all allocated storage for a set of paice rules, typically
1028 * loaded at dblk.stem_extra. Called by REINIT routines and
1029 * by load_paice_suffixes() when cleaning up after an error.
1031 static void free_paice_rules (PRULE ***rules_table_ptr)
1035 PRULE **rules_table;
1037 if (*rules_table_ptr == NULL)
1039 rules_table = *rules_table_ptr;
1040 for (i=0; i<256; i++) {
1041 if (rules_table[i] == NULL)
1054 *rules_table_ptr = NULL;
1056 } /* free_paice_rules() */
1059 /************************************************/
1061 /* load_paice_suffixes */
1063 /************************************************/
1064 /* Loads European language paice stemmer suffix rules
1065 * into dblk.stem_extra as an array of ptrs to linked lists.
1066 * Like stop lists, sfx files can be
1067 * (1) passed in dblk.fname_sfx,
1068 * (2) generated from dblk path, dbname, and '.sfx',
1069 * (3) generated from dblk path, language, and '.sfx'.
1070 * Internal tables will be reused if file previously loaded.
1071 * Only uses single byte character sets (ascii, iso-latin-1).
1072 * Uses strtok(). dblk->charmap must already be loaded.
1073 * Will continue to parse entire file even if errors are found.
1074 * RETURNS TRUE if no problems, else FALSE with msg in ausapi_msglist.
1076 static int load_paice_suffixes (DBLK *dblk, DBLK *dblist)
1080 PRULE *prule, **prule_link;
1081 PRULE **rules_table;
1082 struct stat statbuf;
1083 UCHAR *cptr, *token;
1084 char readbuf [_POSIX_PATH_MAX + 1024];
1085 char msgbuf [_POSIX_PATH_MAX + 1024];
1086 UCHAR *suffix, *apndstr;
1087 int must_be_intact, is_last_rule;
1089 int lineno, errcount;
1090 _Xstrtokparams strtok_buf;
1092 dblk->stem_extra = NULL; /* just to be sure */
1095 if (debugging_loadlang)
1097 PROGNAME"1715 Load paice suffixes: db='%s' lang=#%d,%s\n",
1098 NULLORSTR(dblk->name), (int)dblk->dbrec.or_language,
1099 language_name(dblk->dbrec.or_language));
1101 /* If file name not provided, generate one based on
1102 * dblk's path, database name, and default extension.
1103 * And if that doesn't work, generate one based on
1104 * dblk's path, language, and default extension.
1106 if (dblk->fname_sfx == NULL) {
1107 if (dblk->path == NULL)
1108 dblk->path = strdup("");
1109 dblk->fname_sfx = austext_malloc (strlen(dblk->path) + 36,
1110 PROGNAME"1113", NULL);
1112 strcpy (dblk->fname_sfx, dblk->path);
1113 ensure_end_slash (dblk->fname_sfx);
1114 strcat (dblk->fname_sfx, dblk->name);
1115 strcat (dblk->fname_sfx, EXT_SUFFIX);
1117 stat (dblk->fname_sfx, &statbuf);
1118 if (errno == ENOENT) {
1119 strcpy (dblk->fname_sfx, dblk->path);
1120 ensure_end_slash (dblk->fname_sfx);
1121 strcat (dblk->fname_sfx, lang_fnames [dblk->dbrec.or_language]);
1122 strcat (dblk->fname_sfx, EXT_SUFFIX);
1125 if (debugging_loadlang)
1127 PROGNAME"1740 Paice suffix file name = '%s'.\n",
1130 /* Don't reload the same file if it's already
1131 * been loaded into a previous dblk in a list,
1132 * but flag it so it won't be freed at unload_language/REINIT.
1133 * Code works just fine if dblist == NULL.
1135 for (db = dblist; db != NULL; db = db->link) {
1136 if (db == dblk || db->fname_sfx == NULL)
1138 if (strcmp (db->fname_sfx, dblk->fname_sfx) == 0) {
1139 dblk->stem_extra = db->stem_extra;
1140 dblk->lang_flags |= LF_DUP_SFX;
1141 if (debugging_loadlang)
1142 fprintf (aa_stderr, PROGNAME"1145 "
1143 "Using previously loaded suffixes, db='%s'.\n",
1149 fp = fopen (dblk->fname_sfx, "rt");
1152 catgets (dtsearch_catd, MS_misc, 362, "%s: %s: %s."),
1153 PROGNAME"181", dblk->fname_sfx, strerror(errno));
1154 DtSearchAddMessage (msgbuf);
1155 dblk->fname_sfx = NULL;
1159 /* Rules table will eventually be loaded at dblk.stem_extra.
1160 * It consists of 256 PRULE ptrs,
1161 * one for each possible single byte char.
1162 * Each ptr is the head of a rules list for that char.
1164 rules_table = austext_malloc (256 * sizeof(PRULE*),
1165 PROGNAME"199", &ausapi_msglist);
1166 memset (rules_table, 0, 256 * sizeof(PRULE*));
1170 /*------- Main Read Loop -------*/
1171 while (fgets (readbuf, sizeof(readbuf), fp) != NULL) {
1174 /* Ignore comment lines */
1175 if (strchr (COMMENT_CHARS, readbuf[0]))
1178 /* TOKEN #1: suffix string, backwards, all uppercase.
1179 * If missing, ignore 'empty' line.
1180 * If the first token is all numeric, ignore line
1181 * (for compatibility with older versions of file).
1183 if ((suffix = (UCHAR *)_XStrtok(readbuf, SFX_DELIMS, strtok_buf)) == NULL)
1186 for (cptr = suffix; cptr; cptr++)
1187 if ((dblk->charmap[*cptr] & NUMERAL) == 0)
1192 /* OPTIONAL TOKEN #2: if next token '*', set 'intact' flag */
1193 if ((token = (UCHAR *)_XStrtok(NULL, SFX_DELIMS, strtok_buf)) == NULL) {
1195 sprintf (msgbuf, catgets(dtsearch_catd, MS_lang, 51,
1196 "%s %s, Line %d: Invalid Paice Rule for suffix '%s'.") ,
1197 PROGNAME"898", dblk->fname_sfx, lineno, suffix);
1198 DtSearchAddMessage (msgbuf);
1202 must_be_intact = FALSE;
1203 if (token[0] == '*') {
1204 must_be_intact = TRUE;
1205 /* Read next token... */
1206 if ((token = (UCHAR *)_XStrtok(NULL, SFX_DELIMS, strtok_buf)) == NULL)
1210 /* TOKEN #3: remove-count */
1211 remove_count = (UCHAR) atoi ((char *) token);
1213 /* OPTIONAL TOKEN #4: if next token is NOT a continue
1214 * symbol ('>' or '$'), then it's an append string.
1217 if ((token = (UCHAR *)_XStrtok(NULL, SFX_DELIMS, strtok_buf)) == NULL)
1219 if (token[0] != '$' && token[0] != '>') {
1221 /* Read next token... */
1222 if ((token = (UCHAR *)_XStrtok(NULL, SFX_DELIMS, strtok_buf)) == NULL)
1226 /* TOKEN #5: continue symbol '$' (stop) or '>' (continue) */
1227 is_last_rule = (token[0] == '$');
1229 if (debugging_loadword) {
1231 " SFX: intact?=%d stop?=%d remv=%d '%s'",
1232 (int) must_be_intact,
1237 fprintf (aa_stderr, "\tapnd='%s'\n", apndstr);
1239 fputc ('\n', aa_stderr);
1242 /* Good suffix. If we haven't had any errors yet,
1243 * add it to rules list for the first char of the suffix.
1247 prule = austext_malloc (sizeof(PRULE), PROGNAME"1252", NULL);
1248 memset (prule, 0, sizeof(PRULE));
1249 prule->suffix = (UCHAR *) strdup ((char*)suffix);
1250 prule->suflen = strlen ((char*)suffix);
1251 prule->must_be_intact = must_be_intact;
1252 prule->remove_count = remove_count;
1253 prule->is_last_rule = is_last_rule;
1255 prule->apndstr = (UCHAR *) strdup ((char*)apndstr);
1256 prule->aplen = strlen ((char*)apndstr);
1259 prule_link = &rules_table[suffix[0]];
1261 prule_link = &(*prule_link)->link;
1262 *prule_link = prule;
1264 } /* end Main Read Loop */
1268 free_paice_rules (&rules_table);
1271 dblk->stem_extra = rules_table;
1273 /* Update last table entry */
1274 if (debugging_loadlang) {
1276 PROGNAME"1654 Paice suffix file '%s' loaded ok.\n",
1281 } /* load_paice_suffixes() */
1284 /************************************************/
1286 /* is_matching_rule */
1288 /************************************************/
1289 /* Subroutine of paice_stemmer().
1290 * Returns TRUE if passed rule can be applied to stem in paicebuf.
1291 * Else returns FALSE.
1293 static int is_matching_rule (PRULE *rule)
1298 if (debugging_paice)
1299 fprintf (aa_stderr, " test rule '%s':\t", rule->suffix);
1301 /* Skip rule if we've made at least one previous change
1302 * but the current rule requires an intact word.
1304 if (rule->must_be_intact && !word_is_intact) {
1305 if (debugging_paice)
1306 fputs ("word not intact...\n", aa_stderr);
1310 /* Do a backward strcmp on the suffix.
1311 * Skip rule if it doesn't match current paicebuf's ending chars.
1314 ptr = paicebuf + paicelen - 1;
1315 for (i = 0; i < j; i++) {
1316 if (*((rule->suffix) + i) != *ptr) {
1317 if (debugging_paice)
1318 fputs ("no match...\n", aa_stderr);
1324 if (debugging_paice)
1325 fputs ("match", aa_stderr);
1327 /* Set i = paicebuf length after removing and appending suffixes.
1328 * Used to algorithmically test remaining stem length
1329 * after tentative application of rule.
1331 i = paicelen - (rule->remove_count - rule->aplen);
1334 if (debugging_paice)
1335 fputs (", but stem too short...\n", aa_stderr);
1340 if (IS_VOWEL (paicebuf[0])) {
1341 if (debugging_paice)
1342 fputs (", and short vowel stem valid.\n", aa_stderr);
1346 if (debugging_paice)
1347 fputs (", but consonant stem too short...\n", aa_stderr);
1352 /* Remaining stem is at least 3 chars.
1353 * If it contains a vowel anywhere, it's valid.
1354 * (A 'Y' after the first char counts as a vowel).
1355 * Otherwise it's not.
1357 for (j=0; j<i; j++) {
1358 if (IS_VOWEL (paicebuf[j])) {
1360 if (debugging_paice)
1361 fputs (", and remaining stem valid.\n", aa_stderr);
1364 if (j > 0 && paicebuf[j] == 'Y')
1368 if (debugging_paice)
1369 fputs (", but remaining stem all consonants.\n", aa_stderr);
1371 } /* is_matching_rule() */
1374 /************************************************/
1378 /************************************************/
1379 /* Given a word token (ALREADY UPPERCASE) in a single byte
1380 * language such as the output of teskey_parser,
1381 * generates 'stem' by repeated suffix removal.
1382 * Returns stem token in a static buffer valid
1383 * until next call to paice_stemmer or null_stemmer.
1384 * Returned stem might be the original unmodified word.
1385 * Returned stem might also be empty string.
1386 * Returned stem is *never* NULL, even if wordin == NULL.
1387 * Input buffer will not be modified; does not use strtok.
1388 * All variables are static for speeeeeeed.
1390 static char *paice_stemmer (char *wordin, DBLK *dblk)
1393 PRULE *rule, **rules_table;
1400 if ((rules_table = (PRULE **)dblk->stem_extra) == NULL) {
1401 fprintf (aa_stderr, catgets (dtsearch_catd, MS_lang, 31,
1402 "%s Stemmer suffixes file never loaded.\n"),
1407 /* The max length of a stem is bufsz - 2:
1408 * one for the terminating \0 and one for the
1409 * prefix ^O that identifies a stem. (But this
1410 * stemmer doesn't actually insert the ^O now.)
1412 strncpy ((char*)paicebuf, wordin, DtSrMAXWIDTH_HWORD);
1413 paicebuf [DtSrMAXWIDTH_HWORD - 2] = 0;
1414 paice_charmap = dblk->charmap;
1415 word_is_intact = TRUE;
1417 for (;;) { /*-------- Main Stemming Loop ---------*/
1419 paicelen = strlen ((char*)paicebuf);
1420 finalc = *(paicebuf + paicelen - 1);
1421 if (debugging_paice) {
1423 "paice: '%s', rules list '%c' for database '%s'\n",
1424 paicebuf, finalc, dblk->name);
1428 /* Look for a matching rule */
1429 if ((rule = rules_table [finalc]) == NULL) {
1430 if (debugging_paice)
1431 fputs (" list is null, stop.\n", aa_stderr);
1435 if (is_matching_rule (rule))
1440 if (debugging_paice)
1441 fprintf (aa_stderr, " rules list '%c' is exhausted, stop.\n",
1446 /* Apply rule that matched */
1447 if (debugging_paice)
1448 fputs (" apply rule: ", aa_stderr);
1449 if (rule->remove_count == 0) {
1450 if (debugging_paice)
1451 fputs ("remove_count = 0, stop.\n", aa_stderr);
1455 paicebuf [paicelen - rule->remove_count] = 0;
1457 strcat ((char*)paicebuf, (char*)rule->apndstr);
1458 paicelen = strlen ((char*)paicebuf);
1459 word_is_intact = FALSE; /* we've removed at least 1 suffix */
1460 if (debugging_paice)
1461 fprintf (aa_stderr, "--> '%s'", paicebuf);
1463 /* Terminate algorithm if rule says so.
1464 * Otherwise continue removing suffixes
1465 * from this partially stemmed word.
1467 if (rule->is_last_rule) {
1468 if (debugging_paice)
1469 fputs (", stop flag is set, stop.\n", aa_stderr);
1472 if (debugging_paice)
1473 fputc ('\n', aa_stderr);
1475 } /* end Main Stemming Loop */
1477 if (debugging_paice) {
1478 fprintf (aa_stderr, " final stem: '%s'\n", paicebuf);
1481 return (char *) paicebuf;
1482 } /* paice_stemmer() */
1485 /************************************************/
1489 /************************************************/
1490 /* Stemmer that just copies and returns passed word.
1491 * In effect, the passed word IS its own stem.
1492 * Output buffer valid until next call to null_stemmer
1495 char *null_stemmer (char *word, DBLK *dblk)
1501 strncpy ((char *)paicebuf, word, DtSrMAXWIDTH_HWORD);
1502 paicebuf [DtSrMAXWIDTH_HWORD-1] = 0;
1503 return (char *) paicebuf;
1504 } /* null_stemmer() */
1507 /************************************************/
1511 /************************************************/
1512 /* Converts passed string to uppercase in place.
1513 * Classic strupr() function using teskey charmaps.
1515 static char *euro_lstrupr (char *string, DBLK *dblk)
1517 static int *charmap;
1519 charmap = dblk->charmap;
1520 for (s=(UCHAR *)string; *s; s++)
1521 *s = charmap[*s] & 0xff;
1526 /************************************************/
1530 /************************************************/
1531 /* Just returns passed string. Used where uppercase
1532 * conversions are not required for a language.
1534 char *null_lstrupr (char *s, DBLK *d)
1538 /************************************************/
1542 /************************************************/
1543 /* Loads a dblk with a specific language's
1544 * structures and function pointers.
1545 * Does not reload structures previously loaded in
1546 * other dblks on dblist if derived from identical files.
1547 * But always loads structures if passed dblist is NULL.
1548 * Presumes dblk already partially initialized with mandatory fields:
1549 * name, path, language.
1550 * May also be preinitialized with optional fields:
1551 * minwordsz, maxwordsz.
1552 * Returns TRUE if all successful.
1553 * Otherwise returns FALSE with err msgs on ausapi_msglist.
1555 int load_language (DBLK *dblk, DBLK *dblist)
1558 int language = dblk->dbrec.or_language;
1560 if (debugging_loadlang)
1562 "\n"PROGNAME"1920 Loading language #%d, %s, for dblk '%s'.\n",
1563 (int)dblk->dbrec.or_language,
1564 language_name (dblk->dbrec.or_language),
1565 NULLORSTR(dblk->name));
1568 * Note: Load list functions must be called
1569 * AFTER charmap and lstrupr are loaded.
1578 dblk->charmap = (language == DtSrLaENG)?
1579 ascii_charmap : latin_charmap;
1580 dblk->parser = teskey_parser;
1581 dblk->stemmer = paice_stemmer;
1582 dblk->lstrupr = euro_lstrupr;
1583 if (dblk->dbrec.or_maxwordsz == 0)
1584 dblk->dbrec.or_maxwordsz = (language == DtSrLaDEU)?
1585 MAXWIDTH_LWORD - 1 : MAXWIDTH_SWORD - 1;
1586 if (dblk->dbrec.or_minwordsz == 0)
1587 dblk->dbrec.or_minwordsz = MINWIDTH_TOKEN + 1;
1589 if (!load_stop_list (dblk, dblist))
1591 if (!load_include_list (dblk, dblist))
1593 if (!load_paice_suffixes (dblk, dblist))
1601 return load_jpn_language (dblk, dblist);
1604 /* Try loading a custom 'user' language.
1605 * If he failed to provide a loader function,
1606 * the dummy custom loader will tell him so.
1607 * If he provided one but it can't load this language,
1608 * it should return it's own error msgs.
1610 return load_custom_language (dblk, dblist);
1612 } /* end switch (language) */
1615 } /* load_language() */
1618 /************************************************/
1620 /* unload_language */
1622 /************************************************/
1623 /* Frees storage for structures allocated by load_language().
1624 * Called when engine REINITs due to change in site config file
1626 * Duplicate wordtrees are not unloaded because they
1627 * will have already been unloaded in a previous dblk.
1629 void unload_language (DBLK *dblk)
1631 switch (dblk->dbrec.or_language) {
1638 dblk->charmap = NULL;
1639 if ((dblk->lang_flags & LF_DUP_STP) == 0)
1640 free_wordtree (&dblk->stoplist);
1642 dblk->stoplist = NULL;
1643 dblk->lang_flags &= ~LF_DUP_STP;
1645 if ((dblk->lang_flags & LF_DUP_INC) == 0)
1646 free_wordtree (&dblk->inclist);
1648 dblk->inclist = NULL;
1649 dblk->lang_flags &= ~LF_DUP_INC;
1651 if ((dblk->lang_flags & LF_DUP_SFX) == 0)
1652 free_paice_rules ((PRULE***)&dblk->stem_extra);
1654 dblk->stem_extra = NULL;
1655 dblk->lang_flags &= ~LF_DUP_SFX;
1661 unload_jpn_language (dblk);
1665 unload_custom_language (dblk);
1669 } /* unload_language() */
1670 /******************** LANG.C ********************/