2 * CDE - Common Desktop Environment
4 * Copyright (c) 1993-2012, The Open Group. All rights reserved.
6 * These libraries and programs are free software; you can
7 * redistribute them and/or modify them under the terms of the GNU
8 * Lesser General Public License as published by the Free Software
9 * Foundation; either version 2 of the License, or (at your option)
12 * These libraries and programs are distributed in the hope that
13 * they will be useful, but WITHOUT ANY WARRANTY; without even the
14 * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU Lesser General Public License for more
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with these librararies and programs; if not, write
20 * to the Free Software Foundation, Inc., 51 Franklin Street, Fifth
21 * Floor, Boston, MA 02110-1301 USA
24 * COMPONENT_NAME: austext
26 * FUNCTIONS: euro_lstrupr
45 * (C) COPYRIGHT International Business Machines Corp. 1995,1996
47 * Licensed Materials - Property of IBM
48 * US Government Users Restricted Rights - Use, duplication or
49 * disclosure restricted by GSA ADP Schedule Contract with IBM Corp.
51 /******************** LANG.C ********************
52 * $XConsortium: lang.c /main/11 1996/11/25 18:47:29 drk $
54 * Includes load_language(), unload_language(), and functions and data for
55 * parsing and stemming European languages in DtSearch/AusText.
56 * Incorporates p/o socrates.c, p/o proctext.c, parser.c
57 * delsfx.c, loadchr.c, stop.c, inclist.c, convneg.c, isendwrd.c
58 * Related to similar semantic modules repackaged into semantic.c.
59 * Paice suffix removal algorithm from C. Paice, 1990,
60 * "Another Stemmer", ACM SIGIR Forum, 24(3), 56-61.
63 * Revision 2.13 1996/03/25 18:55:26 miker
64 * Changed FILENAME_MAX to _POSIX_PATH_MAX.
66 * Revision 2.12 1996/03/25 17:00:19 miker
67 * Cleanup compiler warning.
69 * Revision 2.11 1996/03/13 22:58:13 miker
70 * Changed char to UCHAR several places.
72 * Revision 2.10 1996/03/05 16:49:58 miker
73 * Move COMMENT_CHARS to SearchP.h.
75 * Revision 2.9 1996/03/05 16:31:20 miker
76 * Added test of PA_MSGS for yacc-based boolean queries.
77 * Made comment chars in linguistic files independent of locale.
78 * Changed several char ptrs to unsigned char so parser will
79 * work when compiled under default signed char compilers.
80 * Simplified several statements with LHS *var++ for same reason.
82 * Revision 2.8 1996/02/05 16:16:05 miker
85 * Revision 2.7 1996/02/05 16:10:54 miker
86 * load_paice_suffixes: discard .sfx lines beginning with all numeric
87 * first token for compatibility with older file formats.
89 * Revision 2.6 1996/02/01 19:11:43 miker
90 * AusText 2.1.11, DtSearch 0.3: Major rewrite for new parsers.
91 * Moved charmaps to new module langmap.c. Removed hard coded
92 * paice stemmer values--now dynamic from .sfx file.
94 * Revision 2.5 1995/10/26 14:55:28 miker
97 * Revision 2.4 1995/10/19 20:54:36 miker
98 * Increased msg buf sizes to accommodate larger database file names.
100 * Revision 2.3 1995/10/06 14:39:45 miker
101 * Bug fix: coredump loading multiple databases
104 * Revision 2.2 1995/10/03 21:39:10 miker
105 * Changed teskey_parser, paice_stemmer, and null_stemmer
106 * to return number of words parsed/stemmed, not just boolean.
108 * Revision 2.1 1995/09/22 21:00:19 miker
109 * Freeze DtSearch 0.1, AusText 2.1.8
111 * Revision 1.3 1995/09/19 22:08:28 miker
112 * Added support for loading and parsing Japanese language DtSrLaJPN.
114 * Revision 1.2 1995/09/05 21:34:52 miker
115 * Fixed bug: search engine wouldn't parse words of exactly
118 * Revision 1.1 1995/08/31 21:03:44 miker
127 #include <sys/stat.h>
129 #define X_INCLUDE_STRING_H
130 #define XOS_USE_NO_LOCKING
131 #include <X11/Xos_r.h>
133 #define PROGNAME "LANG"
134 #define EXT_SUFFIX ".sfx" /* standard paice suffix file format */
135 #define OUTBUFSZ 6140
136 #define SFX_DELIMS " \t\n"
139 #define IS_VOWEL(c) ((paice_charmap [(UCHAR)c] & VOWEL) != 0)
141 /************************************************/
145 /************************************************/
146 /* List of Paice suffix removal rules from .sfx files */
147 typedef struct prule_t {
148 struct prule_t *link; /* Ptr to next list node */
149 UCHAR *suffix; /* Applicable suffix string, backwards */
150 UCHAR suflen; /* Length of suffix */
151 char must_be_intact; /* Optional '*'. Rule only applies
153 UCHAR remove_count; /* Number of suffix chars to remove */
154 UCHAR aplen; /* Length of apndstr */
155 UCHAR *apndstr; /* Optional append string */
156 char is_last_rule; /* '$' terminate or '>' continue algorithm */
160 /************************************************/
164 /************************************************/
165 int debugging_loadlang = FALSE;
166 int debugging_loadword = FALSE;
167 int debugging_search_wordtree = FALSE;
168 int debugging_teskey = FALSE;
169 int debugging_paice = FALSE;
170 static int *paice_charmap;
171 static UCHAR paicebuf [DtSrMAXWIDTH_HWORD + 2];
173 static int word_is_intact;
175 /* Language strings correspond to DtSrLa.. constants. */
176 static char *lang_fnames[] = {
178 "eng", /* 1 ('eng2' same files as 'eng') */
184 "jpn", /* 7 ('jpn2' same files as 'jpn' */
189 /************************************************/
193 /************************************************/
194 /* Returns language name string given language number */
195 static char *language_name (DtSrINT16 langno)
197 static char *language_names[] = {
198 "English-ASCII", /* 0 = DtSrLaENG */
199 "English-Latin1", /* 1 = DtSrLaENG2 */
200 "Spanish", /* 2 = DtSrLaESP */
201 "French", /* 3 = DtSrLaFRA */
202 "Italian", /* 4 = DtSrLaITA */
203 "German", /* 5 = DtSrLaDEU */
204 "Japanese-comp" /* 6 = DtSrLaJPN */
205 "Japanese-.knj" /* 7 = DtSrLaJPN2 */
208 static char *language_names[10];
210 language_names[0] = catgets(dtsearch_catd, MS_lang, 50, "English-ASCII");
211 language_names[1] = catgets(dtsearch_catd, MS_lang, 51, "English-Latin1");
212 language_names[1] = catgets(dtsearch_catd, MS_lang, 52, "Spanish");
213 language_names[1] = catgets(dtsearch_catd, MS_lang, 53, "French");
214 language_names[1] = catgets(dtsearch_catd, MS_lang, 54, "Italian");
215 language_names[1] = catgets(dtsearch_catd, MS_lang, 54, "German");
216 language_names[1] = catgets(dtsearch_catd, MS_lang, 54, "Japanese-comp");
217 language_names[1] = catgets(dtsearch_catd, MS_lang, 54, "Japanese-.knj");
222 else if (langno > DtSrLaLAST)
223 return "(Custom Language)";
225 return language_names [langno];
226 } /* language_name() */
229 /************************************************/
231 /* search_wordtree */
233 /************************************************/
235 * Formerly search_inclist() in inclist.c and search_stoplist() in stop.c.
236 * Searches a word list in a binary WORDTREE.
237 * Passed wordstring is presumed to be a clean,
238 * uppercase word token string terminated by \0.
239 * Variables are static for speeeeed.
240 * Returns TRUE if successful search, else FALSE.
241 * See also search_wordtree_jpn() in jpn.c
243 static int search_wordtree (WORDTREE *wordtree, UCHAR *wordstring)
245 static int direction;
246 static WORDTREE *node;
249 if (debugging_search_wordtree)
250 fprintf (aa_stderr, PROGNAME"196 search wordtree for '%s':\n",
252 /* MAIN SEARCH LOOP: binary tree search */
253 for (node = wordtree; node != NULL; ) {
254 if ((direction = strcmp ((char *) wordstring, node->word)) == 0) {
255 if (debugging_search_wordtree)
256 fprintf (aa_stderr, " HIT!\n");
259 /* Descend left or right depending on word */
260 if (debugging_search_wordtree)
261 fprintf (aa_stderr, " %c '%s'\n",
262 (direction < 0) ? 'L' : 'R', node->word);
268 if (debugging_search_wordtree)
269 fprintf (aa_stderr, " MISS.\n");
271 } /* search_wordtree() */
274 /************************************************/
278 /************************************************/
280 * Teskey_parser() is derived from the former Socrates() in socrates.c.
281 * Returns next teskey-parsed word token from a character stream.
282 * Called from (1) dtsrindex, where readchar_ftext() cofunction
283 * reads the .fzk file document 'stream', or (2) search engine
284 * query parsers, where readchar_string() cofunction 'reads'
285 * from the query string.
286 * (The word hiliting parser does not directly call teskey_parser; it has
287 * its own simplified equivalent to the parsing algorithms herein.)
289 * First call passes args in PARG structure. This resets end of
290 * text block (ETX) flag, resets 'offset' counter to zero, etc.
291 * Subsequent calls should pass NULL, and parser returns
292 * next token in block, until reader cofunction reads ETX,
293 * ie special ETX char ('\0'). Subsequent calls to parser
294 * return NULL meaning "no tokens left in current stream".
295 * Reader cofunctions tolerate repeated calls after
296 * the first ETX, still returning '\0'.
298 * This parser presumes all incoming text is unformatted.
299 * Since parser accesses streams a char at a time it does
300 * not require periodic line feeds or anything else.
302 * Parser also returns offset information: number of bytes
303 * since beginning of text block.
305 * Variables are static for speeeeeeed.
307 * OUTPUT FORMAT: NULL or a static C string containing a single
308 * parsed word token. Word buffer reused at next call.
309 * Each word is translated as follows:
310 * All alphas TO UPPERCASE.
311 * Teskey algorithm used to find word boundaries.
312 * Always keeps include-list words.
313 * Throws away stoplist words, very short words, and very long words.
314 * All intervening nonconcordables discarded.
316 * There is a slight mod to the published Teskey algorithm.
317 * Words can begin with optionally concordable chars
318 * but not end with them. For example if '-' is optionally
319 * concordable, '-foo-' will be parsed into '-foo'.
321 char *teskey_parser (PARG *parg)
323 static READCFP cofunction;
324 static void *cofunction_arg;
325 static DBLK *dblk = NULL;
326 static UCHAR *outbuf = NULL;
327 static size_t outbufsz = 0;
328 static UCHAR *endmaxword; /* end largest possible output word */
329 static UCHAR *outp; /* next loc in outbuf */
330 static char *begw; /* beginning of a word in the input buffer */
331 static char *endw; /* end of a word in the input buffer */
333 static int minwordsz, maxwordsz;
335 static enum {BETW_WORDS, IN_WORD, TOO_LONG}
337 static long *offsetp, readcount, candidate_offset;
338 static int is_hiliting;
341 /* If first call for current text block... */
344 minwordsz = dblk->dbrec.or_minwordsz;
345 maxwordsz = dblk->dbrec.or_maxwordsz;
346 charmap = dblk->charmap;
347 offsetp = parg->offsetp;
348 is_hiliting = (parg->flags & PA_HILITING);
349 add_msgs = (parg->flags & PA_MSGS);
350 if (charmap == NULL) {
351 fprintf (aa_stderr, catgets(dtsearch_catd, MS_lang, 4,
352 "%s dblk not initialized.\n"),
358 cofunction_arg = parg->string;
359 cofunction = (READCFP) readchar_string;
361 else if (parg->ftext) {
362 cofunction_arg = parg;
363 cofunction = (READCFP) readchar_ftext;
366 fprintf (aa_stderr, catgets(dtsearch_catd, MS_lang, 5,
367 "%s Program Error: parg contains neither file nor string.\n"),
372 if (outbufsz <= maxwordsz) {
375 outbufsz = maxwordsz + 8;
376 outbuf = austext_malloc (outbufsz + 8, PROGNAME"807", NULL);
378 endmaxword = outbuf + maxwordsz;
379 if (debugging_teskey)
381 "teskey: start of text block, maxwsz=%ld outbufsz=%ld\n",
382 maxwordsz, outbufsz);
386 /* CANDIDATE WORD LOOP: Read text chars into outbuf.
387 * Exit loop when outbuf contains one candidate token or at ETX.
391 tpstate = BETW_WORDS;
392 while (*outp = cofunction (cofunction_arg)) {
394 cofunction_arg = NULL;
396 /*------------- BETW_WORDS State ------------
397 * Reader is between word tokens.
399 if (tpstate == BETW_WORDS) {
401 * Discard nonconcordable chars between words.
403 if ((charmap[*outp] & NON_CONCORD) != 0)
406 * Fully concordable char is definite start of new word.
407 * Convert to uppercase and go get next char.
409 if ((charmap[*outp] & CONCORDABLE) != 0) {
410 *outp = charmap[*outp] & 0x00ff;
412 candidate_offset = readcount;
417 * Must be optionally concordable. It can only
418 * start a new word if next char is concordable.
419 * If so, convert a fully concordable char
420 * to uppercase and go get next char.
421 * Otherwise discard just like non_concord.
424 if (*outp = cofunction(NULL))
426 if ((charmap[*outp] & CONCORDABLE) != 0) {
427 *outp = charmap[*outp] & 0x00ff;
429 candidate_offset = readcount - 1;
437 } /* endif BETW_WORDS */
440 /*------------- IN_WORD State ------------
441 * Reader is in middle of a word.
442 * Convert all concordables to uppercase and append.
443 * Terminate word at first non_concord.
444 * Non_concords treatment depends on next char.
446 else if (tpstate == IN_WORD) {
447 if ((charmap[*outp] & CONCORDABLE) != 0) {
448 if (outp < endmaxword) {
449 *outp = charmap[*outp] & 0x00ff;
454 if (debugging_teskey)
456 "teskey: ofs=%3ld \"%.15s...\", (TOO LONG)\n",
457 candidate_offset-1, outbuf);
459 char msgbuf [DtSrMAXWIDTH_HWORD + 100];
460 sprintf (msgbuf, catgets(dtsearch_catd, MS_lang, 8,
461 "%s '%.*s...' is larger\n"
462 "than the maximum word size of database '%s'.") ,
463 PROGNAME"449", maxwordsz,
464 parg->string, dblk->label);
465 DtSearchAddMessage (msgbuf);
473 if ((charmap[*outp] & NON_CONCORD) != 0) {
477 /* Must be opt_concord... */
479 if (*outp = cofunction(NULL))
481 if ((charmap[*outp] & CONCORDABLE) != 0) {
482 if (outp < endmaxword) {
483 *outp = charmap[*outp] & 0x00ff; /* uppercase */
488 if (debugging_teskey)
490 "teskey: ofs=%3ld \"%.15s...\", (TOO LONG)\n",
491 candidate_offset-1, outbuf);
497 else { /* next char NOT concordable...*/
501 } /* endif IN_WORD */
504 /*------------- TOO_LONG State ------------
505 * Reader is in middle of a word that exceeds max word size.
506 * Discard all concordables and opt_concords until we
507 * can get between words again with a clear non_concord.
509 else if (tpstate == TOO_LONG) {
510 if ((charmap[*outp] & NON_CONCORD) != 0) {
512 tpstate = BETW_WORDS;
517 /*------------- UNKNOWN State ------------*/
519 fprintf (aa_stderr, catgets(dtsearch_catd, MS_lang, 10,
520 "%s Program Error: Unknown parser state.\n"),
524 } /* end read loop for next CANDIDATE WORD */
526 /*---------- TEST FOR ETX -------------*/
527 if (outbuf[0] == 0) {
528 if (debugging_teskey)
529 fprintf (aa_stderr, "teskey: etx\n");
532 sprintf (msgbuf, catgets(dtsearch_catd, MS_lang, 12,
533 "%s '%.120s' is not a valid word in database '%s'.") ,
534 PROGNAME"506", parg->string, dblk->label);
535 DtSearchAddMessage (msgbuf);
540 wordlen = strlen ((char *) outbuf);
541 candidate_offset--; /* token offset is one less than number of reads */
542 if (debugging_teskey)
543 fprintf (aa_stderr, "teskey: ofs=%3ld \"%s\"",
544 candidate_offset, outbuf);
547 if (debugging_teskey)
548 fprintf (aa_stderr, ", (hiliting, skip tree searches)");
552 /*--------- INCLUDE LIST ----------
553 * Search before testing for stoplist or minimum word length.
555 if (dblk->inclist != NULL) {
556 if (search_wordtree (dblk->inclist, outbuf)) {
557 if (debugging_teskey)
558 fprintf (aa_stderr, ", (INCLUDE LIST)");
563 /*--------- TOO SHORT -----------*/
564 if (wordlen < minwordsz) {
565 if (debugging_teskey)
566 fprintf (aa_stderr, ", (TOO SHORT, min %d)\n", minwordsz);
569 sprintf (msgbuf, catgets(dtsearch_catd, MS_lang, 17,
570 "%s '%s' is less than the\n"
571 "minimum word size of database '%s'.") ,
572 PROGNAME"543", parg->string, dblk->label);
573 DtSearchAddMessage (msgbuf);
576 goto READ_ANOTHER_WORD;
579 /*----------- STOP LIST -------------*/
580 if (dblk->stoplist != NULL) {
581 if (search_wordtree (dblk->stoplist, outbuf)) {
582 if (debugging_teskey)
583 fprintf (aa_stderr, ", (STOP LIST)\n");
586 sprintf (msgbuf, catgets(dtsearch_catd, MS_lang, 19,
587 "%s The word '%s' is not indexed in database '%s'.") ,
588 PROGNAME"558", parg->string, dblk->label);
589 DtSearchAddMessage (msgbuf);
592 goto READ_ANOTHER_WORD;
597 /* Word is correctly parsed and passes all dblk filters. */
598 if (debugging_teskey)
599 fprintf (aa_stderr, ", ...good word\n");
601 *offsetp = candidate_offset;
602 return (char *) outbuf;
603 } /* teskey_parser() */
606 /************************************************/
610 /************************************************/
611 /* Verifies passed word token is teskey-concordable
612 * in code page of passed charmap. Used in validating
613 * word files. Returns TRUE if all chars concordable
614 * or optionally concordable, else returns FALSE.
616 int is_concordable (char *word, int *charmap)
619 for (cptr = (UCHAR *)word; *cptr != 0; cptr++)
620 if ((charmap[*cptr] & NON_CONCORD) != 0)
623 } /* is_concordable() */
626 /************************************************/
630 /************************************************/
631 /* Called by load_stop_list(), load_include_list(), etc,
632 * to read an appropriate word list file into binary tree structures.
634 * INPUT FILE FORMAT: One word per line, all chars teskey concordable.
635 * Preferred order is frequency of occurrence in the corpus
636 * to make searches efficient. Otherwise the words should at least
637 * be in random order or an order that will approximate a binary search.
638 * If first char is any of COMMENT_CHARS, line is ignored as comments.
639 * Ascii spaces, tabs, or newline delimits the first word token--
640 * anything else on the line is ignored as comments.
641 * Optionally characters in word token will be checked for teskey
644 * RETURNS 0 if file successfully loaded, returns 1 if file missing,
645 * returns 2 and messages in global msglist if file has fatal errors.
660 char sprintbuf [_POSIX_PATH_MAX + 1024];
663 WORDTREE **this_link;
664 _Xstrtokparams strtok_buf;
666 if (debugging_loadlang)
667 fprintf (aa_stderr, PROGNAME"1071 "
668 "load_wordtree: db=%s fname='%s'\n",
669 NULLORSTR(dblk->name), NULLORSTR(fname));
671 if ((fileid = fopen (fname, "rt")) == NULL) {
672 /* Not being able to find the file is not an error.
673 * We indicate that with the return code.
674 * But any other error (like permissions) is fatal.
676 if (errno == ENOENT) {
677 if (debugging_loadlang)
678 fputs (" ...file not found.\n", aa_stderr);
683 catgets (dtsearch_catd, MS_misc, 362, "%s: %s: %s."),
684 PROGNAME"362", fname, strerror(errno));
685 DtSearchAddMessage (sprintbuf);
690 /*--------- Main Read Loop ----------*/
692 while (fgets (readbuf, sizeof(readbuf), fileid) != NULL) {
695 * Ignore comment lines beginning with punctuation char.
696 * Ignore empty lines (strtok returns NULL, no tokens).
697 * Otherwise first or only word on line is the desired word.
699 if (strchr (COMMENT_CHARS, readbuf[0]))
701 if ((token = _XStrtok(readbuf, " \t\n", strtok_buf)) == NULL)
703 dblk->lstrupr (token, dblk);
705 if (debugging_loadword)
706 fprintf (aa_stderr, " WORD: '%s' ", token);
708 /* If requested confirm all chars are teskey-concordable. */
710 if (!is_concordable (token, dblk->charmap)) {
711 sprintf (sprintbuf, catgets (dtsearch_catd, MS_misc, 400,
712 "%s: %s, line %ld: Invalid chars in word '%s'."),
713 PROGNAME"400", fname, linecount, token);
714 DtSearchAddMessage (sprintbuf);
719 /* Unless we've already detected some errors,
720 * allocate a new node and load its data fields.
725 new = austext_malloc (sizeof(WORDTREE) + i + 4,
726 PROGNAME"104", NULL);
730 new->word = (void *) (new + 1);
731 strcpy (new->word, token);
733 /* Descend binary tree and insert in correct alphabetical place */
734 is_duplicate = FALSE;
735 for (this_link = treetop; *this_link != NULL; ) {
736 i = strcmp (new->word, (*this_link)->word);
738 /* test for duplicate word */
740 sprintf (sprintbuf, catgets (dtsearch_catd, MS_misc, 423,
741 "%s Word '%s' in '%s' is a duplicate."),
742 PROGNAME"423", token, fname);
743 DtSearchAddMessage (sprintbuf);
744 /* duplicates aren't fatal, just ignore the word */
746 break; /* no point in continuing descent */
749 /* Descend tree to find correct insertion point */
750 if (debugging_loadword)
751 fputc(((i < 0)? 'L' : 'R'), aa_stderr);
752 this_link = (WORDTREE **) ((i < 0) ?
753 &(*this_link)->llink : &(*this_link)->rlink);
754 } /* end forloop to find tree insertion point */
756 /* Don't link anything if error found while descending tree */
758 if (debugging_loadword)
759 fputs (" duplicate!\n", aa_stderr);
764 /* Insert new node at current location in tree */
766 if (debugging_loadword)
767 fputs(" .\n", aa_stderr);
768 } /* end of read loop */
773 if (debugging_loadlang)
775 PROGNAME"1186 load word file '%s' failed.\n", fname);
779 if (debugging_loadlang)
781 PROGNAME"1193 load word file '%s' successful.\n", fname);
784 } /* load_wordtree() */
787 /************************************************/
791 /************************************************/
792 /* Formerly free_bintree() in msgutil.c.
793 * Frees storage for all nodes in a WORDTREE and
794 * sets its top-of-list pointer to NULL.
795 * Works only for node structures where all memory
796 * was allocated in a single call to malloc().
797 * Uses link inversion traversal (eg, Data Structure Techniques,
798 * Thomas A. Standish, Algorithm 3.6) where TAG is initialized
799 * at preorder visit, and node is freed at postorder visit.
801 static void free_wordtree (WORDTREE ** wordtree_head)
804 WORDTREE *prev = NULL;
805 WORDTREE *pres = *wordtree_head;
807 if (*wordtree_head == NULL)
811 pres->word = (void *) 0; /* preorder visit: TAG = 0 */
822 pres->word = (void *) 1; /* TAG = 1 */
830 if (prev == NULL) { /* end of algorithm? */
831 *wordtree_head = NULL;
834 if (prev->word == (void *) 0) { /* go up left leg */
840 else { /* go up right leg */
842 prev->word = (void *) 0; /* restore TAG = 0 */
845 goto POSTORDER_VISIT;
847 } /* free_wordtree() */
850 /************************************************/
852 /* load_include_list */
854 /************************************************/
855 /* Builds include list by reading include file
856 * into a binary tree structure.
857 * Unlike stoplists, include-lists are optional.
858 * Also unlike stoplists, there are no language default include-lists.
859 * 'dblist' may be NULL.
860 * RETURNS TRUE if no problems, else FALSE with msg in ausapi_msglist.
862 static int load_include_list (DBLK *dblk, DBLK *dblist)
865 int filename_was_null = (dblk->fname_inc == NULL);
867 char sprintbuf [512];
869 dblk->inclist = NULL; /* just to be sure */
871 if (debugging_loadlang)
873 PROGNAME"1705 Load inclist: db='%s' lang=#%d,%s\n",
874 NULLORSTR(dblk->name), (int)dblk->dbrec.or_language,
875 language_name(dblk->dbrec.or_language));
877 /* If file name not provided, generate one based on
878 * dblk's path, database name, and default extension.
880 if (filename_was_null) {
881 if (dblk->name[0] == 0) {
882 dblk->fname_inc = "";
883 dblk->inclist = NULL;
884 if (debugging_loadlang)
885 fprintf (aa_stderr, PROGNAME"1339 "
886 "No inclist because neither fname nor dbname provided.\n");
889 if (dblk->path == NULL)
890 dblk->path = strdup("");
891 dblk->fname_inc = austext_malloc (strlen(dblk->path) + 36,
892 PROGNAME"1187", NULL);
893 strcpy (dblk->fname_inc, dblk->path);
894 ensure_end_slash (dblk->fname_inc);
895 strcat (dblk->fname_inc, dblk->name);
896 strcat (dblk->fname_inc, EXT_INCLIST);
898 if (debugging_loadlang)
900 PROGNAME"1350 Include list file name = '%s'.\n",
903 /* Dont reload the same file if it's already
904 * been loaded into a previous dblk in a list.
905 * Code works just fine if dblist == NULL.
907 for (db = dblist; db != NULL; db = db->link) {
908 if (db == dblk || db->fname_inc == NULL)
910 if (strcmp (db->fname_inc, dblk->fname_inc) == 0) {
911 dblk->inclist = db->inclist;
912 dblk->lang_flags |= LF_DUP_INC;
913 if (debugging_loadlang)
914 fprintf (aa_stderr, PROGNAME"1363 "
915 "Using previously loaded inclist, db='%s'.\n",
921 /* Include list is optional so missing file is
922 * not an error unless caller named a specific file.
924 i = load_wordtree (&dblk->inclist, dblk, dblk->fname_inc, TRUE);
930 if (filename_was_null) {
931 dblk->fname_inc = "";
932 dblk->inclist = NULL;
937 catgets (dtsearch_catd, MS_misc, 362, "%s: %s: %s."),
938 PROGNAME"1218", dblk->fname_inc, strerror(ENOENT));
939 DtSearchAddMessage (sprintbuf);
946 } /* load_include_list() */
949 /************************************************/
953 /************************************************/
954 /* Builds stoplist by reading stoplist file into a
955 * binary tree structure. File name can be
956 * (1) passed in dblk.fname_stp,
957 * (2) generated from dblk path, name, and '.stp',
958 * (3) default for dblk path, language, and '.stp'.
959 * 'dblist' may be NULL.
960 * RETURNS TRUE if no problems, else FALSE with msg in ausapi_msglist.
962 static int load_stop_list (DBLK *dblk, DBLK *dblist)
966 char sprintbuf [_POSIX_PATH_MAX + 512];
969 dblk->stoplist = NULL; /* just to be sure */
971 if (debugging_loadlang)
973 PROGNAME"1700 Load stoplist: db='%s' lang=#%d,%s\n",
974 NULLORSTR(dblk->name), (int)dblk->dbrec.or_language,
975 language_name(dblk->dbrec.or_language));
977 /* If file name not provided, generate one based on
978 * dblk's path, database name, and default extension.
979 * And if that doesn't work, generate one based on
980 * dblk's path, language, and default extension.
982 if (dblk->fname_stp == NULL) {
983 if (dblk->path == NULL)
984 dblk->path = strdup("");
985 dblk->fname_stp = austext_malloc (strlen(dblk->path) + 36,
986 PROGNAME"919", NULL);
988 strcpy (dblk->fname_stp, dblk->path);
989 ensure_end_slash (dblk->fname_stp);
990 strcat (dblk->fname_stp, dblk->name);
991 strcat (dblk->fname_stp, EXT_STOPLIST);
993 stat (dblk->fname_stp, &statbuf);
994 if (errno == ENOENT) {
995 strcpy (dblk->fname_stp, dblk->path);
996 ensure_end_slash (dblk->fname_stp);
997 strcat (dblk->fname_stp, lang_fnames [dblk->dbrec.or_language]);
998 strcat (dblk->fname_stp, EXT_STOPLIST);
1001 if (debugging_loadlang)
1003 PROGNAME"1448 Stoplist file name = '%s'.\n",
1006 /* Dont reload the same file if it's already
1007 * been loaded into a previous dblk in a list.
1008 * Code works just fine if dblist == NULL.
1010 for (db = dblist; db != NULL; db = db->link) {
1011 if (db == dblk || db->fname_stp == NULL)
1013 if (strcmp (db->fname_stp, dblk->fname_stp) == 0) {
1014 dblk->stoplist = db->stoplist;
1015 dblk->lang_flags |= LF_DUP_STP;
1016 if (debugging_loadlang)
1017 fprintf (aa_stderr, PROGNAME"1460 "
1018 "Using previously loaded stoplist, db='%s'.\n",
1024 /* Stop lists are mandatory--a missing stoplist is fatal. */
1025 i = load_wordtree (&dblk->stoplist, dblk, dblk->fname_stp, TRUE);
1028 catgets (dtsearch_catd, MS_misc, 362, "%s: %s: %s"),
1029 PROGNAME"1270", dblk->fname_stp, strerror(ENOENT));
1030 DtSearchAddMessage (sprintbuf);
1033 } /* load_stop_list() */
1036 /************************************************/
1038 /* free_paice_rules */
1040 /************************************************/
1041 /* Frees all allocated storage for a set of paice rules, typically
1042 * loaded at dblk.stem_extra. Called by REINIT routines and
1043 * by load_paice_suffixes() when cleaning up after an error.
1045 static void free_paice_rules (PRULE ***rules_table_ptr)
1049 PRULE **rules_table;
1051 if (*rules_table_ptr == NULL)
1053 rules_table = *rules_table_ptr;
1054 for (i=0; i<256; i++) {
1055 if (rules_table[i] == NULL)
1068 *rules_table_ptr = NULL;
1070 } /* free_paice_rules() */
1073 /************************************************/
1075 /* load_paice_suffixes */
1077 /************************************************/
1078 /* Loads European language paice stemmer suffix rules
1079 * into dblk.stem_extra as an array of ptrs to linked lists.
1080 * Like stop lists, sfx files can be
1081 * (1) passed in dblk.fname_sfx,
1082 * (2) generated from dblk path, dbname, and '.sfx',
1083 * (3) generated from dblk path, language, and '.sfx'.
1084 * Internal tables will be reused if file previously loaded.
1085 * Only uses single byte character sets (ascii, iso-latin-1).
1086 * Uses strtok(). dblk->charmap must already be loaded.
1087 * Will continue to parse entire file even if errors are found.
1088 * RETURNS TRUE if no problems, else FALSE with msg in ausapi_msglist.
1090 static int load_paice_suffixes (DBLK *dblk, DBLK *dblist)
1095 PRULE *prule, **prule_link;
1096 PRULE **rules_table;
1097 struct stat statbuf;
1098 UCHAR *cptr, *token;
1099 char readbuf [_POSIX_PATH_MAX + 1024];
1100 char msgbuf [_POSIX_PATH_MAX + 1024];
1101 UCHAR *suffix, *apndstr;
1102 int must_be_intact, is_last_rule;
1104 int lineno, errcount;
1105 _Xstrtokparams strtok_buf;
1107 dblk->stem_extra = NULL; /* just to be sure */
1110 if (debugging_loadlang)
1112 PROGNAME"1715 Load paice suffixes: db='%s' lang=#%d,%s\n",
1113 NULLORSTR(dblk->name), (int)dblk->dbrec.or_language,
1114 language_name(dblk->dbrec.or_language));
1116 /* If file name not provided, generate one based on
1117 * dblk's path, database name, and default extension.
1118 * And if that doesn't work, generate one based on
1119 * dblk's path, language, and default extension.
1121 if (dblk->fname_sfx == NULL) {
1122 if (dblk->path == NULL)
1123 dblk->path = strdup("");
1124 dblk->fname_sfx = austext_malloc (strlen(dblk->path) + 36,
1125 PROGNAME"1113", NULL);
1127 strcpy (dblk->fname_sfx, dblk->path);
1128 ensure_end_slash (dblk->fname_sfx);
1129 strcat (dblk->fname_sfx, dblk->name);
1130 strcat (dblk->fname_sfx, EXT_SUFFIX);
1132 stat (dblk->fname_sfx, &statbuf);
1133 if (errno == ENOENT) {
1134 strcpy (dblk->fname_sfx, dblk->path);
1135 ensure_end_slash (dblk->fname_sfx);
1136 strcat (dblk->fname_sfx, lang_fnames [dblk->dbrec.or_language]);
1137 strcat (dblk->fname_sfx, EXT_SUFFIX);
1140 if (debugging_loadlang)
1142 PROGNAME"1740 Paice suffix file name = '%s'.\n",
1145 /* Dont reload the same file if it's already
1146 * been loaded into a previous dblk in a list,
1147 * but flag it so it won't be freed at unload_language/REINIT.
1148 * Code works just fine if dblist == NULL.
1150 for (db = dblist; db != NULL; db = db->link) {
1151 if (db == dblk || db->fname_sfx == NULL)
1153 if (strcmp (db->fname_sfx, dblk->fname_sfx) == 0) {
1154 dblk->stem_extra = db->stem_extra;
1155 dblk->lang_flags |= LF_DUP_SFX;
1156 if (debugging_loadlang)
1157 fprintf (aa_stderr, PROGNAME"1145 "
1158 "Using previously loaded suffixes, db='%s'.\n",
1164 fp = fopen (dblk->fname_sfx, "rt");
1167 catgets (dtsearch_catd, MS_misc, 362, "%s: %s: %s."),
1168 PROGNAME"181", dblk->fname_sfx, strerror(errno));
1169 DtSearchAddMessage (msgbuf);
1170 dblk->fname_sfx = NULL;
1174 /* Rules table will eventually be loaded at dblk.stem_extra.
1175 * It consists of 256 PRULE ptrs,
1176 * one for each possible single byte char.
1177 * Each ptr is the head of a rules list for that char.
1179 rules_table = austext_malloc (256 * sizeof(PRULE*),
1180 PROGNAME"199", &ausapi_msglist);
1181 memset (rules_table, 0, 256 * sizeof(PRULE*));
1185 /*------- Main Read Loop -------*/
1186 while (fgets (readbuf, sizeof(readbuf), fp) != NULL) {
1189 /* Ignore comment lines */
1190 if (strchr (COMMENT_CHARS, readbuf[0]))
1193 /* TOKEN #1: suffix string, backwards, all uppercase.
1194 * If missing, ignore 'empty' line.
1195 * If the first token is all numeric, ignore line
1196 * (for compatibility with older versions of file).
1198 if ((suffix = (UCHAR *)_XStrtok(readbuf, SFX_DELIMS, strtok_buf)) == NULL)
1201 for (cptr = suffix; cptr; cptr++)
1202 if ((dblk->charmap[*cptr] & NUMERAL) == 0)
1207 /* OPTIONAL TOKEN #2: if next token '*', set 'intact' flag */
1208 if ((token = (UCHAR *)_XStrtok(NULL, SFX_DELIMS, strtok_buf)) == NULL) {
1210 sprintf (msgbuf, catgets(dtsearch_catd, MS_lang, 51,
1211 "%s %s, Line %d: Invalid Paice Rule for suffix '%s'.") ,
1212 PROGNAME"898", dblk->fname_sfx, lineno, suffix);
1213 DtSearchAddMessage (msgbuf);
1217 must_be_intact = FALSE;
1218 if (token[0] == '*') {
1219 must_be_intact = TRUE;
1220 /* Read next token... */
1221 if ((token = (UCHAR *)_XStrtok(NULL, SFX_DELIMS, strtok_buf)) == NULL)
1225 /* TOKEN #3: remove-count */
1226 remove_count = (UCHAR) atoi ((char *) token);
1228 /* OPTIONAL TOKEN #4: if next token is NOT a continue
1229 * symbol ('>' or '$'), then it's an append string.
1232 if ((token = (UCHAR *)_XStrtok(NULL, SFX_DELIMS, strtok_buf)) == NULL)
1234 if (token[0] != '$' && token[0] != '>') {
1236 /* Read next token... */
1237 if ((token = (UCHAR *)_XStrtok(NULL, SFX_DELIMS, strtok_buf)) == NULL)
1241 /* TOKEN #5: continue symbol '$' (stop) or '>' (continue) */
1242 is_last_rule = (token[0] == '$');
1244 if (debugging_loadword) {
1246 " SFX: intact?=%d stop?=%d remv=%d '%s'",
1247 (int) must_be_intact,
1252 fprintf (aa_stderr, "\tapnd='%s'\n", apndstr);
1254 fputc ('\n', aa_stderr);
1257 /* Good suffix. If we haven't had any errors yet,
1258 * add it to rules list for the first char of the suffix.
1262 prule = austext_malloc (sizeof(PRULE), PROGNAME"1252", NULL);
1263 memset (prule, 0, sizeof(PRULE));
1264 prule->suffix = (UCHAR *) strdup ((char*)suffix);
1265 prule->suflen = strlen ((char*)suffix);
1266 prule->must_be_intact = must_be_intact;
1267 prule->remove_count = remove_count;
1268 prule->is_last_rule = is_last_rule;
1270 prule->apndstr = (UCHAR *) strdup ((char*)apndstr);
1271 prule->aplen = strlen ((char*)apndstr);
1274 prule_link = &rules_table[suffix[0]];
1276 prule_link = &(*prule_link)->link;
1277 *prule_link = prule;
1279 } /* end Main Read Loop */
1283 free_paice_rules (&rules_table);
1286 dblk->stem_extra = rules_table;
1288 /* Update last table entry */
1289 if (debugging_loadlang) {
1291 PROGNAME"1654 Paice suffix file '%s' loaded ok.\n",
1296 } /* load_paice_suffixes() */
1299 /************************************************/
1301 /* is_matching_rule */
1303 /************************************************/
1304 /* Subroutine of paice_stemmer().
1305 * Returns TRUE if passed rule can be applied to stem in paicebuf.
1306 * Else returns FALSE.
1308 static int is_matching_rule (PRULE *rule)
1313 if (debugging_paice)
1314 fprintf (aa_stderr, " test rule '%s':\t", rule->suffix);
1316 /* Skip rule if we've made at least one previous change
1317 * but the current rule requires an intact word.
1319 if (rule->must_be_intact && !word_is_intact) {
1320 if (debugging_paice)
1321 fputs ("word not intact...\n", aa_stderr);
1325 /* Do a backward strcmp on the suffix.
1326 * Skip rule if it doesn't match current paicebuf's ending chars.
1329 ptr = paicebuf + paicelen - 1;
1330 for (i = 0; i < j; i++) {
1331 if (*((rule->suffix) + i) != *ptr) {
1332 if (debugging_paice)
1333 fputs ("no match...\n", aa_stderr);
1339 if (debugging_paice)
1340 fputs ("match", aa_stderr);
1342 /* Set i = paicebuf length after removing and appending suffixes.
1343 * Used to algorithmically test remaining stem length
1344 * after tentative application of rule.
1346 i = paicelen - (rule->remove_count - rule->aplen);
1349 if (debugging_paice)
1350 fputs (", but stem too short...\n", aa_stderr);
1355 if (IS_VOWEL (paicebuf[0])) {
1356 if (debugging_paice)
1357 fputs (", and short vowel stem valid.\n", aa_stderr);
1361 if (debugging_paice)
1362 fputs (", but consonant stem too short...\n", aa_stderr);
1367 /* Remaining stem is at least 3 chars.
1368 * If it contains a vowel anywhere, it's valid.
1369 * (A 'Y' after the first char counts as a vowel).
1370 * Otherwise it's not.
1372 for (j=0; j<i; j++) {
1373 if (IS_VOWEL (paicebuf[j])) {
1375 if (debugging_paice)
1376 fputs (", and remaining stem valid.\n", aa_stderr);
1379 if (j > 0 && paicebuf[j] == 'Y')
1383 if (debugging_paice)
1384 fputs (", but remaining stem all consonants.\n", aa_stderr);
1386 } /* is_matching_rule() */
1389 /************************************************/
1393 /************************************************/
1394 /* Given a word token (ALREADY UPPERCASE) in a single byte
1395 * language such as the output of teskey_parser,
1396 * generates 'stem' by repeated suffix removal.
1397 * Returns stem token in a static buffer valid
1398 * until next call to paice_stemmer or null_stemmer.
1399 * Returned stem might be the original unmodified word.
1400 * Returned stem might also be empty string.
1401 * Returned stem is *never* NULL, even if wordin == NULL.
1402 * Input buffer will not be modified; does not use strtok.
1403 * All variables are static for speeeeeeed.
1405 static char *paice_stemmer (char *wordin, DBLK *dblk)
1408 PRULE *rule, **rules_table;
1415 if ((rules_table = (PRULE **)dblk->stem_extra) == NULL) {
1416 fprintf (aa_stderr, catgets (dtsearch_catd, MS_lang, 31,
1417 "%s Stemmer suffixes file never loaded.\n"),
1422 /* The max length of a stem is bufsz - 2:
1423 * one for the terminating \0 and one for the
1424 * prefix ^O that identifies a stem. (But this
1425 * stemmer doesn't actually insert the ^O now.)
1427 strncpy ((char*)paicebuf, wordin, DtSrMAXWIDTH_HWORD);
1428 paicebuf [DtSrMAXWIDTH_HWORD - 2] = 0;
1429 paice_charmap = dblk->charmap;
1430 word_is_intact = TRUE;
1432 for (;;) { /*-------- Main Stemming Loop ---------*/
1434 paicelen = strlen ((char*)paicebuf);
1435 finalc = *(paicebuf + paicelen - 1);
1436 if (debugging_paice) {
1438 "paice: '%s', rules list '%c' for database '%s'\n",
1439 paicebuf, finalc, dblk->name);
1443 /* Look for a matching rule */
1444 if ((rule = rules_table [finalc]) == NULL) {
1445 if (debugging_paice)
1446 fputs (" list is null, stop.\n", aa_stderr);
1450 if (is_matching_rule (rule))
1455 if (debugging_paice)
1456 fprintf (aa_stderr, " rules list '%c' is exhausted, stop.\n",
1461 /* Apply rule that matched */
1462 if (debugging_paice)
1463 fputs (" apply rule: ", aa_stderr);
1464 if (rule->remove_count == 0) {
1465 if (debugging_paice)
1466 fputs ("remove_count = 0, stop.\n", aa_stderr);
1470 paicebuf [paicelen - rule->remove_count] = 0;
1472 strcat ((char*)paicebuf, (char*)rule->apndstr);
1473 paicelen = strlen ((char*)paicebuf);
1474 word_is_intact = FALSE; /* we've removed at least 1 suffix */
1475 if (debugging_paice)
1476 fprintf (aa_stderr, "--> '%s'", paicebuf);
1478 /* Terminate algorithm if rule says so.
1479 * Otherwise continue removing suffixes
1480 * from this partially stemmed word.
1482 if (rule->is_last_rule) {
1483 if (debugging_paice)
1484 fputs (", stop flag is set, stop.\n", aa_stderr);
1487 if (debugging_paice)
1488 fputc ('\n', aa_stderr);
1490 } /* end Main Stemming Loop */
1492 if (debugging_paice) {
1493 fprintf (aa_stderr, " final stem: '%s'\n", paicebuf);
1496 return (char *) paicebuf;
1497 } /* paice_stemmer() */
1500 /************************************************/
1504 /************************************************/
1505 /* Stemmer that just copies and returns passed word.
1506 * In effect, the passed word IS its own stem.
1507 * Output buffer valid until next call to null_stemmer
1510 char *null_stemmer (char *word, DBLK *dblk)
1516 strncpy ((char *)paicebuf, word, DtSrMAXWIDTH_HWORD);
1517 paicebuf [DtSrMAXWIDTH_HWORD-1] = 0;
1518 return (char *) paicebuf;
1519 } /* null_stemmer() */
1522 /************************************************/
1526 /************************************************/
1527 /* Converts passed string to uppercase in place.
1528 * Classic strupr() function using teskey charmaps.
1530 static char *euro_lstrupr (char *string, DBLK *dblk)
1532 static int *charmap;
1534 charmap = dblk->charmap;
1535 for (s=(UCHAR *)string; *s; s++)
1536 *s = charmap[*s] & 0xff;
1541 /************************************************/
1545 /************************************************/
1546 /* Just returns passed string. Used where uppercase
1547 * conversions are not required for a language.
1549 char *null_lstrupr (char *s, DBLK *d)
1553 /************************************************/
1557 /************************************************/
1558 /* Loads a dblk with a specific language's
1559 * structures and function pointers.
1560 * Does not reload structures previously loaded in
1561 * other dblks on dblist if derived from identical files.
1562 * But always loads structures if passed dblist is NULL.
1563 * Presumes dblk already partially initialized with mandatory fields:
1564 * name, path, language.
1565 * May also be preinitialized with optional fields:
1566 * minwordsz, maxwordsz.
1567 * Returns TRUE if all successful.
1568 * Otherwise returns FALSE with err msgs on ausapi_msglist.
1570 int load_language (DBLK *dblk, DBLK *dblist)
1575 int language = dblk->dbrec.or_language;
1577 if (debugging_loadlang)
1579 "\n"PROGNAME"1920 Loading language #%d, %s, for dblk '%s'.\n",
1580 (int)dblk->dbrec.or_language,
1581 language_name (dblk->dbrec.or_language),
1582 NULLORSTR(dblk->name));
1585 * Note: Load list functions must be called
1586 * AFTER charmap and lstrupr are loaded.
1595 dblk->charmap = (language == DtSrLaENG)?
1596 ascii_charmap : latin_charmap;
1597 dblk->parser = teskey_parser;
1598 dblk->stemmer = paice_stemmer;
1599 dblk->lstrupr = euro_lstrupr;
1600 if (dblk->dbrec.or_maxwordsz == 0)
1601 dblk->dbrec.or_maxwordsz = (language == DtSrLaDEU)?
1602 MAXWIDTH_LWORD - 1 : MAXWIDTH_SWORD - 1;
1603 if (dblk->dbrec.or_minwordsz == 0)
1604 dblk->dbrec.or_minwordsz = MINWIDTH_TOKEN + 1;
1606 if (!load_stop_list (dblk, dblist))
1608 if (!load_include_list (dblk, dblist))
1610 if (!load_paice_suffixes (dblk, dblist))
1618 return load_jpn_language (dblk, dblist);
1621 /* Try loading a custom 'user' language.
1622 * If he failed to provide a loader function,
1623 * the dummy custom loader will tell him so.
1624 * If he provided one but it can't load this language,
1625 * it should return it's own error msgs.
1627 return load_custom_language (dblk, dblist);
1629 } /* end switch (language) */
1632 } /* load_language() */
1635 /************************************************/
1637 /* unload_language */
1639 /************************************************/
1640 /* Frees storage for structures allocated by load_language().
1641 * Called when engine REINITs due to change in site config file
1643 * Duplicate wordtrees are not unloaded because they
1644 * will have already been unloaded in a previous dblk.
1646 void unload_language (DBLK *dblk)
1648 switch (dblk->dbrec.or_language) {
1655 dblk->charmap = NULL;
1656 if ((dblk->lang_flags & LF_DUP_STP) == 0)
1657 free_wordtree (&dblk->stoplist);
1659 dblk->stoplist = NULL;
1660 dblk->lang_flags &= ~LF_DUP_STP;
1662 if ((dblk->lang_flags & LF_DUP_INC) == 0)
1663 free_wordtree (&dblk->inclist);
1665 dblk->inclist = NULL;
1666 dblk->lang_flags &= ~LF_DUP_INC;
1668 if ((dblk->lang_flags & LF_DUP_SFX) == 0)
1669 free_paice_rules ((PRULE***)&dblk->stem_extra);
1671 dblk->stem_extra = NULL;
1672 dblk->lang_flags &= ~LF_DUP_SFX;
1678 unload_jpn_language (dblk);
1682 unload_custom_language (dblk);
1686 } /* unload_language() */
1687 /******************** LANG.C ********************/