cde/lib/DtSearch/lang.c

   1 /*
   2  * CDE - Common Desktop Environment
   3  *
   4  * Copyright (c) 1993-2012, The Open Group. All rights reserved.
   5  *
   6  * These libraries and programs are free software; you can
   7  * redistribute them and/or modify them under the terms of the GNU
   8  * Lesser General Public License as published by the Free Software
   9  * Foundation; either version 2 of the License, or (at your option)
  10  * any later version.
  11  *
  12  * These libraries and programs are distributed in the hope that
  13  * they will be useful, but WITHOUT ANY WARRANTY; without even the
  14  * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE. See the GNU Lesser General Public License for more
  16  * details.
  17  *
  18  * You should have received a copy of the GNU Lesser General Public
  19  * License along with these librararies and programs; if not, write
  20  * to the Free Software Foundation, Inc., 51 Franklin Street, Fifth
  21  * Floor, Boston, MA 02110-1301 USA
  22  */
  23 /*
  24  *   COMPONENT_NAME: austext
  25  *
  26  *   FUNCTIONS: euro_lstrupr
  27  *              free_wordtree
  28  *              is_concordable
  29  *              language_name
  30  *              load_include_list
  31  *              load_language
  32  *              load_paice_suffixes
  33  *              load_stop_list
  34  *              load_wordtree
  35  *              null_lstrupr
  36  *              null_stemmer
  37  *              paice_stemmer
  38  *              search_wordtree
  39  *              teskey_parser
  40  *              unload_language
  41  *
  42  *   ORIGINS: 27
  43  *
  44  *
  45  *   (C) COPYRIGHT International Business Machines Corp. 1995,1996
  46  *   All Rights Reserved
  47  *   Licensed Materials - Property of IBM
  48  *   US Government Users Restricted Rights - Use, duplication or
  49  *   disclosure restricted by GSA ADP Schedule Contract with IBM Corp.
  50  */
  51 /******************** LANG.C ********************
  52  * $XConsortium: lang.c /main/11 1996/11/25 18:47:29 drk $
  53  * July 1995.
  54  * Includes load_language(), unload_language(), and functions and data for
  55  * parsing and stemming European languages in DtSearch/AusText.
  56  * Incorporates p/o socrates.c, p/o proctext.c, parser.c
  57  * delsfx.c, loadchr.c, stop.c, inclist.c, convneg.c, isendwrd.c
  58  * Related to similar semantic modules repackaged into semantic.c.
  59  * Paice suffix removal algorithm from C. Paice, 1990,
  60  * "Another Stemmer", ACM SIGIR Forum, 24(3), 56-61.
  61  *
  62  * $Log$
  63  * Revision 2.13  1996/03/25  18:55:26  miker
  64  * Changed FILENAME_MAX to _POSIX_PATH_MAX.
  65  *
  66  * Revision 2.12  1996/03/25  17:00:19  miker
  67  * Cleanup compiler warning.
  68  *
  69  * Revision 2.11  1996/03/13  22:58:13  miker
  70  * Changed char to UCHAR several places.
  71  *
  72  * Revision 2.10  1996/03/05  16:49:58  miker
  73  * Move COMMENT_CHARS to SearchP.h.
  74  *
  75  * Revision 2.9  1996/03/05  16:31:20  miker
  76  * Added test of PA_MSGS for yacc-based boolean queries.
  77  * Made comment chars in linguistic files independent of locale.
  78  * Changed several char ptrs to unsigned char so parser will
  79  * work when compiled under default signed char compilers.
  80  * Simplified several statements with LHS *var++ for same reason.
  81  *
  82  * Revision 2.8  1996/02/05  16:16:05  miker
  83  * Restore prolog.
  84  *
  85  * Revision 2.7  1996/02/05  16:10:54  miker
  86  * load_paice_suffixes: discard .sfx lines beginning with all numeric
  87  * first token for compatibility with older file formats.
  88  *
  89  * Revision 2.6  1996/02/01  19:11:43  miker
  90  * AusText 2.1.11, DtSearch 0.3:  Major rewrite for new parsers.
  91  * Moved charmaps to new module langmap.c.  Removed hard coded
  92  * paice stemmer values--now dynamic from .sfx file.
  93  *
  94  * Revision 2.5  1995/10/26  14:55:28  miker
  95  * Added prolog.
  96  *
  97  * Revision 2.4  1995/10/19  20:54:36  miker
  98  * Increased msg buf sizes to accommodate larger database file names.
  99  *
 100  * Revision 2.3  1995/10/06  14:39:45  miker
 101  * Bug fix: coredump loading multiple databases
 102  * on Solaris.
 103  *
 104  * Revision 2.2  1995/10/03  21:39:10  miker
 105  * Changed teskey_parser, paice_stemmer, and null_stemmer
 106  * to return number of words parsed/stemmed, not just boolean.
 107  *
 108  * Revision 2.1  1995/09/22  21:00:19  miker
 109  * Freeze DtSearch 0.1, AusText 2.1.8
 110  *
 111  * Revision 1.3  1995/09/19  22:08:28  miker
 112  * Added support for loading and parsing Japanese language DtSrLaJPN.
 113  *
 114  * Revision 1.2  1995/09/05  21:34:52  miker
 115  * Fixed bug: search engine wouldn't parse words of exactly
 116  * 3 or 15 chars.
 117  *
 118  * Revision 1.1  1995/08/31  21:03:44  miker
 119  * Initial revision
 120  */
 121 #include "SearchP.h"
 122
 123 #include <limits.h>
 124 #include <stdlib.h>
 125 #include <string.h>
 126 #include <errno.h>
 127 #include <sys/stat.h>
 128
 129 #define X_INCLUDE_STRING_H
 130 #define XOS_USE_NO_LOCKING
 131 #include <X11/Xos_r.h>
 132
 133 #define PROGNAME        "LANG"
 134 #define EXT_SUFFIX      ".sfx"  /* standard paice suffix file format */
 135 #define OUTBUFSZ        6140
 136 #define SFX_DELIMS      " \t\n"
 137 #define MS_misc         1
 138 #define MS_lang         15
 139 #define IS_VOWEL(c)     ((paice_charmap [(UCHAR)c] & VOWEL) != 0)
 140
 141 /************************************************/
 142 /*                                              */
 143 /*                    PRULE                     */
 144 /*                                              */
 145 /************************************************/
 146 /* List of Paice suffix removal rules from .sfx files */
 147 typedef struct prule_t {
 148     struct prule_t   *link;     /* Ptr to next list node */
 149     UCHAR   *suffix;            /* Applicable suffix string, backwards */
 150     UCHAR   suflen;             /* Length of suffix */
 151     char    must_be_intact;     /* Optional '*'.  Rule only applies
 152                                  * to intact words */
 153     UCHAR   remove_count;       /* Number of suffix chars to remove */
 154     UCHAR   aplen;              /* Length of apndstr */
 155     UCHAR   *apndstr;           /* Optional append string */
 156     char    is_last_rule;       /* '$' terminate or '>' continue algorithm */
 157     }   PRULE;
 158
 159
 160 /************************************************/
 161 /*                                              */
 162 /*                    GLOBALS                   */
 163 /*                                              */
 164 /************************************************/
 165 int             debugging_loadlang =            FALSE;
 166 int             debugging_loadword =            FALSE;
 167 int             debugging_search_wordtree =     FALSE;
 168 int             debugging_teskey =              FALSE;
 169 int             debugging_paice =               FALSE;
 170 static int      *paice_charmap;
 171 static UCHAR    paicebuf [DtSrMAXWIDTH_HWORD + 2];
 172 static int      paicelen;
 173 static int      word_is_intact;
 174
 175 /* Language strings correspond to DtSrLa.. constants.  */
 176 static char     *lang_fnames[] = {
 177                         "eng",          /*  0 */
 178                         "eng",          /*  1  ('eng2' same files as 'eng') */
 179                         "esp",          /*  2 */
 180                         "fra",          /*  3 */
 181                         "ita",          /*  4 */
 182                         "deu",          /*  5 */
 183                         "jpn",          /*  6 */
 184                         "jpn",          /*  7  ('jpn2' same files as 'jpn' */
 185                         NULL
 186                 };
 187
 188
 189 /************************************************/
 190 /*                                              */
 191 /*                language_name                 */
 192 /*                                              */
 193 /************************************************/
 194 /* Returns language name string given language number */
 195 static char     *language_name (DtSrINT16 langno)
 196 {
 197     static char *language_names[] = {
 198                 "English-ASCII",        /*  0 = DtSrLaENG */
 199                 "English-Latin1",       /*  1 = DtSrLaENG2 */
 200                 "Spanish",              /*  2 = DtSrLaESP */
 201                 "French",               /*  3 = DtSrLaFRA */
 202                 "Italian",              /*  4 = DtSrLaITA */
 203                 "German",               /*  5 = DtSrLaDEU */
 204                 "Japanese-comp"         /*  6 = DtSrLaJPN */
 205                 "Japanese-.knj"         /*  7 = DtSrLaJPN2 */
 206                 };
 207 #if TRACY
 208     static char *language_names[10];
 209
 210     language_names[0] = catgets(dtsearch_catd, MS_lang, 50, "English-ASCII");
 211     language_names[1] = catgets(dtsearch_catd, MS_lang, 51, "English-Latin1");
 212     language_names[1] = catgets(dtsearch_catd, MS_lang, 52, "Spanish");
 213     language_names[1] = catgets(dtsearch_catd, MS_lang, 53, "French");
 214     language_names[1] = catgets(dtsearch_catd, MS_lang, 54, "Italian");
 215     language_names[1] = catgets(dtsearch_catd, MS_lang, 54, "German");
 216     language_names[1] = catgets(dtsearch_catd, MS_lang, 54, "Japanese-comp");
 217     language_names[1] = catgets(dtsearch_catd, MS_lang, 54, "Japanese-.knj");
 218
 219 #endif
 220     if (langno < 0)
 221         return "INVALID!";
 222     else if (langno > DtSrLaLAST)
 223         return "(Custom Language)";
 224     else
 225         return language_names [langno];
 226 } /* language_name() */
 227
 228
 229 /************************************************/
 230 /*                                              */
 231 /*               search_wordtree                */
 232 /*                                              */
 233 /************************************************/
 234 /* Sept 1991.
 235  * Formerly search_inclist() in inclist.c and search_stoplist() in stop.c.
 236  * Searches a word list in a binary WORDTREE.
 237  * Passed wordstring is presumed to be a clean,
 238  * uppercase word token string terminated by \0.
 239  * Variables are static for speeeeed.
 240  * Returns TRUE if successful search, else FALSE.
 241  * See also search_wordtree_jpn() in jpn.c
 242  */
 243 static int      search_wordtree (WORDTREE *wordtree, UCHAR *wordstring)
 244 {
 245     static int          direction;
 246     static WORDTREE     *node;
 247     static char         *cptr;
 248
 249     if (debugging_search_wordtree)
 250         fprintf (aa_stderr, PROGNAME"196 search wordtree for '%s':\n",
 251             wordstring);
 252     /* MAIN SEARCH LOOP: binary tree search */
 253     for (node = wordtree;  node != NULL;  ) {
 254         if ((direction = strcmp ((char *) wordstring, node->word)) == 0) {
 255             if (debugging_search_wordtree)
 256                 fprintf (aa_stderr, "  HIT!\n");
 257             return TRUE;
 258         }
 259         /* Descend left or right depending on word */
 260         if (debugging_search_wordtree)
 261             fprintf (aa_stderr, "  %c '%s'\n",
 262                 (direction < 0) ? 'L' : 'R', node->word);
 263         if (direction < 0)
 264             node = node->llink;
 265         else
 266             node = node->rlink;
 267     }
 268     if (debugging_search_wordtree)
 269         fprintf (aa_stderr, "  MISS.\n");
 270     return FALSE;
 271 }  /* search_wordtree() */
 272
 273
 274 /************************************************/
 275 /*                                              */
 276 /*                 teskey_parser                */
 277 /*                                              */
 278 /************************************************/
 279 /* 1989.
 280  * Teskey_parser() is derived from the former Socrates() in socrates.c.
 281  * Returns next teskey-parsed word token from a character stream.
 282  * Called from (1) dtsrindex, where readchar_ftext() cofunction
 283  * reads the .fzk file document 'stream', or (2) search engine
 284  * query parsers, where readchar_string() cofunction 'reads'
 285  * from the query string.
 286  * (The word hiliting parser does not directly call teskey_parser; it has
 287  * its own simplified equivalent to the parsing algorithms herein.)
 288  *
 289  * First call passes args in PARG structure.  This resets end of
 290  * text block (ETX) flag, resets 'offset' counter to zero, etc.
 291  * Subsequent calls should pass NULL, and parser returns
 292  * next token in block, until reader cofunction reads ETX,
 293  * ie special ETX char ('\0').  Subsequent calls to parser
 294  * return NULL meaning "no tokens left in current stream".
 295  * Reader cofunctions tolerate repeated calls after
 296  * the first ETX, still returning '\0'.
 297  *
 298  * This parser presumes all incoming text is unformatted.
 299  * Since parser accesses streams a char at a time it does
 300  * not require periodic line feeds or anything else.
 301  *
 302  * Parser also returns offset information: number of bytes
 303  * since beginning of text block.
 304  *
 305  * Variables are static for speeeeeeed.
 306  *
 307  * OUTPUT FORMAT:  NULL or a static C string containing a single
 308  * parsed word token.  Word buffer reused at next call.
 309  * Each word is translated as follows:
 310  *      All alphas TO UPPERCASE.
 311  *      Teskey algorithm used to find word boundaries.
 312  *      Always keeps include-list words.
 313  *      Throws away stoplist words, very short words, and very long words.
 314  *      All intervening nonconcordables discarded.
 315  *
 316  * There is a slight mod to the published Teskey algorithm.
 317  * Words can begin with optionally concordable chars
 318  * but not end with them.  For example if '-' is optionally
 319  * concordable, '-foo-' will be parsed into '-foo'.
 320  */
 321 char    *teskey_parser (PARG *parg)
 322 {
 323     static READCFP      cofunction;
 324     static void         *cofunction_arg;
 325     static DBLK         *dblk =         NULL;
 326     static UCHAR        *outbuf =       NULL;
 327     static size_t       outbufsz =      0;
 328     static UCHAR        *endmaxword;    /* end largest possible output word */
 329     static UCHAR        *outp;          /* next loc in outbuf */
 330     static char         *begw;  /* beginning of a word in the input buffer */
 331     static char         *endw;  /* end of a word in the input buffer */
 332     static int          *charmap;
 333     static int          minwordsz, maxwordsz;
 334     static int          wordlen;
 335     static enum {BETW_WORDS, IN_WORD, TOO_LONG}
 336                         tpstate;
 337     static long         *offsetp, readcount, candidate_offset;
 338     static int          is_hiliting;
 339     static int          add_msgs;
 340
 341     /* If first call for current text block... */
 342     if (parg) {
 343         dblk = parg->dblk;
 344         minwordsz = dblk->dbrec.or_minwordsz;
 345         maxwordsz = dblk->dbrec.or_maxwordsz;
 346         charmap = dblk->charmap;
 347         offsetp = parg->offsetp;
 348         is_hiliting = (parg->flags & PA_HILITING);
 349         add_msgs = (parg->flags & PA_MSGS);
 350         if (charmap == NULL) {
 351             fprintf (aa_stderr, catgets(dtsearch_catd, MS_lang, 4,
 352                 "%s dblk not initialized.\n"),
 353                 PROGNAME"801");
 354             DtSearchExit (55);
 355         }
 356
 357         if (parg->string) {
 358             cofunction_arg = parg->string;
 359             cofunction = (READCFP) readchar_string;
 360         }
 361         else if (parg->ftext) {
 362             cofunction_arg = parg;
 363             cofunction = (READCFP) readchar_ftext;
 364         }
 365         else {
 366             fprintf (aa_stderr, catgets(dtsearch_catd, MS_lang, 5,
 367                 "%s Program Error: parg contains neither file nor string.\n"),
 368                 PROGNAME"327");
 369             DtSearchExit (27);
 370         }
 371
 372         if (outbufsz <= maxwordsz) {
 373             if (outbuf)
 374                 free (outbuf);
 375             outbufsz = maxwordsz + 8;
 376             outbuf = austext_malloc (outbufsz + 8, PROGNAME"807", NULL);
 377         }
 378         endmaxword = outbuf + maxwordsz;
 379         if (debugging_teskey)
 380             fprintf (aa_stderr,
 381                 "teskey: start of text block, maxwsz=%ld outbufsz=%ld\n",
 382                 maxwordsz, outbufsz);
 383         readcount = 0L;
 384     }
 385
 386     /* CANDIDATE WORD LOOP:  Read text chars into outbuf.
 387      * Exit loop when outbuf contains one candidate token or at ETX.
 388      */
 389 READ_ANOTHER_WORD:
 390     outp = outbuf;
 391     tpstate = BETW_WORDS;
 392     while (*outp = cofunction (cofunction_arg)) {
 393         readcount++;
 394         cofunction_arg = NULL;
 395
 396         /*------------- BETW_WORDS State ------------
 397          * Reader is between word tokens.
 398          */
 399         if (tpstate == BETW_WORDS) {
 400             /*
 401              * Discard nonconcordable chars between words.
 402              */
 403             if ((charmap[*outp] & NON_CONCORD) != 0)
 404                 continue;
 405             /*
 406              * Fully concordable char is definite start of new word.
 407              * Convert to uppercase and go get next char.
 408              */
 409             if ((charmap[*outp] & CONCORDABLE) != 0) {
 410                 *outp = charmap[*outp] & 0x00ff;
 411                 outp++;
 412                 candidate_offset = readcount;
 413                 tpstate = IN_WORD;
 414                 continue;
 415             }
 416             /*
 417              * Must be optionally concordable.  It can only
 418              * start a new word if next char is concordable.
 419              * If so, convert a fully concordable char
 420              * to uppercase and go get next char.
 421              * Otherwise discard just like non_concord.
 422              */
 423             outp++;
 424             if (*outp = cofunction(NULL))
 425                 readcount++;
 426             if ((charmap[*outp] & CONCORDABLE) != 0) {
 427                 *outp = charmap[*outp] & 0x00ff;
 428                 outp++;
 429                 candidate_offset = readcount - 1;
 430                 tpstate = IN_WORD;
 431                 continue;
 432             }
 433             else {
 434                 outp--;
 435                 continue;
 436             }
 437         } /* endif BETW_WORDS */
 438
 439
 440         /*------------- IN_WORD State ------------
 441          * Reader is in middle of a word.
 442          * Convert all concordables to uppercase and append.
 443          * Terminate word at first non_concord.
 444          * Non_concords treatment depends on next char.
 445          */
 446         else if (tpstate == IN_WORD) {
 447             if ((charmap[*outp] & CONCORDABLE) != 0) {
 448                 if (outp < endmaxword) {
 449                     *outp = charmap[*outp] & 0x00ff;
 450                     outp++;
 451                 }
 452                 else {
 453                     tpstate = TOO_LONG;
 454                     if (debugging_teskey)
 455                         fprintf (aa_stderr,
 456                                 "teskey: ofs=%3ld \"%.15s...\", (TOO LONG)\n",
 457                                 candidate_offset-1, outbuf);
 458                     if (add_msgs) {
 459                         char    msgbuf [DtSrMAXWIDTH_HWORD + 100];
 460                         sprintf (msgbuf, catgets(dtsearch_catd, MS_lang, 8,
 461                             "%s '%.*s...' is larger\n"
 462                             "than the maximum word size of database '%s'.") ,
 463                             PROGNAME"449", maxwordsz,
 464                             parg->string, dblk->label);
 465                         DtSearchAddMessage (msgbuf);
 466                         return NULL;
 467                     }
 468                     outbuf[0] = 0;
 469                     outp = outbuf;
 470                 }
 471                 continue;
 472             }
 473             if ((charmap[*outp] & NON_CONCORD) != 0) {
 474                 *outp = '\0';
 475                 break;
 476             }
 477             /* Must be opt_concord... */
 478             outp++;
 479             if (*outp = cofunction(NULL))
 480                 readcount++;
 481             if ((charmap[*outp] & CONCORDABLE) != 0) {
 482                 if (outp < endmaxword) {
 483                     *outp = charmap[*outp] & 0x00ff;    /* uppercase */
 484                     outp++;
 485                 }
 486                 else {
 487                     tpstate = TOO_LONG;
 488                     if (debugging_teskey)
 489                         fprintf (aa_stderr,
 490                                 "teskey: ofs=%3ld \"%.15s...\", (TOO LONG)\n",
 491                                 candidate_offset-1, outbuf);
 492                     outbuf[0] = 0;
 493                     outp = outbuf;
 494                 }
 495                 continue;
 496             }
 497             else {      /* next char NOT concordable...*/
 498                 *(--outp) = '\0';
 499                 break;
 500             }
 501         } /* endif IN_WORD */
 502
 503
 504         /*------------- TOO_LONG State ------------
 505          * Reader is in middle of a word that exceeds max word size.
 506          * Discard all concordables and opt_concords until we
 507          * can get between words again with a clear non_concord.
 508          */
 509         else if (tpstate == TOO_LONG) {
 510             if ((charmap[*outp] & NON_CONCORD) != 0) {
 511                 outp = outbuf;
 512                 tpstate = BETW_WORDS;
 513             }
 514             continue;
 515         }
 516
 517         /*------------- UNKNOWN State ------------*/
 518         else {
 519             fprintf (aa_stderr, catgets(dtsearch_catd, MS_lang, 10,
 520                 "%s Program Error: Unknown parser state.\n"),
 521                 PROGNAME"306");
 522             DtSearchExit (26);
 523         }
 524     } /* end read loop for next CANDIDATE WORD */
 525
 526     /*---------- TEST FOR ETX -------------*/
 527     if (outbuf[0] == 0) {
 528         if (debugging_teskey)
 529             fprintf (aa_stderr, "teskey: etx\n");
 530         if (add_msgs) {
 531             char        msgbuf [200];
 532             sprintf (msgbuf, catgets(dtsearch_catd, MS_lang, 12,
 533                 "%s '%.120s' is not a valid word in database '%s'.") ,
 534                 PROGNAME"506", parg->string, dblk->label);
 535             DtSearchAddMessage (msgbuf);
 536         }
 537         return NULL;
 538     }
 539
 540     wordlen = strlen ((char *) outbuf);
 541     candidate_offset--; /* token offset is one less than number of reads */
 542     if (debugging_teskey)
 543         fprintf (aa_stderr, "teskey: ofs=%3ld \"%s\"",
 544             candidate_offset, outbuf);
 545
 546     if (is_hiliting) {
 547         if (debugging_teskey)
 548             fprintf (aa_stderr, ", (hiliting, skip tree searches)");
 549         goto GOOD_WORD;
 550     }
 551
 552     /*--------- INCLUDE LIST ----------
 553      * Search before testing for stoplist or minimum word length.
 554      */
 555     if (dblk->inclist != NULL) {
 556         if (search_wordtree (dblk->inclist, outbuf)) {
 557             if (debugging_teskey)
 558                 fprintf (aa_stderr, ", (INCLUDE LIST)");
 559             goto GOOD_WORD;
 560         }
 561     }
 562
 563     /*--------- TOO SHORT -----------*/
 564     if (wordlen < minwordsz) {
 565         if (debugging_teskey)
 566             fprintf (aa_stderr, ", (TOO SHORT, min %d)\n", minwordsz);
 567         if (add_msgs) {
 568             char        msgbuf [200];
 569             sprintf (msgbuf, catgets(dtsearch_catd, MS_lang, 17,
 570                 "%s '%s' is less than the\n"
 571                 "minimum word size of database '%s'.") ,
 572                 PROGNAME"543", parg->string, dblk->label);
 573             DtSearchAddMessage (msgbuf);
 574             return NULL;
 575         }
 576         goto READ_ANOTHER_WORD;
 577     }
 578
 579     /*----------- STOP LIST -------------*/
 580     if (dblk->stoplist != NULL) {
 581         if (search_wordtree (dblk->stoplist, outbuf)) {
 582             if (debugging_teskey)
 583                 fprintf (aa_stderr, ", (STOP LIST)\n");
 584             if (add_msgs) {
 585                 char    msgbuf [200];
 586                 sprintf (msgbuf, catgets(dtsearch_catd, MS_lang, 19,
 587                     "%s The word '%s' is not indexed in database '%s'.") ,
 588                     PROGNAME"558", parg->string, dblk->label);
 589                 DtSearchAddMessage (msgbuf);
 590                 return NULL;
 591             }
 592             goto READ_ANOTHER_WORD;
 593         }
 594     }
 595
 596 GOOD_WORD:
 597     /* Word is correctly parsed and passes all dblk filters. */
 598     if (debugging_teskey)
 599         fprintf (aa_stderr, ", ...good word\n");
 600     if (offsetp)
 601         *offsetp = candidate_offset;
 602     return (char *) outbuf;
 603 } /* teskey_parser() */
 604
 605
 606 /************************************************/
 607 /*                                              */
 608 /*                 is_concordable               */
 609 /*                                              */
 610 /************************************************/
 611 /* Verifies passed word token is teskey-concordable
 612  * in code page of passed charmap.  Used in validating
 613  * word files.  Returns TRUE if all chars concordable
 614  * or optionally concordable, else returns FALSE.
 615  */
 616 int     is_concordable (char *word, int *charmap)
 617 {
 618     UCHAR       *cptr;
 619     for (cptr = (UCHAR *)word;  *cptr != 0;  cptr++)
 620         if ((charmap[*cptr] & NON_CONCORD) != 0)
 621             break;
 622     return (*cptr == 0);
 623 } /* is_concordable() */
 624
 625
 626 /************************************************/
 627 /*                                              */
 628 /*                 load_wordtree                */
 629 /*                                              */
 630 /************************************************/
 631 /* Called by load_stop_list(), load_include_list(), etc,
 632  * to read an appropriate word list file into binary tree structures.
 633  *
 634  * INPUT FILE FORMAT:  One word per line, all chars teskey concordable.
 635  * Preferred order is frequency of occurrence in the corpus
 636  * to make searches efficient.  Otherwise the words should at least
 637  * be in random order or an order that will approximate a binary search.
 638  * If first char is any of COMMENT_CHARS, line is ignored as comments.
 639  * Ascii spaces, tabs, or newline delimits the first word token--
 640  * anything else on the line is ignored as comments.
 641  * Optionally characters in word token will be checked for teskey
 642  * concordability.
 643  *
 644  * RETURNS 0 if file successfully loaded, returns 1 if file missing,
 645  * returns 2 and messages in global msglist if file has fatal errors.
 646  */
 647 int     load_wordtree (
 648                     WORDTREE    **treetop,
 649                     DBLK        *dblk,
 650                     char        *fname,
 651                     int         do_teskey_test)
 652 {
 653     int         i;
 654     int         errcount;
 655     int         is_duplicate;
 656     long        linecount = 0;
 657     char        *token;
 658     char        *cptr;
 659     char        readbuf [256];
 660     char        sprintbuf [_POSIX_PATH_MAX + 1024];
 661     FILE        *fileid;
 662     WORDTREE    *new;
 663     WORDTREE    **this_link;
 664     _Xstrtokparams      strtok_buf;
 665
 666     if (debugging_loadlang)
 667         fprintf (aa_stderr, PROGNAME"1071 "
 668             "load_wordtree: db=%s fname='%s'\n",
 669             NULLORSTR(dblk->name), NULLORSTR(fname));
 670
 671     if ((fileid = fopen (fname, "rt")) == NULL) {
 672         /* Not being able to find the file is not an error.
 673          * We indicate that with the return code.
 674          * But any other error (like permissions) is fatal.
 675          */
 676         if (errno == ENOENT) {
 677             if (debugging_loadlang)
 678                 fputs ("  ...file not found.\n", aa_stderr);
 679             return 1;
 680         }
 681         else {
 682             sprintf (sprintbuf,
 683                 catgets (dtsearch_catd, MS_misc, 362, "%s: %s: %s."),
 684                 PROGNAME"362", fname, strerror(errno));
 685             DtSearchAddMessage (sprintbuf);
 686             return 2;
 687         }
 688     }
 689
 690     /*--------- Main Read Loop ----------*/
 691     errcount = 0;
 692     while (fgets (readbuf, sizeof(readbuf), fileid) != NULL) {
 693         linecount++;
 694         /*
 695          * Ignore comment lines beginning with punctuation char.
 696          * Ignore empty lines (strtok returns NULL, no tokens).
 697          * Otherwise first or only word on line is the desired word.
 698          */
 699         if (strchr (COMMENT_CHARS, readbuf[0]))
 700             continue;
 701         if ((token = _XStrtok(readbuf, " \t\n", strtok_buf)) == NULL)
 702             continue;
 703         dblk->lstrupr (token, dblk);
 704
 705         if (debugging_loadword)
 706             fprintf (aa_stderr, "  WORD: '%s'  ", token);
 707
 708         /* If requested confirm all chars are teskey-concordable. */
 709         if (do_teskey_test)
 710             if (!is_concordable (token, dblk->charmap)) {
 711                 sprintf (sprintbuf, catgets (dtsearch_catd, MS_misc, 400,
 712                     "%s: %s, line %ld: Invalid chars in word '%s'."),
 713                     PROGNAME"400", fname, linecount, token);
 714                 DtSearchAddMessage (sprintbuf);
 715                 errcount++;
 716                 continue;
 717             }
 718
 719         /* Unless we've already detected some errors,
 720          * allocate a new node and load its data fields.
 721          */
 722         if (errcount)
 723             continue;
 724         i = strlen (token);
 725         new = austext_malloc (sizeof(WORDTREE) + i + 4,
 726             PROGNAME"104", NULL);
 727         new->llink = NULL;
 728         new->rlink = NULL;
 729         new->len = i;
 730         new->word = (void *) (new + 1);
 731         strcpy (new->word, token);
 732
 733         /* Descend binary tree and insert in correct alphabetical place */
 734         is_duplicate = FALSE;
 735         for (this_link = treetop;  *this_link != NULL;  ) {
 736             i = strcmp (new->word, (*this_link)->word);
 737
 738             /* test for duplicate word */
 739             if (i == 0) {
 740                 sprintf (sprintbuf, catgets (dtsearch_catd, MS_misc, 423,
 741                     "%s Word '%s' in '%s' is a duplicate."),
 742                     PROGNAME"423", token, fname);
 743                 DtSearchAddMessage (sprintbuf);
 744                 /* duplicates aren't fatal, just ignore the word */
 745                 is_duplicate = TRUE;
 746                 break;  /* no point in continuing descent */
 747             }
 748
 749             /* Descend tree to find correct insertion point */
 750             if (debugging_loadword)
 751                 fputc(((i < 0)? 'L' : 'R'), aa_stderr);
 752             this_link = (WORDTREE **) ((i < 0) ?
 753                 &(*this_link)->llink : &(*this_link)->rlink);
 754         } /* end forloop to find tree insertion point */
 755
 756         /* Don't link anything if error found while descending tree */
 757         if (is_duplicate) {
 758             if (debugging_loadword)
 759                 fputs (" duplicate!\n", aa_stderr);
 760             free (new);
 761             continue;
 762         }
 763
 764         /* Insert new node at current location in tree */
 765         *this_link = new;
 766         if (debugging_loadword)
 767             fputs(" .\n", aa_stderr);
 768     }   /* end of read loop */
 769
 770     fclose (fileid);
 771
 772     if (errcount) {
 773         if (debugging_loadlang)
 774             fprintf (aa_stderr,
 775                 PROGNAME"1186 load word file '%s' failed.\n", fname);
 776         return 2;
 777     }
 778     else {
 779         if (debugging_loadlang)
 780             fprintf (aa_stderr,
 781                 PROGNAME"1193 load word file '%s' successful.\n", fname);
 782         return 0;
 783     }
 784 }  /* load_wordtree() */
 785
 786
 787 /************************************************/
 788 /*                                              */
 789 /*                free_wordtree                 */
 790 /*                                              */
 791 /************************************************/
 792 /* Formerly free_bintree() in msgutil.c.
 793  * Frees storage for all nodes in a WORDTREE and
 794  * sets its top-of-list pointer to NULL.
 795  * Works only for node structures where all memory
 796  * was allocated in a single call to malloc().
 797  * Uses link inversion traversal (eg, Data Structure Techniques,
 798  * Thomas A. Standish, Algorithm 3.6) where TAG is initialized
 799  * at preorder visit, and node is freed at postorder visit.
 800  */
 801 static void     free_wordtree (WORDTREE ** wordtree_head)
 802 {
 803     WORDTREE        *next;
 804     WORDTREE        *prev = NULL;
 805     WORDTREE        *pres = *wordtree_head;
 806
 807     if (*wordtree_head == NULL)
 808         return;
 809
 810 DESCEND_LEFT:
 811     pres->word = (void *) 0;    /* preorder visit:  TAG = 0 */
 812     next = pres->llink;
 813     if (next != NULL) {
 814         pres->llink = prev;
 815         prev = pres;
 816         pres = next;
 817         goto DESCEND_LEFT;
 818     }
 819 DESCEND_RIGHT:
 820      next = pres->rlink;
 821     if (next != NULL) {
 822         pres->word = (void *) 1;        /* TAG = 1 */
 823         pres->rlink = prev;
 824         prev = pres;
 825         pres = next;
 826         goto DESCEND_LEFT;
 827     }
 828 POSTORDER_VISIT:
 829     free (pres);
 830     if (prev == NULL) { /* end of algorithm? */
 831         *wordtree_head = NULL;
 832         return;
 833     }
 834     if (prev->word == (void *) 0) {     /* go up left leg */
 835         next = prev->llink;
 836         pres = prev;
 837         prev = next;
 838         goto DESCEND_RIGHT;
 839     }
 840     else {      /* go up right leg */
 841         next = prev->rlink;
 842         prev->word = (void *) 0;        /* restore TAG = 0 */
 843         pres = prev;
 844         prev = next;
 845         goto POSTORDER_VISIT;
 846     }
 847 }  /* free_wordtree() */
 848
 849
 850 /************************************************/
 851 /*                                              */
 852 /*              load_include_list               */
 853 /*                                              */
 854 /************************************************/
 855 /* Builds include list by reading include file
 856  * into a binary tree structure.
 857  * Unlike stoplists, include-lists are optional.
 858  * Also unlike stoplists, there are no language default include-lists.
 859  * 'dblist' may be NULL.
 860  * RETURNS TRUE if no problems, else FALSE with msg in ausapi_msglist.
 861  */
 862 static int      load_include_list (DBLK *dblk, DBLK *dblist)
 863 {
 864     int         i;
 865     int         filename_was_null = (dblk->fname_inc == NULL);
 866     DBLK        *db;
 867     char        sprintbuf [512];
 868
 869     dblk->inclist = NULL;       /* just to be sure */
 870
 871     if (debugging_loadlang)
 872         fprintf (aa_stderr,
 873             PROGNAME"1705 Load inclist: db='%s' lang=#%d,%s\n",
 874             NULLORSTR(dblk->name), (int)dblk->dbrec.or_language,
 875             language_name(dblk->dbrec.or_language));
 876
 877     /* If file name not provided, generate one based on
 878      * dblk's path, database name, and default extension.
 879      */
 880     if (filename_was_null) {
 881         if (dblk->name[0] == 0) {
 882             dblk->fname_inc = "";
 883             dblk->inclist = NULL;
 884             if (debugging_loadlang)
 885                 fprintf (aa_stderr, PROGNAME"1339 "
 886                     "No inclist because neither fname nor dbname provided.\n");
 887             return TRUE;
 888         }
 889         if (dblk->path == NULL)
 890             dblk->path = strdup("");
 891         dblk->fname_inc = austext_malloc (strlen(dblk->path) + 36,
 892              PROGNAME"1187", NULL);
 893         strcpy (dblk->fname_inc, dblk->path);
 894         ensure_end_slash (dblk->fname_inc);
 895         strcat (dblk->fname_inc, dblk->name);
 896         strcat (dblk->fname_inc, EXT_INCLIST);
 897     }
 898     if (debugging_loadlang)
 899         fprintf (aa_stderr,
 900             PROGNAME"1350 Include list file name = '%s'.\n",
 901             dblk->fname_inc);
 902
 903     /* Dont reload the same file if it's already
 904      * been loaded into a previous dblk in a list.
 905      * Code works just fine if dblist == NULL.
 906      */
 907     for (db = dblist;  db != NULL;  db = db->link) {
 908         if (db == dblk || db->fname_inc == NULL)
 909             continue;
 910         if (strcmp (db->fname_inc, dblk->fname_inc) == 0) {
 911             dblk->inclist = db->inclist;
 912             dblk->lang_flags |= LF_DUP_INC;
 913             if (debugging_loadlang)
 914                 fprintf (aa_stderr, PROGNAME"1363 "
 915                     "Using previously loaded inclist, db='%s'.\n",
 916                     dblk->name);
 917             return TRUE;
 918         }
 919     }
 920
 921     /* Include list is optional so missing file is
 922      * not an error unless caller named a specific file.
 923      */
 924     i = load_wordtree (&dblk->inclist, dblk, dblk->fname_inc, TRUE);
 925     switch (i) {
 926         case 0:
 927             return TRUE;
 928
 929         case 1:
 930             if (filename_was_null) {
 931                 dblk->fname_inc = "";
 932                 dblk->inclist = NULL;
 933                 return TRUE;
 934             }
 935             else {
 936                 sprintf (sprintbuf,
 937                     catgets (dtsearch_catd, MS_misc, 362, "%s: %s: %s."),
 938                     PROGNAME"1218", dblk->fname_inc, strerror(ENOENT));
 939                 DtSearchAddMessage (sprintbuf);
 940                 return FALSE;
 941             }
 942
 943         default:
 944             return FALSE;
 945     }
 946 } /* load_include_list() */
 947
 948
 949 /************************************************/
 950 /*                                              */
 951 /*               load_stop_list                 */
 952 /*                                              */
 953 /************************************************/
 954 /* Builds stoplist by reading stoplist file into a
 955  * binary tree structure.  File name can be
 956  *    (1) passed in dblk.fname_stp,
 957  *    (2) generated from dblk path, name, and '.stp',
 958  *    (3) default for dblk path, language, and '.stp'.
 959  * 'dblist' may be NULL.
 960  * RETURNS TRUE if no problems, else FALSE with msg in ausapi_msglist.
 961  */
 962 static int      load_stop_list (DBLK *dblk, DBLK *dblist)
 963 {
 964     int         i;
 965     DBLK        *db;
 966     char        sprintbuf [_POSIX_PATH_MAX + 512];
 967     struct stat statbuf;
 968
 969     dblk->stoplist = NULL;      /* just to be sure */
 970
 971     if (debugging_loadlang)
 972         fprintf (aa_stderr,
 973             PROGNAME"1700 Load stoplist: db='%s' lang=#%d,%s\n",
 974             NULLORSTR(dblk->name), (int)dblk->dbrec.or_language,
 975             language_name(dblk->dbrec.or_language));
 976
 977     /* If file name not provided, generate one based on
 978      * dblk's path, database name, and default extension.
 979      * And if that doesn't work, generate one based on
 980      * dblk's path, language, and default extension.
 981      */
 982     if (dblk->fname_stp == NULL) {
 983         if (dblk->path == NULL)
 984             dblk->path = strdup("");
 985         dblk->fname_stp = austext_malloc (strlen(dblk->path) + 36,
 986              PROGNAME"919", NULL);
 987
 988         strcpy (dblk->fname_stp, dblk->path);
 989         ensure_end_slash (dblk->fname_stp);
 990         strcat (dblk->fname_stp, dblk->name);
 991         strcat (dblk->fname_stp, EXT_STOPLIST);
 992         errno = 0;
 993         stat (dblk->fname_stp, &statbuf);
 994         if (errno == ENOENT) {
 995             strcpy (dblk->fname_stp, dblk->path);
 996             ensure_end_slash (dblk->fname_stp);
 997             strcat (dblk->fname_stp, lang_fnames [dblk->dbrec.or_language]);
 998             strcat (dblk->fname_stp, EXT_STOPLIST);
 999         }
1000     }
1001     if (debugging_loadlang)
1002         fprintf (aa_stderr,
1003             PROGNAME"1448 Stoplist file name = '%s'.\n",
1004             dblk->fname_stp);
1005
1006     /* Dont reload the same file if it's already
1007      * been loaded into a previous dblk in a list.
1008      * Code works just fine if dblist == NULL.
1009      */
1010     for (db = dblist;  db != NULL;  db = db->link) {
1011         if (db == dblk || db->fname_stp == NULL)
1012             continue;
1013         if (strcmp (db->fname_stp, dblk->fname_stp) == 0) {
1014             dblk->stoplist = db->stoplist;
1015             dblk->lang_flags |= LF_DUP_STP;
1016             if (debugging_loadlang)
1017                 fprintf (aa_stderr, PROGNAME"1460 "
1018                     "Using previously loaded stoplist, db='%s'.\n",
1019                     dblk->name);
1020             return TRUE;
1021         }
1022     }
1023
1024     /* Stop lists are mandatory--a missing stoplist is fatal. */
1025     i = load_wordtree (&dblk->stoplist, dblk, dblk->fname_stp, TRUE);
1026     if (i == 1) {
1027         sprintf (sprintbuf,
1028             catgets (dtsearch_catd, MS_misc, 362, "%s: %s: %s"),
1029             PROGNAME"1270", dblk->fname_stp, strerror(ENOENT));
1030         DtSearchAddMessage (sprintbuf);
1031     }
1032     return (i == 0);
1033 } /* load_stop_list() */
1034
1035
1036 /************************************************/
1037 /*                                              */
1038 /*              free_paice_rules                */
1039 /*                                              */
1040 /************************************************/
1041 /* Frees all allocated storage for a set of paice rules, typically
1042  * loaded at dblk.stem_extra.  Called by REINIT routines and
1043  * by load_paice_suffixes() when cleaning up after an error.
1044  */
1045 static void     free_paice_rules (PRULE ***rules_table_ptr)
1046 {
1047     int         i;
1048     PRULE       *p, **linkp;
1049     PRULE       **rules_table;
1050
1051     if (*rules_table_ptr == NULL)
1052         return;
1053     rules_table = *rules_table_ptr;
1054     for (i=0; i<256; i++) {
1055         if (rules_table[i] == NULL)
1056             continue;
1057         p = rules_table[i];
1058         while (p) {
1059             linkp = &p->link;
1060             free (p->suffix);
1061             if (p->apndstr)
1062                 free (p->apndstr);
1063             free (p);
1064             p = *linkp;
1065         }
1066     }
1067     free (rules_table);
1068     *rules_table_ptr = NULL;
1069     return;
1070 } /* free_paice_rules() */
1071
1072
1073 /************************************************/
1074 /*                                              */
1075 /*              load_paice_suffixes             */
1076 /*                                              */
1077 /************************************************/
1078 /* Loads European language paice stemmer suffix rules
1079  * into dblk.stem_extra as an array of ptrs to linked lists.
1080  * Like stop lists, sfx files can be
1081  *    (1) passed in dblk.fname_sfx,
1082  *    (2) generated from dblk path, dbname, and '.sfx',
1083  *    (3) generated from dblk path, language, and '.sfx'.
1084  * Internal tables will be reused if file previously loaded.
1085  * Only uses single byte character sets (ascii, iso-latin-1).
1086  * Uses strtok().  dblk->charmap must already be loaded.
1087  * Will continue to parse entire file even if errors are found.
1088  * RETURNS TRUE if no problems, else FALSE with msg in ausapi_msglist.
1089  */
1090 static int      load_paice_suffixes (DBLK *dblk, DBLK *dblist)
1091 {
1092     int         i;
1093     FILE        *fp;
1094     DBLK        *db;
1095     PRULE       *prule, **prule_link;
1096     PRULE       **rules_table;
1097     struct stat statbuf;
1098     UCHAR       *cptr, *token;
1099     char        readbuf [_POSIX_PATH_MAX + 1024];
1100     char        msgbuf [_POSIX_PATH_MAX + 1024];
1101     UCHAR       *suffix, *apndstr;
1102     int         must_be_intact, is_last_rule;
1103     UCHAR       remove_count;
1104     int         lineno, errcount;
1105     _Xstrtokparams      strtok_buf;
1106
1107     dblk->stem_extra = NULL;    /* just to be sure */
1108     rules_table = NULL;
1109
1110     if (debugging_loadlang)
1111         fprintf (aa_stderr,
1112             PROGNAME"1715 Load paice suffixes: db='%s' lang=#%d,%s\n",
1113             NULLORSTR(dblk->name), (int)dblk->dbrec.or_language,
1114             language_name(dblk->dbrec.or_language));
1115
1116     /* If file name not provided, generate one based on
1117      * dblk's path, database name, and default extension.
1118      * And if that doesn't work, generate one based on
1119      * dblk's path, language, and default extension.
1120      */
1121     if (dblk->fname_sfx == NULL) {
1122         if (dblk->path == NULL)
1123             dblk->path = strdup("");
1124         dblk->fname_sfx = austext_malloc (strlen(dblk->path) + 36,
1125              PROGNAME"1113", NULL);
1126
1127         strcpy (dblk->fname_sfx, dblk->path);
1128         ensure_end_slash (dblk->fname_sfx);
1129         strcat (dblk->fname_sfx, dblk->name);
1130         strcat (dblk->fname_sfx, EXT_SUFFIX);
1131         errno = 0;
1132         stat (dblk->fname_sfx, &statbuf);
1133         if (errno == ENOENT) {
1134             strcpy (dblk->fname_sfx, dblk->path);
1135             ensure_end_slash (dblk->fname_sfx);
1136             strcat (dblk->fname_sfx, lang_fnames [dblk->dbrec.or_language]);
1137             strcat (dblk->fname_sfx, EXT_SUFFIX);
1138         }
1139     }
1140     if (debugging_loadlang)
1141         fprintf (aa_stderr,
1142             PROGNAME"1740 Paice suffix file name = '%s'.\n",
1143             dblk->fname_sfx);
1144
1145     /* Dont reload the same file if it's already
1146      * been loaded into a previous dblk in a list,
1147      * but flag it so it won't be freed at unload_language/REINIT.
1148      * Code works just fine if dblist == NULL.
1149      */
1150     for (db = dblist;  db != NULL;  db = db->link) {
1151         if (db == dblk || db->fname_sfx == NULL)
1152             continue;
1153         if (strcmp (db->fname_sfx, dblk->fname_sfx) == 0) {
1154             dblk->stem_extra = db->stem_extra;
1155             dblk->lang_flags |= LF_DUP_SFX;
1156             if (debugging_loadlang)
1157                 fprintf (aa_stderr, PROGNAME"1145 "
1158                     "Using previously loaded suffixes, db='%s'.\n",
1159                     dblk->name);
1160             return TRUE;
1161         }
1162     }
1163
1164     fp = fopen (dblk->fname_sfx, "rt");
1165     if (fp == NULL) {
1166         sprintf (msgbuf,
1167             catgets (dtsearch_catd, MS_misc, 362, "%s: %s: %s."),
1168             PROGNAME"181", dblk->fname_sfx, strerror(errno));
1169         DtSearchAddMessage (msgbuf);
1170         dblk->fname_sfx = NULL;
1171         return FALSE;
1172     }
1173
1174     /* Rules table will eventually be loaded at dblk.stem_extra.
1175      * It consists of 256 PRULE ptrs,
1176      * one for each possible single byte char.
1177      * Each ptr is the head of a rules list for that char.
1178      */
1179     rules_table = austext_malloc (256 * sizeof(PRULE*),
1180         PROGNAME"199", &ausapi_msglist);
1181     memset (rules_table, 0, 256 * sizeof(PRULE*));
1182     lineno =    0;
1183     errcount =  0;
1184
1185     /*------- Main Read Loop -------*/
1186     while (fgets (readbuf, sizeof(readbuf), fp) != NULL) {
1187         lineno++;
1188
1189         /* Ignore comment lines */
1190         if (strchr (COMMENT_CHARS, readbuf[0]))
1191             continue;
1192
1193         /* TOKEN #1: suffix string, backwards, all uppercase.
1194          * If missing, ignore 'empty' line.
1195          * If the first token is all numeric, ignore line
1196          * (for compatibility with older versions of file).
1197          */
1198         if ((suffix = (UCHAR *)_XStrtok(readbuf, SFX_DELIMS, strtok_buf)) == NULL)
1199             continue;
1200
1201         for (cptr = suffix;  cptr;  cptr++)
1202             if ((dblk->charmap[*cptr] & NUMERAL) == 0)
1203                 break;
1204         if (*cptr == '\0')
1205             continue;
1206
1207         /* OPTIONAL TOKEN #2: if next token '*', set 'intact' flag */
1208         if ((token = (UCHAR *)_XStrtok(NULL, SFX_DELIMS, strtok_buf)) == NULL) {
1209 BAD_RULE:
1210             sprintf (msgbuf,  catgets(dtsearch_catd, MS_lang, 51,
1211                 "%s %s, Line %d: Invalid Paice Rule for suffix '%s'.") ,
1212                 PROGNAME"898", dblk->fname_sfx, lineno, suffix);
1213             DtSearchAddMessage (msgbuf);
1214             errcount++;
1215             continue;
1216         }
1217         must_be_intact = FALSE;
1218         if (token[0] == '*') {
1219             must_be_intact = TRUE;
1220             /* Read next token... */
1221             if ((token = (UCHAR *)_XStrtok(NULL, SFX_DELIMS, strtok_buf)) == NULL)
1222                 goto BAD_RULE;
1223         }
1224
1225         /* TOKEN #3: remove-count */
1226         remove_count = (UCHAR) atoi ((char *) token);
1227
1228         /* OPTIONAL TOKEN #4: if next token is NOT a continue
1229          * symbol ('>' or '$'), then it's an append string.
1230          */
1231         apndstr = NULL;
1232         if ((token = (UCHAR *)_XStrtok(NULL, SFX_DELIMS, strtok_buf)) == NULL)
1233             goto BAD_RULE;
1234         if (token[0] != '$'  &&  token[0] != '>') {
1235             apndstr = token;
1236             /* Read next token... */
1237             if ((token = (UCHAR *)_XStrtok(NULL, SFX_DELIMS, strtok_buf)) == NULL)
1238                 goto BAD_RULE;
1239         }
1240
1241         /* TOKEN #5: continue symbol '$' (stop) or '>' (continue) */
1242         is_last_rule = (token[0] == '$');
1243
1244         if (debugging_loadword) {
1245             fprintf (aa_stderr,
1246                 "  SFX: intact?=%d stop?=%d remv=%d '%s'",
1247                 (int) must_be_intact,
1248                 (int) is_last_rule,
1249                 (int) remove_count,
1250                 suffix);
1251             if (apndstr)
1252                 fprintf (aa_stderr, "\tapnd='%s'\n", apndstr);
1253             else
1254                 fputc ('\n', aa_stderr);
1255         }
1256
1257         /* Good suffix.  If we haven't had any errors yet,
1258          * add it to rules list for the first char of the suffix.
1259          */
1260         if (errcount)
1261             continue;
1262         prule = austext_malloc (sizeof(PRULE), PROGNAME"1252", NULL);
1263         memset (prule, 0, sizeof(PRULE));
1264         prule->suffix =         (UCHAR *) strdup ((char*)suffix);
1265         prule->suflen =         strlen ((char*)suffix);
1266         prule->must_be_intact = must_be_intact;
1267         prule->remove_count =   remove_count;
1268         prule->is_last_rule =   is_last_rule;
1269         if (apndstr) {
1270             prule->apndstr =    (UCHAR *) strdup ((char*)apndstr);
1271             prule->aplen =      strlen ((char*)apndstr);
1272         }
1273
1274         prule_link = &rules_table[suffix[0]];
1275         while (*prule_link)
1276             prule_link = &(*prule_link)->link;
1277         *prule_link = prule;
1278
1279     } /* end Main Read Loop */
1280
1281     fclose (fp);
1282     if (errcount) {
1283         free_paice_rules (&rules_table);
1284         return FALSE;
1285     }
1286     dblk->stem_extra = rules_table;
1287
1288     /* Update last table entry */
1289     if (debugging_loadlang) {
1290         fprintf (aa_stderr,
1291             PROGNAME"1654 Paice suffix file '%s' loaded ok.\n",
1292             dblk->fname_sfx);
1293         fflush (aa_stderr);
1294     }
1295     return TRUE;
1296 }  /* load_paice_suffixes() */
1297
1298
1299 /************************************************/
1300 /*                                              */
1301 /*               is_matching_rule               */
1302 /*                                              */
1303 /************************************************/
1304 /* Subroutine of paice_stemmer().
1305  * Returns TRUE if passed rule can be applied to stem in paicebuf.
1306  * Else returns FALSE.
1307  */
1308 static int      is_matching_rule (PRULE *rule)
1309 {
1310     static UCHAR        *ptr;
1311     static int          i, j;
1312
1313     if (debugging_paice)
1314         fprintf (aa_stderr, "  test rule '%s':\t", rule->suffix);
1315
1316     /* Skip rule if we've made at least one previous change
1317      * but the current rule requires an intact word.
1318      */
1319     if (rule->must_be_intact  &&  !word_is_intact) {
1320         if (debugging_paice)
1321             fputs ("word not intact...\n", aa_stderr);
1322         return FALSE;
1323     }
1324
1325     /* Do a backward strcmp on the suffix.
1326      * Skip rule if it doesn't match current paicebuf's ending chars.
1327      */
1328     j = rule->suflen;
1329     ptr = paicebuf + paicelen - 1;
1330     for (i = 0; i < j; i++) {
1331         if (*((rule->suffix) + i) != *ptr) {
1332             if (debugging_paice)
1333                 fputs ("no match...\n", aa_stderr);
1334             return FALSE;
1335         }
1336         ptr--;
1337     }
1338
1339     if (debugging_paice)
1340         fputs ("match", aa_stderr);
1341
1342     /* Set i = paicebuf length after removing and appending suffixes.
1343      * Used to algorithmically test remaining stem length
1344      * after tentative application of rule.
1345      */
1346     i = paicelen - (rule->remove_count - rule->aplen);
1347
1348     if (i <= 1) {
1349         if (debugging_paice)
1350             fputs (", but stem too short...\n", aa_stderr);
1351         return FALSE;
1352     }
1353
1354     if (i == 2) {
1355         if (IS_VOWEL (paicebuf[0])) {
1356             if (debugging_paice)
1357                 fputs (", and short vowel stem valid.\n", aa_stderr);
1358             return TRUE;
1359         }
1360         else {
1361             if (debugging_paice)
1362                 fputs (", but consonant stem too short...\n", aa_stderr);
1363             return FALSE;
1364         }
1365     }
1366
1367     /* Remaining stem is at least 3 chars.
1368      * If it contains a vowel anywhere, it's valid.
1369      * (A 'Y' after the first char counts as a vowel).
1370      * Otherwise it's not.
1371      */
1372     for (j=0;  j<i;  j++) {
1373         if (IS_VOWEL (paicebuf[j])) {
1374 GOOD_STEM:
1375             if (debugging_paice)
1376                 fputs (", and remaining stem valid.\n", aa_stderr);
1377             return TRUE;
1378         }
1379         if (j > 0  &&  paicebuf[j] == 'Y')
1380             goto GOOD_STEM;
1381     }
1382
1383     if (debugging_paice)
1384         fputs (", but remaining stem all consonants.\n", aa_stderr);
1385     return FALSE;
1386 }  /* is_matching_rule() */
1387
1388
1389 /************************************************/
1390 /*                                              */
1391 /*                 paice_stemmer                */
1392 /*                                              */
1393 /************************************************/
1394 /* Given a word token (ALREADY UPPERCASE) in a single byte
1395  * language such as the output of teskey_parser,
1396  * generates 'stem' by repeated suffix removal.
1397  * Returns stem token in a static buffer valid
1398  * until next call to paice_stemmer or null_stemmer.
1399  * Returned stem might be the original unmodified word.
1400  * Returned stem might also be empty string.
1401  * Returned stem is *never* NULL, even if wordin == NULL.
1402  * Input buffer will not be modified; does not use strtok.
1403  * All variables are static for speeeeeeed.
1404  */
1405 static char     *paice_stemmer (char *wordin, DBLK *dblk)
1406 {
1407     UCHAR       finalc;
1408     PRULE       *rule, **rules_table;
1409
1410     if (wordin == NULL)
1411         return "";
1412     if (*wordin == 0)
1413         return "";
1414
1415     if ((rules_table = (PRULE **)dblk->stem_extra) == NULL) {
1416         fprintf (aa_stderr, catgets (dtsearch_catd, MS_lang, 31,
1417             "%s Stemmer suffixes file never loaded.\n"),
1418             PROGNAME"310");
1419         DtSearchExit (2);
1420     }
1421
1422     /* The max length of a stem is bufsz - 2:
1423      * one for the terminating \0 and one for the
1424      * prefix ^O that identifies a stem.  (But this
1425      * stemmer doesn't actually insert the ^O now.)
1426      */
1427     strncpy ((char*)paicebuf, wordin, DtSrMAXWIDTH_HWORD);
1428     paicebuf [DtSrMAXWIDTH_HWORD - 2] = 0;
1429     paice_charmap =     dblk->charmap;
1430     word_is_intact =    TRUE;
1431
1432     for (;;) { /*-------- Main Stemming Loop ---------*/
1433
1434         paicelen = strlen ((char*)paicebuf);
1435         finalc = *(paicebuf + paicelen - 1);
1436         if (debugging_paice) {
1437             fprintf (aa_stderr,
1438                 "paice: '%s', rules list '%c' for database '%s'\n",
1439                 paicebuf, finalc, dblk->name);
1440             fflush (aa_stderr);
1441         }
1442
1443         /* Look for a matching rule */
1444         if ((rule = rules_table [finalc]) == NULL) {
1445             if (debugging_paice)
1446                 fputs ("  list is null, stop.\n", aa_stderr);
1447             break;
1448         }
1449         while (rule) {
1450             if (is_matching_rule (rule))
1451                 break;
1452             rule = rule->link;
1453         }
1454         if (rule == NULL) {
1455             if (debugging_paice)
1456                 fprintf (aa_stderr, "  rules list '%c' is exhausted, stop.\n",
1457                     finalc);
1458             break;
1459         }
1460
1461         /* Apply rule that matched */
1462         if (debugging_paice)
1463             fputs ("    apply rule: ", aa_stderr);
1464         if (rule->remove_count == 0) {
1465             if (debugging_paice)
1466                 fputs ("remove_count = 0, stop.\n", aa_stderr);
1467             break;
1468         }
1469
1470         paicebuf [paicelen - rule->remove_count] = 0;
1471         if (rule->aplen)
1472             strcat ((char*)paicebuf, (char*)rule->apndstr);
1473         paicelen = strlen ((char*)paicebuf);
1474         word_is_intact = FALSE;  /* we've removed at least 1 suffix */
1475         if (debugging_paice)
1476             fprintf (aa_stderr, "--> '%s'", paicebuf);
1477
1478         /* Terminate algorithm if rule says so.
1479          * Otherwise continue removing suffixes
1480          * from this partially stemmed word.
1481          */
1482         if (rule->is_last_rule) {
1483             if (debugging_paice)
1484                 fputs (", stop flag is set, stop.\n", aa_stderr);
1485             break;
1486         }
1487         if (debugging_paice)
1488             fputc ('\n', aa_stderr);
1489
1490     } /* end Main Stemming Loop */
1491
1492     if (debugging_paice) {
1493         fprintf (aa_stderr, "  final stem: '%s'\n", paicebuf);
1494         fflush (aa_stderr);
1495     }
1496     return (char *) paicebuf;
1497 } /* paice_stemmer() */
1498
1499
1500 /************************************************/
1501 /*                                              */
1502 /*                 null_stemmer                 */
1503 /*                                              */
1504 /************************************************/
1505 /* Stemmer that just copies and returns passed word.
1506  * In effect, the passed word IS its own stem.
1507  * Output buffer valid until next call to null_stemmer
1508  * or paice_stemmer.
1509  */
1510 char    *null_stemmer (char *word, DBLK *dblk)
1511 {
1512     if (word == NULL)
1513         return "";
1514     if (*word == '\0')
1515         return "";
1516     strncpy ((char *)paicebuf, word, DtSrMAXWIDTH_HWORD);
1517     paicebuf [DtSrMAXWIDTH_HWORD-1] = 0;
1518     return (char *) paicebuf;
1519 } /* null_stemmer() */
1520
1521
1522 /************************************************/
1523 /*                                              */
1524 /*                 euro_lstrupr                 */
1525 /*                                              */
1526 /************************************************/
1527 /* Converts passed string to uppercase in place.
1528  * Classic strupr() function using teskey charmaps.
1529  */
1530 static char     *euro_lstrupr (char *string, DBLK *dblk)
1531 {
1532     static int          *charmap;
1533     static UCHAR        *s;
1534     charmap = dblk->charmap;
1535     for (s=(UCHAR *)string;  *s;  s++)
1536         *s = charmap[*s] & 0xff;
1537     return string;
1538 }
1539
1540
1541 /************************************************/
1542 /*                                              */
1543 /*                 null_lstrupr                 */
1544 /*                                              */
1545 /************************************************/
1546 /* Just returns passed string.  Used where uppercase
1547  * conversions are not required for a language.
1548  */
1549 char    *null_lstrupr (char *s, DBLK *d)
1550 { return s; }
1551
1552
1553 /************************************************/
1554 /*                                              */
1555 /*                load_language                 */
1556 /*                                              */
1557 /************************************************/
1558 /* Loads a dblk with a specific language's
1559  * structures and function pointers.
1560  * Does not reload structures previously loaded in
1561  * other dblks on dblist if derived from identical files.
1562  * But always loads structures if passed dblist is NULL.
1563  * Presumes dblk already partially initialized with mandatory fields:
1564  *      name, path, language.
1565  * May also be preinitialized with optional fields:
1566  *      minwordsz, maxwordsz.
1567  * Returns TRUE if all successful.
1568  * Otherwise returns FALSE with err msgs on ausapi_msglist.
1569  */
1570 int     load_language (DBLK *dblk, DBLK *dblist)
1571 {
1572     int         i;
1573     int         oops =  FALSE;
1574     char        msgbuf [512];
1575     int         language = dblk->dbrec.or_language;
1576
1577     if (debugging_loadlang)
1578         fprintf (aa_stderr,
1579             "\n"PROGNAME"1920 Loading language #%d, %s, for dblk '%s'.\n",
1580             (int)dblk->dbrec.or_language,
1581             language_name (dblk->dbrec.or_language),
1582             NULLORSTR(dblk->name));
1583
1584     /*
1585      * Note: Load list functions must be called
1586      * AFTER charmap and lstrupr are loaded.
1587      */
1588     switch (language) {
1589         case DtSrLaENG:
1590         case DtSrLaENG2:
1591         case DtSrLaESP:
1592         case DtSrLaFRA:
1593         case DtSrLaITA:
1594         case DtSrLaDEU:
1595             dblk->charmap =     (language == DtSrLaENG)?
1596                                     ascii_charmap : latin_charmap;
1597             dblk->parser =      teskey_parser;
1598             dblk->stemmer =     paice_stemmer;
1599             dblk->lstrupr =     euro_lstrupr;
1600             if (dblk->dbrec.or_maxwordsz == 0)
1601                 dblk->dbrec.or_maxwordsz = (language == DtSrLaDEU)?
1602                     MAXWIDTH_LWORD - 1 : MAXWIDTH_SWORD - 1;
1603             if (dblk->dbrec.or_minwordsz == 0)
1604                 dblk->dbrec.or_minwordsz = MINWIDTH_TOKEN + 1;
1605             oops = FALSE;
1606             if (!load_stop_list (dblk, dblist))
1607                 oops = TRUE;
1608             if (!load_include_list (dblk, dblist))
1609                 oops = TRUE;
1610             if (!load_paice_suffixes (dblk, dblist))
1611                 oops = TRUE;
1612             if (oops)
1613                 return FALSE;
1614             break;
1615
1616         case DtSrLaJPN:
1617         case DtSrLaJPN2:
1618             return load_jpn_language (dblk, dblist);
1619
1620         default:
1621             /* Try loading a custom 'user' language.
1622              * If he failed to provide a loader function,
1623              * the dummy custom loader will tell him so.
1624              * If he provided one but it can't load this language,
1625              * it should return it's own error msgs.
1626              */
1627             return load_custom_language (dblk, dblist);
1628
1629     } /* end switch (language) */
1630
1631     return TRUE;
1632 } /* load_language() */
1633
1634
1635 /************************************************/
1636 /*                                              */
1637 /*                unload_language               */
1638 /*                                              */
1639 /************************************************/
1640 /* Frees storage for structures allocated by load_language().
1641  * Called when engine REINITs due to change in site config file
1642  * or databases.
1643  * Duplicate wordtrees are not unloaded because they
1644  * will have already been unloaded in a previous dblk.
1645  */
1646 void    unload_language (DBLK *dblk)
1647 {
1648     switch (dblk->dbrec.or_language) {
1649         case DtSrLaENG:
1650         case DtSrLaENG2:
1651         case DtSrLaESP:
1652         case DtSrLaFRA:
1653         case DtSrLaITA:
1654         case DtSrLaDEU:
1655             dblk->charmap = NULL;
1656             if ((dblk->lang_flags & LF_DUP_STP) == 0)
1657                 free_wordtree (&dblk->stoplist);
1658             else {
1659                 dblk->stoplist = NULL;
1660                 dblk->lang_flags &= ~LF_DUP_STP;
1661             }
1662             if ((dblk->lang_flags & LF_DUP_INC) == 0)
1663                 free_wordtree (&dblk->inclist);
1664             else {
1665                 dblk->inclist = NULL;
1666                 dblk->lang_flags &= ~LF_DUP_INC;
1667             }
1668             if ((dblk->lang_flags & LF_DUP_SFX) == 0)
1669                 free_paice_rules ((PRULE***)&dblk->stem_extra);
1670             else {
1671                 dblk->stem_extra = NULL;
1672                 dblk->lang_flags &= ~LF_DUP_SFX;
1673             }
1674             break;
1675
1676         case DtSrLaJPN:
1677         case DtSrLaJPN2:
1678             unload_jpn_language (dblk);
1679             break;
1680
1681         default:
1682             unload_custom_language (dblk);
1683             break;
1684     }
1685     return;
1686 } /* unload_language() */
1687 /******************** LANG.C ********************/
1688