cde/lib/DtSearch/lang.c

   1 /*
   2  * CDE - Common Desktop Environment
   3  *
   4  * Copyright (c) 1993-2012, The Open Group. All rights reserved.
   5  *
   6  * These libraries and programs are free software; you can
   7  * redistribute them and/or modify them under the terms of the GNU
   8  * Lesser General Public License as published by the Free Software
   9  * Foundation; either version 2 of the License, or (at your option)
  10  * any later version.
  11  *
  12  * These libraries and programs are distributed in the hope that
  13  * they will be useful, but WITHOUT ANY WARRANTY; without even the
  14  * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE. See the GNU Lesser General Public License for more
  16  * details.
  17  *
  18  * You should have received a copy of the GNU Lesser General Public
  19  * License along with these libraries and programs; if not, write
  20  * to the Free Software Foundation, Inc., 51 Franklin Street, Fifth
  21  * Floor, Boston, MA 02110-1301 USA
  22  */
  23 /*
  24  *   COMPONENT_NAME: austext
  25  *
  26  *   FUNCTIONS: euro_lstrupr
  27  *              free_wordtree
  28  *              is_concordable
  29  *              language_name
  30  *              load_include_list
  31  *              load_language
  32  *              load_paice_suffixes
  33  *              load_stop_list
  34  *              load_wordtree
  35  *              null_lstrupr
  36  *              null_stemmer
  37  *              paice_stemmer
  38  *              search_wordtree
  39  *              teskey_parser
  40  *              unload_language
  41  *
  42  *   ORIGINS: 27
  43  *
  44  *
  45  *   (C) COPYRIGHT International Business Machines Corp. 1995,1996
  46  *   All Rights Reserved
  47  *   Licensed Materials - Property of IBM
  48  *   US Government Users Restricted Rights - Use, duplication or
  49  *   disclosure restricted by GSA ADP Schedule Contract with IBM Corp.
  50  */
  51 /******************** LANG.C ********************
  52  * $XConsortium: lang.c /main/11 1996/11/25 18:47:29 drk $
  53  * July 1995.
  54  * Includes load_language(), unload_language(), and functions and data for
  55  * parsing and stemming European languages in DtSearch/AusText.
  56  * Incorporates p/o socrates.c, p/o proctext.c, parser.c
  57  * delsfx.c, loadchr.c, stop.c, inclist.c, convneg.c, isendwrd.c
  58  * Related to similar semantic modules repackaged into semantic.c.
  59  * Paice suffix removal algorithm from C. Paice, 1990,
  60  * "Another Stemmer", ACM SIGIR Forum, 24(3), 56-61.
  61  *
  62  * $Log$
  63  * Revision 2.13  1996/03/25  18:55:26  miker
  64  * Changed FILENAME_MAX to _POSIX_PATH_MAX.
  65  *
  66  * Revision 2.12  1996/03/25  17:00:19  miker
  67  * Cleanup compiler warning.
  68  *
  69  * Revision 2.11  1996/03/13  22:58:13  miker
  70  * Changed char to UCHAR several places.
  71  *
  72  * Revision 2.10  1996/03/05  16:49:58  miker
  73  * Move COMMENT_CHARS to SearchP.h.
  74  *
  75  * Revision 2.9  1996/03/05  16:31:20  miker
  76  * Added test of PA_MSGS for yacc-based boolean queries.
  77  * Made comment chars in linguistic files independent of locale.
  78  * Changed several char ptrs to unsigned char so parser will
  79  * work when compiled under default signed char compilers.
  80  * Simplified several statements with LHS *var++ for same reason.
  81  *
  82  * Revision 2.8  1996/02/05  16:16:05  miker
  83  * Restore prolog.
  84  *
  85  * Revision 2.7  1996/02/05  16:10:54  miker
  86  * load_paice_suffixes: discard .sfx lines beginning with all numeric
  87  * first token for compatibility with older file formats.
  88  *
  89  * Revision 2.6  1996/02/01  19:11:43  miker
  90  * AusText 2.1.11, DtSearch 0.3:  Major rewrite for new parsers.
  91  * Moved charmaps to new module langmap.c.  Removed hard coded
  92  * paice stemmer values--now dynamic from .sfx file.
  93  *
  94  * Revision 2.5  1995/10/26  14:55:28  miker
  95  * Added prolog.
  96  *
  97  * Revision 2.4  1995/10/19  20:54:36  miker
  98  * Increased msg buf sizes to accommodate larger database file names.
  99  *
 100  * Revision 2.3  1995/10/06  14:39:45  miker
 101  * Bug fix: coredump loading multiple databases
 102  * on Solaris.
 103  *
 104  * Revision 2.2  1995/10/03  21:39:10  miker
 105  * Changed teskey_parser, paice_stemmer, and null_stemmer
 106  * to return number of words parsed/stemmed, not just boolean.
 107  *
 108  * Revision 2.1  1995/09/22  21:00:19  miker
 109  * Freeze DtSearch 0.1, AusText 2.1.8
 110  *
 111  * Revision 1.3  1995/09/19  22:08:28  miker
 112  * Added support for loading and parsing Japanese language DtSrLaJPN.
 113  *
 114  * Revision 1.2  1995/09/05  21:34:52  miker
 115  * Fixed bug: search engine wouldn't parse words of exactly
 116  * 3 or 15 chars.
 117  *
 118  * Revision 1.1  1995/08/31  21:03:44  miker
 119  * Initial revision
 120  */
 121 #include "SearchP.h"
 122
 123 #include <limits.h>
 124 #include <stdlib.h>
 125 #include <string.h>
 126 #include <errno.h>
 127 #include <sys/stat.h>
 128
 129 #define X_INCLUDE_STRING_H
 130 #define XOS_USE_NO_LOCKING
 131 #include <X11/Xos_r.h>
 132
 133 #define PROGNAME        "LANG"
 134 #define EXT_SUFFIX      ".sfx"  /* standard paice suffix file format */
 135 #define OUTBUFSZ        6140
 136 #define SFX_DELIMS      " \t\n"
 137 #define MS_misc         1
 138 #define MS_lang         15
 139 #define IS_VOWEL(c)     ((paice_charmap [(UCHAR)c] & VOWEL) != 0)
 140
 141 /************************************************/
 142 /*                                              */
 143 /*                    PRULE                     */
 144 /*                                              */
 145 /************************************************/
 146 /* List of Paice suffix removal rules from .sfx files */
 147 typedef struct prule_t {
 148     struct prule_t   *link;     /* Ptr to next list node */
 149     UCHAR   *suffix;            /* Applicable suffix string, backwards */
 150     UCHAR   suflen;             /* Length of suffix */
 151     char    must_be_intact;     /* Optional '*'.  Rule only applies
 152                                  * to intact words */
 153     UCHAR   remove_count;       /* Number of suffix chars to remove */
 154     UCHAR   aplen;              /* Length of apndstr */
 155     UCHAR   *apndstr;           /* Optional append string */
 156     char    is_last_rule;       /* '$' terminate or '>' continue algorithm */
 157     }   PRULE;
 158
 159 char *ensure_end_slash (char *pathstr);
 160 void  unload_jpn_language (DBLK *dblk);
 161
 162 /************************************************/
 163 /*                                              */
 164 /*                    GLOBALS                   */
 165 /*                                              */
 166 /************************************************/
 167 int             debugging_loadlang =            FALSE;
 168 int             debugging_loadword =            FALSE;
 169 int             debugging_search_wordtree =     FALSE;
 170 int             debugging_teskey =              FALSE;
 171 int             debugging_paice =               FALSE;
 172 static int      *paice_charmap;
 173 static UCHAR    paicebuf [DtSrMAXWIDTH_HWORD + 2];
 174 static int      paicelen;
 175 static int      word_is_intact;
 176
 177 /* Language strings correspond to DtSrLa.. constants.  */
 178 static char     *lang_fnames[] = {
 179                         "eng",          /*  0 */
 180                         "eng",          /*  1  ('eng2' same files as 'eng') */
 181                         "esp",          /*  2 */
 182                         "fra",          /*  3 */
 183                         "ita",          /*  4 */
 184                         "deu",          /*  5 */
 185                         "jpn",          /*  6 */
 186                         "jpn",          /*  7  ('jpn2' same files as 'jpn' */
 187                         NULL
 188                 };
 189
 190
 191 /************************************************/
 192 /*                                              */
 193 /*                language_name                 */
 194 /*                                              */
 195 /************************************************/
 196 /* Returns language name string given language number */
 197 static char     *language_name (DtSrINT16 langno)
 198 {
 199     static char *language_names[] = {
 200                 "English-ASCII",        /*  0 = DtSrLaENG */
 201                 "English-Latin1",       /*  1 = DtSrLaENG2 */
 202                 "Spanish",              /*  2 = DtSrLaESP */
 203                 "French",               /*  3 = DtSrLaFRA */
 204                 "Italian",              /*  4 = DtSrLaITA */
 205                 "German",               /*  5 = DtSrLaDEU */
 206                 "Japanese-comp"         /*  6 = DtSrLaJPN */
 207                 "Japanese-.knj"         /*  7 = DtSrLaJPN2 */
 208                 };
 209
 210     if (langno < 0)
 211         return "INVALID!";
 212     else if (langno > DtSrLaLAST)
 213         return "(Custom Language)";
 214     else
 215         return language_names [langno];
 216 } /* language_name() */
 217
 218
 219 /************************************************/
 220 /*                                              */
 221 /*               search_wordtree                */
 222 /*                                              */
 223 /************************************************/
 224 /* Sept 1991.
 225  * Formerly search_inclist() in inclist.c and search_stoplist() in stop.c.
 226  * Searches a word list in a binary WORDTREE.
 227  * Passed wordstring is presumed to be a clean,
 228  * uppercase word token string terminated by \0.
 229  * Variables are static for speeeeed.
 230  * Returns TRUE if successful search, else FALSE.
 231  * See also search_wordtree_jpn() in jpn.c
 232  */
 233 static int      search_wordtree (WORDTREE *wordtree, UCHAR *wordstring)
 234 {
 235     static int          direction;
 236     static WORDTREE     *node;
 237
 238     if (debugging_search_wordtree)
 239         fprintf (aa_stderr, PROGNAME"196 search wordtree for '%s':\n",
 240             wordstring);
 241     /* MAIN SEARCH LOOP: binary tree search */
 242     for (node = wordtree;  node != NULL;  ) {
 243         if ((direction = strcmp ((char *) wordstring, node->word)) == 0) {
 244             if (debugging_search_wordtree)
 245                 fprintf (aa_stderr, "  HIT!\n");
 246             return TRUE;
 247         }
 248         /* Descend left or right depending on word */
 249         if (debugging_search_wordtree)
 250             fprintf (aa_stderr, "  %c '%s'\n",
 251                 (direction < 0) ? 'L' : 'R', (char *) node->word);
 252         if (direction < 0)
 253             node = node->llink;
 254         else
 255             node = node->rlink;
 256     }
 257     if (debugging_search_wordtree)
 258         fprintf (aa_stderr, "  MISS.\n");
 259     return FALSE;
 260 }  /* search_wordtree() */
 261
 262
 263 /************************************************/
 264 /*                                              */
 265 /*                 teskey_parser                */
 266 /*                                              */
 267 /************************************************/
 268 /* 1989.
 269  * Teskey_parser() is derived from the former Socrates() in socrates.c.
 270  * Returns next teskey-parsed word token from a character stream.
 271  * Called from (1) dtsrindex, where readchar_ftext() cofunction
 272  * reads the .fzk file document 'stream', or (2) search engine
 273  * query parsers, where readchar_string() cofunction 'reads'
 274  * from the query string.
 275  * (The word hiliting parser does not directly call teskey_parser; it has
 276  * its own simplified equivalent to the parsing algorithms herein.)
 277  *
 278  * First call passes args in PARG structure.  This resets end of
 279  * text block (ETX) flag, resets 'offset' counter to zero, etc.
 280  * Subsequent calls should pass NULL, and parser returns
 281  * next token in block, until reader cofunction reads ETX,
 282  * ie special ETX char ('\0').  Subsequent calls to parser
 283  * return NULL meaning "no tokens left in current stream".
 284  * Reader cofunctions tolerate repeated calls after
 285  * the first ETX, still returning '\0'.
 286  *
 287  * This parser presumes all incoming text is unformatted.
 288  * Since parser accesses streams a char at a time it does
 289  * not require periodic line feeds or anything else.
 290  *
 291  * Parser also returns offset information: number of bytes
 292  * since beginning of text block.
 293  *
 294  * Variables are static for speeeeeeed.
 295  *
 296  * OUTPUT FORMAT:  NULL or a static C string containing a single
 297  * parsed word token.  Word buffer reused at next call.
 298  * Each word is translated as follows:
 299  *      All alphas TO UPPERCASE.
 300  *      Teskey algorithm used to find word boundaries.
 301  *      Always keeps include-list words.
 302  *      Throws away stoplist words, very short words, and very long words.
 303  *      All intervening nonconcordables discarded.
 304  *
 305  * There is a slight mod to the published Teskey algorithm.
 306  * Words can begin with optionally concordable chars
 307  * but not end with them.  For example if '-' is optionally
 308  * concordable, '-foo-' will be parsed into '-foo'.
 309  */
 310 char    *teskey_parser (PARG *parg)
 311 {
 312     static READCFP      cofunction;
 313     static void         *cofunction_arg;
 314     static DBLK         *dblk =         NULL;
 315     static UCHAR        *outbuf =       NULL;
 316     static size_t       outbufsz =      0;
 317     static UCHAR        *endmaxword;    /* end largest possible output word */
 318     static UCHAR        *outp;          /* next loc in outbuf */
 319     static int          *charmap;
 320     static int          minwordsz, maxwordsz;
 321     static int          wordlen;
 322     static enum {BETW_WORDS, IN_WORD, TOO_LONG}
 323                         tpstate;
 324     static long         *offsetp, readcount, candidate_offset;
 325     static int          is_hiliting;
 326     static int          add_msgs;
 327
 328     /* If first call for current text block... */
 329     if (parg) {
 330         dblk = parg->dblk;
 331         minwordsz = dblk->dbrec.or_minwordsz;
 332         maxwordsz = dblk->dbrec.or_maxwordsz;
 333         charmap = dblk->charmap;
 334         offsetp = parg->offsetp;
 335         is_hiliting = (parg->flags & PA_HILITING);
 336         add_msgs = (parg->flags & PA_MSGS);
 337         if (charmap == NULL) {
 338             fprintf (aa_stderr, catgets(dtsearch_catd, MS_lang, 4,
 339                 "%s dblk not initialized.\n"),
 340                 PROGNAME"801");
 341             DtSearchExit (55);
 342         }
 343
 344         if (parg->string) {
 345             cofunction_arg = parg->string;
 346             cofunction = (READCFP) readchar_string;
 347         }
 348         else if (parg->ftext) {
 349             cofunction_arg = parg;
 350             cofunction = (READCFP) readchar_ftext;
 351         }
 352         else {
 353             fprintf (aa_stderr, catgets(dtsearch_catd, MS_lang, 5,
 354                 "%s Program Error: parg contains neither file nor string.\n"),
 355                 PROGNAME"327");
 356             DtSearchExit (27);
 357         }
 358
 359         if (outbufsz <= maxwordsz) {
 360             if (outbuf)
 361                 free (outbuf);
 362             outbufsz = maxwordsz + 8;
 363             outbuf = austext_malloc (outbufsz + 8, PROGNAME"807", NULL);
 364         }
 365         endmaxword = outbuf + maxwordsz;
 366         if (debugging_teskey)
 367             fprintf (aa_stderr,
 368                 "teskey: start of text block, maxwsz=%d outbufsz=%lu\n",
 369                 maxwordsz, (unsigned long) outbufsz);
 370         readcount = 0L;
 371     }
 372
 373     /* CANDIDATE WORD LOOP:  Read text chars into outbuf.
 374      * Exit loop when outbuf contains one candidate token or at ETX.
 375      */
 376 READ_ANOTHER_WORD:
 377     outp = outbuf;
 378     tpstate = BETW_WORDS;
 379     while ((*outp = cofunction (cofunction_arg))) {
 380         readcount++;
 381         cofunction_arg = NULL;
 382
 383         /*------------- BETW_WORDS State ------------
 384          * Reader is between word tokens.
 385          */
 386         if (tpstate == BETW_WORDS) {
 387             /*
 388              * Discard nonconcordable chars between words.
 389              */
 390             if ((charmap[*outp] & NON_CONCORD) != 0)
 391                 continue;
 392             /*
 393              * Fully concordable char is definite start of new word.
 394              * Convert to uppercase and go get next char.
 395              */
 396             if ((charmap[*outp] & CONCORDABLE) != 0) {
 397                 *outp = charmap[*outp] & 0x00ff;
 398                 outp++;
 399                 candidate_offset = readcount;
 400                 tpstate = IN_WORD;
 401                 continue;
 402             }
 403             /*
 404              * Must be optionally concordable.  It can only
 405              * start a new word if next char is concordable.
 406              * If so, convert a fully concordable char
 407              * to uppercase and go get next char.
 408              * Otherwise discard just like non_concord.
 409              */
 410             outp++;
 411             if ((*outp = cofunction(NULL)))
 412                 readcount++;
 413             if ((charmap[*outp] & CONCORDABLE) != 0) {
 414                 *outp = charmap[*outp] & 0x00ff;
 415                 outp++;
 416                 candidate_offset = readcount - 1;
 417                 tpstate = IN_WORD;
 418                 continue;
 419             }
 420             else {
 421                 outp--;
 422                 continue;
 423             }
 424         } /* endif BETW_WORDS */
 425
 426
 427         /*------------- IN_WORD State ------------
 428          * Reader is in middle of a word.
 429          * Convert all concordables to uppercase and append.
 430          * Terminate word at first non_concord.
 431          * Non_concords treatment depends on next char.
 432          */
 433         else if (tpstate == IN_WORD) {
 434             if ((charmap[*outp] & CONCORDABLE) != 0) {
 435                 if (outp < endmaxword) {
 436                     *outp = charmap[*outp] & 0x00ff;
 437                     outp++;
 438                 }
 439                 else {
 440                     tpstate = TOO_LONG;
 441                     if (debugging_teskey)
 442                         fprintf (aa_stderr,
 443                                 "teskey: ofs=%3ld \"%.15s...\", (TOO LONG)\n",
 444                                 candidate_offset-1, outbuf);
 445                     if (add_msgs) {
 446                         char    msgbuf [DtSrMAXWIDTH_HWORD + 100];
 447                         sprintf (msgbuf, catgets(dtsearch_catd, MS_lang, 8,
 448                             "%s '%.*s...' is larger\n"
 449                             "than the maximum word size of database '%s'.") ,
 450                             PROGNAME"449", maxwordsz,
 451                             parg->string, dblk->label);
 452                         DtSearchAddMessage (msgbuf);
 453                         return NULL;
 454                     }
 455                     outbuf[0] = 0;
 456                     outp = outbuf;
 457                 }
 458                 continue;
 459             }
 460             if ((charmap[*outp] & NON_CONCORD) != 0) {
 461                 *outp = '\0';
 462                 break;
 463             }
 464             /* Must be opt_concord... */
 465             outp++;
 466             if ((*outp = cofunction(NULL)))
 467                 readcount++;
 468             if ((charmap[*outp] & CONCORDABLE) != 0) {
 469                 if (outp < endmaxword) {
 470                     *outp = charmap[*outp] & 0x00ff;    /* uppercase */
 471                     outp++;
 472                 }
 473                 else {
 474                     tpstate = TOO_LONG;
 475                     if (debugging_teskey)
 476                         fprintf (aa_stderr,
 477                                 "teskey: ofs=%3ld \"%.15s...\", (TOO LONG)\n",
 478                                 candidate_offset-1, outbuf);
 479                     outbuf[0] = 0;
 480                     outp = outbuf;
 481                 }
 482                 continue;
 483             }
 484             else {      /* next char NOT concordable...*/
 485                 *(--outp) = '\0';
 486                 break;
 487             }
 488         } /* endif IN_WORD */
 489
 490
 491         /*------------- TOO_LONG State ------------
 492          * Reader is in middle of a word that exceeds max word size.
 493          * Discard all concordables and opt_concords until we
 494          * can get between words again with a clear non_concord.
 495          */
 496         else if (tpstate == TOO_LONG) {
 497             if ((charmap[*outp] & NON_CONCORD) != 0) {
 498                 outp = outbuf;
 499                 tpstate = BETW_WORDS;
 500             }
 501             continue;
 502         }
 503
 504         /*------------- UNKNOWN State ------------*/
 505         else {
 506             fprintf (aa_stderr, catgets(dtsearch_catd, MS_lang, 10,
 507                 "%s Program Error: Unknown parser state.\n"),
 508                 PROGNAME"306");
 509             DtSearchExit (26);
 510         }
 511     } /* end read loop for next CANDIDATE WORD */
 512
 513     /*---------- TEST FOR ETX -------------*/
 514     if (outbuf[0] == 0) {
 515         if (debugging_teskey)
 516             fprintf (aa_stderr, "teskey: etx\n");
 517         if (add_msgs) {
 518             char        msgbuf [200];
 519             sprintf (msgbuf, catgets(dtsearch_catd, MS_lang, 12,
 520                 "%s '%.120s' is not a valid word in database '%s'.") ,
 521                 PROGNAME"506", parg->string, dblk->label);
 522             DtSearchAddMessage (msgbuf);
 523         }
 524         return NULL;
 525     }
 526
 527     wordlen = strlen ((char *) outbuf);
 528     candidate_offset--; /* token offset is one less than number of reads */
 529     if (debugging_teskey)
 530         fprintf (aa_stderr, "teskey: ofs=%3ld \"%s\"",
 531             candidate_offset, outbuf);
 532
 533     if (is_hiliting) {
 534         if (debugging_teskey)
 535             fprintf (aa_stderr, ", (hiliting, skip tree searches)");
 536         goto GOOD_WORD;
 537     }
 538
 539     /*--------- INCLUDE LIST ----------
 540      * Search before testing for stoplist or minimum word length.
 541      */
 542     if (dblk->inclist != NULL) {
 543         if (search_wordtree (dblk->inclist, outbuf)) {
 544             if (debugging_teskey)
 545                 fprintf (aa_stderr, ", (INCLUDE LIST)");
 546             goto GOOD_WORD;
 547         }
 548     }
 549
 550     /*--------- TOO SHORT -----------*/
 551     if (wordlen < minwordsz) {
 552         if (debugging_teskey)
 553             fprintf (aa_stderr, ", (TOO SHORT, min %d)\n", minwordsz);
 554         if (add_msgs) {
 555             char        msgbuf [200];
 556             sprintf (msgbuf, catgets(dtsearch_catd, MS_lang, 17,
 557                 "%s '%s' is less than the\n"
 558                 "minimum word size of database '%s'.") ,
 559                 PROGNAME"543", parg->string, dblk->label);
 560             DtSearchAddMessage (msgbuf);
 561             return NULL;
 562         }
 563         goto READ_ANOTHER_WORD;
 564     }
 565
 566     /*----------- STOP LIST -------------*/
 567     if (dblk->stoplist != NULL) {
 568         if (search_wordtree (dblk->stoplist, outbuf)) {
 569             if (debugging_teskey)
 570                 fprintf (aa_stderr, ", (STOP LIST)\n");
 571             if (add_msgs) {
 572                 char    msgbuf [200];
 573                 sprintf (msgbuf, catgets(dtsearch_catd, MS_lang, 19,
 574                     "%s The word '%s' is not indexed in database '%s'.") ,
 575                     PROGNAME"558", parg->string, dblk->label);
 576                 DtSearchAddMessage (msgbuf);
 577                 return NULL;
 578             }
 579             goto READ_ANOTHER_WORD;
 580         }
 581     }
 582
 583 GOOD_WORD:
 584     /* Word is correctly parsed and passes all dblk filters. */
 585     if (debugging_teskey)
 586         fprintf (aa_stderr, ", ...good word\n");
 587     if (offsetp)
 588         *offsetp = candidate_offset;
 589     return (char *) outbuf;
 590 } /* teskey_parser() */
 591
 592
 593 /************************************************/
 594 /*                                              */
 595 /*                 is_concordable               */
 596 /*                                              */
 597 /************************************************/
 598 /* Verifies passed word token is teskey-concordable
 599  * in code page of passed charmap.  Used in validating
 600  * word files.  Returns TRUE if all chars concordable
 601  * or optionally concordable, else returns FALSE.
 602  */
 603 int     is_concordable (char *word, int *charmap)
 604 {
 605     UCHAR       *cptr;
 606     for (cptr = (UCHAR *)word;  *cptr != 0;  cptr++)
 607         if ((charmap[*cptr] & NON_CONCORD) != 0)
 608             break;
 609     return (*cptr == 0);
 610 } /* is_concordable() */
 611
 612
 613 /************************************************/
 614 /*                                              */
 615 /*                 load_wordtree                */
 616 /*                                              */
 617 /************************************************/
 618 /* Called by load_stop_list(), load_include_list(), etc,
 619  * to read an appropriate word list file into binary tree structures.
 620  *
 621  * INPUT FILE FORMAT:  One word per line, all chars teskey concordable.
 622  * Preferred order is frequency of occurrence in the corpus
 623  * to make searches efficient.  Otherwise the words should at least
 624  * be in random order or an order that will approximate a binary search.
 625  * If first char is any of COMMENT_CHARS, line is ignored as comments.
 626  * Ascii spaces, tabs, or newline delimits the first word token--
 627  * anything else on the line is ignored as comments.
 628  * Optionally characters in word token will be checked for teskey
 629  * concordability.
 630  *
 631  * RETURNS 0 if file successfully loaded, returns 1 if file missing,
 632  * returns 2 and messages in global msglist if file has fatal errors.
 633  */
 634 int     load_wordtree (
 635                     WORDTREE    **treetop,
 636                     DBLK        *dblk,
 637                     char        *fname,
 638                     int         do_teskey_test)
 639 {
 640     int         i;
 641     int         errcount;
 642     int         is_duplicate;
 643     long        linecount = 0;
 644     char        *token;
 645     char        readbuf [256];
 646     char        sprintbuf [_POSIX_PATH_MAX + 1024];
 647     FILE        *fileid;
 648     WORDTREE    *new;
 649     WORDTREE    **this_link;
 650     _Xstrtokparams      strtok_buf;
 651
 652     if (debugging_loadlang)
 653         fprintf (aa_stderr, PROGNAME"1071 "
 654             "load_wordtree: db=%s fname='%s'\n",
 655             NULLORSTR(dblk->name), NULLORSTR(fname));
 656
 657     if ((fileid = fopen (fname, "rt")) == NULL) {
 658         /* Not being able to find the file is not an error.
 659          * We indicate that with the return code.
 660          * But any other error (like permissions) is fatal.
 661          */
 662         if (errno == ENOENT) {
 663             if (debugging_loadlang)
 664                 fputs ("  ...file not found.\n", aa_stderr);
 665             return 1;
 666         }
 667         else {
 668             sprintf (sprintbuf,
 669                 catgets (dtsearch_catd, MS_misc, 362, "%s: %s: %s."),
 670                 PROGNAME"362", fname, strerror(errno));
 671             DtSearchAddMessage (sprintbuf);
 672             return 2;
 673         }
 674     }
 675
 676     /*--------- Main Read Loop ----------*/
 677     errcount = 0;
 678     while (fgets (readbuf, sizeof(readbuf), fileid) != NULL) {
 679         linecount++;
 680         /*
 681          * Ignore comment lines beginning with punctuation char.
 682          * Ignore empty lines (strtok returns NULL, no tokens).
 683          * Otherwise first or only word on line is the desired word.
 684          */
 685         if (strchr (COMMENT_CHARS, readbuf[0]))
 686             continue;
 687         if ((token = _XStrtok(readbuf, " \t\n", strtok_buf)) == NULL)
 688             continue;
 689         dblk->lstrupr (token, dblk);
 690
 691         if (debugging_loadword)
 692             fprintf (aa_stderr, "  WORD: '%s'  ", token);
 693
 694         /* If requested confirm all chars are teskey-concordable. */
 695         if (do_teskey_test)
 696             if (!is_concordable (token, dblk->charmap)) {
 697                 sprintf (sprintbuf, catgets (dtsearch_catd, MS_misc, 400,
 698                     "%s: %s, line %ld: Invalid chars in word '%s'."),
 699                     PROGNAME"400", fname, linecount, token);
 700                 DtSearchAddMessage (sprintbuf);
 701                 errcount++;
 702                 continue;
 703             }
 704
 705         /* Unless we've already detected some errors,
 706          * allocate a new node and load its data fields.
 707          */
 708         if (errcount)
 709             continue;
 710         i = strlen (token);
 711         new = austext_malloc (sizeof(WORDTREE) + i + 4,
 712             PROGNAME"104", NULL);
 713         new->llink = NULL;
 714         new->rlink = NULL;
 715         new->len = i;
 716         new->word = (void *) (new + 1);
 717         strcpy (new->word, token);
 718
 719         /* Descend binary tree and insert in correct alphabetical place */
 720         is_duplicate = FALSE;
 721         for (this_link = treetop;  *this_link != NULL;  ) {
 722             i = strcmp (new->word, (*this_link)->word);
 723
 724             /* test for duplicate word */
 725             if (i == 0) {
 726                 sprintf (sprintbuf, catgets (dtsearch_catd, MS_misc, 423,
 727                     "%s Word '%s' in '%s' is a duplicate."),
 728                     PROGNAME"423", token, fname);
 729                 DtSearchAddMessage (sprintbuf);
 730                 /* duplicates aren't fatal, just ignore the word */
 731                 is_duplicate = TRUE;
 732                 break;  /* no point in continuing descent */
 733             }
 734
 735             /* Descend tree to find correct insertion point */
 736             if (debugging_loadword)
 737                 fputc(((i < 0)? 'L' : 'R'), aa_stderr);
 738             this_link = (WORDTREE **) ((i < 0) ?
 739                 &(*this_link)->llink : &(*this_link)->rlink);
 740         } /* end forloop to find tree insertion point */
 741
 742         /* Don't link anything if error found while descending tree */
 743         if (is_duplicate) {
 744             if (debugging_loadword)
 745                 fputs (" duplicate!\n", aa_stderr);
 746             free (new);
 747             continue;
 748         }
 749
 750         /* Insert new node at current location in tree */
 751         *this_link = new;
 752         if (debugging_loadword)
 753             fputs(" .\n", aa_stderr);
 754     }   /* end of read loop */
 755
 756     fclose (fileid);
 757
 758     if (errcount) {
 759         if (debugging_loadlang)
 760             fprintf (aa_stderr,
 761                 PROGNAME"1186 load word file '%s' failed.\n", fname);
 762         return 2;
 763     }
 764     else {
 765         if (debugging_loadlang)
 766             fprintf (aa_stderr,
 767                 PROGNAME"1193 load word file '%s' successful.\n", fname);
 768         return 0;
 769     }
 770 }  /* load_wordtree() */
 771
 772
 773 /************************************************/
 774 /*                                              */
 775 /*                free_wordtree                 */
 776 /*                                              */
 777 /************************************************/
 778 /* Formerly free_bintree() in msgutil.c.
 779  * Frees storage for all nodes in a WORDTREE and
 780  * sets its top-of-list pointer to NULL.
 781  * Works only for node structures where all memory
 782  * was allocated in a single call to malloc().
 783  * Uses link inversion traversal (eg, Data Structure Techniques,
 784  * Thomas A. Standish, Algorithm 3.6) where TAG is initialized
 785  * at preorder visit, and node is freed at postorder visit.
 786  */
 787 static void     free_wordtree (WORDTREE ** wordtree_head)
 788 {
 789     WORDTREE        *next;
 790     WORDTREE        *prev = NULL;
 791     WORDTREE        *pres = *wordtree_head;
 792
 793     if (*wordtree_head == NULL)
 794         return;
 795
 796 DESCEND_LEFT:
 797     pres->word = (void *) 0;    /* preorder visit:  TAG = 0 */
 798     next = pres->llink;
 799     if (next != NULL) {
 800         pres->llink = prev;
 801         prev = pres;
 802         pres = next;
 803         goto DESCEND_LEFT;
 804     }
 805 DESCEND_RIGHT:
 806      next = pres->rlink;
 807     if (next != NULL) {
 808         pres->word = (void *) 1;        /* TAG = 1 */
 809         pres->rlink = prev;
 810         prev = pres;
 811         pres = next;
 812         goto DESCEND_LEFT;
 813     }
 814 POSTORDER_VISIT:
 815     free (pres);
 816     if (prev == NULL) { /* end of algorithm? */
 817         *wordtree_head = NULL;
 818         return;
 819     }
 820     if (prev->word == (void *) 0) {     /* go up left leg */
 821         next = prev->llink;
 822         pres = prev;
 823         prev = next;
 824         goto DESCEND_RIGHT;
 825     }
 826     else {      /* go up right leg */
 827         next = prev->rlink;
 828         prev->word = (void *) 0;        /* restore TAG = 0 */
 829         pres = prev;
 830         prev = next;
 831         goto POSTORDER_VISIT;
 832     }
 833 }  /* free_wordtree() */
 834
 835
 836 /************************************************/
 837 /*                                              */
 838 /*              load_include_list               */
 839 /*                                              */
 840 /************************************************/
 841 /* Builds include list by reading include file
 842  * into a binary tree structure.
 843  * Unlike stoplists, include-lists are optional.
 844  * Also unlike stoplists, there are no language default include-lists.
 845  * 'dblist' may be NULL.
 846  * RETURNS TRUE if no problems, else FALSE with msg in ausapi_msglist.
 847  */
 848 static int      load_include_list (DBLK *dblk, DBLK *dblist)
 849 {
 850     int         i;
 851     int         filename_was_null = (dblk->fname_inc == NULL);
 852     DBLK        *db;
 853     char        sprintbuf [512];
 854
 855     dblk->inclist = NULL;       /* just to be sure */
 856
 857     if (debugging_loadlang)
 858         fprintf (aa_stderr,
 859             PROGNAME"1705 Load inclist: db='%s' lang=#%d,%s\n",
 860             NULLORSTR(dblk->name), (int)dblk->dbrec.or_language,
 861             language_name(dblk->dbrec.or_language));
 862
 863     /* If file name not provided, generate one based on
 864      * dblk's path, database name, and default extension.
 865      */
 866     if (filename_was_null) {
 867         if (dblk->name[0] == 0) {
 868             dblk->fname_inc = "";
 869             dblk->inclist = NULL;
 870             if (debugging_loadlang)
 871                 fprintf (aa_stderr, PROGNAME"1339 "
 872                     "No inclist because neither fname nor dbname provided.\n");
 873             return TRUE;
 874         }
 875         if (dblk->path == NULL)
 876             dblk->path = strdup("");
 877         dblk->fname_inc = austext_malloc (strlen(dblk->path) + 36,
 878              PROGNAME"1187", NULL);
 879         strcpy (dblk->fname_inc, dblk->path);
 880         ensure_end_slash (dblk->fname_inc);
 881         strcat (dblk->fname_inc, dblk->name);
 882         strcat (dblk->fname_inc, EXT_INCLIST);
 883     }
 884     if (debugging_loadlang)
 885         fprintf (aa_stderr,
 886             PROGNAME"1350 Include list file name = '%s'.\n",
 887             dblk->fname_inc);
 888
 889     /* Don't reload the same file if it's already
 890      * been loaded into a previous dblk in a list.
 891      * Code works just fine if dblist == NULL.
 892      */
 893     for (db = dblist;  db != NULL;  db = db->link) {
 894         if (db == dblk || db->fname_inc == NULL)
 895             continue;
 896         if (strcmp (db->fname_inc, dblk->fname_inc) == 0) {
 897             dblk->inclist = db->inclist;
 898             dblk->lang_flags |= LF_DUP_INC;
 899             if (debugging_loadlang)
 900                 fprintf (aa_stderr, PROGNAME"1363 "
 901                     "Using previously loaded inclist, db='%s'.\n",
 902                     dblk->name);
 903             return TRUE;
 904         }
 905     }
 906
 907     /* Include list is optional so missing file is
 908      * not an error unless caller named a specific file.
 909      */
 910     i = load_wordtree (&dblk->inclist, dblk, dblk->fname_inc, TRUE);
 911     switch (i) {
 912         case 0:
 913             return TRUE;
 914
 915         case 1:
 916             if (filename_was_null) {
 917                 dblk->fname_inc = "";
 918                 dblk->inclist = NULL;
 919                 return TRUE;
 920             }
 921             else {
 922                 sprintf (sprintbuf,
 923                     catgets (dtsearch_catd, MS_misc, 362, "%s: %s: %s."),
 924                     PROGNAME"1218", dblk->fname_inc, strerror(ENOENT));
 925                 DtSearchAddMessage (sprintbuf);
 926                 return FALSE;
 927             }
 928
 929         default:
 930             return FALSE;
 931     }
 932 } /* load_include_list() */
 933
 934
 935 /************************************************/
 936 /*                                              */
 937 /*               load_stop_list                 */
 938 /*                                              */
 939 /************************************************/
 940 /* Builds stoplist by reading stoplist file into a
 941  * binary tree structure.  File name can be
 942  *    (1) passed in dblk.fname_stp,
 943  *    (2) generated from dblk path, name, and '.stp',
 944  *    (3) default for dblk path, language, and '.stp'.
 945  * 'dblist' may be NULL.
 946  * RETURNS TRUE if no problems, else FALSE with msg in ausapi_msglist.
 947  */
 948 static int      load_stop_list (DBLK *dblk, DBLK *dblist)
 949 {
 950     int         i;
 951     DBLK        *db;
 952     char        sprintbuf [_POSIX_PATH_MAX + 512];
 953     struct stat statbuf;
 954
 955     dblk->stoplist = NULL;      /* just to be sure */
 956
 957     if (debugging_loadlang)
 958         fprintf (aa_stderr,
 959             PROGNAME"1700 Load stoplist: db='%s' lang=#%d,%s\n",
 960             NULLORSTR(dblk->name), (int)dblk->dbrec.or_language,
 961             language_name(dblk->dbrec.or_language));
 962
 963     /* If file name not provided, generate one based on
 964      * dblk's path, database name, and default extension.
 965      * And if that doesn't work, generate one based on
 966      * dblk's path, language, and default extension.
 967      */
 968     if (dblk->fname_stp == NULL) {
 969         if (dblk->path == NULL)
 970             dblk->path = strdup("");
 971         dblk->fname_stp = austext_malloc (strlen(dblk->path) + 36,
 972              PROGNAME"919", NULL);
 973
 974         strcpy (dblk->fname_stp, dblk->path);
 975         ensure_end_slash (dblk->fname_stp);
 976         strcat (dblk->fname_stp, dblk->name);
 977         strcat (dblk->fname_stp, EXT_STOPLIST);
 978         errno = 0;
 979         stat (dblk->fname_stp, &statbuf);
 980         if (errno == ENOENT) {
 981             strcpy (dblk->fname_stp, dblk->path);
 982             ensure_end_slash (dblk->fname_stp);
 983             strcat (dblk->fname_stp, lang_fnames [dblk->dbrec.or_language]);
 984             strcat (dblk->fname_stp, EXT_STOPLIST);
 985         }
 986     }
 987     if (debugging_loadlang)
 988         fprintf (aa_stderr,
 989             PROGNAME"1448 Stoplist file name = '%s'.\n",
 990             dblk->fname_stp);
 991
 992     /* Don't reload the same file if it's already
 993      * been loaded into a previous dblk in a list.
 994      * Code works just fine if dblist == NULL.
 995      */
 996     for (db = dblist;  db != NULL;  db = db->link) {
 997         if (db == dblk || db->fname_stp == NULL)
 998             continue;
 999         if (strcmp (db->fname_stp, dblk->fname_stp) == 0) {
1000             dblk->stoplist = db->stoplist;
1001             dblk->lang_flags |= LF_DUP_STP;
1002             if (debugging_loadlang)
1003                 fprintf (aa_stderr, PROGNAME"1460 "
1004                     "Using previously loaded stoplist, db='%s'.\n",
1005                     dblk->name);
1006             return TRUE;
1007         }
1008     }
1009
1010     /* Stop lists are mandatory--a missing stoplist is fatal. */
1011     i = load_wordtree (&dblk->stoplist, dblk, dblk->fname_stp, TRUE);
1012     if (i == 1) {
1013         sprintf (sprintbuf,
1014             catgets (dtsearch_catd, MS_misc, 362, "%s: %s: %s"),
1015             PROGNAME"1270", dblk->fname_stp, strerror(ENOENT));
1016         DtSearchAddMessage (sprintbuf);
1017     }
1018     return (i == 0);
1019 } /* load_stop_list() */
1020
1021
1022 /************************************************/
1023 /*                                              */
1024 /*              free_paice_rules                */
1025 /*                                              */
1026 /************************************************/
1027 /* Frees all allocated storage for a set of paice rules, typically
1028  * loaded at dblk.stem_extra.  Called by REINIT routines and
1029  * by load_paice_suffixes() when cleaning up after an error.
1030  */
1031 static void     free_paice_rules (PRULE ***rules_table_ptr)
1032 {
1033     int         i;
1034     PRULE       *p, **linkp;
1035     PRULE       **rules_table;
1036
1037     if (*rules_table_ptr == NULL)
1038         return;
1039     rules_table = *rules_table_ptr;
1040     for (i=0; i<256; i++) {
1041         if (rules_table[i] == NULL)
1042             continue;
1043         p = rules_table[i];
1044         while (p) {
1045             linkp = &p->link;
1046             free (p->suffix);
1047             if (p->apndstr)
1048                 free (p->apndstr);
1049             free (p);
1050             p = *linkp;
1051         }
1052     }
1053     free (rules_table);
1054     *rules_table_ptr = NULL;
1055     return;
1056 } /* free_paice_rules() */
1057
1058
1059 /************************************************/
1060 /*                                              */
1061 /*              load_paice_suffixes             */
1062 /*                                              */
1063 /************************************************/
1064 /* Loads European language paice stemmer suffix rules
1065  * into dblk.stem_extra as an array of ptrs to linked lists.
1066  * Like stop lists, sfx files can be
1067  *    (1) passed in dblk.fname_sfx,
1068  *    (2) generated from dblk path, dbname, and '.sfx',
1069  *    (3) generated from dblk path, language, and '.sfx'.
1070  * Internal tables will be reused if file previously loaded.
1071  * Only uses single byte character sets (ascii, iso-latin-1).
1072  * Uses strtok().  dblk->charmap must already be loaded.
1073  * Will continue to parse entire file even if errors are found.
1074  * RETURNS TRUE if no problems, else FALSE with msg in ausapi_msglist.
1075  */
1076 static int      load_paice_suffixes (DBLK *dblk, DBLK *dblist)
1077 {
1078     FILE        *fp;
1079     DBLK        *db;
1080     PRULE       *prule, **prule_link;
1081     PRULE       **rules_table;
1082     struct stat statbuf;
1083     UCHAR       *cptr, *token;
1084     char        readbuf [_POSIX_PATH_MAX + 1024];
1085     char        msgbuf [_POSIX_PATH_MAX + 1024];
1086     UCHAR       *suffix, *apndstr;
1087     int         must_be_intact, is_last_rule;
1088     UCHAR       remove_count;
1089     int         lineno, errcount;
1090     _Xstrtokparams      strtok_buf;
1091
1092     dblk->stem_extra = NULL;    /* just to be sure */
1093     rules_table = NULL;
1094
1095     if (debugging_loadlang)
1096         fprintf (aa_stderr,
1097             PROGNAME"1715 Load paice suffixes: db='%s' lang=#%d,%s\n",
1098             NULLORSTR(dblk->name), (int)dblk->dbrec.or_language,
1099             language_name(dblk->dbrec.or_language));
1100
1101     /* If file name not provided, generate one based on
1102      * dblk's path, database name, and default extension.
1103      * And if that doesn't work, generate one based on
1104      * dblk's path, language, and default extension.
1105      */
1106     if (dblk->fname_sfx == NULL) {
1107         if (dblk->path == NULL)
1108             dblk->path = strdup("");
1109         dblk->fname_sfx = austext_malloc (strlen(dblk->path) + 36,
1110              PROGNAME"1113", NULL);
1111
1112         strcpy (dblk->fname_sfx, dblk->path);
1113         ensure_end_slash (dblk->fname_sfx);
1114         strcat (dblk->fname_sfx, dblk->name);
1115         strcat (dblk->fname_sfx, EXT_SUFFIX);
1116         errno = 0;
1117         stat (dblk->fname_sfx, &statbuf);
1118         if (errno == ENOENT) {
1119             strcpy (dblk->fname_sfx, dblk->path);
1120             ensure_end_slash (dblk->fname_sfx);
1121             strcat (dblk->fname_sfx, lang_fnames [dblk->dbrec.or_language]);
1122             strcat (dblk->fname_sfx, EXT_SUFFIX);
1123         }
1124     }
1125     if (debugging_loadlang)
1126         fprintf (aa_stderr,
1127             PROGNAME"1740 Paice suffix file name = '%s'.\n",
1128             dblk->fname_sfx);
1129
1130     /* Don't reload the same file if it's already
1131      * been loaded into a previous dblk in a list,
1132      * but flag it so it won't be freed at unload_language/REINIT.
1133      * Code works just fine if dblist == NULL.
1134      */
1135     for (db = dblist;  db != NULL;  db = db->link) {
1136         if (db == dblk || db->fname_sfx == NULL)
1137             continue;
1138         if (strcmp (db->fname_sfx, dblk->fname_sfx) == 0) {
1139             dblk->stem_extra = db->stem_extra;
1140             dblk->lang_flags |= LF_DUP_SFX;
1141             if (debugging_loadlang)
1142                 fprintf (aa_stderr, PROGNAME"1145 "
1143                     "Using previously loaded suffixes, db='%s'.\n",
1144                     dblk->name);
1145             return TRUE;
1146         }
1147     }
1148
1149     fp = fopen (dblk->fname_sfx, "rt");
1150     if (fp == NULL) {
1151         sprintf (msgbuf,
1152             catgets (dtsearch_catd, MS_misc, 362, "%s: %s: %s."),
1153             PROGNAME"181", dblk->fname_sfx, strerror(errno));
1154         DtSearchAddMessage (msgbuf);
1155         dblk->fname_sfx = NULL;
1156         return FALSE;
1157     }
1158
1159     /* Rules table will eventually be loaded at dblk.stem_extra.
1160      * It consists of 256 PRULE ptrs,
1161      * one for each possible single byte char.
1162      * Each ptr is the head of a rules list for that char.
1163      */
1164     rules_table = austext_malloc (256 * sizeof(PRULE*),
1165         PROGNAME"199", &ausapi_msglist);
1166     memset (rules_table, 0, 256 * sizeof(PRULE*));
1167     lineno =    0;
1168     errcount =  0;
1169
1170     /*------- Main Read Loop -------*/
1171     while (fgets (readbuf, sizeof(readbuf), fp) != NULL) {
1172         lineno++;
1173
1174         /* Ignore comment lines */
1175         if (strchr (COMMENT_CHARS, readbuf[0]))
1176             continue;
1177
1178         /* TOKEN #1: suffix string, backwards, all uppercase.
1179          * If missing, ignore 'empty' line.
1180          * If the first token is all numeric, ignore line
1181          * (for compatibility with older versions of file).
1182          */
1183         if ((suffix = (UCHAR *)_XStrtok(readbuf, SFX_DELIMS, strtok_buf)) == NULL)
1184             continue;
1185
1186         for (cptr = suffix;  cptr;  cptr++)
1187             if ((dblk->charmap[*cptr] & NUMERAL) == 0)
1188                 break;
1189         if (*cptr == '\0')
1190             continue;
1191
1192         /* OPTIONAL TOKEN #2: if next token '*', set 'intact' flag */
1193         if ((token = (UCHAR *)_XStrtok(NULL, SFX_DELIMS, strtok_buf)) == NULL) {
1194 BAD_RULE:
1195             sprintf (msgbuf,  catgets(dtsearch_catd, MS_lang, 51,
1196                 "%s %s, Line %d: Invalid Paice Rule for suffix '%s'.") ,
1197                 PROGNAME"898", dblk->fname_sfx, lineno, suffix);
1198             DtSearchAddMessage (msgbuf);
1199             errcount++;
1200             continue;
1201         }
1202         must_be_intact = FALSE;
1203         if (token[0] == '*') {
1204             must_be_intact = TRUE;
1205             /* Read next token... */
1206             if ((token = (UCHAR *)_XStrtok(NULL, SFX_DELIMS, strtok_buf)) == NULL)
1207                 goto BAD_RULE;
1208         }
1209
1210         /* TOKEN #3: remove-count */
1211         remove_count = (UCHAR) atoi ((char *) token);
1212
1213         /* OPTIONAL TOKEN #4: if next token is NOT a continue
1214          * symbol ('>' or '$'), then it's an append string.
1215          */
1216         apndstr = NULL;
1217         if ((token = (UCHAR *)_XStrtok(NULL, SFX_DELIMS, strtok_buf)) == NULL)
1218             goto BAD_RULE;
1219         if (token[0] != '$'  &&  token[0] != '>') {
1220             apndstr = token;
1221             /* Read next token... */
1222             if ((token = (UCHAR *)_XStrtok(NULL, SFX_DELIMS, strtok_buf)) == NULL)
1223                 goto BAD_RULE;
1224         }
1225
1226         /* TOKEN #5: continue symbol '$' (stop) or '>' (continue) */
1227         is_last_rule = (token[0] == '$');
1228
1229         if (debugging_loadword) {
1230             fprintf (aa_stderr,
1231                 "  SFX: intact?=%d stop?=%d remv=%d '%s'",
1232                 (int) must_be_intact,
1233                 (int) is_last_rule,
1234                 (int) remove_count,
1235                 suffix);
1236             if (apndstr)
1237                 fprintf (aa_stderr, "\tapnd='%s'\n", apndstr);
1238             else
1239                 fputc ('\n', aa_stderr);
1240         }
1241
1242         /* Good suffix.  If we haven't had any errors yet,
1243          * add it to rules list for the first char of the suffix.
1244          */
1245         if (errcount)
1246             continue;
1247         prule = austext_malloc (sizeof(PRULE), PROGNAME"1252", NULL);
1248         memset (prule, 0, sizeof(PRULE));
1249         prule->suffix =         (UCHAR *) strdup ((char*)suffix);
1250         prule->suflen =         strlen ((char*)suffix);
1251         prule->must_be_intact = must_be_intact;
1252         prule->remove_count =   remove_count;
1253         prule->is_last_rule =   is_last_rule;
1254         if (apndstr) {
1255             prule->apndstr =    (UCHAR *) strdup ((char*)apndstr);
1256             prule->aplen =      strlen ((char*)apndstr);
1257         }
1258
1259         prule_link = &rules_table[suffix[0]];
1260         while (*prule_link)
1261             prule_link = &(*prule_link)->link;
1262         *prule_link = prule;
1263
1264     } /* end Main Read Loop */
1265
1266     fclose (fp);
1267     if (errcount) {
1268         free_paice_rules (&rules_table);
1269         return FALSE;
1270     }
1271     dblk->stem_extra = rules_table;
1272
1273     /* Update last table entry */
1274     if (debugging_loadlang) {
1275         fprintf (aa_stderr,
1276             PROGNAME"1654 Paice suffix file '%s' loaded ok.\n",
1277             dblk->fname_sfx);
1278         fflush (aa_stderr);
1279     }
1280     return TRUE;
1281 }  /* load_paice_suffixes() */
1282
1283
1284 /************************************************/
1285 /*                                              */
1286 /*               is_matching_rule               */
1287 /*                                              */
1288 /************************************************/
1289 /* Subroutine of paice_stemmer().
1290  * Returns TRUE if passed rule can be applied to stem in paicebuf.
1291  * Else returns FALSE.
1292  */
1293 static int      is_matching_rule (PRULE *rule)
1294 {
1295     static UCHAR        *ptr;
1296     static int          i, j;
1297
1298     if (debugging_paice)
1299         fprintf (aa_stderr, "  test rule '%s':\t", rule->suffix);
1300
1301     /* Skip rule if we've made at least one previous change
1302      * but the current rule requires an intact word.
1303      */
1304     if (rule->must_be_intact  &&  !word_is_intact) {
1305         if (debugging_paice)
1306             fputs ("word not intact...\n", aa_stderr);
1307         return FALSE;
1308     }
1309
1310     /* Do a backward strcmp on the suffix.
1311      * Skip rule if it doesn't match current paicebuf's ending chars.
1312      */
1313     j = rule->suflen;
1314     ptr = paicebuf + paicelen - 1;
1315     for (i = 0; i < j; i++) {
1316         if (*((rule->suffix) + i) != *ptr) {
1317             if (debugging_paice)
1318                 fputs ("no match...\n", aa_stderr);
1319             return FALSE;
1320         }
1321         ptr--;
1322     }
1323
1324     if (debugging_paice)
1325         fputs ("match", aa_stderr);
1326
1327     /* Set i = paicebuf length after removing and appending suffixes.
1328      * Used to algorithmically test remaining stem length
1329      * after tentative application of rule.
1330      */
1331     i = paicelen - (rule->remove_count - rule->aplen);
1332
1333     if (i <= 1) {
1334         if (debugging_paice)
1335             fputs (", but stem too short...\n", aa_stderr);
1336         return FALSE;
1337     }
1338
1339     if (i == 2) {
1340         if (IS_VOWEL (paicebuf[0])) {
1341             if (debugging_paice)
1342                 fputs (", and short vowel stem valid.\n", aa_stderr);
1343             return TRUE;
1344         }
1345         else {
1346             if (debugging_paice)
1347                 fputs (", but consonant stem too short...\n", aa_stderr);
1348             return FALSE;
1349         }
1350     }
1351
1352     /* Remaining stem is at least 3 chars.
1353      * If it contains a vowel anywhere, it's valid.
1354      * (A 'Y' after the first char counts as a vowel).
1355      * Otherwise it's not.
1356      */
1357     for (j=0;  j<i;  j++) {
1358         if (IS_VOWEL (paicebuf[j])) {
1359 GOOD_STEM:
1360             if (debugging_paice)
1361                 fputs (", and remaining stem valid.\n", aa_stderr);
1362             return TRUE;
1363         }
1364         if (j > 0  &&  paicebuf[j] == 'Y')
1365             goto GOOD_STEM;
1366     }
1367
1368     if (debugging_paice)
1369         fputs (", but remaining stem all consonants.\n", aa_stderr);
1370     return FALSE;
1371 }  /* is_matching_rule() */
1372
1373
1374 /************************************************/
1375 /*                                              */
1376 /*                 paice_stemmer                */
1377 /*                                              */
1378 /************************************************/
1379 /* Given a word token (ALREADY UPPERCASE) in a single byte
1380  * language such as the output of teskey_parser,
1381  * generates 'stem' by repeated suffix removal.
1382  * Returns stem token in a static buffer valid
1383  * until next call to paice_stemmer or null_stemmer.
1384  * Returned stem might be the original unmodified word.
1385  * Returned stem might also be empty string.
1386  * Returned stem is *never* NULL, even if wordin == NULL.
1387  * Input buffer will not be modified; does not use strtok.
1388  * All variables are static for speeeeeeed.
1389  */
1390 static char     *paice_stemmer (char *wordin, DBLK *dblk)
1391 {
1392     UCHAR       finalc;
1393     PRULE       *rule, **rules_table;
1394
1395     if (wordin == NULL)
1396         return "";
1397     if (*wordin == 0)
1398         return "";
1399
1400     if ((rules_table = (PRULE **)dblk->stem_extra) == NULL) {
1401         fprintf (aa_stderr, catgets (dtsearch_catd, MS_lang, 31,
1402             "%s Stemmer suffixes file never loaded.\n"),
1403             PROGNAME"310");
1404         DtSearchExit (2);
1405     }
1406
1407     /* The max length of a stem is bufsz - 2:
1408      * one for the terminating \0 and one for the
1409      * prefix ^O that identifies a stem.  (But this
1410      * stemmer doesn't actually insert the ^O now.)
1411      */
1412     strncpy ((char*)paicebuf, wordin, DtSrMAXWIDTH_HWORD);
1413     paicebuf [DtSrMAXWIDTH_HWORD - 2] = 0;
1414     paice_charmap =     dblk->charmap;
1415     word_is_intact =    TRUE;
1416
1417     for (;;) { /*-------- Main Stemming Loop ---------*/
1418
1419         paicelen = strlen ((char*)paicebuf);
1420         finalc = *(paicebuf + paicelen - 1);
1421         if (debugging_paice) {
1422             fprintf (aa_stderr,
1423                 "paice: '%s', rules list '%c' for database '%s'\n",
1424                 paicebuf, finalc, dblk->name);
1425             fflush (aa_stderr);
1426         }
1427
1428         /* Look for a matching rule */
1429         if ((rule = rules_table [finalc]) == NULL) {
1430             if (debugging_paice)
1431                 fputs ("  list is null, stop.\n", aa_stderr);
1432             break;
1433         }
1434         while (rule) {
1435             if (is_matching_rule (rule))
1436                 break;
1437             rule = rule->link;
1438         }
1439         if (rule == NULL) {
1440             if (debugging_paice)
1441                 fprintf (aa_stderr, "  rules list '%c' is exhausted, stop.\n",
1442                     finalc);
1443             break;
1444         }
1445
1446         /* Apply rule that matched */
1447         if (debugging_paice)
1448             fputs ("    apply rule: ", aa_stderr);
1449         if (rule->remove_count == 0) {
1450             if (debugging_paice)
1451                 fputs ("remove_count = 0, stop.\n", aa_stderr);
1452             break;
1453         }
1454
1455         paicebuf [paicelen - rule->remove_count] = 0;
1456         if (rule->aplen)
1457             strcat ((char*)paicebuf, (char*)rule->apndstr);
1458         paicelen = strlen ((char*)paicebuf);
1459         word_is_intact = FALSE;  /* we've removed at least 1 suffix */
1460         if (debugging_paice)
1461             fprintf (aa_stderr, "--> '%s'", paicebuf);
1462
1463         /* Terminate algorithm if rule says so.
1464          * Otherwise continue removing suffixes
1465          * from this partially stemmed word.
1466          */
1467         if (rule->is_last_rule) {
1468             if (debugging_paice)
1469                 fputs (", stop flag is set, stop.\n", aa_stderr);
1470             break;
1471         }
1472         if (debugging_paice)
1473             fputc ('\n', aa_stderr);
1474
1475     } /* end Main Stemming Loop */
1476
1477     if (debugging_paice) {
1478         fprintf (aa_stderr, "  final stem: '%s'\n", paicebuf);
1479         fflush (aa_stderr);
1480     }
1481     return (char *) paicebuf;
1482 } /* paice_stemmer() */
1483
1484
1485 /************************************************/
1486 /*                                              */
1487 /*                 null_stemmer                 */
1488 /*                                              */
1489 /************************************************/
1490 /* Stemmer that just copies and returns passed word.
1491  * In effect, the passed word IS its own stem.
1492  * Output buffer valid until next call to null_stemmer
1493  * or paice_stemmer.
1494  */
1495 char    *null_stemmer (char *word, DBLK *dblk)
1496 {
1497     if (word == NULL)
1498         return "";
1499     if (*word == '\0')
1500         return "";
1501     strncpy ((char *)paicebuf, word, DtSrMAXWIDTH_HWORD);
1502     paicebuf [DtSrMAXWIDTH_HWORD-1] = 0;
1503     return (char *) paicebuf;
1504 } /* null_stemmer() */
1505
1506
1507 /************************************************/
1508 /*                                              */
1509 /*                 euro_lstrupr                 */
1510 /*                                              */
1511 /************************************************/
1512 /* Converts passed string to uppercase in place.
1513  * Classic strupr() function using teskey charmaps.
1514  */
1515 static char     *euro_lstrupr (char *string, DBLK *dblk)
1516 {
1517     static int          *charmap;
1518     static UCHAR        *s;
1519     charmap = dblk->charmap;
1520     for (s=(UCHAR *)string;  *s;  s++)
1521         *s = charmap[*s] & 0xff;
1522     return string;
1523 }
1524
1525
1526 /************************************************/
1527 /*                                              */
1528 /*                 null_lstrupr                 */
1529 /*                                              */
1530 /************************************************/
1531 /* Just returns passed string.  Used where uppercase
1532  * conversions are not required for a language.
1533  */
1534 char    *null_lstrupr (char *s, DBLK *d)
1535 { return s; }
1536
1537
1538 /************************************************/
1539 /*                                              */
1540 /*                load_language                 */
1541 /*                                              */
1542 /************************************************/
1543 /* Loads a dblk with a specific language's
1544  * structures and function pointers.
1545  * Does not reload structures previously loaded in
1546  * other dblks on dblist if derived from identical files.
1547  * But always loads structures if passed dblist is NULL.
1548  * Presumes dblk already partially initialized with mandatory fields:
1549  *      name, path, language.
1550  * May also be preinitialized with optional fields:
1551  *      minwordsz, maxwordsz.
1552  * Returns TRUE if all successful.
1553  * Otherwise returns FALSE with err msgs on ausapi_msglist.
1554  */
1555 int     load_language (DBLK *dblk, DBLK *dblist)
1556 {
1557     int         oops =  FALSE;
1558     int         language = dblk->dbrec.or_language;
1559
1560     if (debugging_loadlang)
1561         fprintf (aa_stderr,
1562             "\n"PROGNAME"1920 Loading language #%d, %s, for dblk '%s'.\n",
1563             (int)dblk->dbrec.or_language,
1564             language_name (dblk->dbrec.or_language),
1565             NULLORSTR(dblk->name));
1566
1567     /*
1568      * Note: Load list functions must be called
1569      * AFTER charmap and lstrupr are loaded.
1570      */
1571     switch (language) {
1572         case DtSrLaENG:
1573         case DtSrLaENG2:
1574         case DtSrLaESP:
1575         case DtSrLaFRA:
1576         case DtSrLaITA:
1577         case DtSrLaDEU:
1578             dblk->charmap =     (language == DtSrLaENG)?
1579                                     ascii_charmap : latin_charmap;
1580             dblk->parser =      teskey_parser;
1581             dblk->stemmer =     paice_stemmer;
1582             dblk->lstrupr =     euro_lstrupr;
1583             if (dblk->dbrec.or_maxwordsz == 0)
1584                 dblk->dbrec.or_maxwordsz = (language == DtSrLaDEU)?
1585                     MAXWIDTH_LWORD - 1 : MAXWIDTH_SWORD - 1;
1586             if (dblk->dbrec.or_minwordsz == 0)
1587                 dblk->dbrec.or_minwordsz = MINWIDTH_TOKEN + 1;
1588             oops = FALSE;
1589             if (!load_stop_list (dblk, dblist))
1590                 oops = TRUE;
1591             if (!load_include_list (dblk, dblist))
1592                 oops = TRUE;
1593             if (!load_paice_suffixes (dblk, dblist))
1594                 oops = TRUE;
1595             if (oops)
1596                 return FALSE;
1597             break;
1598
1599         case DtSrLaJPN:
1600         case DtSrLaJPN2:
1601             return load_jpn_language (dblk, dblist);
1602
1603         default:
1604             /* Try loading a custom 'user' language.
1605              * If he failed to provide a loader function,
1606              * the dummy custom loader will tell him so.
1607              * If he provided one but it can't load this language,
1608              * it should return it's own error msgs.
1609              */
1610             return load_custom_language (dblk, dblist);
1611
1612     } /* end switch (language) */
1613
1614     return TRUE;
1615 } /* load_language() */
1616
1617
1618 /************************************************/
1619 /*                                              */
1620 /*                unload_language               */
1621 /*                                              */
1622 /************************************************/
1623 /* Frees storage for structures allocated by load_language().
1624  * Called when engine REINITs due to change in site config file
1625  * or databases.
1626  * Duplicate wordtrees are not unloaded because they
1627  * will have already been unloaded in a previous dblk.
1628  */
1629 void    unload_language (DBLK *dblk)
1630 {
1631     switch (dblk->dbrec.or_language) {
1632         case DtSrLaENG:
1633         case DtSrLaENG2:
1634         case DtSrLaESP:
1635         case DtSrLaFRA:
1636         case DtSrLaITA:
1637         case DtSrLaDEU:
1638             dblk->charmap = NULL;
1639             if ((dblk->lang_flags & LF_DUP_STP) == 0)
1640                 free_wordtree (&dblk->stoplist);
1641             else {
1642                 dblk->stoplist = NULL;
1643                 dblk->lang_flags &= ~LF_DUP_STP;
1644             }
1645             if ((dblk->lang_flags & LF_DUP_INC) == 0)
1646                 free_wordtree (&dblk->inclist);
1647             else {
1648                 dblk->inclist = NULL;
1649                 dblk->lang_flags &= ~LF_DUP_INC;
1650             }
1651             if ((dblk->lang_flags & LF_DUP_SFX) == 0)
1652                 free_paice_rules ((PRULE***)&dblk->stem_extra);
1653             else {
1654                 dblk->stem_extra = NULL;
1655                 dblk->lang_flags &= ~LF_DUP_SFX;
1656             }
1657             break;
1658
1659         case DtSrLaJPN:
1660         case DtSrLaJPN2:
1661             unload_jpn_language (dblk);
1662             break;
1663
1664         default:
1665             unload_custom_language (dblk);
1666             break;
1667     }
1668     return;
1669 } /* unload_language() */
1670 /******************** LANG.C ********************/
1671