cde/lib/DtSearch/jpn.c

   1 /*
   2  * CDE - Common Desktop Environment
   3  *
   4  * Copyright (c) 1993-2012, The Open Group. All rights reserved.
   5  *
   6  * These libraries and programs are free software; you can
   7  * redistribute them and/or modify them under the terms of the GNU
   8  * Lesser General Public License as published by the Free Software
   9  * Foundation; either version 2 of the License, or (at your option)
  10  * any later version.
  11  *
  12  * These libraries and programs are distributed in the hope that
  13  * they will be useful, but WITHOUT ANY WARRANTY; without even the
  14  * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE. See the GNU Lesser General Public License for more
  16  * details.
  17  *
  18  * You should have received a copy of the GNU Lesser General Public
  19  * License along with these librararies and programs; if not, write
  20  * to the Free Software Foundation, Inc., 51 Franklin Street, Fifth
  21  * Floor, Boston, MA 02110-1301 USA
  22  */
  23 /*
  24  *   COMPONENT_NAME: austext
  25  *
  26  *   FUNCTIONS: display_jstate
  27  *              jpn_parser
  28  *              kanji_compounder
  29  *              load_jpn_language
  30  *              load_jpntree
  31  *              parse_substring
  32  *              read_jchar
  33  *              search_kanjitree
  34  *
  35  *   ORIGINS: 27
  36  *
  37  *
  38  *   (C) COPYRIGHT International Business Machines Corp. 1995,1996
  39  *   All Rights Reserved
  40  *   Licensed Materials - Property of IBM
  41  *   US Government Users Restricted Rights - Use, duplication or
  42  *   disclosure restricted by GSA ADP Schedule Contract with IBM Corp.
  43  */
  44 /******************** JPN.C ********************
  45  * $TOG: jpn.c /main/7 1999/10/14 14:11:33 mgreess $
  46  * September 1995.
  47  * Includes functions and data for parsing Japanese,
  48  * supported languages DtSrLaJPN and DtSrLaJPN2.
  49  * Currently only supports EUC packed format,
  50  * but should be easily extendable to Shift-JIS.
  51  * JIS can be supported if half-width katakana are excluded
  52  * (no SI or SO chars to conflict with the ^O stemming char,
  53  * and engine must decide never to balk at ESCape sequences).
  54  * Will not support Unicode or other fixed width, n-wide
  55  * encodings that would conflict with ascii in either byte.
  56  * Does not require wide char or multibyte char functions.
  57  * There is no Japanese stemmer(), ie standard null_stemmer() is used.
  58  *
  59  * Code Set 0 can be either 7-bit ASCII or 7-bit JIS-Roman.
  60  * The parser() for ASCII is the full teskey_parser()
  61  * used for European languages with an ascii char set.
  62  * Min/max word size, stoplists, and include lists may be
  63  * used if provided, as in European languages.
  64  *
  65  * Code Set 1 is JIS X 0208-1990.
  66  * Symbols and line drawing elements are not indexed.
  67  * Hirigana strings are discarded as equivalent to stoplist words.
  68  * Contiguous strings of katakana, Roman, Greek, or cyrillic
  69  * are parsed as single words.
  70  *
  71  * Individual kanji chars are parsed as single words.
  72  * In addition, for language DtSrLaJPN, all kanji compounds
  73  * (pairs, triplets, etc) found in any contiguous string of
  74  * kanjis will be parsed up to a maximum word size
  75  * defined in MAX_KANJI_CLEN (see caveat below).
  76  * For language DtSrLaJPN2, only kanji substrings listed
  77  * in a .knj file are parsed as additional compound words.
  78  * Characters from unassigned kuten rows are presumed to be
  79  * user-defined kanji and are parsed as such.
  80  *
  81  * Code Set 2 is 1/2 width katakana.
  82  * Contiguous strings are parsed as single words.
  83  *
  84  * Code Set 3 is JIS X 0212-1990.
  85  * Parsing is similar to Code Set 1: discard symbols, etc,
  86  * contiguous strings of related foreign characters equal words,
  87  * and individual kanji and unassigned chars equal single words,
  88  * with additional kanji compounding depending on language.
  89  * Row 5 has 4 new katakana (not yet officially approved)
  90  * so it is treated here as katakana.
  91  *
  92  * $Log$
  93  * Revision 2.8  1996/04/10  20:24:33  miker
  94  * Fixed bug in kanji tree loader.
  95  *
  96  * Revision 2.7  1996/03/25  18:55:15  miker
  97  * Changed FILENAME_MAX to _POSIX_PATH_MAX.
  98  *
  99  * Revision 2.6  1996/03/13  22:57:40  miker
 100  * Added prolog.  Changed char to UCHAR several places.
 101  *
 102  * Revision 2.5  1996/03/05  16:09:58  miker
 103  * Made jchar array of unsigned chars for compat with Sun compilers.
 104  * Added test of PA_MSGS for yacc-based boolean queries.
 105  *
 106  * Revision 2.4  1996/02/01  19:08:10  miker
 107  * AusText 2.1.11, DtSearch 0.3:  Major rewrite for new parsers.
 108  * Made optional power series kanji compounding (KANJI_COMPOUNDS)
 109  * into a new DtSrLaJPN language.  Old version now DtSrLaJPN2.
 110  *
 111  * Revision 2.3  1995/12/01  16:20:17  miker
 112  * Changed read_jchar arg to unsigned to fix Solaris bug.
 113  *
 114  * Revision 2.2  1995/10/26  15:08:31  miker
 115  * Added prolog.
 116  *
 117  * Revision 2.1  1995/09/22  20:57:13  miker
 118  * Freeze DtSearch 0.1, AusText 2.1.8
 119  *
 120  * Revision 1.1  1995/09/19  21:24:57  miker
 121  * Initial revision
 122  */
 123 #include "SearchP.h"
 124 #include <limits.h>
 125 #include <stdlib.h>
 126 #include <stdio.h>
 127 #include <string.h>
 128 #include <errno.h>
 129 #include <sys/stat.h>
 130
 131 #define PROGNAME        "JPN"
 132 #define SS2_CHAR        0x8E    /* Single Shift char for Code Set 2 */
 133 #define SS3_CHAR        0x8F    /* Single Shift char for Code Set 3 */
 134 #define EXT_KATAKANA    ".ktk"
 135 #define EXT_KANJI       ".knj"
 136 #define SUBSTRBUFSZ     100
 137 #define MS_misc         1
 138 #define MS_lang         15
 139
 140 /* In addition to single kanji chars parsed as individual words,
 141  * Language DtSrLaJPN will also blindly consider all contiguous kanji
 142  * substrings up to MAX_KANJI_CLEN as separate compound words.
 143  * For example if MAX_KANJI_CLEN were 3, the 4 kanjis "ABCD"
 144  * would parse as "A B C D AB BC CD ABC BCD".
 145  * The number of parsed words = the number of
 146  * ordered permutations of n things taken r! times!
 147  * This is can be very wasteful of indexing time and file space.
 148  * The alternative is language DtSrLaJPN2 which only considers
 149  * strings listed in jpn.knj as valid kanji compounds.
 150  * The kanji compounds in jpn.knj are the statistically significant
 151  * kanji substrings found in a large corpus of natural language Japanese.
 152  */
 153 #define MAX_KANJI_CLEN  6
 154
 155 /************************************************/
 156 /*                                              */
 157 /*                    JSTATE                    */
 158 /*                                              */
 159 /************************************************/
 160 /* EUC text substring types.
 161  * Used to switch states in parser's automaton.
 162  * Coded as bit positions for efficient boolean comparisons.
 163  */
 164 #define JS_STX          0x0001  /* Start of text blk, initial state */
 165 #define JS_KANJI        0x0002  /* Set 1, Set 3 */
 166 #define JS_KATAKANA     0x0004  /* Set 1 */
 167 #define JS_ASCII        0x0008  /* Set 0 */
 168 #define JS_ROMAN        0x0010  /* Set 1 */
 169 #define JS_GREEK        0x0020  /* Set 1, Set 3 */
 170 #define JS_CYRILLIC     0x0040  /* Set 1 */
 171 #define JS_ALPHA        0x0080  /* Set 3 */
 172 #define JS_HALFKATA     0x0100  /* Set 2 */
 173 #define JS_DISCARD      0x0200  /* Set 1, Set 3, any char not in EUC */
 174 #define JS_ETX          0x0300  /* End of text block */
 175 #define JS_ALPHA_COMPATIBLE  (JS_ROMAN | JS_GREEK | JS_CYRILLIC)
 176
 177 /************************************************/
 178 /*                                              */
 179 /*                   JPNTREE                    */
 180 /*                                              */
 181 /************************************************/
 182 /* Similar to standard binary WORDTREE.
 183  * Each tree node distinguished by first 4 bytes
 184  * (usually 2 jchars), which is minimum compound word size.
 185  * All compounds beginning with those 4 bytes are chained
 186  * in a linked list off of that node.
 187  */
 188 typedef struct _jpntree_tag {
 189     struct _jpntree_tag  *rlink;  /* ptr to right binary node */
 190     struct _jpntree_tag  *llink;  /* ptr to left binary node */
 191     struct _jpntree_tag  *next;   /* ptr to next compound in linked list */
 192     int                  len;     /* length of word in bytes */
 193     void                 *word;
 194     } JPNTREE;
 195
 196 /************************************************/
 197 /*                                              */
 198 /*                    JPNBLK                    */
 199 /*                                              */
 200 /************************************************/
 201 typedef struct {
 202         JPNTREE *katatree;
 203         JPNTREE *kanjitree;
 204         }       JPNBLK;
 205
 206 /************************************************/
 207 /*                                              */
 208 /*                    GLOBALS                   */
 209 /*                                              */
 210 /************************************************/
 211 int             debugging_jpn =         FALSE;
 212 extern int      debugging_loadlang;
 213 extern int      debugging_loadword;
 214
 215 /* Used in jpn_parser() and parse_substr().  Made global for speed. */
 216 static int      do_compounding =        FALSE;
 217 static int      is_new_substring =      TRUE;
 218 static int      jstate, last_jstate;
 219 static UCHAR    jchar [8];
 220 static int      jcharlen =              0;
 221 static DBLK     *jpn_dblk;
 222 static JPNTREE  *jpn_kanjitree =        NULL;
 223 static JPNTREE  *jpn_katatree =         NULL;
 224 static JPNTREE  *kanjitree =            NULL;
 225 static int      language;
 226 static long     *offsetp;
 227 static long     readcount =             0;
 228 static READCFP  readchar;
 229 static void     *readchar_arg;
 230 static UCHAR    *outbuf =               NULL;
 231 static UCHAR    *save_parg_string =     NULL;
 232 static UCHAR    *substrbuf =            NULL;
 233 static long     substr_offset;
 234
 235
 236 /************************************************/
 237 /*                                              */
 238 /*                display_jstate                */
 239 /*                                              */
 240 /************************************************/
 241 /* for debugging and error msgs */
 242 static char     *display_jstate (int js)
 243 {
 244     switch (js) {
 245         case JS_KANJI:          return "KANJI";
 246         case JS_KATAKANA:       return "KATAKANA";
 247         case JS_DISCARD:        return "DISCARD";
 248         case JS_ROMAN:          return "ROMAN";
 249         case JS_ASCII:          return "ASCII";
 250         case JS_ALPHA:          return "ALPHA";
 251         case JS_ETX:            return "ETX";
 252         case JS_STX:            return "STX";
 253         case JS_GREEK:          return "GREEK";
 254         case JS_CYRILLIC:       return "CYRILLIC";
 255         case JS_HALFKATA:       return "HALFKATA";
 256         default:                return "(UNKNOWN)";
 257     }
 258 } /* display_jstate() */
 259
 260
 261 /************************************************/
 262 /*                                              */
 263 /*                  read_jchar                  */
 264 /*                                              */
 265 /************************************************/
 266 /* Subroutine of jpn_parser().
 267  * Using global character reading 'readchar' cofunction,
 268  * returns (1) next multibyte Japanese character in global jchar,
 269  * (2) length of jchar in global jcharlen, and
 270  * (3) next state of state machine in global jstate.
 271  * Function itself returns jstate.
 272  * Rows in the KUTEN tables which are officially 'unassigned'
 273  * are treated as user-defined kanji, so all jstates
 274  * are presumed JS_KANJI except those specifically marked
 275  * otherwise at the beginning of each array below.
 276  */
 277 static int      read_jchar (void)
 278 {
 279     /* Jstates table for EUC Set 1 (JIS 0208) */
 280     static int  jstates_set1 [] = {
 281         JS_DISCARD,     JS_DISCARD,     JS_DISCARD,     /* 0 - 2 */
 282         JS_ROMAN,       JS_DISCARD,     JS_KATAKANA,    /* 3 - 5 */
 283         JS_GREEK,       JS_CYRILLIC,    JS_DISCARD      /* 6 - 8 */
 284     };
 285
 286     /* Jstates table for EUC Set 3 (JIS 0212).
 287      * Row 5 is presumed to be katakana because
 288      * of four new unapproved katakana characters.
 289      */
 290     static int  jstates_set3 [] = {
 291         JS_DISCARD,     JS_DISCARD,     JS_DISCARD,     /* 0 - 2 */
 292         JS_DISCARD,     JS_DISCARD,     JS_KATAKANA,    /* 3 - 5 */
 293         JS_GREEK,       JS_CYRILLIC,    JS_DISCARD,     /* 6 - 8 */
 294         JS_ALPHA,       JS_ALPHA,       JS_ALPHA        /* 9 - 11 */
 295     };
 296
 297     if (readchar_arg) {
 298         jchar[0] = readchar (readchar_arg);
 299         readchar_arg = NULL;
 300     }
 301     else
 302         jchar[0] = readchar (NULL);
 303     if (jchar[0] == 0)
 304         return (jstate = JS_ETX);
 305     readcount++;
 306
 307     /* Set 1 (JIS 0208) */
 308     if (jchar[0] >= 0xA1 && jchar[0] <= 0xFE) {
 309         jcharlen = 2;
 310         if (jchar[0] > 0xA8)
 311             jstate = JS_KANJI;
 312         else
 313             jstate = jstates_set1 [(jchar[0] & 0x7F) - 32];
 314         if (jchar[1] = readchar (NULL))
 315             readcount++;
 316         else
 317             jstate = JS_ETX;
 318         return jstate;
 319     }
 320
 321     /* Set 0 (ASCII) */
 322     if (jchar[0] < 0x80) {
 323         jcharlen = 1;
 324         return (jstate = JS_ASCII);
 325     }
 326
 327     /* Set 3 (JIS 0212) */
 328     if (jchar[0] == SS3_CHAR) {
 329         jcharlen = 3;
 330         /*
 331          * Hop over the single shift char to get the first JIS byte.
 332          * Make sure first JIS byte is in proper
 333          * range to avoid indexing outside of table.
 334          */
 335         if ((jchar[1] = readchar (NULL)) == 0)
 336             return (jstate = JS_ETX);
 337         readcount++;
 338         if (jchar[1] < 0xA1)
 339             return (jstate = JS_DISCARD);
 340         if (jchar[1] > 0xAA)
 341             jstate = JS_KANJI;
 342         else
 343             jstate = jstates_set3 [(*jchar & 0x7F) - 32];
 344
 345         if ((jchar[2] = readchar (NULL)) == 0)
 346             return (jstate = JS_ETX);
 347         readcount++;
 348         /* JS_ALPHA chars ('miscellaneous alphabetic chars' of
 349          * rows 9 - 11) are compatible with several other jstates,
 350          * so adjust as necessary.
 351          */
 352         if (jstate == JS_ALPHA  &&
 353                 ((last_jstate & JS_ALPHA_COMPATIBLE) != 0))
 354             jstate = last_jstate;
 355         else if (last_jstate == JS_ALPHA  &&
 356                 ((jstate & JS_ALPHA_COMPATIBLE) != 0))
 357             last_jstate = jstate;
 358         return jstate;
 359     }
 360
 361     /* Set 2 (half-width katakana) */
 362     if (jchar[0] == SS2_CHAR) {
 363         jcharlen = 2;
 364         jstate = JS_HALFKATA;
 365         if (jchar[1] = readchar (NULL))
 366             readcount++;
 367         else
 368             jstate = JS_ETX;
 369         return jstate;
 370     }
 371
 372     /* If first jchar doesn't match expected EUC coding,
 373      * discard it until we get back into sync.
 374      */
 375     jcharlen = 1;
 376     return (jstate = JS_DISCARD);
 377 } /* read_jchar() */
 378
 379
 380 /************************************************/
 381 /*                                              */
 382 /*               kanji_compounder               */
 383 /*                                              */
 384 /************************************************/
 385 /* Subroutine of parse_substring() of jpn_parser().
 386  * Used only for language DtSrLaJPN (power series compounding).
 387  * Called repeatedly when the substring is a sequence of kanji chars.
 388  * For each call writes to outbuf and returns a single kanji
 389  * compound word, using every possible compound in the substring
 390  * from length 1 to length MAX_KANJI_CLEN.
 391  * Updates offsetp for each word returned.
 392  * Returns NULL when substring exhausted.  First call for
 393  * a new substring indicated by global is_new_substring.
 394  */
 395
 396 static UCHAR    *kanji_compounder (void)
 397 {
 398     static int          all_done =      TRUE;
 399     static int          clen =          MAX_KANJI_CLEN + 1;
 400     static UCHAR        *mysubstrp =    NULL;
 401     static UCHAR        *mysubstrend =  NULL;
 402     static UCHAR        *op, *ss;
 403     static int          i;
 404
 405     if (is_new_substring) {
 406         is_new_substring = FALSE;
 407         all_done = FALSE;
 408         clen = 1;
 409         mysubstrp = substrbuf;
 410         mysubstrend = substrbuf + strlen ((char*)substrbuf);
 411     }
 412
 413     /* Advance compound length by 1.
 414      * If max compound length exceeded, reset it
 415      * to 1 and increment substring pointer by 1 jchar.
 416      */
 417     else {
 418         if (all_done)
 419             return NULL;
 420         if (++clen > MAX_KANJI_CLEN) {
 421             clen = 1;
 422             mysubstrp += (*mysubstrp == SS3_CHAR)? 3 : 2;
 423         }
 424     }
 425
 426     /* Assemble one word into outbuf, of length clen,
 427      * beginning at current substring ptr.
 428      * If there aren't enough jchars left in string,
 429      * reset clen to 1 and advance substrp by 1 jchar.
 430      * We're all done when substring exhausted.
 431      */
 432     while (mysubstrp < mysubstrend) {
 433         op = outbuf;
 434         ss = mysubstrp;
 435         for (i = 0;  i < clen;  i++) {
 436             /* Are there enough jchars left in substring? */
 437             if (ss >= mysubstrend) {
 438                 clen = 1;
 439                 mysubstrp += (*mysubstrp == SS3_CHAR)? 3 : 2;
 440                 i = 0;          /* indicates assembly failure */
 441                 break;          /* breaks the for loop */
 442             }
 443             /* Assemble one jchar into outbuf */
 444             if (*ss == SS3_CHAR)
 445                 *op++ = *ss++;
 446             *op++ = *ss++;
 447             *op++ = *ss++;
 448         }
 449         /* Did word assembly succeed? */
 450         if (i >= clen) {
 451             *op = 0;
 452             if (offsetp)
 453                 *offsetp = substr_offset + (mysubstrp - substrbuf);
 454             if (debugging_jpn)
 455                 fprintf (aa_stderr,
 456                     "knjcompdr: subofs=%2ld totofs=%3ld \"%s\"\n",
 457                     mysubstrp - substrbuf, *offsetp, outbuf);
 458             return outbuf;
 459         }
 460     }
 461
 462     all_done = TRUE;
 463     return NULL;
 464 } /* kanji_compounder() */
 465
 466
 467 /************************************************/
 468 /*                                              */
 469 /*              search_kanjitree                */
 470 /*                                              */
 471 /************************************************/
 472 /* Subroutine of parse_substring() of jpn_parser().
 473  * Used only for language DtSrLaJPN2; DtSrLaJPN calls
 474  * kanji_compounder() to generate compounds algorithmically.
 475  * First call for a new substring of kanjis is indicated
 476  * when is_new_substring is TRUE.  Each call, then and thereafter,
 477  * returns a token (1) for each individual kanji char in string,
 478  * and (2) for each sequence of kanjis found in the kanji
 479  * compounds JPNTREE which begins with each char in string.
 480  * Also returns offset of returned token in offsetp.
 481  * Returns NULL when string is exhausted.
 482  * Variables are static for speeeeed.
 483  */
 484 static UCHAR    *search_kanjitree (void)
 485 {
 486     static int          all_done =      TRUE;
 487     static JPNTREE      *node, *last_node;
 488     static UCHAR        *substrp, *substrend;
 489     static int          direction;
 490     static int          nodelen;
 491     static int          jcharlen;
 492
 493     if (is_new_substring) {
 494         is_new_substring = FALSE;
 495         all_done = FALSE;
 496         substrend = substrbuf + strlen ((char*)substrbuf);
 497         substrp = substrbuf;
 498
 499         /* Return first substr jchar as next token */
 500         last_node = NULL;       /* NULL = tree not searched yet */
 501         jcharlen = (*substrp == SS3_CHAR)? 3 : 2;
 502         strncpy ((char*)outbuf, (char*)substrp, jcharlen);
 503         outbuf [jcharlen] = 0;
 504         if (offsetp)
 505             *offsetp = substr_offset;
 506         return outbuf;
 507     }
 508     else if (all_done)
 509         return NULL;
 510
 511     /* If not enough chars left in substring to search tree,
 512      * treat it as an exhausted tree search.  In other words,
 513      * reset tree search, increment to next jchar, and return it.
 514      */
 515     if (strlen ((char*)substrp) < 4) {
 516         if (debugging_jpn)
 517             fputs ("knjtree: ...remaining substring too short", aa_stderr);
 518 EXHAUSTED_TREE:
 519         if (debugging_jpn)
 520             fputs (".\n", aa_stderr);
 521         last_node = NULL;
 522         substrp += jcharlen;
 523         if (substrp >= substrend) {
 524             all_done = TRUE;
 525             return NULL;
 526         }
 527         jcharlen = (*substrp == SS3_CHAR)? 3 : 2;
 528         strncpy ((char*)outbuf, (char*)substrp, jcharlen);
 529         outbuf [jcharlen] = 0;
 530         if (offsetp)
 531             *offsetp = substr_offset + (substrp - substrbuf);
 532         return outbuf;
 533     }
 534
 535     /* If last call resulted in a tree hit, the node was saved.
 536      * Continue the linked list search directly from the last hit.
 537      */
 538     if (last_node) {
 539         last_node = last_node->next;
 540         if (debugging_jpn)
 541             fputs ("knjtree: ...continue tree search: ", aa_stderr);
 542 LINKED_LIST_SEARCH:
 543         for (node = last_node;  node;  node = node->next) {
 544             if ((strncmp ((char*)substrp, node->word, node->len)) == 0) {
 545                 /* HIT on linked list search */
 546                 last_node = node;
 547                 strcpy ((char*)outbuf, node->word);
 548                 if (debugging_jpn)
 549                     fprintf (aa_stderr, "* '%s'\n", outbuf);
 550                 if (offsetp)
 551                     *offsetp = substr_offset + (substrp - substrbuf);
 552                 return outbuf;
 553             }
 554             else if (debugging_jpn)
 555                 fputc ('-', aa_stderr);
 556         }
 557         goto EXHAUSTED_TREE;
 558     }
 559
 560     /* Start new binary tree search at curr jchar.
 561      * If hit, commence linked list search.
 562      */
 563     if (debugging_jpn)
 564         fprintf (aa_stderr,
 565             "knjtree: \"%.4s...\" ", substrp);
 566     for (node = kanjitree;  node != NULL;  ) {
 567         if ((direction = strncmp ((char*)substrp, node->word, 4)) == 0) {
 568             /* HIT on binary search */
 569             last_node = node;
 570             goto LINKED_LIST_SEARCH;
 571         }
 572         /* Descend left or right depending on word */
 573         if (debugging_jpn)
 574             fputc ((direction < 0) ? 'L' : 'R', aa_stderr);
 575         if (direction < 0)
 576             node = node->llink;
 577         else
 578             node = node->rlink;
 579     }
 580
 581     /* No match on first 4 bytes of substrp in binary tree.
 582      * Tree exhausted without a hit, so increment to next
 583      * jchar in substring and return it as a word.
 584      */
 585     goto EXHAUSTED_TREE;
 586 }  /* search_kanjitree() */
 587
 588
 589 /************************************************/
 590 /*                                              */
 591 /*                parse_substring               */
 592 /*                                              */
 593 /************************************************/
 594 /* Subroutine of jpn_parser().
 595  * Returns next Japanese multibyte word token from current
 596  * substring of jchars, or NULL when out of tokens.
 597  * Returned token is valid until next call.
 598  * Static args initialized at first call for a new substring.
 599  * Provides optional kanji compounding depending on PA_  flags.
 600  * We usually compound at index time (dtsrindex) or when query
 601  * is Query-By-Example (statistical searches), and usually don't
 602  * compound boolean queries.
 603  */
 604 static UCHAR    *parse_substring (void)
 605 {
 606     static int  is_substr_end =         TRUE;
 607     static int  substrlen =             0;
 608     static PARG myparg;
 609     static UCHAR        *token;
 610     static long myoffset;
 611
 612     if (is_new_substring) {
 613         substrlen = strlen ((char*)substrbuf);
 614
 615         /* A very common ascii substring is the final line-feed
 616          * at the end of a line of text--discard it now.
 617          */
 618         if (last_jstate == JS_ASCII
 619                 &&  substrlen == 1
 620                 &&  substrbuf[0] == '\n') {
 621             is_substr_end = TRUE;
 622             is_new_substring = FALSE;
 623             return NULL;
 624         }
 625
 626         is_substr_end = FALSE;
 627
 628         if (!outbuf)
 629             outbuf = austext_malloc (DtSrMAXWIDTH_HWORD + 8,
 630                 PROGNAME"807", NULL);
 631
 632         if (debugging_jpn) {
 633             int         i;
 634             fprintf (aa_stderr, "jpnsubstr: js=%s len=%ld str='",
 635                 display_jstate(last_jstate), substrlen);
 636             for (i = 0;  i < substrlen;  i++)
 637                 fputc ((substrbuf[i] < 32)? '~' : substrbuf[i],
 638                     aa_stderr);
 639             fprintf (aa_stderr, "'\n");
 640             if (last_jstate == JS_ROMAN) {
 641                 fprintf (aa_stderr, "  (ascii equiv: '");
 642                 for (i = 1;  i < substrlen;  i+=2)
 643                     fputc ((substrbuf[i] & 0x7f) + 32, aa_stderr);
 644                 fprintf (aa_stderr, "')\n");
 645             }
 646             fflush (aa_stderr);
 647         }
 648
 649     } /* endif is_new_substring */
 650
 651     if (is_substr_end)
 652         return NULL;
 653
 654     switch (last_jstate) {
 655
 656         case JS_DISCARD:
 657             /* Ignore discardable substrings */
 658             is_new_substring = FALSE;
 659             is_substr_end = TRUE;
 660             return NULL;
 661
 662         case JS_KATAKANA:
 663         case JS_ROMAN:
 664         case JS_CYRILLIC:
 665         case JS_GREEK:
 666         case JS_ALPHA:
 667         case JS_HALFKATA:
 668             /* Treat entire substring as single parsed word */
 669 ENTIRE_SUBSTR_IS_WORD:
 670             if (debugging_jpn)
 671                 fputs ("  token is entire substring.\n", aa_stderr);
 672             strncpy ((char*)outbuf, (char*)substrbuf, DtSrMAXWIDTH_HWORD);
 673             outbuf [DtSrMAXWIDTH_HWORD - 1] = 0;
 674             is_new_substring = FALSE;
 675             is_substr_end = TRUE;
 676             if (offsetp)
 677                 *offsetp = substr_offset;
 678             return outbuf;
 679
 680         case JS_ASCII:
 681             /* Call the full teskey_parser() for European languages.
 682              * Includes stoplist and include list processing.
 683              */
 684             if (is_new_substring) {
 685                 is_new_substring = FALSE;
 686                 if (debugging_jpn)
 687                     fputs ("  calling teskey parser.\n", aa_stderr);
 688                 myparg.dblk =           jpn_dblk;
 689                 myparg.string =         substrbuf;
 690                 myparg.ftext =          NULL;
 691                 myparg.offsetp =        &myoffset;
 692                 token = (UCHAR *) teskey_parser (&myparg);
 693             }
 694             else
 695                 token = (UCHAR *) teskey_parser (NULL);
 696             if (token) {
 697                 if (offsetp)
 698                     *offsetp = substr_offset + myoffset;
 699             }
 700             else
 701                 is_substr_end = TRUE;
 702             return token;
 703
 704         case JS_KANJI:
 705             /* If not compounding, treat entire substring
 706              * as one query word, ie a single compound kanji word.
 707              * If compounding, each individual kanji in the
 708              * substring is returned as a word by itself.
 709              * Each kanji can be 2 or 3 bytes depending on
 710              * which code set it came from.  In addition,
 711              * sequences of 2 or more kanjis ('compound kanji
 712              * words') are returned as individual words.
 713              * Method of kanji compounding depends on language:
 714              * DtSrLaJPN does "power series" kanji compounding,
 715              * DtSrLaJPN2 looks up kanji compounds in a word tree.
 716              * Both functions test and reset is_new_substring,
 717              * update offsetp as necessary, and return either NULL
 718              * or a pointer to outbuf containing a valid token.
 719              */
 720             if (!do_compounding)
 721                 goto ENTIRE_SUBSTR_IS_WORD;
 722             token = (language == DtSrLaJPN)?
 723                 kanji_compounder() : search_kanjitree();
 724             if (!token)
 725                 is_substr_end = TRUE;
 726             return token;
 727
 728         default:
 729             break;
 730
 731     } /* end state switch */
 732
 733     /* Should never get here... */
 734     fprintf (aa_stderr, catgets(dtsearch_catd, MS_lang, 20,
 735         "%s Program Error: Unknown jstate %d.\n") ,
 736         PROGNAME"246", last_jstate);
 737     DtSearchExit (46);
 738 } /* parse_substring() */
 739
 740
 741 /************************************************/
 742 /*                                              */
 743 /*                  jpn_parser                  */
 744 /*                                              */
 745 /************************************************/
 746 /* Returns next word token from text stream of packed EUC
 747  * Japanese text, languages DtSrLaJPN and DtSrLaJPN2.
 748  * Called from (1) dtsrindex, where readchar_ftext() cofunction
 749  * reads the .fzk file document 'stream', or (2) search engine
 750  * query parsers, where readchar_string() cofunction 'reads'
 751  * from the query string.
 752  *
 753  * First call passes args in PARG block.  This resets end of
 754  * text block (ETX) flag, resets 'offset' counter to zero, etc.
 755  * Subsequent calls should pass NULL, and parser returns
 756  * next token in block, until reader cofunction reads ETX
 757  * end returns special ETX char ('\0').  Subsequent call to parser
 758  * returns NULL meaning "no tokens left in current stream".
 759  * Reader cofunction tolerates repeated calls after
 760  * the first ETX, still returning '\0'.
 761  *
 762  * This parser presumes all incoming text is packed EUC multibyte
 763  * Japanese chars as described above, but is otherwise unformatted.
 764  * Since parser accesses streams a multibyte char at a time,
 765  * it does not require periodic line feeds, etc.
 766  *
 767  * To control kanji compounding, caller should set a PA_ switch
 768  * in parg.flags as desired before call.  Compounding is done
 769  * when indexing (dtsrindex) or for hiliting (comparing previous
 770  * search results against all possible words in document text).
 771  * But in a Query by Example (stat searches), parser might also
 772  * be asked to  generate compound words.  In boolean queries
 773  * (stems and exact words), parser should not generate compounds
 774  * because if user enters a compound string, he probably only wants
 775  * documents containing that exact token.
 776  *
 777  * Parser also returns offset information: number of bytes
 778  * since beginning of text block.  The returned offsets are
 779  * NOT NECESSARILY IN ASCENDING ORDER due to kanji compounding.
 780  *
 781  * Variables are static or global for speeeeeeed.
 782  *
 783  * OUTPUT FORMAT:  NULL or a static C string containing a
 784  * single parsed word token.
 785  * The text in the buffer is valid until the next call.
 786  * Each word is translated as described above.
 787  */
 788 char    *jpn_parser (PARG *parg)
 789 {
 790     static int          filling_substring =     TRUE;
 791     static int          was_discarding =        FALSE;
 792     static int          add_msgs =              FALSE;
 793     static UCHAR        *endsubstrbuf =         NULL;
 794     static size_t       substrbufsz =           0;
 795     static UCHAR        *token;
 796     static UCHAR        *substrp;
 797
 798     /* If first call for new text block... */
 799     if (parg) {
 800         jpn_dblk = parg->dblk;
 801         language = jpn_dblk->dbrec.or_language;
 802         kanjitree = ((JPNBLK *)(jpn_dblk->parse_extra))->kanjitree;
 803         offsetp = parg->offsetp;
 804         do_compounding = (parg->flags & (PA_HILITING | PA_INDEXING));
 805         add_msgs = (parg->flags & PA_MSGS);
 806         if (parg->string) {     /* text is query str from search engine */
 807             save_parg_string = parg->string;
 808             readchar_arg = parg->string;
 809             readchar = (READCFP) readchar_string;
 810         }
 811         else {                  /* text is from .fzk file in dtsrindex */
 812             save_parg_string = NULL;
 813             readchar_arg = parg;
 814             readchar = (READCFP) readchar_ftext;
 815         }
 816
 817         if (substrbufsz == 0) {
 818             substrbufsz = SUBSTRBUFSZ;
 819             substrbuf = austext_malloc (SUBSTRBUFSZ + 8, PROGNAME"680", NULL);
 820         }
 821         endsubstrbuf = substrbuf + substrbufsz;
 822
 823         if (debugging_jpn) {
 824             fprintf (aa_stderr,
 825                 "jpnparser: start text block, substrbufsz=%ld.\n",
 826                 substrbufsz);
 827             fflush (aa_stderr);
 828         }
 829
 830         /* Seed the first substring */
 831         filling_substring = TRUE;
 832         readcount = 0L;
 833         last_jstate = JS_STX;
 834         read_jchar();
 835
 836     } /* endif (parg != NULL) */
 837
 838 FILL_ANOTHER_SUBSTRING:
 839     /* Input text is presumed to contain substrings
 840      * of chars related by their EUC encoding.
 841      * Fill the substring buffer by reading in nonDISCARDable
 842      * multibyte jchars until jstate changes signaling
 843      * end of a substring.
 844      * Note last jchar read, the one that changes the jstate,
 845      * hangs around till we come back to this loop.
 846      */
 847     if (filling_substring) {
 848         if (debugging_jpn) {
 849             if (jstate == JS_DISCARD) {
 850                 fputs ("jpnparser: js=DISCARD:", aa_stderr);
 851                 was_discarding = TRUE;
 852             }
 853             else
 854                 was_discarding = FALSE;
 855         }
 856         while (jstate == JS_DISCARD) {
 857             if (debugging_jpn)
 858                 fprintf (aa_stderr, " %s", jchar);
 859             read_jchar();
 860         }
 861         if (debugging_jpn && was_discarding)
 862             fputc ('\n', aa_stderr);
 863         if (jstate == JS_ETX) {
 864             if (debugging_jpn)
 865                 fputs ("jpnparser: js=ETX\n", aa_stderr);
 866             if (add_msgs) {
 867                 char    msgbuf [DtSrMAXWIDTH_HWORD + 100];
 868                 sprintf (msgbuf, catgets(dtsearch_catd, MS_lang, 21,
 869                     "%s '%.*s' is not a valid Japanese word.") ,
 870                     PROGNAME"812", DtSrMAXWIDTH_HWORD, save_parg_string);
 871                 DtSearchAddMessage (msgbuf);
 872             }
 873             return NULL;
 874         }
 875
 876         last_jstate = jstate;
 877         substrp = substrbuf;
 878         substr_offset = readcount - jcharlen;
 879
 880         /* Fill the substring buffer.
 881          * Ensure substring buffer is big enough.
 882          */
 883         while (last_jstate == jstate) {
 884             if (endsubstrbuf - substrp < 8) {
 885                 size_t  curlen = substrp - substrbuf;
 886                 if (debugging_jpn) {
 887                     fprintf (aa_stderr,
 888                         "jpnparser: curr substr len %ld, "
 889                         "new substrbufsz %ld.\n",
 890                         curlen, substrbufsz<<1);
 891                     fflush (aa_stderr);
 892                 }
 893                 substrbufsz <<= 1;      /* double its size */
 894                 substrbuf = realloc (substrbuf, substrbufsz);
 895                 endsubstrbuf = substrbuf + substrbufsz;
 896                 substrp = substrbuf + curlen;
 897             }
 898             strncpy ((char*)substrp, (char*)jchar, jcharlen);
 899             substrp += jcharlen;
 900             read_jchar();
 901         }
 902         *substrp = 0;
 903         filling_substring = FALSE;
 904         is_new_substring = TRUE;
 905     }
 906
 907     /* Empty the substring buffer returning each token
 908      * one by one, ie parse and return word tokens from string,
 909      * including possible kanji compounds if switched on.
 910      */
 911     if (token = parse_substring())
 912         return (char *) token;
 913
 914     /* When current substring is empty, go back and fill another one.
 915      * If we're parsing a string (eg hiliting text of a doc),
 916      * parse_substring() will have used readchar_string().
 917      * Since we now want to resume using it to parse the original
 918      * string, we have to reset it's string ptr.
 919      */
 920     filling_substring = TRUE;
 921     if (save_parg_string)
 922         readchar_arg = save_parg_string + readcount;
 923     goto FILL_ANOTHER_SUBSTRING;
 924
 925 } /* jpn_parser() */
 926
 927
 928 /************************************************/
 929 /*                                              */
 930 /*                load_jpntree                  */
 931 /*                                              */
 932 /************************************************/
 933 /* Subroutine of load_jpn_language.  Builds a JPNTREE
 934  * from a file of packed EUC compound words.
 935  * Basically a copy of load_wordtree() in lang.c.
 936  *
 937  * INPUT FILE FORMAT:  One word per line, min 4 bytes (2 jchars),
 938  * all words packed EUC.  Preferred order is frequency of
 939  * occurrence in the corpus to make searches efficient.
 940  * Otherwise the words should at least be in random order or
 941  * an order that will approximate a binary search.
 942  * If first char is ASCII (ie not packed EUC), line is
 943  * ignored as comments.  Any ascii chars after packed EUC,
 944  * such as whitespace and/or subsequent ascii comments,
 945  * delimits word token (ie anything else on the line is ignored).
 946  * "Line" ends in ascii linefeed (\n).
 947  *
 948  * RETURNS 0 if file successfully loaded, returns 1 if file missing,
 949  * returns 2 and messages in global msglist if file has fatal errors.
 950  */
 951 static int      load_jpntree (
 952                     JPNTREE     **treetop,
 953                     char        *fname)
 954 {
 955     int         i;
 956     int         comment_count = 0;
 957     int         node_count = 0;
 958     int         is_duplicate;
 959     long        linecount = 0;
 960     UCHAR       *cptr;
 961     UCHAR       readbuf [256];
 962     char        sprintbuf [_POSIX_PATH_MAX + 1024];
 963     FILE        *fileid;
 964     JPNTREE     *new;
 965     JPNTREE     **this_link;
 966
 967     if (debugging_loadlang | debugging_loadword)
 968         fprintf (aa_stderr, PROGNAME"1071 "
 969             "load_jpntree: fname='%s'\n", NULLORSTR(fname));
 970
 971     if ((fileid = fopen (fname, "rt")) == NULL) {
 972         /* Not being able to find the file is not an error.
 973          * We indicate that with the return code.
 974          * But any other error (like permissions) is fatal.
 975          */
 976         if (errno == ENOENT) {
 977             if (debugging_loadlang | debugging_loadword)
 978                 fputs ("  ...file not found.\n", aa_stderr);
 979             return 1;
 980         }
 981         else {
 982             sprintf (sprintbuf,
 983                 catgets (dtsearch_catd, MS_misc, 362, "%s: %s: %s."),
 984                 PROGNAME"362", fname, strerror(errno));
 985             DtSearchAddMessage (sprintbuf);
 986             return 2;
 987         }
 988     }
 989
 990     /*--------- Main Read Loop ----------*/
 991     while (fgets ((char*)readbuf, sizeof(readbuf), fileid) != NULL) {
 992         linecount++;
 993         /*
 994          * Ignore lines beginning with any ascii char (comments).
 995          * Otherwise first or only packed EUC token on line
 996          * is the desired word.
 997          */
 998         if (readbuf[0] < 0x80) {
 999             comment_count++;
1000             continue;
1001         }
1002         for (cptr = readbuf;  *cptr >= 0x80;  cptr++)
1003             ;
1004         *cptr = 0;
1005         if (debugging_loadword) {
1006             fprintf (aa_stderr, "  JPNWORD: '%s' %n", readbuf, &i);
1007             while (i++ < 28)
1008                 fputc (' ', aa_stderr);
1009         }
1010
1011         /* Test for word too short */
1012         if (strlen((char*)readbuf) < 4) {
1013             sprintf (sprintbuf, catgets(dtsearch_catd, MS_lang, 23,
1014                 "%s Word '%s' on line %ld is too short.") ,
1015                 PROGNAME"1074", readbuf, linecount);
1016             DtSearchAddMessage (sprintbuf);
1017             continue;
1018         }
1019
1020         /* Allocate and populate a new node */
1021         i = strlen ((char*) readbuf);
1022         new = austext_malloc (sizeof(JPNTREE) + i + 4,
1023             PROGNAME"104", NULL);
1024         new->llink = NULL;
1025         new->rlink = NULL;
1026         new->next = NULL;
1027         new->len = i;
1028         new->word = (void *) (new + 1);
1029         strcpy (new->word, (char *) readbuf);
1030
1031         /* Search binary tree, comparing only first 4 bytes */
1032         is_duplicate = FALSE;
1033         for (this_link = treetop;  *this_link != NULL;  ) {
1034             i = strncmp (new->word, (*this_link)->word, 4);
1035
1036             if (i == 0) {
1037                 /* If first 4 bytes are similar, search
1038                  * linked list, comparing entire string.
1039                  */
1040                 while (*this_link != NULL) {
1041                     i = strcmp (new->word, (*this_link)->word);
1042
1043                     /* Test for duplicate word */
1044                     if (i == 0) {
1045                         sprintf (sprintbuf,
1046                             catgets (dtsearch_catd, MS_misc, 423,
1047                             "%s Word '%s' in '%s' is a duplicate."),
1048                             PROGNAME"423", readbuf, fname);
1049                         DtSearchAddMessage (sprintbuf);
1050                         /* duplicates aren't fatal, just ignore the word */
1051                         is_duplicate = TRUE;
1052                         break;  /* discontinue list search */
1053                     }
1054                     if (debugging_loadword)
1055                         fputc('-', aa_stderr);
1056                     this_link =  &(*this_link)->next;
1057                 } /* end linked list search */
1058
1059                 break;          /* discontinue tree search */
1060             } /* endif where first 4 bytes matched at a tree node */
1061
1062             /* First 4 bytes dissimilar.  Descend tree
1063              * to find next possible insertion point.
1064              */
1065             if (debugging_loadword)
1066                 fputc(((i < 0)? 'L' : 'R'), aa_stderr);
1067             this_link = (JPNTREE **) ((i < 0) ?
1068                 &(*this_link)->llink : &(*this_link)->rlink);
1069         } /* end binary tree search */
1070
1071         /* Don't link anything if error found while descending tree */
1072         if (is_duplicate) {
1073             if (debugging_loadword)
1074                 fputs (" duplicate!\n", aa_stderr);
1075             free (new);
1076             continue;
1077         }
1078
1079         /* Insert new node at current location in tree */
1080         *this_link = new;
1081         if (debugging_loadword)
1082             fputs(".\n", aa_stderr);
1083         node_count++;
1084     }   /* end of read loop */
1085
1086     fclose (fileid);
1087
1088     if (node_count <= 0) {
1089         if (debugging_loadlang | debugging_loadword)
1090             fprintf (aa_stderr,
1091                 PROGNAME"1185 load '%s' unsuccessful, %d comments discarded.\n",
1092                 fname, comment_count);
1093         sprintf (sprintbuf, catgets(dtsearch_catd, MS_lang, 24,
1094             "%s No Japanese words in word file '%s'.") ,
1095             PROGNAME"1186", fname);
1096         DtSearchAddMessage (sprintbuf);
1097         return 2;
1098     }
1099     else {
1100         if (debugging_loadlang | debugging_loadword)
1101             fprintf (aa_stderr,
1102                 PROGNAME"1193 load word file '%s' successful, %d words.\n",
1103                 fname, node_count);
1104         return 0;
1105     }
1106 }  /* load_jpntree() */
1107
1108
1109 /************************************************/
1110 /*                                              */
1111 /*              load_jpn_language               */
1112 /*                                              */
1113 /************************************************/
1114 /* Loads a dblk with japanese (DtSrLaJPN, DtSrLaJPN2)
1115  * structures and function pointers.
1116  * Called from load_language(), with identical input and output.
1117  * Does not reload structures previously loaded in
1118  * other jpn dblks on dblist if derived from identical files.
1119  * But always loads structures if passed dblist is NULL.
1120  * Presumes dblk already partially initialized:
1121  *      name, path, language, flags.
1122  * Returns TRUE if all successful.  Otherwise
1123  * returns FALSE with err msgs on ausapi_msglist.
1124  */
1125 int      load_jpn_language (DBLK *dblk, DBLK *dblist)
1126 {
1127     extern int  ascii_charmap[];        /* in lang.c */
1128     int         i;
1129     int         errcount = 0;
1130     JPNBLK      *jpnblk;
1131     char        fname [_POSIX_PATH_MAX + 4];
1132     char        path [_POSIX_PATH_MAX + 4];
1133     char        msgbuf [_POSIX_PATH_MAX + 128];
1134
1135     dblk->charmap =     ascii_charmap;  /* for teskey */
1136     dblk->parser =      jpn_parser;
1137     dblk->lstrupr =     null_lstrupr;
1138     dblk->stemmer =     null_stemmer;
1139     if (dblk->dbrec.or_maxwordsz == 0)  /* for teskey */
1140         dblk->dbrec.or_maxwordsz = MAXWIDTH_SWORD - 1;
1141     if (dblk->dbrec.or_minwordsz == 0)  /* for teskey */
1142         dblk->dbrec.or_minwordsz = MINWIDTH_TOKEN + 1;
1143     jpnblk = austext_malloc (sizeof(JPNBLK) + 4, PROGNAME"2107", NULL);
1144     memset (jpnblk, 0, sizeof(JPNBLK));
1145     dblk->parse_extra = (void *) jpnblk;
1146
1147     /* Load optional katakana and kanji word lists.
1148      * If specific dblk version not found,
1149      * try the default language version.
1150      * If either has load errors, return a failure.
1151      * If both are missing, just forget it.
1152      */
1153     if (dblk->path == NULL)
1154         path[0] = 0;
1155     else {
1156         if (strlen (dblk->path) > _POSIX_PATH_MAX - 14) {
1157             sprintf (msgbuf, catgets(dtsearch_catd, MS_lang, 25,
1158                 "%s Database '%s' path too long: '%s'.") ,
1159                 PROGNAME"759", dblk->name, dblk->path);
1160             DtSearchAddMessage (msgbuf);
1161             return FALSE;
1162         }
1163         strcpy (path, dblk->path);
1164         ensure_end_slash (path);
1165     }
1166
1167 #ifdef NO_KATAKANA_TREES_YET
1168     /* Load katakana wordtree */
1169     strcpy (fname, path);
1170     strcat (fname, dblk->name);
1171     strcat (fname, EXT_KATAKANA);
1172     i = load_jpntree (&jpnblk->katatree, fname);
1173     if (i == 1) {       /* ...db specific file not found */
1174         if (jpn_katatree == NULL) {     /* load default... */
1175             strcpy (fname, path);
1176             strcat (fname, "jpn");
1177             strcat (fname, EXT_KATAKANA);
1178             i = load_jpntree (&jpn_katatree, fname);
1179         }
1180         else            /* default already loaded */
1181             i == 0;
1182         jpnblk->katatree = jpn_katatree;
1183     }
1184     if (i > 1)
1185         errcount++;
1186 #endif /* NO_KATAKANA_TREES_YET */
1187
1188     /* Load kanji wordtree only if kanji compounds are derived
1189      * from list in file, ie for language DtSrLaJPN2 only.
1190      * If database specific list not found,
1191      * use language generic list.  If language generic
1192      * list also not found, ignore compounding.
1193      * Only one language generic list will
1194      * be loaded, at jpn_kanjitree.
1195      */
1196     if (dblk->dbrec.or_language == DtSrLaJPN2) {
1197         strcpy (fname, path);
1198         strcat (fname, dblk->name);
1199         strcat (fname, EXT_KANJI);
1200         i = load_jpntree (&jpnblk->kanjitree, fname);
1201         if (i == 1) {   /* ...db specific file not found */
1202             /* If the generic knj file (jpn.knj) was
1203              * never loaded, try loading it now.
1204              */
1205             if (jpn_kanjitree == NULL) {
1206                 strcpy (fname, path);
1207                 strcat (fname, "jpn");
1208                 strcat (fname, EXT_KANJI);
1209                 load_jpntree (&jpn_kanjitree, fname);
1210                 /* (it either worked or it didn't) */
1211             }
1212             /* Whether generic load successful or not,
1213              * try to use it (eg it might still be NULL).
1214              */
1215             jpnblk->kanjitree = jpn_kanjitree;
1216         }
1217         if (i > 1)  /* error trying to open db specific file */
1218             errcount++;
1219     }
1220
1221     return (errcount > 0)? FALSE : TRUE;
1222
1223 } /* load_jpn_language() */
1224
1225
1226 /************************************************/
1227 /*                                              */
1228 /*                 free_jpntree                 */
1229 /*                                              */
1230 /************************************************/
1231 /* Identical to free_wordtree() in lang.c
1232  * (link inversion traversal, from Data Structure Techniques,
1233  * Thomas A. Standish, Algorithm 3.6),
1234  * except post order visit includes freeing
1235  * linked list at each tree node.
1236  */
1237 static void     free_jpntree (JPNTREE ** jpntree_head)
1238 {
1239     JPNTREE     *next, *prev, *pres;
1240     JPNTREE     *listp, *next_listp;
1241
1242     if (*jpntree_head == NULL)
1243         return;
1244     pres = *jpntree_head;
1245     prev = NULL;
1246
1247 DESCEND_LEFT:
1248     pres->word = (void *) 0;    /* preorder visit:  TAG = 0 */
1249     next = pres->llink;
1250     if (next != NULL) {
1251         pres->llink = prev;
1252         prev = pres;
1253         pres = next;
1254         goto DESCEND_LEFT;
1255     }
1256 DESCEND_RIGHT:
1257      next = pres->rlink;
1258     if (next != NULL) {
1259         pres->word = (void *) 1;        /* TAG = 1 */
1260         pres->rlink = prev;
1261         prev = pres;
1262         pres = next;
1263         goto DESCEND_LEFT;
1264     }
1265 POSTORDER_VISIT:
1266     listp = pres;
1267     while (listp->next) {
1268         next_listp = listp->next;
1269         free (listp);
1270         listp = next_listp;
1271     }
1272     free (listp);
1273
1274     if (prev == NULL) {                 /* end of algorithm? */
1275         *jpntree_head = NULL;
1276         return;
1277     }
1278     if (prev->word == (void *) 0) {     /* go up left leg */
1279         next = prev->llink;
1280         pres = prev;
1281         prev = next;
1282         goto DESCEND_RIGHT;
1283     }
1284     else {                              /* go up right leg */
1285         next = prev->rlink;
1286         prev->word = (void *) 0;        /* restore TAG = 0 */
1287         pres = prev;
1288         prev = next;
1289         goto POSTORDER_VISIT;
1290     }
1291 }  /* free_jpntree() */
1292
1293
1294 /************************************************/
1295 /*                                              */
1296 /*             unload_jpn_language              */
1297 /*                                              */
1298 /************************************************/
1299 /* Frees storage for structures allocated by load_jpn_language().
1300  * Called when engine REINITs due to change in site config file
1301  * or databases.
1302  * The global jpntrees are not currently unloaded because they
1303  * are presumed valid for the duration of the engine session.
1304  * Currently there are no teskey trees (inclist, stoplist) to free.
1305  */
1306 void    unload_jpn_language (DBLK *dblk)
1307 {
1308     /* free jpnblk and any database-associated jpntrees */
1309     if (dblk->parse_extra) {
1310         JPNBLK  *jpnblk = (JPNBLK *) dblk->parse_extra;
1311         if (jpnblk->katatree  &&  jpnblk->katatree != jpn_katatree)
1312             free_jpntree (&jpnblk->katatree);
1313         if (jpnblk->kanjitree &&  jpnblk->kanjitree != jpn_kanjitree)
1314             free_jpntree (&jpnblk->kanjitree);
1315         free (jpnblk);
1316         dblk->parse_extra = NULL;
1317     }
1318     return;
1319 } /* unload_jpn_language() */
1320
1321 /******************** JPN.C ********************/
1322