2 * CDE - Common Desktop Environment
4 * Copyright (c) 1993-2012, The Open Group. All rights reserved.
6 * These libraries and programs are free software; you can
7 * redistribute them and/or modify them under the terms of the GNU
8 * Lesser General Public License as published by the Free Software
9 * Foundation; either version 2 of the License, or (at your option)
12 * These libraries and programs are distributed in the hope that
13 * they will be useful, but WITHOUT ANY WARRANTY; without even the
14 * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU Lesser General Public License for more
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with these librararies and programs; if not, write
20 * to the Free Software Foundation, Inc., 51 Franklin Street, Fifth
21 * Floor, Boston, MA 02110-1301 USA
24 * COMPONENT_NAME: austext
26 * FUNCTIONS: display_jstate
38 * (C) COPYRIGHT International Business Machines Corp. 1995,1996
40 * Licensed Materials - Property of IBM
41 * US Government Users Restricted Rights - Use, duplication or
42 * disclosure restricted by GSA ADP Schedule Contract with IBM Corp.
44 /******************** JPN.C ********************
45 * $TOG: jpn.c /main/7 1999/10/14 14:11:33 mgreess $
47 * Includes functions and data for parsing Japanese,
48 * supported languages DtSrLaJPN and DtSrLaJPN2.
49 * Currently only supports EUC packed format,
50 * but should be easily extendable to Shift-JIS.
51 * JIS can be supported if half-width katakana are excluded
52 * (no SI or SO chars to conflict with the ^O stemming char,
53 * and engine must decide never to balk at ESCape sequences).
54 * Will not support Unicode or other fixed width, n-wide
55 * encodings that would conflict with ascii in either byte.
56 * Does not require wide char or multibyte char functions.
57 * There is no Japanese stemmer(), ie standard null_stemmer() is used.
59 * Code Set 0 can be either 7-bit ASCII or 7-bit JIS-Roman.
60 * The parser() for ASCII is the full teskey_parser()
61 * used for European languages with an ascii char set.
62 * Min/max word size, stoplists, and include lists may be
63 * used if provided, as in European languages.
65 * Code Set 1 is JIS X 0208-1990.
66 * Symbols and line drawing elements are not indexed.
67 * Hirigana strings are discarded as equivalent to stoplist words.
68 * Contiguous strings of katakana, Roman, Greek, or cyrillic
69 * are parsed as single words.
71 * Individual kanji chars are parsed as single words.
72 * In addition, for language DtSrLaJPN, all kanji compounds
73 * (pairs, triplets, etc) found in any contiguous string of
74 * kanjis will be parsed up to a maximum word size
75 * defined in MAX_KANJI_CLEN (see caveat below).
76 * For language DtSrLaJPN2, only kanji substrings listed
77 * in a .knj file are parsed as additional compound words.
78 * Characters from unassigned kuten rows are presumed to be
79 * user-defined kanji and are parsed as such.
81 * Code Set 2 is 1/2 width katakana.
82 * Contiguous strings are parsed as single words.
84 * Code Set 3 is JIS X 0212-1990.
85 * Parsing is similar to Code Set 1: discard symbols, etc,
86 * contiguous strings of related foreign characters equal words,
87 * and individual kanji and unassigned chars equal single words,
88 * with additional kanji compounding depending on language.
89 * Row 5 has 4 new katakana (not yet officially approved)
90 * so it is treated here as katakana.
93 * Revision 2.8 1996/04/10 20:24:33 miker
94 * Fixed bug in kanji tree loader.
96 * Revision 2.7 1996/03/25 18:55:15 miker
97 * Changed FILENAME_MAX to _POSIX_PATH_MAX.
99 * Revision 2.6 1996/03/13 22:57:40 miker
100 * Added prolog. Changed char to UCHAR several places.
102 * Revision 2.5 1996/03/05 16:09:58 miker
103 * Made jchar array of unsigned chars for compat with Sun compilers.
104 * Added test of PA_MSGS for yacc-based boolean queries.
106 * Revision 2.4 1996/02/01 19:08:10 miker
107 * AusText 2.1.11, DtSearch 0.3: Major rewrite for new parsers.
108 * Made optional power series kanji compounding (KANJI_COMPOUNDS)
109 * into a new DtSrLaJPN language. Old version now DtSrLaJPN2.
111 * Revision 2.3 1995/12/01 16:20:17 miker
112 * Changed read_jchar arg to unsigned to fix Solaris bug.
114 * Revision 2.2 1995/10/26 15:08:31 miker
117 * Revision 2.1 1995/09/22 20:57:13 miker
118 * Freeze DtSearch 0.1, AusText 2.1.8
120 * Revision 1.1 1995/09/19 21:24:57 miker
129 #include <sys/stat.h>
131 #define PROGNAME "JPN"
132 #define SS2_CHAR 0x8E /* Single Shift char for Code Set 2 */
133 #define SS3_CHAR 0x8F /* Single Shift char for Code Set 3 */
134 #define EXT_KATAKANA ".ktk"
135 #define EXT_KANJI ".knj"
136 #define SUBSTRBUFSZ 100
140 /* In addition to single kanji chars parsed as individual words,
141 * Language DtSrLaJPN will also blindly consider all contiguous kanji
142 * substrings up to MAX_KANJI_CLEN as separate compound words.
143 * For example if MAX_KANJI_CLEN were 3, the 4 kanjis "ABCD"
144 * would parse as "A B C D AB BC CD ABC BCD".
145 * The number of parsed words = the number of
146 * ordered permutations of n things taken r! times!
147 * This is can be very wasteful of indexing time and file space.
148 * The alternative is language DtSrLaJPN2 which only considers
149 * strings listed in jpn.knj as valid kanji compounds.
150 * The kanji compounds in jpn.knj are the statistically significant
151 * kanji substrings found in a large corpus of natural language Japanese.
153 #define MAX_KANJI_CLEN 6
155 /************************************************/
159 /************************************************/
160 /* EUC text substring types.
161 * Used to switch states in parser's automaton.
162 * Coded as bit positions for efficient boolean comparisons.
164 #define JS_STX 0x0001 /* Start of text blk, initial state */
165 #define JS_KANJI 0x0002 /* Set 1, Set 3 */
166 #define JS_KATAKANA 0x0004 /* Set 1 */
167 #define JS_ASCII 0x0008 /* Set 0 */
168 #define JS_ROMAN 0x0010 /* Set 1 */
169 #define JS_GREEK 0x0020 /* Set 1, Set 3 */
170 #define JS_CYRILLIC 0x0040 /* Set 1 */
171 #define JS_ALPHA 0x0080 /* Set 3 */
172 #define JS_HALFKATA 0x0100 /* Set 2 */
173 #define JS_DISCARD 0x0200 /* Set 1, Set 3, any char not in EUC */
174 #define JS_ETX 0x0300 /* End of text block */
175 #define JS_ALPHA_COMPATIBLE (JS_ROMAN | JS_GREEK | JS_CYRILLIC)
177 /************************************************/
181 /************************************************/
182 /* Similar to standard binary WORDTREE.
183 * Each tree node distinguished by first 4 bytes
184 * (usually 2 jchars), which is minimum compound word size.
185 * All compounds beginning with those 4 bytes are chained
186 * in a linked list off of that node.
188 typedef struct _jpntree_tag {
189 struct _jpntree_tag *rlink; /* ptr to right binary node */
190 struct _jpntree_tag *llink; /* ptr to left binary node */
191 struct _jpntree_tag *next; /* ptr to next compound in linked list */
192 int len; /* length of word in bytes */
196 /************************************************/
200 /************************************************/
206 /************************************************/
210 /************************************************/
211 int debugging_jpn = FALSE;
212 extern int debugging_loadlang;
213 extern int debugging_loadword;
215 /* Used in jpn_parser() and parse_substr(). Made global for speed. */
216 static int do_compounding = FALSE;
217 static int is_new_substring = TRUE;
218 static int jstate, last_jstate;
219 static UCHAR jchar [8];
220 static int jcharlen = 0;
221 static DBLK *jpn_dblk;
222 static JPNTREE *jpn_kanjitree = NULL;
223 static JPNTREE *jpn_katatree = NULL;
224 static JPNTREE *kanjitree = NULL;
226 static long *offsetp;
227 static long readcount = 0;
228 static READCFP readchar;
229 static void *readchar_arg;
230 static UCHAR *outbuf = NULL;
231 static UCHAR *save_parg_string = NULL;
232 static UCHAR *substrbuf = NULL;
233 static long substr_offset;
236 /************************************************/
240 /************************************************/
241 /* for debugging and error msgs */
242 static char *display_jstate (int js)
245 case JS_KANJI: return "KANJI";
246 case JS_KATAKANA: return "KATAKANA";
247 case JS_DISCARD: return "DISCARD";
248 case JS_ROMAN: return "ROMAN";
249 case JS_ASCII: return "ASCII";
250 case JS_ALPHA: return "ALPHA";
251 case JS_ETX: return "ETX";
252 case JS_STX: return "STX";
253 case JS_GREEK: return "GREEK";
254 case JS_CYRILLIC: return "CYRILLIC";
255 case JS_HALFKATA: return "HALFKATA";
256 default: return "(UNKNOWN)";
258 } /* display_jstate() */
261 /************************************************/
265 /************************************************/
266 /* Subroutine of jpn_parser().
267 * Using global character reading 'readchar' cofunction,
268 * returns (1) next multibyte Japanese character in global jchar,
269 * (2) length of jchar in global jcharlen, and
270 * (3) next state of state machine in global jstate.
271 * Function itself returns jstate.
272 * Rows in the KUTEN tables which are officially 'unassigned'
273 * are treated as user-defined kanji, so all jstates
274 * are presumed JS_KANJI except those specifically marked
275 * otherwise at the beginning of each array below.
277 static int read_jchar (void)
279 /* Jstates table for EUC Set 1 (JIS 0208) */
280 static int jstates_set1 [] = {
281 JS_DISCARD, JS_DISCARD, JS_DISCARD, /* 0 - 2 */
282 JS_ROMAN, JS_DISCARD, JS_KATAKANA, /* 3 - 5 */
283 JS_GREEK, JS_CYRILLIC, JS_DISCARD /* 6 - 8 */
286 /* Jstates table for EUC Set 3 (JIS 0212).
287 * Row 5 is presumed to be katakana because
288 * of four new unapproved katakana characters.
290 static int jstates_set3 [] = {
291 JS_DISCARD, JS_DISCARD, JS_DISCARD, /* 0 - 2 */
292 JS_DISCARD, JS_DISCARD, JS_KATAKANA, /* 3 - 5 */
293 JS_GREEK, JS_CYRILLIC, JS_DISCARD, /* 6 - 8 */
294 JS_ALPHA, JS_ALPHA, JS_ALPHA /* 9 - 11 */
298 jchar[0] = readchar (readchar_arg);
302 jchar[0] = readchar (NULL);
304 return (jstate = JS_ETX);
307 /* Set 1 (JIS 0208) */
308 if (jchar[0] >= 0xA1 && jchar[0] <= 0xFE) {
313 jstate = jstates_set1 [(jchar[0] & 0x7F) - 32];
314 if (jchar[1] = readchar (NULL))
322 if (jchar[0] < 0x80) {
324 return (jstate = JS_ASCII);
327 /* Set 3 (JIS 0212) */
328 if (jchar[0] == SS3_CHAR) {
331 * Hop over the single shift char to get the first JIS byte.
332 * Make sure first JIS byte is in proper
333 * range to avoid indexing outside of table.
335 if ((jchar[1] = readchar (NULL)) == 0)
336 return (jstate = JS_ETX);
339 return (jstate = JS_DISCARD);
343 jstate = jstates_set3 [(*jchar & 0x7F) - 32];
345 if ((jchar[2] = readchar (NULL)) == 0)
346 return (jstate = JS_ETX);
348 /* JS_ALPHA chars ('miscellaneous alphabetic chars' of
349 * rows 9 - 11) are compatible with several other jstates,
350 * so adjust as necessary.
352 if (jstate == JS_ALPHA &&
353 ((last_jstate & JS_ALPHA_COMPATIBLE) != 0))
354 jstate = last_jstate;
355 else if (last_jstate == JS_ALPHA &&
356 ((jstate & JS_ALPHA_COMPATIBLE) != 0))
357 last_jstate = jstate;
361 /* Set 2 (half-width katakana) */
362 if (jchar[0] == SS2_CHAR) {
364 jstate = JS_HALFKATA;
365 if (jchar[1] = readchar (NULL))
372 /* If first jchar doesn't match expected EUC coding,
373 * discard it until we get back into sync.
376 return (jstate = JS_DISCARD);
380 /************************************************/
382 /* kanji_compounder */
384 /************************************************/
385 /* Subroutine of parse_substring() of jpn_parser().
386 * Used only for language DtSrLaJPN (power series compounding).
387 * Called repeatedly when the substring is a sequence of kanji chars.
388 * For each call writes to outbuf and returns a single kanji
389 * compound word, using every possible compound in the substring
390 * from length 1 to length MAX_KANJI_CLEN.
391 * Updates offsetp for each word returned.
392 * Returns NULL when substring exhausted. First call for
393 * a new substring indicated by global is_new_substring.
396 static UCHAR *kanji_compounder (void)
398 static int all_done = TRUE;
399 static int clen = MAX_KANJI_CLEN + 1;
400 static UCHAR *mysubstrp = NULL;
401 static UCHAR *mysubstrend = NULL;
402 static UCHAR *op, *ss;
405 if (is_new_substring) {
406 is_new_substring = FALSE;
409 mysubstrp = substrbuf;
410 mysubstrend = substrbuf + strlen ((char*)substrbuf);
413 /* Advance compound length by 1.
414 * If max compound length exceeded, reset it
415 * to 1 and increment substring pointer by 1 jchar.
420 if (++clen > MAX_KANJI_CLEN) {
422 mysubstrp += (*mysubstrp == SS3_CHAR)? 3 : 2;
426 /* Assemble one word into outbuf, of length clen,
427 * beginning at current substring ptr.
428 * If there aren't enough jchars left in string,
429 * reset clen to 1 and advance substrp by 1 jchar.
430 * We're all done when substring exhausted.
432 while (mysubstrp < mysubstrend) {
435 for (i = 0; i < clen; i++) {
436 /* Are there enough jchars left in substring? */
437 if (ss >= mysubstrend) {
439 mysubstrp += (*mysubstrp == SS3_CHAR)? 3 : 2;
440 i = 0; /* indicates assembly failure */
441 break; /* breaks the for loop */
443 /* Assemble one jchar into outbuf */
449 /* Did word assembly succeed? */
453 *offsetp = substr_offset + (mysubstrp - substrbuf);
456 "knjcompdr: subofs=%2ld totofs=%3ld \"%s\"\n",
457 mysubstrp - substrbuf, *offsetp, outbuf);
464 } /* kanji_compounder() */
467 /************************************************/
469 /* search_kanjitree */
471 /************************************************/
472 /* Subroutine of parse_substring() of jpn_parser().
473 * Used only for language DtSrLaJPN2; DtSrLaJPN calls
474 * kanji_compounder() to generate compounds algorithmically.
475 * First call for a new substring of kanjis is indicated
476 * when is_new_substring is TRUE. Each call, then and thereafter,
477 * returns a token (1) for each individual kanji char in string,
478 * and (2) for each sequence of kanjis found in the kanji
479 * compounds JPNTREE which begins with each char in string.
480 * Also returns offset of returned token in offsetp.
481 * Returns NULL when string is exhausted.
482 * Variables are static for speeeeed.
484 static UCHAR *search_kanjitree (void)
486 static int all_done = TRUE;
487 static JPNTREE *node, *last_node;
488 static UCHAR *substrp, *substrend;
489 static int direction;
493 if (is_new_substring) {
494 is_new_substring = FALSE;
496 substrend = substrbuf + strlen ((char*)substrbuf);
499 /* Return first substr jchar as next token */
500 last_node = NULL; /* NULL = tree not searched yet */
501 jcharlen = (*substrp == SS3_CHAR)? 3 : 2;
502 strncpy ((char*)outbuf, (char*)substrp, jcharlen);
503 outbuf [jcharlen] = 0;
505 *offsetp = substr_offset;
511 /* If not enough chars left in substring to search tree,
512 * treat it as an exhausted tree search. In other words,
513 * reset tree search, increment to next jchar, and return it.
515 if (strlen ((char*)substrp) < 4) {
517 fputs ("knjtree: ...remaining substring too short", aa_stderr);
520 fputs (".\n", aa_stderr);
523 if (substrp >= substrend) {
527 jcharlen = (*substrp == SS3_CHAR)? 3 : 2;
528 strncpy ((char*)outbuf, (char*)substrp, jcharlen);
529 outbuf [jcharlen] = 0;
531 *offsetp = substr_offset + (substrp - substrbuf);
535 /* If last call resulted in a tree hit, the node was saved.
536 * Continue the linked list search directly from the last hit.
539 last_node = last_node->next;
541 fputs ("knjtree: ...continue tree search: ", aa_stderr);
543 for (node = last_node; node; node = node->next) {
544 if ((strncmp ((char*)substrp, node->word, node->len)) == 0) {
545 /* HIT on linked list search */
547 strcpy ((char*)outbuf, node->word);
549 fprintf (aa_stderr, "* '%s'\n", outbuf);
551 *offsetp = substr_offset + (substrp - substrbuf);
554 else if (debugging_jpn)
555 fputc ('-', aa_stderr);
560 /* Start new binary tree search at curr jchar.
561 * If hit, commence linked list search.
565 "knjtree: \"%.4s...\" ", substrp);
566 for (node = kanjitree; node != NULL; ) {
567 if ((direction = strncmp ((char*)substrp, node->word, 4)) == 0) {
568 /* HIT on binary search */
570 goto LINKED_LIST_SEARCH;
572 /* Descend left or right depending on word */
574 fputc ((direction < 0) ? 'L' : 'R', aa_stderr);
581 /* No match on first 4 bytes of substrp in binary tree.
582 * Tree exhausted without a hit, so increment to next
583 * jchar in substring and return it as a word.
586 } /* search_kanjitree() */
589 /************************************************/
591 /* parse_substring */
593 /************************************************/
594 /* Subroutine of jpn_parser().
595 * Returns next Japanese multibyte word token from current
596 * substring of jchars, or NULL when out of tokens.
597 * Returned token is valid until next call.
598 * Static args initialized at first call for a new substring.
599 * Provides optional kanji compounding depending on PA_ flags.
600 * We usually compound at index time (dtsrindex) or when query
601 * is Query-By-Example (statistical searches), and usually don't
602 * compound boolean queries.
604 static UCHAR *parse_substring (void)
606 static int is_substr_end = TRUE;
607 static int substrlen = 0;
610 static long myoffset;
612 if (is_new_substring) {
613 substrlen = strlen ((char*)substrbuf);
615 /* A very common ascii substring is the final line-feed
616 * at the end of a line of text--discard it now.
618 if (last_jstate == JS_ASCII
620 && substrbuf[0] == '\n') {
621 is_substr_end = TRUE;
622 is_new_substring = FALSE;
626 is_substr_end = FALSE;
629 outbuf = austext_malloc (DtSrMAXWIDTH_HWORD + 8,
630 PROGNAME"807", NULL);
634 fprintf (aa_stderr, "jpnsubstr: js=%s len=%ld str='",
635 display_jstate(last_jstate), substrlen);
636 for (i = 0; i < substrlen; i++)
637 fputc ((substrbuf[i] < 32)? '~' : substrbuf[i],
639 fprintf (aa_stderr, "'\n");
640 if (last_jstate == JS_ROMAN) {
641 fprintf (aa_stderr, " (ascii equiv: '");
642 for (i = 1; i < substrlen; i+=2)
643 fputc ((substrbuf[i] & 0x7f) + 32, aa_stderr);
644 fprintf (aa_stderr, "')\n");
649 } /* endif is_new_substring */
654 switch (last_jstate) {
657 /* Ignore discardable substrings */
658 is_new_substring = FALSE;
659 is_substr_end = TRUE;
668 /* Treat entire substring as single parsed word */
669 ENTIRE_SUBSTR_IS_WORD:
671 fputs (" token is entire substring.\n", aa_stderr);
672 strncpy ((char*)outbuf, (char*)substrbuf, DtSrMAXWIDTH_HWORD);
673 outbuf [DtSrMAXWIDTH_HWORD - 1] = 0;
674 is_new_substring = FALSE;
675 is_substr_end = TRUE;
677 *offsetp = substr_offset;
681 /* Call the full teskey_parser() for European languages.
682 * Includes stoplist and include list processing.
684 if (is_new_substring) {
685 is_new_substring = FALSE;
687 fputs (" calling teskey parser.\n", aa_stderr);
688 myparg.dblk = jpn_dblk;
689 myparg.string = substrbuf;
691 myparg.offsetp = &myoffset;
692 token = (UCHAR *) teskey_parser (&myparg);
695 token = (UCHAR *) teskey_parser (NULL);
698 *offsetp = substr_offset + myoffset;
701 is_substr_end = TRUE;
705 /* If not compounding, treat entire substring
706 * as one query word, ie a single compound kanji word.
707 * If compounding, each individual kanji in the
708 * substring is returned as a word by itself.
709 * Each kanji can be 2 or 3 bytes depending on
710 * which code set it came from. In addition,
711 * sequences of 2 or more kanjis ('compound kanji
712 * words') are returned as individual words.
713 * Method of kanji compounding depends on language:
714 * DtSrLaJPN does "power series" kanji compounding,
715 * DtSrLaJPN2 looks up kanji compounds in a word tree.
716 * Both functions test and reset is_new_substring,
717 * update offsetp as necessary, and return either NULL
718 * or a pointer to outbuf containing a valid token.
721 goto ENTIRE_SUBSTR_IS_WORD;
722 token = (language == DtSrLaJPN)?
723 kanji_compounder() : search_kanjitree();
725 is_substr_end = TRUE;
731 } /* end state switch */
733 /* Should never get here... */
734 fprintf (aa_stderr, catgets(dtsearch_catd, MS_lang, 20,
735 "%s Program Error: Unknown jstate %d.\n") ,
736 PROGNAME"246", last_jstate);
738 } /* parse_substring() */
741 /************************************************/
745 /************************************************/
746 /* Returns next word token from text stream of packed EUC
747 * Japanese text, languages DtSrLaJPN and DtSrLaJPN2.
748 * Called from (1) dtsrindex, where readchar_ftext() cofunction
749 * reads the .fzk file document 'stream', or (2) search engine
750 * query parsers, where readchar_string() cofunction 'reads'
751 * from the query string.
753 * First call passes args in PARG block. This resets end of
754 * text block (ETX) flag, resets 'offset' counter to zero, etc.
755 * Subsequent calls should pass NULL, and parser returns
756 * next token in block, until reader cofunction reads ETX
757 * end returns special ETX char ('\0'). Subsequent call to parser
758 * returns NULL meaning "no tokens left in current stream".
759 * Reader cofunction tolerates repeated calls after
760 * the first ETX, still returning '\0'.
762 * This parser presumes all incoming text is packed EUC multibyte
763 * Japanese chars as described above, but is otherwise unformatted.
764 * Since parser accesses streams a multibyte char at a time,
765 * it does not require periodic line feeds, etc.
767 * To control kanji compounding, caller should set a PA_ switch
768 * in parg.flags as desired before call. Compounding is done
769 * when indexing (dtsrindex) or for hiliting (comparing previous
770 * search results against all possible words in document text).
771 * But in a Query by Example (stat searches), parser might also
772 * be asked to generate compound words. In boolean queries
773 * (stems and exact words), parser should not generate compounds
774 * because if user enters a compound string, he probably only wants
775 * documents containing that exact token.
777 * Parser also returns offset information: number of bytes
778 * since beginning of text block. The returned offsets are
779 * NOT NECESSARILY IN ASCENDING ORDER due to kanji compounding.
781 * Variables are static or global for speeeeeeed.
783 * OUTPUT FORMAT: NULL or a static C string containing a
784 * single parsed word token.
785 * The text in the buffer is valid until the next call.
786 * Each word is translated as described above.
788 char *jpn_parser (PARG *parg)
790 static int filling_substring = TRUE;
791 static int was_discarding = FALSE;
792 static int add_msgs = FALSE;
793 static UCHAR *endsubstrbuf = NULL;
794 static size_t substrbufsz = 0;
796 static UCHAR *substrp;
798 /* If first call for new text block... */
800 jpn_dblk = parg->dblk;
801 language = jpn_dblk->dbrec.or_language;
802 kanjitree = ((JPNBLK *)(jpn_dblk->parse_extra))->kanjitree;
803 offsetp = parg->offsetp;
804 do_compounding = (parg->flags & (PA_HILITING | PA_INDEXING));
805 add_msgs = (parg->flags & PA_MSGS);
806 if (parg->string) { /* text is query str from search engine */
807 save_parg_string = parg->string;
808 readchar_arg = parg->string;
809 readchar = (READCFP) readchar_string;
811 else { /* text is from .fzk file in dtsrindex */
812 save_parg_string = NULL;
814 readchar = (READCFP) readchar_ftext;
817 if (substrbufsz == 0) {
818 substrbufsz = SUBSTRBUFSZ;
819 substrbuf = austext_malloc (SUBSTRBUFSZ + 8, PROGNAME"680", NULL);
821 endsubstrbuf = substrbuf + substrbufsz;
825 "jpnparser: start text block, substrbufsz=%ld.\n",
830 /* Seed the first substring */
831 filling_substring = TRUE;
833 last_jstate = JS_STX;
836 } /* endif (parg != NULL) */
838 FILL_ANOTHER_SUBSTRING:
839 /* Input text is presumed to contain substrings
840 * of chars related by their EUC encoding.
841 * Fill the substring buffer by reading in nonDISCARDable
842 * multibyte jchars until jstate changes signaling
843 * end of a substring.
844 * Note last jchar read, the one that changes the jstate,
845 * hangs around till we come back to this loop.
847 if (filling_substring) {
849 if (jstate == JS_DISCARD) {
850 fputs ("jpnparser: js=DISCARD:", aa_stderr);
851 was_discarding = TRUE;
854 was_discarding = FALSE;
856 while (jstate == JS_DISCARD) {
858 fprintf (aa_stderr, " %s", jchar);
861 if (debugging_jpn && was_discarding)
862 fputc ('\n', aa_stderr);
863 if (jstate == JS_ETX) {
865 fputs ("jpnparser: js=ETX\n", aa_stderr);
867 char msgbuf [DtSrMAXWIDTH_HWORD + 100];
868 sprintf (msgbuf, catgets(dtsearch_catd, MS_lang, 21,
869 "%s '%.*s' is not a valid Japanese word.") ,
870 PROGNAME"812", DtSrMAXWIDTH_HWORD, save_parg_string);
871 DtSearchAddMessage (msgbuf);
876 last_jstate = jstate;
878 substr_offset = readcount - jcharlen;
880 /* Fill the substring buffer.
881 * Ensure substring buffer is big enough.
883 while (last_jstate == jstate) {
884 if (endsubstrbuf - substrp < 8) {
885 size_t curlen = substrp - substrbuf;
888 "jpnparser: curr substr len %ld, "
889 "new substrbufsz %ld.\n",
890 curlen, substrbufsz<<1);
893 substrbufsz <<= 1; /* double its size */
894 substrbuf = realloc (substrbuf, substrbufsz);
895 endsubstrbuf = substrbuf + substrbufsz;
896 substrp = substrbuf + curlen;
898 strncpy ((char*)substrp, (char*)jchar, jcharlen);
903 filling_substring = FALSE;
904 is_new_substring = TRUE;
907 /* Empty the substring buffer returning each token
908 * one by one, ie parse and return word tokens from string,
909 * including possible kanji compounds if switched on.
911 if (token = parse_substring())
912 return (char *) token;
914 /* When current substring is empty, go back and fill another one.
915 * If we're parsing a string (eg hiliting text of a doc),
916 * parse_substring() will have used readchar_string().
917 * Since we now want to resume using it to parse the original
918 * string, we have to reset it's string ptr.
920 filling_substring = TRUE;
921 if (save_parg_string)
922 readchar_arg = save_parg_string + readcount;
923 goto FILL_ANOTHER_SUBSTRING;
928 /************************************************/
932 /************************************************/
933 /* Subroutine of load_jpn_language. Builds a JPNTREE
934 * from a file of packed EUC compound words.
935 * Basically a copy of load_wordtree() in lang.c.
937 * INPUT FILE FORMAT: One word per line, min 4 bytes (2 jchars),
938 * all words packed EUC. Preferred order is frequency of
939 * occurrence in the corpus to make searches efficient.
940 * Otherwise the words should at least be in random order or
941 * an order that will approximate a binary search.
942 * If first char is ASCII (ie not packed EUC), line is
943 * ignored as comments. Any ascii chars after packed EUC,
944 * such as whitespace and/or subsequent ascii comments,
945 * delimits word token (ie anything else on the line is ignored).
946 * "Line" ends in ascii linefeed (\n).
948 * RETURNS 0 if file successfully loaded, returns 1 if file missing,
949 * returns 2 and messages in global msglist if file has fatal errors.
951 static int load_jpntree (
956 int comment_count = 0;
962 char sprintbuf [_POSIX_PATH_MAX + 1024];
967 if (debugging_loadlang | debugging_loadword)
968 fprintf (aa_stderr, PROGNAME"1071 "
969 "load_jpntree: fname='%s'\n", NULLORSTR(fname));
971 if ((fileid = fopen (fname, "rt")) == NULL) {
972 /* Not being able to find the file is not an error.
973 * We indicate that with the return code.
974 * But any other error (like permissions) is fatal.
976 if (errno == ENOENT) {
977 if (debugging_loadlang | debugging_loadword)
978 fputs (" ...file not found.\n", aa_stderr);
983 catgets (dtsearch_catd, MS_misc, 362, "%s: %s: %s."),
984 PROGNAME"362", fname, strerror(errno));
985 DtSearchAddMessage (sprintbuf);
990 /*--------- Main Read Loop ----------*/
991 while (fgets ((char*)readbuf, sizeof(readbuf), fileid) != NULL) {
994 * Ignore lines beginning with any ascii char (comments).
995 * Otherwise first or only packed EUC token on line
996 * is the desired word.
998 if (readbuf[0] < 0x80) {
1002 for (cptr = readbuf; *cptr >= 0x80; cptr++)
1005 if (debugging_loadword) {
1006 fprintf (aa_stderr, " JPNWORD: '%s' %n", readbuf, &i);
1008 fputc (' ', aa_stderr);
1011 /* Test for word too short */
1012 if (strlen((char*)readbuf) < 4) {
1013 sprintf (sprintbuf, catgets(dtsearch_catd, MS_lang, 23,
1014 "%s Word '%s' on line %ld is too short.") ,
1015 PROGNAME"1074", readbuf, linecount);
1016 DtSearchAddMessage (sprintbuf);
1020 /* Allocate and populate a new node */
1021 i = strlen ((char*) readbuf);
1022 new = austext_malloc (sizeof(JPNTREE) + i + 4,
1023 PROGNAME"104", NULL);
1028 new->word = (void *) (new + 1);
1029 strcpy (new->word, (char *) readbuf);
1031 /* Search binary tree, comparing only first 4 bytes */
1032 is_duplicate = FALSE;
1033 for (this_link = treetop; *this_link != NULL; ) {
1034 i = strncmp (new->word, (*this_link)->word, 4);
1037 /* If first 4 bytes are similar, search
1038 * linked list, comparing entire string.
1040 while (*this_link != NULL) {
1041 i = strcmp (new->word, (*this_link)->word);
1043 /* Test for duplicate word */
1046 catgets (dtsearch_catd, MS_misc, 423,
1047 "%s Word '%s' in '%s' is a duplicate."),
1048 PROGNAME"423", readbuf, fname);
1049 DtSearchAddMessage (sprintbuf);
1050 /* duplicates aren't fatal, just ignore the word */
1051 is_duplicate = TRUE;
1052 break; /* discontinue list search */
1054 if (debugging_loadword)
1055 fputc('-', aa_stderr);
1056 this_link = &(*this_link)->next;
1057 } /* end linked list search */
1059 break; /* discontinue tree search */
1060 } /* endif where first 4 bytes matched at a tree node */
1062 /* First 4 bytes dissimilar. Descend tree
1063 * to find next possible insertion point.
1065 if (debugging_loadword)
1066 fputc(((i < 0)? 'L' : 'R'), aa_stderr);
1067 this_link = (JPNTREE **) ((i < 0) ?
1068 &(*this_link)->llink : &(*this_link)->rlink);
1069 } /* end binary tree search */
1071 /* Don't link anything if error found while descending tree */
1073 if (debugging_loadword)
1074 fputs (" duplicate!\n", aa_stderr);
1079 /* Insert new node at current location in tree */
1081 if (debugging_loadword)
1082 fputs(".\n", aa_stderr);
1084 } /* end of read loop */
1088 if (node_count <= 0) {
1089 if (debugging_loadlang | debugging_loadword)
1091 PROGNAME"1185 load '%s' unsuccessful, %d comments discarded.\n",
1092 fname, comment_count);
1093 sprintf (sprintbuf, catgets(dtsearch_catd, MS_lang, 24,
1094 "%s No Japanese words in word file '%s'.") ,
1095 PROGNAME"1186", fname);
1096 DtSearchAddMessage (sprintbuf);
1100 if (debugging_loadlang | debugging_loadword)
1102 PROGNAME"1193 load word file '%s' successful, %d words.\n",
1106 } /* load_jpntree() */
1109 /************************************************/
1111 /* load_jpn_language */
1113 /************************************************/
1114 /* Loads a dblk with japanese (DtSrLaJPN, DtSrLaJPN2)
1115 * structures and function pointers.
1116 * Called from load_language(), with identical input and output.
1117 * Does not reload structures previously loaded in
1118 * other jpn dblks on dblist if derived from identical files.
1119 * But always loads structures if passed dblist is NULL.
1120 * Presumes dblk already partially initialized:
1121 * name, path, language, flags.
1122 * Returns TRUE if all successful. Otherwise
1123 * returns FALSE with err msgs on ausapi_msglist.
1125 int load_jpn_language (DBLK *dblk, DBLK *dblist)
1127 extern int ascii_charmap[]; /* in lang.c */
1131 char fname [_POSIX_PATH_MAX + 4];
1132 char path [_POSIX_PATH_MAX + 4];
1133 char msgbuf [_POSIX_PATH_MAX + 128];
1135 dblk->charmap = ascii_charmap; /* for teskey */
1136 dblk->parser = jpn_parser;
1137 dblk->lstrupr = null_lstrupr;
1138 dblk->stemmer = null_stemmer;
1139 if (dblk->dbrec.or_maxwordsz == 0) /* for teskey */
1140 dblk->dbrec.or_maxwordsz = MAXWIDTH_SWORD - 1;
1141 if (dblk->dbrec.or_minwordsz == 0) /* for teskey */
1142 dblk->dbrec.or_minwordsz = MINWIDTH_TOKEN + 1;
1143 jpnblk = austext_malloc (sizeof(JPNBLK) + 4, PROGNAME"2107", NULL);
1144 memset (jpnblk, 0, sizeof(JPNBLK));
1145 dblk->parse_extra = (void *) jpnblk;
1147 /* Load optional katakana and kanji word lists.
1148 * If specific dblk version not found,
1149 * try the default language version.
1150 * If either has load errors, return a failure.
1151 * If both are missing, just forget it.
1153 if (dblk->path == NULL)
1156 if (strlen (dblk->path) > _POSIX_PATH_MAX - 14) {
1157 sprintf (msgbuf, catgets(dtsearch_catd, MS_lang, 25,
1158 "%s Database '%s' path too long: '%s'.") ,
1159 PROGNAME"759", dblk->name, dblk->path);
1160 DtSearchAddMessage (msgbuf);
1163 strcpy (path, dblk->path);
1164 ensure_end_slash (path);
1167 #ifdef NO_KATAKANA_TREES_YET
1168 /* Load katakana wordtree */
1169 strcpy (fname, path);
1170 strcat (fname, dblk->name);
1171 strcat (fname, EXT_KATAKANA);
1172 i = load_jpntree (&jpnblk->katatree, fname);
1173 if (i == 1) { /* ...db specific file not found */
1174 if (jpn_katatree == NULL) { /* load default... */
1175 strcpy (fname, path);
1176 strcat (fname, "jpn");
1177 strcat (fname, EXT_KATAKANA);
1178 i = load_jpntree (&jpn_katatree, fname);
1180 else /* default already loaded */
1182 jpnblk->katatree = jpn_katatree;
1186 #endif /* NO_KATAKANA_TREES_YET */
1188 /* Load kanji wordtree only if kanji compounds are derived
1189 * from list in file, ie for language DtSrLaJPN2 only.
1190 * If database specific list not found,
1191 * use language generic list. If language generic
1192 * list also not found, ignore compounding.
1193 * Only one language generic list will
1194 * be loaded, at jpn_kanjitree.
1196 if (dblk->dbrec.or_language == DtSrLaJPN2) {
1197 strcpy (fname, path);
1198 strcat (fname, dblk->name);
1199 strcat (fname, EXT_KANJI);
1200 i = load_jpntree (&jpnblk->kanjitree, fname);
1201 if (i == 1) { /* ...db specific file not found */
1202 /* If the generic knj file (jpn.knj) was
1203 * never loaded, try loading it now.
1205 if (jpn_kanjitree == NULL) {
1206 strcpy (fname, path);
1207 strcat (fname, "jpn");
1208 strcat (fname, EXT_KANJI);
1209 load_jpntree (&jpn_kanjitree, fname);
1210 /* (it either worked or it didn't) */
1212 /* Whether generic load successful or not,
1213 * try to use it (eg it might still be NULL).
1215 jpnblk->kanjitree = jpn_kanjitree;
1217 if (i > 1) /* error trying to open db specific file */
1221 return (errcount > 0)? FALSE : TRUE;
1223 } /* load_jpn_language() */
1226 /************************************************/
1230 /************************************************/
1231 /* Identical to free_wordtree() in lang.c
1232 * (link inversion traversal, from Data Structure Techniques,
1233 * Thomas A. Standish, Algorithm 3.6),
1234 * except post order visit includes freeing
1235 * linked list at each tree node.
1237 static void free_jpntree (JPNTREE ** jpntree_head)
1239 JPNTREE *next, *prev, *pres;
1240 JPNTREE *listp, *next_listp;
1242 if (*jpntree_head == NULL)
1244 pres = *jpntree_head;
1248 pres->word = (void *) 0; /* preorder visit: TAG = 0 */
1259 pres->word = (void *) 1; /* TAG = 1 */
1267 while (listp->next) {
1268 next_listp = listp->next;
1274 if (prev == NULL) { /* end of algorithm? */
1275 *jpntree_head = NULL;
1278 if (prev->word == (void *) 0) { /* go up left leg */
1284 else { /* go up right leg */
1286 prev->word = (void *) 0; /* restore TAG = 0 */
1289 goto POSTORDER_VISIT;
1291 } /* free_jpntree() */
1294 /************************************************/
1296 /* unload_jpn_language */
1298 /************************************************/
1299 /* Frees storage for structures allocated by load_jpn_language().
1300 * Called when engine REINITs due to change in site config file
1302 * The global jpntrees are not currently unloaded because they
1303 * are presumed valid for the duration of the engine session.
1304 * Currently there are no teskey trees (inclist, stoplist) to free.
1306 void unload_jpn_language (DBLK *dblk)
1308 /* free jpnblk and any database-associated jpntrees */
1309 if (dblk->parse_extra) {
1310 JPNBLK *jpnblk = (JPNBLK *) dblk->parse_extra;
1311 if (jpnblk->katatree && jpnblk->katatree != jpn_katatree)
1312 free_jpntree (&jpnblk->katatree);
1313 if (jpnblk->kanjitree && jpnblk->kanjitree != jpn_kanjitree)
1314 free_jpntree (&jpnblk->kanjitree);
1316 dblk->parse_extra = NULL;
1319 } /* unload_jpn_language() */
1321 /******************** JPN.C ********************/