2 * CDE - Common Desktop Environment
4 * Copyright (c) 1993-2012, The Open Group. All rights reserved.
6 * These libraries and programs are free software; you can
7 * redistribute them and/or modify them under the terms of the GNU
8 * Lesser General Public License as published by the Free Software
9 * Foundation; either version 2 of the License, or (at your option)
12 * These libraries and programs are distributed in the hope that
13 * they will be useful, but WITHOUT ANY WARRANTY; without even the
14 * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU Lesser General Public License for more
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with these librararies and programs; if not, write
20 * to the Free Software Foundation, Inc., 51 Franklin Street, Fifth
21 * Floor, Boston, MA 02110-1301 USA
24 * COMPONENT_NAME: austext
26 * FUNCTIONS: descend_tree
33 * put_addrs_2_dtbs_addr_file
37 * write_2_dtbs_addr_file
38 * write_new_word_2_dtbs
44 * (C) COPYRIGHT International Business Machines Corp. 1992,1996
46 * Licensed Materials - Property of IBM
47 * US Government Users Restricted Rights - Use, duplication or
48 * disclosure restricted by GSA ADP Schedule Contract with IBM Corp.
50 /************************ DTSRINDEX.C *******************************
51 * $XConsortium: dtsrindex.c /main/10 1996/09/23 21:02:54 cde-ibm $
52 * CDE version of borodin.c
53 * Formerly dtsrindex.c was cborodin.c.
56 * Text file in FZK format.
57 * Each record contains 4 formatted 'lines' (text strings ending in \n):
58 * 1. fzkey (not used in this program).
59 * 2. abstract (not used in this program).
60 * 3. unique database key for the record. Used to find the database
61 * address of the record which is the reference for the inverted index.
62 * 4. The record's date (not used in this program).
64 * The rest of the record is unformatted text (not necessarily organized
65 * into 'lines'). It is read a character at a time and parsed into
66 * individual words by the parser function for the database's language.
67 * Each record ends with a delimiter string specified by command line arg.
70 * Revision 2.8 1996/04/10 19:50:38 miker
71 * Deleted dangerous and unnecessary -a option.
73 * Revision 2.7 1996/03/25 18:54:15 miker
74 * Changed FILENAME_MAX to _POSIX_PATH_MAX.
76 * Revision 2.6 1996/02/01 18:25:44 miker
77 * AusText 2.1.11, DtSearch 0.3. Pass 1 changed to accommodate
78 * new single-character reading parser/stemmers.
80 * Revision 2.5 1995/12/29 17:16:04 miker
81 * Bug fix: Opened wrong msg catalog.
83 * Revision 2.4 1995/12/27 21:18:40 miker
84 * Msg bug: 'percent done' was negative number.
86 * Revision 2.3 1995/12/01 16:15:44 miker
87 * Deleted unnecessary log2 var, conflict with Solaris function.
88 * Added -r command line arg.
90 * Revision 2.2 1995/10/26 15:26:53 miker
93 * Revision 2.1 1995/09/22 19:29:53 miker
94 * Freeze DtSearch 0.1, AusText 2.1.8
96 * Revision 1.3 1995/09/05 21:08:54 miker
97 * Fixed bug: appeared as if 1 and 2 char 'words' were being indexed.
98 * Added DEBUG_P switch.
100 * Revision 1.2 1995/09/01 22:17:02 miker
101 * Fixed solaris segfault: too many args to printf in print_usage().
103 * Revision 1.1 1995/08/31 20:51:08 miker
104 * Initial revision of dtsrindex.c, copied from cborodin.c.
107 * Revision 1.18 1995/05/30 18:58:54 miker
108 * Correct bug introduced by previous fix (2.1.5c).
110 * Revision 1.17 1995/05/18 22:54:08 miker
111 * 2.1.5b cborodin bug. Segfault due to overflowing bitvector
112 * after many deletions and no mrclean.
123 #include <sys/stat.h>
127 extern void find_keyword (char *cur_word, int vista_num);
128 extern void read_wordstr (struct or_hwordrec * glob_word, int vista_num);
129 extern void write_wordstr (struct or_hwordrec * glob_word, int vista_num);
130 extern void fill_data1 (char *ch);
132 #define PROGNAME "DTSRINDEX"
134 #define BATCH_SIZE 10000L
135 #define WORDS_PER_DOT 500
136 #define RECS_PER_DOT 20
137 #define INBUFSZ 1024 /* default input text header line size */
139 #define MS_cborodin 14
141 /******************* BIT VECTORS *****************/
142 DB_ADDR *word_addrs_ii; /* fread buf for d99 (= tot # dbas) */
143 DtSrINT32 *dbas_word_count;
144 char *dbas_bits_batch;
145 DB_ADDR *record_addr_word;
146 DtSrINT32 num_addrs_for_word;
147 DtSrINT32 or_reccount;
148 DtSrINT32 bit_vector_size;
150 /*-------------------------- GLOBALS ----------------------------*/
151 /* batch_size also used by fileman.c for allocating unused holes
152 * in order to no go past end of 'record_addr_word' array.
154 extern DtSrINT32 batch_size;
157 static int cache_size = CACHE_SIZE;
158 static int check_existing_addrs = TRUE;
159 long count_word_ii = 0L;
160 long dbkey_seqno = 0L;
163 static int debugging = 0;
164 #define DEBUG_I 0x01 /* P1 tree insertions */
165 #define DEBUG_P 0x10 /* P1 parser/stemmer */
166 #define DEBUG_T 0x02 /* P2 tree dump (words) */
167 #define DEBUG_N 0x04 /* P2 NEW words, vista */
168 #define DEBUG_O 0x08 /* P2 OLD words, vista) */
169 #define DEBUG_t 0x20 /* P2 tree dump (dbas) */
170 #define DEBUG_n 0x40 /* P2 NEW d99 for new words */
171 #define DEBUG_o 0x80 /* P2 OLD d99 updates for old words */
175 char dicpath [_POSIX_PATH_MAX];
176 static int dotcount = 0;
177 char dtbs_addr_file [_POSIX_PATH_MAX];
179 long dtbs_size_records = 0L;
180 static long duplicate_recids = 0L;
181 struct stat fstat_input;
183 static char fname_input [_POSIX_PATH_MAX];
186 static FILE *instream;
188 int inbuf_overflowed = FALSE;
189 size_t inbufsz = INBUFSZ;
193 static char msg_374[] = "\n%s Out of Memory!\n"
194 " Split the incoming file into several "
195 "smaller files and try again.\n";
196 static char msg_776[] = "\n%s Write Failure d99 file: %s\n";
197 char new_dtbs_file = FALSE;
198 long num_of_diff_words = 0L;
199 int normal_retncode = 0;
201 int parsep_char = END_RETAIN_PAGE;
203 unsigned long record_count = 0UL;
205 static int recs_per_dot = RECS_PER_DOT;
208 extern int shutdown_now;
211 char *sprintbuffer = NULL;
213 extern int debugging_teskey;
214 time_t timestart = 0;
215 time_t totalstart = 0;
216 static int words_per_dot = WORDS_PER_DOT;
218 /************************************************/
222 /************************************************/
223 typedef struct dba_str {
226 struct dba_str *next_dba;
229 /************************************************/
233 /************************************************/
234 typedef struct _treen_ {
235 char *word; /* ptr to word in stop list */
236 struct _treen_ *llink; /* left link in binary tree */
237 struct _treen_ *rlink; /* ptr to right link in binary tree */
241 static TREENODE *root_node = NULL;
242 static TREENODE *top_of_stack;
243 static TREENODE *stack;
244 static TREENODE *pres;
245 static TREENODE *prev;
246 static TREENODE *next;
247 static TREENODE *avail_node;
251 /************************************************/
255 /************************************************/
256 /* Returns static string same as passed string except nonprintable
257 * and nonascii chars replaced by '^' for display.
259 static char *displayable (char *passed_string)
261 static char *buf = NULL;
262 static size_t buflen = 0;
263 size_t passed_len = strlen (passed_string);
265 if (buflen < passed_len) {
269 buf = austext_malloc (buflen + 4, PROGNAME"158", NULL);
272 for (src = passed_string; *src != 0; src++) {
273 if (*src >= 32 && *src < 127)
280 } /* displayable() */
283 /************************************************/
285 /* print_exit_code */
287 /************************************************/
288 /* Called from inside DtSearchExit() at (*austext_exit_last)() */
289 static void print_exit_code (int exit_code)
295 /* Put total seconds into totalstart */
297 totalstart = time (NULL) - totalstart;
298 printf (catgets (dtsearch_catd, MS_cborodin, 206,
299 "%s: Exit Code = %d, Total elapsed time %ldm %lds.\n"),
300 aa_argv0, exit_code, totalstart / 60L, totalstart % 60L);
302 } /* print_exit_code() */
305 /****************************************/
307 /* write_to_file() */
309 /****************************************/
310 /* This is the 'visit node' point for the tree traversal
311 * functions of Pass 2 (traverse_tree() and descend_tree()).
313 * Each tree node = word or stem + linked list of dbas.
314 * When called, each dba list member just contains the number
315 * of times the token appears in that document. This function
316 * chains through the list, builds a statistical 'weight'
317 * for each doc/word pair, and stores it as a reformatted 'dba'
318 * in array 'record_addr_word[]', in 'host' byte swap order.
319 * The count of the current number of addrs
320 * in the array is stored in 'num_addrs_for_word'.
321 * Fill_data1() is then called to update or write a new
322 * vista record and d99 data for the token.
324 * The weight stored for each doc-word instance is 1 byte.
325 * It's the ratio of log of number of times given word occurs in doc,
326 * divided by log of total count of all words in doc,
327 * scaled to range 0 to 255.
328 * Fundamentally it's a word count of that word in the doc,
329 * but adjusted as follows:
330 * 1) Large occurrances in small documents weigh more than
331 * the same number of occurrances in large documents.
332 * 2) Taking the log skews the ratio to be more linear,
333 * ie take advantage of higher ranges of the 'weight'.
334 * For example a word that occurs in 10% of the document,
335 * will have a weight of .5 (50%).
336 * 3) The scaling changes the ratio, a float between 0. and .9999,
337 * to an integer between 0 and 255.
339 void write_to_file (TREENODE * output_node)
344 /* 'record_addr_word[]' was permanently allocated
345 * with a size = max batch size so it can hold
346 * all the addrs for a single word node in the tree.
347 * In effect it will replace the dba linked list.
348 * Note: word_addrs_ii (io buffer for d99 file) != record_addr_word[].
351 if (debugging & (DEBUG_T | DEBUG_t)) { /* Print out tree node */
352 printf (" node '%s' %c%c%c",
353 displayable(output_node->word),
354 (output_node->llink)? 'L' : '.',
355 (output_node->rlink)? 'R' : '.',
356 (debugging & DEBUG_t)? '\n' : ' ');
359 num_addrs_for_word = 0; /* DtSrINT32 */
360 print_dba = output_node->dba_list;
361 while (print_dba != NULL) {
363 mydba = print_dba->dba;
364 if (debugging & DEBUG_t)
365 printf (" dba #%ld: node adr=%ld cnt=%ld",
366 (long)num_addrs_for_word, (long)mydba, (long)print_dba->w_c);
368 record_addr_word [num_addrs_for_word] =
369 mydba << 8; /* rec# in hi 3 bytes */
370 record_addr_word [num_addrs_for_word] +=
371 (log ((double) (print_dba->w_c) + 0.5) /
372 log ((double) (dbas_word_count[mydba] + 1))) * 256;
374 if (debugging & DEBUG_t)
375 printf (" -> x%lx (%ld:%ld)\n",
376 (long)record_addr_word [num_addrs_for_word],
377 (long)record_addr_word [num_addrs_for_word] >> 8,
378 (long)record_addr_word [num_addrs_for_word] & 0xffL);
380 print_dba = print_dba->next_dba;
381 num_addrs_for_word++;
382 if (num_addrs_for_word >= batch_size) {
383 printf (catgets (dtsearch_catd, MS_cborodin, 280,
384 "\n%s num_addrs_for_word (%ld) >= batchsz (%ld).\n"),
385 PROGNAME"280", (long)num_addrs_for_word, (long)batch_size);
389 if ((debugging & DEBUG_T) && !(debugging & DEBUG_t))
390 printf (" dbacnt=%ld\n", (long)num_addrs_for_word);
392 fill_data1 (output_node->word);
395 } /* write_to_file() */
398 /****************************************/
402 /****************************************/
403 /* Coroutine of traverse_tree(), Pass 2 Robson tree traversal.
404 * The write_to_file() function is the 'preorder visit' point.
406 void descend_tree (void)
411 if ((pres->llink == NULL) && (pres->rlink == NULL)) {
412 write_to_file (pres);
416 if (pres->llink != NULL) {
423 write_to_file (pres);
431 } /* descend_tree() */
434 /********************************/
438 /********************************/
439 /* This is the actual Pass 2 function, a tree traversal
440 * of Pass 1's word-dba binary tree.
441 * The algorithm is based on the J. M. ROBSON link inversion traversal
442 * algorithm for binary trees. Ref. Thomas A. STANDISH pp. 77-78.
443 * The write_to_file() function is the 'preorder visit' point.
445 void traverse_tree (void)
450 /* Dheck for the empty tree */
451 if (root_node == NULL) {
452 printf (catgets (dtsearch_catd, MS_cborodin, 288,
453 "%s Abort. There are no words in the input file %s.\n"),
454 PROGNAME"288", fname_input);
457 /* Initialize the variables */
467 if (pres == root_node) {
470 if (prev->rlink == NULL) {
471 write_to_file (prev);
479 if (prev->llink == NULL) {
487 if (prev == top_of_stack) {
489 top_of_stack = stack->rlink;
490 stack = stack->llink;
494 prev->llink = prev->rlink;
501 write_to_file (prev);
502 avail_node->llink = stack;
503 avail_node->rlink = top_of_stack;
514 } /* traverse_tree() */
518 /********************************************************/
520 /* print_usage_msg */
522 /********************************************************/
523 static void print_usage_msg (void)
525 printf (catgets (dtsearch_catd, MS_cborodin, 17,
527 "USAGE: %s -d<dbname> [options] <infile>\n"
528 " Listed default file name extensions can be overridden.\n"
529 " -d<dbname> 1 - 8 character database name, include optional path prefix.\n"
530 " -t<etxstr> End of text document delimiter string. Default '\\f\\n'.\n"
531 " -r<N> Change Pass 1 records-per-dot from %d to <N>.\n"
532 " -b<N> Change max batch size from %ld to <N>.\n"
533 " -c<N> Change database paging cache from %ld 1K pages to <N> 1K pages.\n"
534 " <N> >= 16 by powers of 2. Initially try only small changes.\n"
535 " -i<N> Change (i)nput buffer size from default %d to <N>.\n"
536 " -h<N> Change duplicate record id hash table size from %ld to <N>.\n"
537 " -h0 means there are no duplicates, do not check for them.\n"
538 " <infile> Input [path]file name. Default extension %s.\n"),
541 (long) BATCH_SIZE, (long) CACHE_SIZE,
542 (int) INBUFSZ, default_hashsize, EXT_FZKEY);
544 } /* print_usage_msg() */
547 /********************************************************/
549 /* segregate_dicname */
551 /********************************************************/
552 /* Separates dictionary name from pathname and loads
553 * them into the globals 'dicname' and 'dicpath'.
554 * Returns TRUE if dicname is valid, else returns FALSE.
556 static int segregate_dicname (char *string)
558 char mybuf[_POSIX_PATH_MAX];
562 strncpy (mybuf, string, sizeof (mybuf));
563 mybuf[sizeof (mybuf) - 1] = 0;
566 * Set 'ptr' to just the 8 char dictionary name by moving
567 * it backwards until first non-alphanumeric character
568 * (such as a ":" in the dos drive id or a slash between directories),
569 * or to the beginning of string.
571 for (ptr = mybuf + strlen (mybuf) - 1; ptr >= mybuf; ptr--)
572 if (!isalnum (*ptr)) {
579 /* test for valid dictionary name */
584 strcpy (dicname, ptr);
586 strncpy (dicpath, mybuf, sizeof (dicpath));
587 dicpath[sizeof (dicpath) - 1] = 0;
589 } /* segregate_dicname() */
592 /********************************************************/
594 /* USER_ARGS_PROCESSOR */
596 /********************************************************/
597 /* handles command line arguments for 'main' */
598 void user_args_processor (int argc, char **argv)
608 /* Initialize some variables prior to parsing command line */
612 /* Each pass grabs new parm of "-xxx" format */
613 while (--argc > 0 && (*++argv)[0] == '-') {
617 case 't': /* ETX delimiter string */
618 /* Replace any "\n" string with real linefeed */
619 targ = parg.etxdelim = malloc (strlen (argptr + 2) + 4);
622 if (src[0] == '\\' && src[1] == 'n') {
633 if ((recs_per_dot = atoi (argptr + 2)) <= 0) {
634 printf (catgets (dtsearch_catd, MS_cborodin, 577,
635 "%s Invalid arg '%s'. Using default -r%d.\n"),
636 PROGNAME"577", argptr, RECS_PER_DOT);
637 recs_per_dot = RECS_PER_DOT;
642 duprec_hashsize = atol (argptr + 2);
643 if (duprec_hashsize == 0UL)
644 printf (catgets (dtsearch_catd, MS_cborodin, 539,
645 "%s Duplicate record id checking disabled.\n"),
650 batch_size = atol (argptr + 2);
651 if (batch_size <= 0L) {
652 printf (catgets (dtsearch_catd, MS_cborodin, 595,
653 "%s Invalid batch size argument '%s'.\n"),
654 PROGNAME"595", argptr);
660 cache_size = atoi (argptr + 2);
661 if (cache_size < 16) {
662 /* minimum size is 16 */
665 /* on error reset size to default */
667 cache_size = CACHE_SIZE;
669 printf (catgets (dtsearch_catd, MS_cborodin, 600,
670 "%sCache size readjusted to %d.\n"),
671 PROGNAME "600 ", cache_size);
674 /* If necessary, round up to nearest power of 2 */
675 for (i = 4; i < 12; i++)
676 if (1 << i >= cache_size)
679 if (i != cache_size) {
685 case 'D': /* unadvertised debugging feature */
686 for (i = 2; argptr[i] != 0; i++) {
688 case 'I': debugging |= DEBUG_I; break;
689 case 'P': debugging |= DEBUG_P;
690 /******* debugging_teskey = TRUE; ******/
692 case 'N': debugging |= DEBUG_N; break;
693 case 'n': debugging |= DEBUG_n; break;
694 case 'O': debugging |= DEBUG_O; break;
695 case 'o': debugging |= DEBUG_o; break;
696 case 'T': debugging |= DEBUG_T; break;
697 case 't': debugging |= DEBUG_t; break;
698 default: goto BADPARM;
704 /* May include both dicname and dicpath */
705 if (!segregate_dicname (argptr + 2)) {
706 printf (catgets (dtsearch_catd, MS_cborodin, 550,
707 "%s '%s' is invalid path/database name.\n"),
708 PROGNAME"550", argptr);
713 case 'i': /* (I)nput buffer size */
714 if ((inbufsz = atol (argptr + 2)) <= 0) {
715 printf (catgets (dtsearch_catd, MS_cborodin, 558,
716 "%s Invalid input buffer size '%s'.\n"),
717 PROGNAME"558", argptr);
723 printf (catgets (dtsearch_catd, MS_cborodin, 567,
724 "%s Unknown command line argument '%s'.\n"),
725 PROGNAME"567", argptr);
728 DtSearchExit (2); /* abort */
731 } /* endwhile for cmd line '-'processing */
733 /* Validate input file name */
735 printf (catgets (dtsearch_catd, MS_cborodin, 580,
736 "%s Missing required input file name.\n"),
740 /* Don't incr argv yet--save input file name */
742 append_ext (fname_input, _POSIX_PATH_MAX, argv[0], EXT_FZKEY);
744 /* Check for missing database name */
745 if (dicname[0] == 0) {
746 printf (catgets (dtsearch_catd, MS_cborodin, 589,
747 "%s No database name specified (-d argument).\a\n"),
751 strcpy (dblk.name, dicname);
754 } /* user_args_processor() */
757 /****************************************/
759 /* put_addrs_2_dtbs_addr_file */
761 /****************************************/
762 /* Suboutine of write_2_dtbs_addr_file() from Pass 2.
763 * That function has used a bit vector to determine
764 * the total change in old d99 addrs for preexisting words,
765 * and prepared for writing an array of old dbas that
766 * are not in the current words tree node (globally named
767 * word_addrs_ii [num_addrs]).
768 * The addrs that ARE in the Pass 1 node fzk file were previously
769 * prepared in a similar array of dbas, globally named
770 * record_addr_word [num_addrs_for_word] but passed here as
771 * 'addrs_array' and 'nitems'.
772 * Both arrays will be byte swapped from 'host' to
773 * 'network' order in this function.
774 * This function does the actual fwrite of both arrays to the d99.
775 * If the number of new addrs can fit in the available free slots,
776 * it rewrites to original offset, otherwise appends to end of d99.
778 static void put_addrs_2_dtbs_addr_file (
779 DB_ADDR *addrs_array,
782 FREE_SPACE_STR *free_slot;
783 FREE_SPACE_STR del_rec;
785 DtSrINT32 num_writes;
788 if (nitems >= batch_size) {
789 printf ( catgets(dtsearch_catd, MS_cborodin, 6,
790 "put_addrs_2_dtbs_addr_file() nitems=%d, batchsz=%ld\n") ,
791 (int)nitems, (long)batch_size);
795 num_addrs = got_word.or_hwaddrs;
796 got_word.or_hwaddrs += nitems; /** somehow, this can exceed total
797 **** num addrs in database by 1 (!?) ******/
798 /* (...only if prev 'overlay/compression' didn't delete all) */
801 /* Put both arrays in 'network' byte order */
802 for (int32 = 0; int32 < nitems; int32++)
803 HTONL (addrs_array[int32]);
804 for (int32 = 0; int32 < num_addrs; int32++)
805 HTONL (word_addrs_ii[int32]);
809 * If number of new addresses greater than number of free holes,
810 * find new free slot that is big enough to hold the data .
812 if (nitems > got_word.or_hwfree) {
813 /* Discard old slot, find new one. */
814 del_rec.hole_size = num_addrs + got_word.or_hwfree;
815 del_rec.offset = got_word.or_hwoffset;
816 free_slot = find_free_space (got_word.or_hwaddrs, &fl_hdr);
817 add_free_space (&del_rec, &fl_hdr);
818 if (free_slot == NULL) {
819 fseek (dtbs_addr_fp, 0L, SEEK_END);
820 got_word.or_hwoffset = ftell (dtbs_addr_fp);
821 got_word.or_hwfree = 0;
824 fseek (dtbs_addr_fp, free_slot->offset, SEEK_SET);
825 got_word.or_hwoffset = free_slot->offset;
826 got_word.or_hwfree = free_slot->hole_size -
829 /*----- Write new database addresses to a file -----*/
830 num_writes = fwrite (addrs_array, sizeof(DB_ADDR),
831 (size_t)nitems, dtbs_addr_fp);
832 if (num_writes != nitems) {
836 /* Copy the old addresses immediately after the new ones */
837 num_writes = fwrite (word_addrs_ii, sizeof(DB_ADDR), (size_t)num_addrs,
839 if (num_writes != num_addrs) {
840 printf (catgets (dtsearch_catd, MS_cborodin, 776, msg_776),
841 PROGNAME"776", strerror(errno));
845 /* Write foxes to the free holes, if any, no byte swap */
846 for (int32 = 0; int32 < got_word.or_hwfree; int32++)
847 addrs_array [int32] = 0xFFFFFFFF;
848 num_writes = fwrite (addrs_array, sizeof(DB_ADDR),
849 (size_t)got_word.or_hwfree, dtbs_addr_fp);
850 if (num_writes != got_word.or_hwfree) {
851 printf (catgets (dtsearch_catd, MS_cborodin, 776, msg_776),
852 PROGNAME"786", strerror(errno));
855 } /* end if (nitems > got_word.or_hwfree), had to get bigger slot */
857 /* Else can reuse existing slot.
858 * Write the new addresses into free holes.
859 * The remaining free holes should already have foxes. (?)
862 fseek (dtbs_addr_fp, got_word.or_hwoffset, SEEK_SET);
863 num_writes = fwrite (addrs_array, sizeof(DB_ADDR),
864 (size_t)nitems, dtbs_addr_fp);
865 if (num_writes != nitems) {
866 printf (catgets (dtsearch_catd, MS_cborodin, 776, msg_776),
867 PROGNAME"798", strerror(errno));
870 /* Copy the old addresses immediately after the new ones */
871 num_writes = fwrite (word_addrs_ii, sizeof(DB_ADDR),
872 (size_t)num_addrs, dtbs_addr_fp);
873 if (num_writes != num_addrs) {
874 printf (catgets (dtsearch_catd, MS_cborodin, 776, msg_776),
875 PROGNAME"889", strerror(errno));
878 got_word.or_hwfree -= nitems;
880 } /* put_addrs_2_dtbs_addr_file() */
883 /****************************************/
885 /* write_2_dtbs_addr_file */
887 /****************************************/
888 /* Subroutine of fill_data1() from Pass 2.
889 * Updates OLD (preexisting) word's d99 file.
891 * The vista word rec has already been read into global 'got_word'.
892 * record_addr_word [num_addrs_for_word] is the array of dba's
893 * for docs from this batch that contain the current word (built by
894 * fill_data1 from the dba_list for the word's Pass 1 binary tree node,
895 * and still in 'host' byte swap order).
896 * This function freads all the old addresses for that word from
897 * the d99 file. It then deletes(!) d99 addrs that
898 * are in the word's Pass 1 tree node. It then calls
899 * put_addrs_2_dtbs_addr_file() to fwrite out the
900 * dba's in the tree, which are either brand new,
901 * or are 'updating' the deleted addrs.
902 * Then it writes the modified old addrs.
903 * Then rewrites vista word rec with new data.
905 * The bit vector dbas_bits_batch contains a 1 bit
906 * for every dba for every doc in the fzk file.
907 * got_word structure:
908 * .or_hwordkey - the word. (always in a 'huge' word buffer).
909 * .or_hwoffset - offset in a d99 inverted index file for
910 * a given word. the first address starts
912 * .or_hwaddrs - total number of addresses for a given word.
913 * .or_hwfree - number of free slots in a database
914 * addresses file for a given word.
916 void write_2_dtbs_addr_file (void)
918 DtSrINT32 num_addrs_ii;
920 DtSrINT32 i_start, k, cur_ind;
921 DtSrINT32 num_delete_addrs = 0;
922 char addrs_removed = FALSE;
923 register DtSrINT32 i;
924 register DtSrINT32 cur_byte;
925 register char bit_addrs;
926 register DB_ADDR temp1;
928 if (debugging & DEBUG_O)
929 printf (" old vis '%s' ofs=%ld adr=%ld fre=%ld\n",
930 displayable(got_word.or_hwordkey),
931 (long) got_word.or_hwoffset,
932 (long) got_word.or_hwaddrs,
933 (long) got_word.or_hwfree);
935 num_addrs_ii = got_word.or_hwaddrs;
936 if (num_addrs_ii > or_reccount) {
937 printf (catgets (dtsearch_catd, MS_cborodin, 713,
938 "\n%s Word '%s' occurs in %ld records,\n"
939 " but there are only %ld records in database!\n"
940 " (This may be a good candidate for the stoplist).\n"),
942 (long) got_word.or_hwordkey,
948 if (fseek (dtbs_addr_fp, (long) got_word.or_hwoffset, SEEK_SET) != 0)
950 printf (catgets (dtsearch_catd, MS_cborodin, 875,
951 "\n%s Could not fseek d99 file to offset %ld.\n"),
952 PROGNAME"875", got_word.or_hwoffset);
955 num_reads = fread (word_addrs_ii, sizeof(DB_ADDR),
956 (size_t)num_addrs_ii, dtbs_addr_fp);
957 if (num_reads != num_addrs_ii) {
958 printf (catgets (dtsearch_catd, MS_cborodin, 848,
959 "\n%s Could not fread %ld bytes (%ld dba's) of d99 file\n"
960 " at offset %ld. Number of dba's read (return code) = %ld.\n"),
961 PROGNAME"848", sizeof(DB_ADDR) * num_addrs_ii, (long)num_addrs_ii,
962 (long)got_word.or_hwoffset, (long)num_reads);
966 for (i = 0; i < num_addrs_ii; i++)
967 NTOHL (word_addrs_ii[i]);
968 /* Now both addr arrays are in 'host' byte swap order */
971 /* If there are only new docs,
972 * this switch will prevent the checking for updates.
974 if (check_existing_addrs) {
977 /* Loop on every preexisting dba for word as read from d99 */
978 for (i = 0; i < num_addrs_ii; i++) {
979 if (debugging & DEBUG_o)
980 printf (" old d99 %ld: x%lx(%ld:%ld)",
982 (long) word_addrs_ii[i],
983 (long) word_addrs_ii[i] >> 8,
984 (long) word_addrs_ii[i] & 0xffL);
986 /* Get 'record number' by shifting hi 3 bytes 1 byte (8 bits)
987 * to right over stat wt byte. D99 rec#'s start at 1,
988 * so subtract 1 to start at 0 for bit vector.
990 temp1 = (*(word_addrs_ii + i) >> 8) - 1; /* = rec#, base 0 */
991 cur_byte = temp1 >> 3; /* get matching byte# in bit vector */
992 if (cur_byte >= bit_vector_size) {
993 printf ( catgets(dtsearch_catd, MS_cborodin, 9,
994 "\n%s Corrupted d99 file for word '%s',\n"
995 " database address %ld @ file position %ld => bitvector[%ld],"
996 " but max bitvector allocation = %ld.\n") ,
997 PROGNAME"727", displayable(got_word.or_hwordkey),
998 (long)temp1, (long)i,
999 (long)cur_byte, (long)bit_vector_size);
1003 bit_addrs |= 1 << (temp1 % 8); /* bit mask */
1005 * If this dba, which is on the current word's old d99
1006 * addrs list, is also a doc in the fzk file (dbas_bits_batch),
1007 * delete it from the d99 list by writing subsequent dba's
1008 * over it. Boy this recursive nested loop has gotta be slow.
1009 * Faster algorithm? Add 'good' addrs to the end of
1010 * record_addr_word[]. No nested overlay loop, only one write!
1012 if (bit_addrs & (*(dbas_bits_batch + cur_byte))) {
1013 addrs_removed = TRUE;
1021 /* compress: move good addrs over
1022 * space of deleted ones */
1023 for (k = i_start; k < i; k++) {
1024 word_addrs_ii[cur_ind] = word_addrs_ii[k];
1030 } /* end if where dba is on both fzk list and curr d99 */
1031 } /* end loop on every d99 addr for this word */
1033 if (addrs_removed) { /* final overlay compression */
1035 /* compress: move good addrs over
1036 * space of deleted ones */
1037 for (k = i_start; k < i; k++) {
1038 word_addrs_ii[cur_ind] = word_addrs_ii[k];
1043 } /* end if (check_existing_addrs) */
1045 got_word.or_hwaddrs -= num_delete_addrs;
1046 got_word.or_hwfree += num_delete_addrs;
1048 /* The old dba array word_addrs_ii[] is now 'compressed',
1049 * it contains only addrs not in fzk file.
1050 * And the vista rec 'got_word' now matches it.
1051 * And record_addr_word[] still contains
1052 * the new/updated addrs from the fzk file.
1053 * Now Efim calls a func to write them both back out to d99 file.
1055 put_addrs_2_dtbs_addr_file (record_addr_word, num_addrs_for_word);
1056 write_wordstr (&got_word, 0); /* update vista WORD rec */
1059 } /* write_2_dtbs_addr_file() */
1062 /********************************/
1064 /* write_new_word_2_dtbs */
1066 /********************************/
1067 /* Subroutine of fill_data1() in Pass 2 for a NEW word.
1068 * Writes d99 data, and updates (empty) got_word vista record.
1069 * record_addr_word [num_addrs_for_word] is the array of addrs
1070 * for docs from this batch that contain the current word (built by
1071 * fill_data1 from the dba_list for the word's Pass 1 binary tree node).
1072 * It will be byte swapped from 'host' to 'network' order in this function.
1074 void write_new_word_2_dtbs (void)
1076 FREE_SPACE_STR *free_slot;
1077 DtSrINT32 num_writes;
1081 if (debugging & (DEBUG_n | DEBUG_N))
1082 printf (" new word '%s', adrs=%ld,",
1083 got_word.or_hwordkey, (long)num_addrs_for_word);
1085 free_slot = find_free_space (num_addrs_for_word, &fl_hdr);
1086 if (free_slot == NULL) {
1087 /* append addrs to end of d99 file */
1088 ret_fseek = fseek (dtbs_addr_fp, 0L, SEEK_END);
1089 got_word.or_hwoffset = ftell (dtbs_addr_fp);
1090 got_word.or_hwfree = 0;
1091 if (debugging & (DEBUG_n | DEBUG_N))
1092 printf ("APPEND ofs=%ld, fre=0\n", got_word.or_hwoffset);
1095 ret_fseek = fseek (dtbs_addr_fp,
1096 (long)free_slot->offset, SEEK_SET);
1097 got_word.or_hwoffset = free_slot->offset;
1098 got_word.or_hwfree = free_slot->hole_size -
1100 if (debugging & (DEBUG_n | DEBUG_N))
1101 printf (" REUSE slot ofs=%ld, fre=%ld\n",
1102 got_word.or_hwoffset, got_word.or_hwfree);
1105 /***** Write new database addresses to d99 file *********/
1106 if (debugging & DEBUG_n) {
1107 for (int32 = 0; int32 < num_addrs_for_word; int32++) {
1108 printf (" dba #%ld: x%lx(%ld:%ld)\n",
1110 (long)record_addr_word[int32],
1111 (long)record_addr_word[int32] >> 8,
1112 (long)record_addr_word[int32] & 0xffL);
1116 /* Put addr array in 'network' byte order */
1117 for (int32 = 0; int32 < num_addrs_for_word; int32++)
1118 HTONL (record_addr_word[int32]);
1120 num_writes = fwrite (record_addr_word, sizeof(DB_ADDR),
1121 (size_t)num_addrs_for_word, dtbs_addr_fp);
1122 if (num_writes != num_addrs_for_word)
1125 got_word.or_hwaddrs = num_addrs_for_word;
1127 if (got_word.or_hwfree != 0) {
1128 /* Fill unused free holes with foxes for debugging.
1129 * Note that byte swap is unnecessary for foxes.
1130 * Note that record_addr_word is now available for this action.
1132 for (int32 = 0; int32 < got_word.or_hwfree; int32++)
1133 *(record_addr_word + int32) = 0xFFFFFFFF;
1134 num_writes = fwrite (record_addr_word, sizeof(DB_ADDR),
1135 (size_t)got_word.or_hwfree, dtbs_addr_fp);
1136 if (num_writes != got_word.or_hwfree) {
1137 printf (catgets (dtsearch_catd, MS_cborodin, 776, msg_776),
1138 PROGNAME"960", strerror(errno));
1143 /* Save changed word_info structure back to the vista database! */
1144 write_wordstr (&got_word, 0);
1146 } /* write_new_word_2_dtbs() */
1149 /************************/
1153 /************************/
1154 /* Called from write_to_file() in Pass 2.
1155 * Write_to_file() is 'visit node' function of tree traversal.
1156 * It has converted dbalist in each word node in tree to
1157 * array of dbas (record_addr_word [num_addrs_for_word])
1158 * with correct statistical weighting, still in 'host' byte swap order.
1159 * This function seeks word key in database. If word is new,
1160 * it calls functions to write new vista rec and d99 data.
1161 * If word is old it calls functions to read word rec and update d99.
1163 void fill_data1 (char *node_word)
1166 strcpy (miker, node_word);
1170 printf (catgets (dtsearch_catd, MS_cborodin, 164,
1171 "\n%s Abort due to signal %d. Database %s\n"
1172 " probably corrupted. Restore backup database.\n"),
1173 PROGNAME"164", shutdown_now, dicname);
1177 /* print occasional progress dots and msgs */
1178 if (!(count_word_ii % words_per_dot)) {
1181 if (!(dotcount % 10))
1183 if (dotcount >= 50) {
1185 seconds_left = (unsigned long)
1186 (((float) num_of_diff_words /
1187 (float) count_word_ii - 1.) *
1188 (float) (time (NULL) - timestart));
1189 printf (catgets (dtsearch_catd, MS_cborodin, 849,
1190 "\n%s: Word #%ld, %.0f%% done. Est %lum %02lus "
1191 "to completion.\n"),
1192 aa_argv0, count_word_ii,
1193 (float) count_word_ii / (float) num_of_diff_words * 100.0,
1194 /***(count_word_ii * 100L) / num_of_diff_words,***/
1195 seconds_left / 60L, seconds_left % 60L);
1199 } /* endif for progress dots and msgs */
1201 strncpy (got_word.or_hwordkey, node_word, DtSrMAXWIDTH_HWORD);
1202 got_word.or_hwordkey[DtSrMAXWIDTH_HWORD - 1] = 0;
1203 find_keyword (miker, 0); /* vista KEYFIND for word rec */
1204 if (db_status == S_NOTFOUND) { /* this is a NEW word */
1205 got_word.or_hwoffset = 0;
1206 got_word.or_hwfree = 0;
1207 got_word.or_hwaddrs = 0;
1208 fillnew_wordrec (&got_word, 0); /* write (empty) vista word rec */
1209 if (db_status != S_OKAY)
1210 vista_abort (PROGNAME"981");
1211 write_new_word_2_dtbs(); /* write NEW word's d99 entries
1212 * and update vista word rec */
1216 /* update previously existing word */
1217 read_wordstr (&got_word, 0); /* read OLD word rec into got_word */
1218 if (db_status == S_OKAY)
1219 write_2_dtbs_addr_file(); /* update OLD word's d99 entries
1220 * and update vista word rec */
1222 } /* fill_data1() */
1225 /************************************************/
1227 /* load_into_bintree */
1229 /************************************************/
1231 * Loads parsed word token or stem token into
1232 * inverted index binary tree along with passed dba.
1233 * Token is allowed to be empty, ie first byte is \0.
1234 * Derived from Efim's original 'teskey_parse()'
1235 * and bin_tree() functions.
1236 * Variables static for speeeeeeed.
1238 static void load_into_bintree (
1243 static DtSrINT16 or_maxwordsz;
1246 static TREENODE **this_link;
1247 static TREENODE *newnode;
1248 static DBALIST *newdba;
1249 static char *tokbuf = NULL;
1251 if (*parser_token == 0) {
1252 if (debugging & DEBUG_I)
1253 printf (" bintr=<empty> dba=%ld\n", (long)dba);
1257 /* Copy token to a buffer.
1258 * Stems have a special prefix character
1259 * to distinguish them from words.
1260 * Also increment total dba word count.
1262 if (tokbuf == NULL) {
1263 or_maxwordsz = dblk.dbrec.or_maxwordsz;
1264 tokbuf = austext_malloc ((size_t) or_maxwordsz + 4,
1265 PROGNAME"1152", NULL);
1267 if (token_is_stem) {
1268 tokbuf[0] = STEM_CH;
1269 strncpy (tokbuf + 1, parser_token, (size_t)or_maxwordsz);
1270 dbas_word_count[dba]++;
1273 strncpy (tokbuf, parser_token, (size_t)or_maxwordsz);
1274 tokbuf [or_maxwordsz] = 0;
1275 if (debugging & DEBUG_I)
1276 printf (" bintr='%s' dba=%ld ", displayable(tokbuf), (long)dba);
1278 /* TREE TRAVERSAL. Search binary tree to find either
1279 * insertion point or identical preexisting token.
1281 for (this_link = &root_node; *this_link != NULL; ) {
1282 i = strcmp (tokbuf, (*this_link)->word);
1284 /* If identical word/stem token already exists... */
1286 /* If token appears more than once in current
1287 * document (dba already exists at top of dba list),
1288 * just increment the word count in the list.
1290 if ((*this_link)->dba_list->dba == dba)
1291 (*this_link)->dba_list->w_c++;
1293 /* If this is first appearance of token for this doc
1294 * (dba is not at start of token's dba list),
1295 * insert dba at start of token's dba list.
1298 if ((newdba = malloc (sizeof(DBALIST))) == NULL) {
1299 printf (catgets (dtsearch_catd, MS_cborodin, 374,
1300 msg_374), PROGNAME"1150");
1305 newdba->next_dba = (*this_link)->dba_list;
1306 (*this_link)->dba_list = newdba;
1308 if (debugging & DEBUG_I)
1309 printf (" Old %ld=%ld\n",
1310 (long)((*this_link)->dba_list->dba),
1311 (long)((*this_link)->dba_list->w_c));
1312 return; /* done with token */
1314 } /* endif where token was found in binary tree */
1316 /* Increment link ptr by descending to correct subtree */
1318 this_link = &(*this_link)->llink;
1319 if (debugging & DEBUG_I)
1323 this_link = &(*this_link)->rlink;
1324 if (debugging & DEBUG_I)
1327 } /* end tree traversal */
1329 /* Tree traversal never found a preexisting token node.
1330 * Create a new node and insert it at the point
1331 * indicated by link ptr.
1333 newnode = austext_malloc (sizeof(TREENODE) + strlen(tokbuf) + 4,
1334 PROGNAME"1234", NULL);
1335 newnode->llink = NULL;
1336 newnode->rlink = NULL;
1337 newnode->word = (char *) (newnode + 1); /* use mem at end of node */
1338 strcpy (newnode->word, tokbuf);
1340 newdba = austext_malloc (sizeof(DBALIST), PROGNAME"1235", NULL);
1341 newnode->dba_list = newdba;
1344 newdba->next_dba = NULL;
1346 *this_link = newnode;
1347 num_of_diff_words++;
1349 if (debugging & DEBUG_I)
1350 printf (" New %ld=%ld\n",
1351 (long)((*this_link)->dba_list->dba),
1352 (long)((*this_link)->dba_list->w_c));
1354 } /* load_into_bintree() */
1357 /**********************************************/
1361 /**********************************************/
1362 main (int argc, char **argv)
1365 long word_offset; /* <-- PARG.offsetp */
1366 long bytes_in; /* ftell() */
1367 DtSrINT32 dba_offset;
1371 char db_key [DtSrMAX_DB_KEYSIZE + 2];
1373 register DtSrINT32 cur_byte;
1375 DB_ADDR dba, temp_dba;
1378 char *parsebufp, *stembufp;
1380 /******************* INITIALIZE ******************/
1381 setlocale (LC_ALL, "");
1382 dtsearch_catd = catopen (FNAME_DTSRCAT, 0);
1384 aa_argv0 = strdup (argv[0]);
1386 tmptr = localtime (&elapsed);
1387 strftime (buf, sizeof(buf),
1388 catgets (dtsearch_catd, MS_misc, 22, "%A, %b %d %Y, %I:%M %p"),
1390 printf (catgets (dtsearch_catd, MS_cborodin, 1, "%s. Run %s.\n"),
1392 austext_exit_last = print_exit_code;
1393 batch_size = BATCH_SIZE;
1394 init_user_interrupt ();
1395 default_hashsize = duprec_hashsize;
1397 memset (&dblk, 0, sizeof(DBLK));
1399 memset (&parg, 0, sizeof(PARG));
1401 parg.etxdelim = ETXDELIM; /* default, can be changed */
1402 parg.offsetp = &word_offset;
1403 parg.flags |= PA_INDEXING; /* do compounding, if parser can */
1405 /* Read user specified command line arguments */
1406 user_args_processor (argc, argv);
1408 /* Finish init now that we know final values */
1409 inbuf = austext_malloc (inbufsz + 16, PROGNAME"1349", NULL);
1410 temp = austext_malloc (inbufsz + 16, PROGNAME"1285", NULL);
1411 sprintbuffer = austext_malloc (inbufsz + _POSIX_PATH_MAX + 16,
1412 PROGNAME"1286", NULL);
1413 record_addr_word = austext_malloc ((sizeof(DB_ADDR) * batch_size) + 16,
1414 PROGNAME "1133", NULL);
1416 /* Save dicname and path in dblk. Save full name of d99 file. */
1417 strcpy (dblk.name, dicname);
1418 dblk.path = dicpath;
1419 strcpy (dtbs_addr_file, dicpath);
1420 strcat (dtbs_addr_file, dicname);
1421 strcat (dtbs_addr_file, EXT_DTBS);
1423 /* Open the database */
1424 if (!austext_dopen (dicname, dicpath, NULL, cache_size, &dbrec)) {
1425 fprintf (aa_stderr, "%s\n", DtSearchGetMessages());
1428 memcpy (&dblk.dbrec, &dbrec, sizeof(DBREC));
1430 /* Load database's parser, stemmer, and linguistic files into dblk. */
1431 if (!load_language (&dblk, NULL)) {
1432 puts (DtSearchGetMessages());
1433 printf (catgets (dtsearch_catd, MS_cborodin, 1097,
1434 "%s Aborting due to errors in loading language files.\n"),
1439 RECFRST (PROGNAME "1067", OR_OBJREC, 0);
1440 CRGET (PROGNAME "1069", &dba, 0); /* byte swap already done in vista */
1442 or_reccount = dbrec.or_reccount; /* DtSrINT32 */
1443 or_recslots = dbrec.or_recslots; /* promoted to DtSrINT32 */
1444 or_maxdba = dbrec.or_maxdba; /* DtSrINT32 lim of dbas_word_count */
1445 bit_vector_size = ((or_maxdba / or_recslots + 1) >> 3) + 1; /* DtSrINT32 */
1446 dba_offset = or_recslots - (dba & 0x00FFFFFF); /* DtSrINT32 */
1449 printf (PROGNAME"1286 "
1450 "realnumrec=%ld recslots=%ld bitvecsz=%ld"
1451 " dbaoffset=%d maxdba=%ld\n",
1452 (long)or_reccount, (long)or_recslots, (long)bit_vector_size,
1453 (int)dba_offset, (long)or_maxdba);
1455 /* Allocate memory space for the arrays.
1456 * dbas_bits_batch = 'bit vector', one bit for every possible rec#.
1457 * the 1 bits = only the dba's that are in this fzk batch.
1458 * word_addrs_ii = fread buffer for d99 file.
1459 * dbas_word_count = summing bkts for word count statistics.
1461 dbas_bits_batch = (char *) austext_malloc ((size_t)bit_vector_size + 48,
1462 PROGNAME "1150", NULL);
1463 word_addrs_ii = (DB_ADDR *) austext_malloc (
1464 sizeof (DB_ADDR) * (or_reccount + 1) + 48,
1465 PROGNAME "1152", NULL);
1466 mallocsz = sizeof(DtSrINT32) * (or_maxdba + 1) + 48;
1467 dbas_word_count = (DtSrINT32 *) austext_malloc (mallocsz,
1468 PROGNAME "1154", NULL);
1469 memset (dbas_bits_batch, 0, (size_t)bit_vector_size + 48);
1470 memset (dbas_word_count, 0, mallocsz);
1474 /* Open the d99 file that contains database addresses.
1475 * If the file doesn't exist, it means the database
1476 * for keyword search is empty - open a new file.
1478 if ((dtbs_addr_fp = fopen (dtbs_addr_file, "r+b")) == NULL) {
1479 dtbs_addr_fp = fopen (dtbs_addr_file, "w+b");
1480 check_existing_addrs = FALSE;
1481 new_dtbs_file = TRUE;
1482 if (dtbs_addr_fp == NULL) {
1483 /* msg 1068 used multiple places */
1484 printf (catgets (dtsearch_catd, MS_cborodin, 1068,
1485 "%s Can't open new inverted index file '%s': %s\n"),
1486 PROGNAME"1068", dtbs_addr_file, strerror(errno));
1489 /* write New Header Information to a file */
1490 init_header (dtbs_addr_fp, &fl_hdr);
1493 /* read Header Information from d99 file */
1494 if (!fread_d99_header (&fl_hdr, dtbs_addr_fp)) {
1495 /* msg 1068 used multiple places */
1496 printf (catgets (dtsearch_catd, MS_cborodin, 1068,
1497 "%s Can't read header data for '%s': %s\n"),
1498 PROGNAME"1422", dtbs_addr_file, strerror(errno));
1503 /* open input .fzk file */
1504 src = getcwd (sprintbuffer, _POSIX_PATH_MAX);
1505 if (!src && debugging)
1506 printf (PROGNAME"1336 Can't getcwd: %s.\n", strerror(errno));
1508 src = getenv ("PWD");
1509 printf (catgets (dtsearch_catd, MS_misc, 24,
1510 "%s: current working directory = '%s', .fzk file = '%s'\n"),
1512 (src) ? src : catgets (dtsearch_catd, MS_misc, 6, "<unknown>"),
1514 if ((instream = fopen (fname_input, "rt")) == NULL) {
1516 printf (catgets (dtsearch_catd, MS_cborodin, 1083,
1517 "%s Can't read input file '%s': %s\n"),
1518 PROGNAME"1083", fname_input, strerror(errno));
1521 if (fstat (fileno (instream), &fstat_input) == -1)
1522 goto BAD_INPUT_FILE;
1523 parg.ftext = instream; /* for readchar_ftext(), discard_to_ETX() */
1525 time (&totalstart); /* for total elapsed time */
1526 timestart = totalstart; /* for Pass 1 elapsed time */
1528 /*------------ PASS 1: ------------
1529 * Main Read Loop. For each text record in input file,
1530 * parse and stem words, store them into binary tree
1531 * inverted index in memory.
1532 * The first few lines are database administrative values.
1533 * They are presumed ascii and read with fgets() as
1534 * 'lines' terminated with \n. The text of the document
1535 * itself is presumed to be in the appropriate database
1536 * 'language', so it is *not* presumed to be lines
1537 * terminated with \n. The document text is read by
1538 * the language's parser() a 'word' at a time, which
1539 * ultimately means a byte at a time.
1541 printf (catgets (dtsearch_catd, MS_cborodin, 1108,
1542 "%s: Beginning Pass 1, reading records from '%s'.\n"
1543 " Each dot = %d records.\n"),
1544 aa_argv0, fname_input, recs_per_dot);
1547 while (!feof(instream)) {
1549 /* 1. Read and discard the FZKEY line.
1550 * 2. Read and discard the ABSTRACT line.
1551 * 3. Read the UNIQUE KEY for the record.
1552 * Do some record initialization steps here.
1553 * 4. Read and discard the DATE line.
1554 * 5. Let the parser read and parse rest of record, ie doc text...
1557 /*----- READ LINE #1, fzkey -----*/
1558 if (fgets (inbuf, inbufsz, instream) == NULL)
1560 inbuf [inbufsz] = 0; /* just to be sure */
1563 printf (catgets (dtsearch_catd, MS_cborodin, 164,
1564 "\n%s: %s Abort due to signal %d. Database %s\n"
1565 " possibly corrupted. Restore backup database.\n"),
1566 aa_argv0, PROGNAME"1299", shutdown_now, dicname);
1570 /* Silently skip null records just like dtsrload */
1571 if (strcmp (inbuf, parg.etxdelim) == 0)
1576 /*----- READ LINE #2, abstract -----*/
1577 if (fgets (inbuf, inbufsz, instream) == NULL) {
1579 printf (catgets (dtsearch_catd, MS_cborodin, 1129,
1580 "%s: %s Invalid .fzk file format.\n"),
1581 fname_input, PROGNAME"1129");
1584 inbuf[inbufsz] = 0; /* just to be sure */
1586 /*--- READ LINE #3, unique database key ---*/
1587 if (fgets (inbuf, inbufsz, instream) == NULL)
1588 goto INVALID_FZK_FORMAT;
1589 inbuf[inbufsz] = 0; /* just to be sure */
1591 if ((cptr = strtok (inbuf, " \t\n")) == NULL)
1592 goto INVALID_FZK_FORMAT;
1594 /* If necessary, discard long keys exactly like cravel */
1595 if (strlen (cptr) >= DtSrMAX_DB_KEYSIZE) {
1596 printf (catgets (dtsearch_catd, MS_cborodin, 659,
1597 "\n%s: %s Discarding record, key too long:\n '%s'.\n"),
1598 aa_argv0, PROGNAME"659", cptr);
1599 discard_to_ETX (&parg);
1602 strcpy (db_key, cptr);
1604 /* Skip duplicate record ids in same order as dtsrload */
1605 i = is_duprec (db_key);
1606 if (i == 2) { /* out of memory */
1607 printf (catgets (dtsearch_catd, MS_cborodin, 374, msg_374),
1611 else if (i == 1) { /* duplicate record id */
1615 printf (catgets (dtsearch_catd, MS_cborodin, 1402,
1616 "%s: Discarded duplicate rec #%lu '%s'.\n"),
1617 aa_argv0, record_count, db_key);
1618 discard_to_ETX (&parg);
1622 /****** FFFFFFFFFFFFFFFFFFFFF **********/
1623 /* Convert database address (slot #) to 'record number',
1624 * what dba would have been if all records took up
1625 * only one slot and there were no dbrec at top of file.
1626 * Record numbers on d99, like dba's, start at #1,
1627 * but rec numbers here (in bit vector) start at #0.
1629 KEYFIND (PROGNAME "222", OR_OBJKEY, (char *) db_key, 0);
1630 if (db_status != S_OKAY) {
1631 normal_retncode = 1; /* = 'warning' */
1634 printf (catgets (dtsearch_catd, MS_cborodin, 1168,
1635 "%s: %s Discarded '%s', key not in database.\n"),
1636 aa_argv0, PROGNAME"1168", displayable(db_key));
1637 discard_to_ETX (&parg);
1641 CRGET (PROGNAME "224", &temp_dba, 0); /* vista already byte swapped */
1642 temp_dba &= 0x00FFFFFF; /* = slot# */
1643 dba = (temp_dba + dba_offset) / or_recslots; /* = rec#, base 1 */
1645 * Don't change this 'dba'!--eventually it goes
1646 * into d99 in this exact format! It will also
1647 * be used as an index into dbas_word_count[] in
1648 * load_into_bintree() so do a sanity check
1649 * to make sure that it hasn't exceeded the size
1650 * of that array. (The count increments have been
1651 * reported as as 'uninitialized memory reads'
1652 * by a debugger). This might happen for example
1653 * if user failed to run dtsrload before dtsrindex?
1655 if (dba < 1 || dba > or_maxdba) {
1656 printf ( catgets(dtsearch_catd, MS_cborodin, 21,
1657 "\n%s '%s' record overflows word counter array.\n"
1658 "Record number %ld > maxdba %ld, dba=%ld, "
1659 "recslots=%ld, offs=%d.\n") ,
1660 PROGNAME"1526", displayable(db_key),
1661 (long)dba, (long)or_maxdba, (long)temp_dba,
1662 (long)or_recslots, (int)dba_offset);
1665 temp_dba = dba - 1; /* = rec# starting at 0 */
1666 cur_byte = temp_dba >> 3; /* bits to bytes: div by 8 */
1667 if (cur_byte >= bit_vector_size) {
1668 printf ( catgets(dtsearch_catd, MS_cborodin, 22,
1669 "\n%s '%s' record in database (dba=%ld)\n"
1670 " overflows bitvector allocation (%ld >= %ld).\n") ,
1671 PROGNAME"1475", displayable(db_key), (long)dba,
1672 (long)cur_byte, (long)bit_vector_size);
1675 dbas_bits_batch[cur_byte] |= 1 << (temp_dba % 8);
1677 /* Print occasional progress dots and msgs */
1678 if (!(record_count % recs_per_dot)) {
1681 if (!(dotcount % 10))
1683 if (dotcount >= 50) {
1685 bytes_in = ftell (instream);
1686 seconds_left = (unsigned long)
1687 (((float) fstat_input.st_size /
1688 (float) bytes_in - 1.) *
1689 (float) (time (NULL) - timestart));
1690 printf (catgets (dtsearch_catd, MS_cborodin, 1190,
1691 "\n%s: Rec #%lu, %.0f%% done. "
1692 "Est %lum %02lus to end Pass 1.\n"),
1695 (float) bytes_in / (float) fstat_input.st_size * 100.0,
1696 seconds_left / 60UL,
1697 seconds_left % 60UL);
1702 /*----- READ LINE #4, date -----*/
1703 if (fgets (inbuf, inbufsz, instream) == NULL)
1704 goto INVALID_FZK_FORMAT;
1705 inbuf[inbufsz] = 0; /* just to be sure */
1707 /* PARSE LOOP FOR CURRENT TEXT BLOCK.
1708 * We must be in the middle of a record ('lines' #5 and beyond).
1709 * From here to ETX, which is either the record delimiter string
1710 * or the end of file, read the file a 'word' at a time
1711 * using the parse() function for the language specified
1713 * Load_into_bintree() stores each token into
1714 * inverted index binary tree.
1715 * Note: dba here MUST still be rec#, base 1.
1716 * It's stored as is by load_into_bintree(),
1717 * and will be moved as is into d99 file in Pass 2.
1719 if (debugging & DEBUG_P)
1720 printf ("\nRecord #%lu '%s'\n"
1721 "Offset Word---- Stem----\n",
1722 record_count, db_key);
1723 for ( cptr = dblk.parser (&parg);
1725 cptr = dblk.parser (NULL)) {
1727 if (debugging & DEBUG_P) {
1728 printf ("%6ld %s %n", word_offset, cptr, &i);
1729 if (!(debugging & DEBUG_I))
1733 load_into_bintree (cptr, FALSE, dba);
1734 cptr = dblk.stemmer (cptr, &dblk);
1735 if (debugging & DEBUG_P) {
1736 printf ("%s\n", cptr);
1739 load_into_bintree (cptr, TRUE, dba);
1742 } /* end of PASS 1 Main read loop */
1744 elapsed = time(NULL) - timestart;
1749 if (duplicate_recids > 0L) {
1750 normal_retncode = 1; /* 'warning' */
1751 sprintf (buf, catgets (dtsearch_catd, MS_cborodin, 40,
1752 "Ignored %ld duplicate records"),
1756 strcpy (buf, catgets (dtsearch_catd, MS_cborodin, 41,
1757 "No duplicate records found"));
1758 printf (catgets (dtsearch_catd, MS_cborodin, 1225,
1759 "%s: Pass 1 completed in %lum %lus, read %lu records.\n"
1760 " %s, parsed %lu words.\n"),
1761 aa_argv0, elapsed / 60L, elapsed % 60L, record_count,
1762 buf, num_of_diff_words);
1763 if (record_count > batch_size) {
1764 printf (catgets (dtsearch_catd, MS_cborodin, 33,
1765 "\n%s Number of incoming records exceeded %d.\n"
1766 " This will usually result in 'Out of Paging Space' "
1768 " and corruption of database. Either split the incoming file to\n"
1769 " reduce record count or use the -b option, and rerun.\n"),
1770 PROGNAME"33", (int)batch_size);
1774 /*----------------- PASS 2: -----------------
1775 * Traverse completed binary tree and write it to d99 file.
1777 printf (catgets (dtsearch_catd, MS_cborodin, 1233,
1778 "%s: Beginning Pass 2: batch index traversal and database update.\n"
1779 " Each dot = %d words.\n"),
1780 aa_argv0, words_per_dot);
1783 traverse_tree (); /* actual Pass 2 */
1789 /* Write header information to the d99 file */
1790 if (!fwrite_d99_header (&fl_hdr, dtbs_addr_fp)) {
1791 printf (catgets (dtsearch_catd, MS_cborodin, 776, msg_776),
1792 PROGNAME"1723", strerror(errno));
1796 fclose (dtbs_addr_fp);
1798 elapsed = time (NULL) - timestart;
1799 printf (catgets (dtsearch_catd, MS_cborodin, 1246,
1800 "%s: Pass 2 completed in %lum %lus, updated %lu words.\n"),
1801 aa_argv0, elapsed / 60L, elapsed % 60L, count_word_ii);
1802 if (normal_retncode == 1)
1803 printf (catgets (dtsearch_catd, MS_cborodin, 2,
1804 "%s: Warnings were detected.\n"), aa_argv0);
1805 DtSearchExit (normal_retncode);
1809 /*************************** DTSRINDEX.C ****************************/