cde/programs/dtsr/dtsrindex.c

   1 /*
   2  *   COMPONENT_NAME: austext
   3  *
   4  *   FUNCTIONS: descend_tree
   5  *              displayable
   6  *              fill_data1
   7  *              load_into_bintree
   8  *              main
   9  *              print_exit_code
  10  *              print_usage_msg
  11  *              put_addrs_2_dtbs_addr_file
  12  *              segregate_dicname
  13  *              traverse_tree
  14  *              user_args_processor
  15  *              write_2_dtbs_addr_file
  16  *              write_new_word_2_dtbs
  17  *              write_to_file
  18  *
  19  *   ORIGINS: 27
  20  *
  21  *
  22  *   (C) COPYRIGHT International Business Machines Corp. 1992,1996
  23  *   All Rights Reserved
  24  *   Licensed Materials - Property of IBM
  25  *   US Government Users Restricted Rights - Use, duplication or
  26  *   disclosure restricted by GSA ADP Schedule Contract with IBM Corp.
  27  */
  28 /************************ DTSRINDEX.C *******************************
  29  * $XConsortium: dtsrindex.c /main/10 1996/09/23 21:02:54 cde-ibm $
  30  * CDE version of borodin.c
  31  * Formerly dtsrindex.c was cborodin.c.
  32  *
  33  * INPUT FORMAT:
  34  * Text file in FZK format.
  35  * Each record contains 4 formatted 'lines' (text strings ending in \n):
  36  * 1. fzkey (not used in this program).
  37  * 2. abstract (not used in this program).
  38  * 3. unique database key for the record.  Used to find the database
  39  *    address of the record which is the reference for the inverted index.
  40  * 4. The record's date (not used in this program).
  41  *
  42  * The rest of the record is unformatted text (not necessarily organized
  43  * into 'lines').  It is read a character at a time and parsed into
  44  * individual words by the parser function for the database's language.
  45  * Each record ends with a delimiter string specified by command line arg.
  46  *
  47  * $Log$
  48  * Revision 2.8  1996/04/10  19:50:38  miker
  49  * Deleted dangerous and unnecessary -a option.
  50  *
  51  * Revision 2.7  1996/03/25  18:54:15  miker
  52  * Changed FILENAME_MAX to _POSIX_PATH_MAX.
  53  *
  54  * Revision 2.6  1996/02/01  18:25:44  miker
  55  * AusText 2.1.11, DtSearch 0.3.  Pass 1 changed to accommodate
  56  * new single-character reading parser/stemmers.
  57  *
  58  * Revision 2.5  1995/12/29  17:16:04  miker
  59  * Bug fix: Opened wrong msg catalog.
  60  *
  61  * Revision 2.4  1995/12/27  21:18:40  miker
  62  * Msg bug: 'percent done' was negative number.
  63  *
  64  * Revision 2.3  1995/12/01  16:15:44  miker
  65  * Deleted unnecessary log2 var, conflict with Solaris function.
  66  * Added -r command line arg.
  67  *
  68  * Revision 2.2  1995/10/26  15:26:53  miker
  69  * Added prolog.
  70  *
  71  * Revision 2.1  1995/09/22  19:29:53  miker
  72  * Freeze DtSearch 0.1, AusText 2.1.8
  73  *
  74  * Revision 1.3  1995/09/05  21:08:54  miker
  75  * Fixed bug: appeared as if 1 and 2 char 'words' were being indexed.
  76  * Added DEBUG_P switch.
  77  *
  78  * Revision 1.2  1995/09/01  22:17:02  miker
  79  * Fixed solaris segfault: too many args to printf in print_usage().
  80  *
  81  * Revision 1.1  1995/08/31  20:51:08  miker
  82  * Initial revision of dtsrindex.c, copied from cborodin.c.
  83  *
  84  * Log: cborodin.c,v
  85  * Revision 1.18  1995/05/30  18:58:54  miker
  86  * Correct bug introduced by previous fix (2.1.5c).
  87  *
  88  * Revision 1.17  1995/05/18  22:54:08  miker
  89  * 2.1.5b cborodin bug.  Segfault due to overflowing bitvector
  90  * after many deletions and no mrclean.
  91  */
  92 #include "SearchP.h"
  93 #include <limits.h>
  94 #include <stdlib.h>
  95 #include <unistd.h>
  96 #include <string.h>
  97 #include <ctype.h>
  98 #include <time.h>
  99 #include <errno.h>
 100 #include <math.h>
 101 #include <sys/stat.h>
 102 #include <locale.h>
 103 #include "vista.h"
 104
 105 extern void     find_keyword (char *cur_word, int vista_num);
 106 extern void     read_wordstr (struct or_hwordrec * glob_word, int vista_num);
 107 extern void     write_wordstr (struct or_hwordrec * glob_word, int vista_num);
 108 extern void     fill_data1 (char *ch);
 109
 110 #define PROGNAME        "DTSRINDEX"
 111
 112 #define BATCH_SIZE      10000L
 113 #define WORDS_PER_DOT   500
 114 #define RECS_PER_DOT    20
 115 #define INBUFSZ         1024    /* default input text header line size */
 116 #define MS_misc         1
 117 #define MS_cborodin     14
 118
 119 /******************* BIT VECTORS *****************/
 120 DB_ADDR        *word_addrs_ii;          /* fread buf for d99 (= tot # dbas) */
 121 DtSrINT32       *dbas_word_count;
 122 char           *dbas_bits_batch;
 123 DB_ADDR        *record_addr_word;
 124 DtSrINT32       num_addrs_for_word;
 125 DtSrINT32       or_reccount;
 126 DtSrINT32       bit_vector_size;
 127
 128 /*-------------------------- GLOBALS ----------------------------*/
 129 /* batch_size also used by fileman.c for allocating unused holes
 130  * in order to no go past end of 'record_addr_word' array.
 131  */
 132 extern DtSrINT32  batch_size;
 133
 134 char            buf[1024];
 135 static int      cache_size =            CACHE_SIZE;
 136 static int      check_existing_addrs =  TRUE;
 137 long            count_word_ii =         0L;
 138 long            dbkey_seqno =           0L;
 139 DBLK            dblk;
 140 DBREC           dbrec;
 141 static int      debugging =             0;
 142   #define DEBUG_I       0x01    /* P1 tree insertions */
 143   #define DEBUG_P       0x10    /* P1 parser/stemmer */
 144   #define DEBUG_T       0x02    /* P2 tree dump (words) */
 145   #define DEBUG_N       0x04    /* P2 NEW words, vista */
 146   #define DEBUG_O       0x08    /* P2 OLD words, vista)  */
 147   #define DEBUG_t       0x20    /* P2 tree dump (dbas) */
 148   #define DEBUG_n       0x40    /* P2 NEW d99 for new words */
 149   #define DEBUG_o       0x80    /* P2 OLD d99 updates for old words */
 150 static unsigned long
 151                 default_hashsize;
 152 char            dicname [10];
 153 char            dicpath [_POSIX_PATH_MAX];
 154 static int      dotcount =              0;
 155 char            dtbs_addr_file [_POSIX_PATH_MAX];
 156 FILE           *dtbs_addr_fp;
 157 long            dtbs_size_records =     0L;
 158 static long     duplicate_recids =      0L;
 159 struct stat     fstat_input;
 160 FILE_HEADER     fl_hdr;
 161 static char     fname_input [_POSIX_PATH_MAX];
 162 struct or_hwordrec
 163                 got_word;
 164 static FILE    *instream;
 165 char            *inbuf;
 166 int             inbuf_overflowed =      FALSE;
 167 size_t          inbufsz =               INBUFSZ;
 168 int             is_pmr;
 169 static DtSrINT32
 170                 or_maxdba =             0;
 171 static char     msg_374[] =     "\n%s Out of Memory!\n"
 172                                 "  Split the incoming file into several "
 173                                 "smaller files and try again.\n";
 174 static char     msg_776[] =     "\n%s Write Failure d99 file: %s\n";
 175 char            new_dtbs_file =         FALSE;
 176 long            num_of_diff_words =     0L;
 177 int             normal_retncode =       0;
 178 static PARG     parg;
 179 int             parsep_char =           END_RETAIN_PAGE;
 180 char            rec_type;
 181 unsigned long   record_count =          0UL;
 182 int             record_lines;
 183 static int      recs_per_dot =          RECS_PER_DOT;
 184 static unsigned long
 185                 seconds_left;
 186 extern int      shutdown_now;
 187 static DtSrINT32
 188                 or_recslots;
 189 char            *sprintbuffer =         NULL;
 190 char            *temp =                 NULL;
 191 extern int      debugging_teskey;
 192 time_t          timestart =             0;
 193 time_t          totalstart =            0;
 194 static int      words_per_dot =         WORDS_PER_DOT;
 195
 196 /************************************************/
 197 /*                                              */
 198 /*                   DBALIST                    */
 199 /*                                              */
 200 /************************************************/
 201 typedef struct dba_str {
 202     DB_ADDR             dba;
 203     DtSrINT32           w_c;
 204     struct dba_str      *next_dba;
 205 }               DBALIST;
 206
 207 /************************************************/
 208 /*                                              */
 209 /*                   TREENODE                   */
 210 /*                                              */
 211 /************************************************/
 212 typedef struct _treen_ {
 213     char           *word;       /* ptr to word in stop list */
 214     struct _treen_ *llink;      /* left link in binary tree */
 215     struct _treen_ *rlink;      /* ptr to right link in binary tree */
 216     DBALIST        *dba_list;
 217 }               TREENODE;
 218
 219 static TREENODE *root_node =            NULL;
 220 static TREENODE *top_of_stack;
 221 static TREENODE *stack;
 222 static TREENODE *pres;
 223 static TREENODE *prev;
 224 static TREENODE *next;
 225 static TREENODE *avail_node;
 226
 227
 228
 229 /************************************************/
 230 /*                                              */
 231 /*                 displayable                  */
 232 /*                                              */
 233 /************************************************/
 234 /* Returns static string same as passed string except nonprintable
 235  * and nonascii chars replaced by '^' for display.
 236  */
 237 static char     *displayable (char *passed_string)
 238 {
 239     static char         *buf =          NULL;
 240     static size_t       buflen =        0;
 241     size_t              passed_len =    strlen (passed_string);
 242     char                *targ, *src;
 243     if (buflen < passed_len) {
 244         if (buf)
 245             free (buf);
 246         buflen = passed_len;
 247         buf = austext_malloc (buflen + 4, PROGNAME"158", NULL);
 248     }
 249     targ = buf;
 250     for (src = passed_string;  *src != 0;  src++) {
 251         if (*src >= 32  && *src < 127)
 252             *targ++ = *src;
 253         else
 254             *targ++ = '^';
 255     }
 256     *targ = 0;
 257     return buf;
 258 } /* displayable() */
 259
 260
 261 /************************************************/
 262 /*                                              */
 263 /*               print_exit_code                */
 264 /*                                              */
 265 /************************************************/
 266 /* Called from inside DtSearchExit() at (*austext_exit_last)() */
 267 static void     print_exit_code (int exit_code)
 268 {
 269     if(dotcount) {
 270         putchar ('\n');
 271         dotcount = 0;
 272     }
 273     /* Put total seconds into totalstart */
 274     if (totalstart > 0)
 275         totalstart = time (NULL) - totalstart;
 276     printf (catgets (dtsearch_catd, MS_cborodin, 206,
 277         "%s: Exit Code = %d, Total elapsed time %ldm %lds.\n"),
 278         aa_argv0, exit_code, totalstart / 60L, totalstart % 60L);
 279     return;
 280 }       /* print_exit_code() */
 281
 282
 283 /****************************************/
 284 /*                                      */
 285 /*           write_to_file()            */
 286 /*                                      */
 287 /****************************************/
 288 /* This is the 'visit node' point for the tree traversal
 289  * functions of Pass 2 (traverse_tree() and descend_tree()).
 290  *
 291  * Each tree node = word or stem + linked list of dbas.
 292  * When called, each dba list member just contains the number
 293  * of times the token appears in that document.  This function
 294  * chains through the list, builds a statistical 'weight'
 295  * for each doc/word pair, and stores it as a reformatted 'dba'
 296  * in array 'record_addr_word[]', in 'host' byte swap order.
 297  * The count of the current number of addrs
 298  * in the array is stored in 'num_addrs_for_word'.
 299  * Fill_data1() is then called to update or write a new
 300  * vista record and d99 data for the token.
 301  *
 302  * The weight stored for each doc-word instance is 1 byte.
 303  * It's the ratio of log of number of times given word occurs in doc,
 304  * divided by log of total count of all words in doc,
 305  * scaled to range 0 to 255.
 306  * Fundamentally it's a word count of that word in the doc,
 307  * but adjusted as follows:
 308  * 1) Large occurrances in small documents weigh more than
 309  *    the same number of occurrances in large documents.
 310  * 2) Taking the log skews the ratio to be more linear,
 311  *    ie take advantage of higher ranges of the 'weight'.
 312  *    For example a word that occurs in 10% of the document,
 313  *    will have a weight of .5 (50%).
 314  * 3) The scaling changes the ratio, a float between 0. and .9999,
 315  *    to an integer between 0 and 255.
 316  */
 317 void            write_to_file (TREENODE * output_node)
 318 {
 319     DBALIST     *print_dba;
 320     DB_ADDR     mydba;
 321
 322     /* 'record_addr_word[]' was permanently allocated
 323      * with a size = max batch size so it can hold
 324      * all the addrs for a single word node in the tree.
 325      * In effect it will replace the dba linked list.
 326      * Note: word_addrs_ii (io buffer for d99 file) != record_addr_word[].
 327      */
 328
 329     if (debugging & (DEBUG_T | DEBUG_t)) {      /* Print out tree node */
 330         printf (" node '%s' %c%c%c",
 331             displayable(output_node->word),
 332             (output_node->llink)? 'L' : '.',
 333             (output_node->rlink)? 'R' : '.',
 334             (debugging & DEBUG_t)? '\n' : ' ');
 335     }
 336
 337     num_addrs_for_word = 0;     /* DtSrINT32 */
 338     print_dba = output_node->dba_list;
 339     while (print_dba != NULL) {
 340
 341         mydba = print_dba->dba;
 342         if (debugging & DEBUG_t)
 343             printf ("    dba #%ld: node adr=%ld cnt=%ld",
 344                 (long)num_addrs_for_word, (long)mydba, (long)print_dba->w_c);
 345
 346         record_addr_word [num_addrs_for_word] =
 347             mydba << 8;  /* rec# in hi 3 bytes */
 348         record_addr_word [num_addrs_for_word] +=
 349             (log ((double) (print_dba->w_c) + 0.5) /
 350             log ((double) (dbas_word_count[mydba] + 1))) * 256;
 351
 352         if (debugging & DEBUG_t)
 353             printf ("  -> x%lx (%ld:%ld)\n",
 354                 (long)record_addr_word [num_addrs_for_word],
 355                 (long)record_addr_word [num_addrs_for_word] >> 8,
 356                 (long)record_addr_word [num_addrs_for_word] & 0xffL);
 357
 358         print_dba = print_dba->next_dba;
 359         num_addrs_for_word++;
 360         if (num_addrs_for_word >= batch_size) {
 361             printf (catgets (dtsearch_catd, MS_cborodin, 280,
 362                 "\n%s num_addrs_for_word (%ld) >= batchsz (%ld).\n"),
 363                 PROGNAME"280", (long)num_addrs_for_word, (long)batch_size);
 364             DtSearchExit (91);
 365         }
 366     }
 367     if ((debugging & DEBUG_T)  && !(debugging & DEBUG_t))
 368         printf (" dbacnt=%ld\n", (long)num_addrs_for_word);
 369
 370     fill_data1 (output_node->word);
 371
 372     return;
 373 } /* write_to_file() */
 374
 375
 376 /****************************************/
 377 /*                                      */
 378 /*           descend_tree()             */
 379 /*                                      */
 380 /****************************************/
 381 /* Coroutine of traverse_tree(), Pass 2 Robson tree traversal.
 382  * The write_to_file() function is the 'preorder visit' point.
 383  */
 384 void            descend_tree (void)
 385 {
 386     int             not_done = TRUE;
 387
 388     while (not_done) {
 389         if ((pres->llink == NULL) && (pres->rlink == NULL)) {
 390             write_to_file (pres);
 391             avail_node = pres;
 392             return;
 393         }
 394         if (pres->llink != NULL) {
 395             next = pres->llink;
 396             pres->llink = prev;
 397             prev = pres;
 398             pres = next;
 399         }
 400         else {
 401             write_to_file (pres);
 402             next = pres->rlink;
 403             pres->rlink = prev;
 404             prev = pres;
 405             pres = next;
 406         }
 407     }
 408     return;
 409 } /* descend_tree() */
 410
 411
 412 /********************************/
 413 /*                              */
 414 /*        traverse_tree         */
 415 /*                              */
 416 /********************************/
 417 /* This is the actual Pass 2 function, a tree traversal
 418  * of Pass 1's word-dba binary tree.
 419  * The algorithm is based on the J. M. ROBSON link inversion traversal
 420  * algorithm for binary trees. Ref. Thomas A. STANDISH  pp. 77-78.
 421  * The write_to_file() function is the 'preorder visit' point.
 422  */
 423 void            traverse_tree (void)
 424 {
 425     int             not_done = TRUE;
 426     int             descend = TRUE;
 427
 428     /* Dheck for the empty tree */
 429     if (root_node == NULL) {
 430         printf (catgets (dtsearch_catd, MS_cborodin, 288,
 431             "%s Abort. There are no words in the input file %s.\n"),
 432             PROGNAME"288", fname_input);
 433         DtSearchExit (34);
 434     }
 435     /* Initialize the variables */
 436     pres = root_node;
 437     prev = pres;
 438     top_of_stack = NULL;
 439     stack = NULL;
 440
 441     while (not_done) {
 442         if (descend) {
 443             descend_tree ();
 444         }
 445         if (pres == root_node) {
 446             return;
 447         }
 448         if (prev->rlink == NULL) {
 449             write_to_file (prev);
 450             next = prev->llink;
 451             prev->llink = pres;
 452             pres = prev;
 453             prev = next;
 454             descend = FALSE;
 455         }
 456         else {
 457             if (prev->llink == NULL) {
 458                 next = prev->rlink;
 459                 prev->rlink = pres;
 460                 pres = prev;
 461                 prev = next;
 462                 descend = FALSE;
 463             }
 464             else {
 465                 if (prev == top_of_stack) {
 466                     next = stack;
 467                     top_of_stack = stack->rlink;
 468                     stack = stack->llink;
 469                     next->llink = NULL;
 470                     next->rlink = NULL;
 471                     next = prev->llink;
 472                     prev->llink = prev->rlink;
 473                     prev->rlink = pres;
 474                     pres = prev;
 475                     prev = next;
 476                     descend = FALSE;
 477                 }
 478                 else {
 479                     write_to_file (prev);
 480                     avail_node->llink = stack;
 481                     avail_node->rlink = top_of_stack;
 482                     stack = avail_node;
 483                     top_of_stack = prev;
 484                     next = prev->rlink;
 485                     prev->rlink = pres;
 486                     pres = next;
 487                     descend = TRUE;
 488                 }
 489             }
 490         }
 491     }
 492 } /* traverse_tree() */
 493
 494
 495
 496 /********************************************************/
 497 /*                                                      */
 498 /*                 print_usage_msg                      */
 499 /*                                                      */
 500 /********************************************************/
 501 static void     print_usage_msg (void)
 502 {
 503                     printf (catgets (dtsearch_catd, MS_cborodin, 17,
 504 "\n"
 505 "USAGE: %s -d<dbname> [options] <infile>\n"
 506 "       Listed default file name extensions can be overridden.\n"
 507 "  -d<dbname>  1 - 8 character database name, include optional path prefix.\n"
 508 "  -t<etxstr>  End of text document delimiter string.  Default '\\f\\n'.\n"
 509 "  -r<N>       Change Pass 1 records-per-dot from %d to <N>.\n"
 510 "  -b<N>       Change max batch size from %ld to <N>.\n"
 511 "  -c<N>       Change database paging cache from %ld 1K pages to <N> 1K pages.\n"
 512 "              <N> >= 16 by powers of 2.  Initially try only small changes.\n"
 513 "  -i<N>       Change (i)nput buffer size from default %d to <N>.\n"
 514 "  -h<N>       Change duplicate record id hash table size from %ld to <N>.\n"
 515 "              -h0 means there are no duplicates, do not check for them.\n"
 516 "  <infile>    Input [path]file name.  Default extension %s.\n"),
 517         aa_argv0,
 518         (int) RECS_PER_DOT,
 519         (long) BATCH_SIZE,  (long) CACHE_SIZE,
 520         (int) INBUFSZ,  default_hashsize,  EXT_FZKEY);
 521     return;
 522 } /* print_usage_msg() */
 523
 524
 525 /********************************************************/
 526 /*                                                      */
 527 /*                segregate_dicname                     */
 528 /*                                                      */
 529 /********************************************************/
 530 /* Separates dictionary name from pathname and loads
 531  * them into the globals 'dicname' and 'dicpath'.
 532  * Returns TRUE if dicname is valid, else returns FALSE.
 533  */
 534 static int      segregate_dicname (char *string)
 535 {
 536     char            mybuf[_POSIX_PATH_MAX];
 537     char           *ptr;
 538     int             i;
 539
 540     strncpy (mybuf, string, sizeof (mybuf));
 541     mybuf[sizeof (mybuf) - 1] = 0;
 542
 543     /*
 544      * Set 'ptr' to just the 8 char dictionary name by moving
 545      * it backwards until first non-alphanumeric character
 546      * (such as a ":" in the dos drive id or a slash between directories),
 547      * or to the beginning of string.
 548      */
 549     for (ptr = mybuf + strlen (mybuf) - 1; ptr >= mybuf; ptr--)
 550         if (!isalnum (*ptr)) {
 551             ptr++;
 552             break;
 553         }
 554     if (ptr < mybuf)
 555         ptr = mybuf;
 556
 557     /* test for valid dictionary name */
 558     i = strlen (ptr);
 559     if (i < 1 || i > 8)
 560         return FALSE;
 561
 562     strcpy (dicname, ptr);
 563     *ptr = 0;
 564     strncpy (dicpath, mybuf, sizeof (dicpath));
 565     dicpath[sizeof (dicpath) - 1] = 0;
 566     return TRUE;
 567 } /* segregate_dicname() */
 568
 569
 570 /********************************************************/
 571 /*                                                      */
 572 /*                 USER_ARGS_PROCESSOR                  */
 573 /*                                                      */
 574 /********************************************************/
 575 /* handles command line arguments for 'main' */
 576 void            user_args_processor (int argc, char **argv)
 577 {
 578     char           *argptr;
 579     char           *targ, *src;
 580     int             i;
 581
 582     if (argc <= 1) {
 583         print_usage_msg ();
 584         DtSearchExit (2);
 585     }
 586     /* Initialize some variables prior to parsing command line */
 587     dicname[0] = 0;
 588     dicpath[0] = 0;
 589
 590     /* Each pass grabs new parm of "-xxx" format */
 591     while (--argc > 0 && (*++argv)[0] == '-') {
 592         argptr = argv[0];
 593         switch (argptr[1]) {
 594
 595             case 't':           /* ETX delimiter string */
 596                 /* Replace any "\n" string with real linefeed */
 597                 targ = parg.etxdelim = malloc (strlen (argptr + 2) + 4);
 598                 src = argptr + 2;
 599                 while (*src) {
 600                     if (src[0] == '\\' && src[1] == 'n') {
 601                         *targ++ = '\n';
 602                         src += 2;
 603                     }
 604                     else
 605                         *targ++ = *src++;
 606                 }
 607                 *targ = 0;
 608                 break;
 609
 610             case 'r':
 611                 if ((recs_per_dot = atoi (argptr + 2)) <= 0) {
 612                     printf (catgets (dtsearch_catd, MS_cborodin, 577,
 613                         "%s Invalid arg '%s'.  Using default -r%d.\n"),
 614                         PROGNAME"577", argptr, RECS_PER_DOT);
 615                     recs_per_dot = RECS_PER_DOT;
 616                 }
 617                 break;
 618
 619             case 'h':
 620                 duprec_hashsize = atol (argptr + 2);
 621                 if (duprec_hashsize == 0UL)
 622                     printf (catgets (dtsearch_catd, MS_cborodin, 539,
 623                         "%s Duplicate record id checking disabled.\n"),
 624                         PROGNAME"539");
 625                 break;
 626
 627             case 'b':
 628                 batch_size = atol (argptr + 2);
 629                 if (batch_size <= 0L) {
 630                     printf (catgets (dtsearch_catd, MS_cborodin, 595,
 631                         "%s Invalid batch size argument '%s'.\n"),
 632                         PROGNAME"595", argptr);
 633                     goto BADPARM;
 634                 }
 635                 break;
 636
 637             case 'c':
 638                 cache_size = atoi (argptr + 2);
 639                 if (cache_size < 16) {
 640                     /* minimum size is 16 */
 641                     if (cache_size > 0)
 642                         cache_size = 16;
 643                     /* on error reset size to default */
 644                     else
 645                         cache_size = CACHE_SIZE;
 646 CACHE_ADJUSTED:
 647                     printf (catgets (dtsearch_catd, MS_cborodin, 600,
 648                             "%sCache size readjusted to %d.\n"),
 649                         PROGNAME "600 ", cache_size);
 650                     break;
 651                 }
 652                 /* If necessary, round up to nearest power of 2 */
 653                 for (i = 4; i < 12; i++)
 654                     if (1 << i >= cache_size)
 655                         break;
 656                 i = 1 << i;
 657                 if (i != cache_size) {
 658                     cache_size = i;
 659                     goto CACHE_ADJUSTED;
 660                 }
 661                 break;
 662
 663             case 'D':           /* unadvertised debugging feature */
 664                 for (i = 2;  argptr[i] != 0;  i++) {
 665                     switch (argptr[i]) {
 666                         case 'I':       debugging |= DEBUG_I;  break;
 667                         case 'P':       debugging |= DEBUG_P;
 668                                 /******* debugging_teskey = TRUE; ******/
 669                                         break;
 670                         case 'N':       debugging |= DEBUG_N;  break;
 671                         case 'n':       debugging |= DEBUG_n;  break;
 672                         case 'O':       debugging |= DEBUG_O;  break;
 673                         case 'o':       debugging |= DEBUG_o;  break;
 674                         case 'T':       debugging |= DEBUG_T;  break;
 675                         case 't':       debugging |= DEBUG_t;  break;
 676                         default:        goto BADPARM;
 677                     }
 678                 }
 679                 break;
 680
 681             case 'd':
 682                 /* May include both dicname and dicpath */
 683                 if (!segregate_dicname (argptr + 2)) {
 684                     printf (catgets (dtsearch_catd, MS_cborodin, 550,
 685                         "%s '%s' is invalid path/database name.\n"),
 686                         PROGNAME"550", argptr);
 687                     goto BADPARM;
 688                 }
 689                 break;
 690
 691             case 'i':           /* (I)nput buffer size */
 692                 if ((inbufsz = atol (argptr + 2)) <= 0) {
 693                     printf (catgets (dtsearch_catd, MS_cborodin, 558,
 694                         "%s Invalid input buffer size '%s'.\n"),
 695                         PROGNAME"558", argptr);
 696                     goto BADPARM;
 697                 }
 698                 break;
 699
 700             default:
 701                 printf (catgets (dtsearch_catd, MS_cborodin, 567,
 702                     "%s Unknown command line argument '%s'.\n"),
 703                     PROGNAME"567", argptr);
 704 BADPARM:
 705                 print_usage_msg ();
 706                 DtSearchExit (2);       /* abort */
 707
 708         }                       /* endswitch */
 709     }                           /* endwhile for cmd line '-'processing */
 710
 711     /* Validate input file name */
 712     if (argc-- <= 0) {
 713         printf (catgets (dtsearch_catd, MS_cborodin, 580,
 714             "%s Missing required input file name.\n"),
 715             PROGNAME"580");
 716         goto BADPARM;
 717     }
 718     /* Don't incr argv yet--save input file name */
 719     else
 720         append_ext (fname_input, _POSIX_PATH_MAX, argv[0], EXT_FZKEY);
 721
 722     /* Check for missing database name */
 723     if (dicname[0] == 0) {
 724         printf (catgets (dtsearch_catd, MS_cborodin, 589,
 725             "%s No database name specified (-d argument).\a\n"),
 726             PROGNAME"589");
 727         goto BADPARM;
 728     }
 729     strcpy (dblk.name, dicname);
 730     dblk.path = dicpath;
 731     return;
 732 } /* user_args_processor() */
 733
 734
 735 /****************************************/
 736 /*                                      */
 737 /*      put_addrs_2_dtbs_addr_file      */
 738 /*                                      */
 739 /****************************************/
 740 /* Suboutine of write_2_dtbs_addr_file() from Pass 2.
 741  * That function has used a bit vector to determine
 742  * the total change in old d99 addrs for preexisting words,
 743  * and prepared for writing an array of old dbas that
 744  * are not in the current words tree node (globally named
 745  * word_addrs_ii [num_addrs]).
 746  * The addrs that ARE in the Pass 1 node fzk file were previously
 747  * prepared in a similar array of dbas, globally named
 748  * record_addr_word [num_addrs_for_word] but passed here as
 749  * 'addrs_array' and 'nitems'.
 750  * Both arrays will be byte swapped from 'host' to
 751  * 'network' order in this function.
 752  * This function does the actual fwrite of both arrays to the d99.
 753  * If the number of new addrs can fit in the available free slots,
 754  * it rewrites to original offset, otherwise appends to end of d99.
 755  */
 756 static void     put_addrs_2_dtbs_addr_file (
 757                     DB_ADDR     *addrs_array,
 758                     DtSrINT32   nitems)
 759 {
 760     FREE_SPACE_STR      *free_slot;
 761     FREE_SPACE_STR      del_rec;
 762     DtSrINT32           int32;
 763     DtSrINT32           num_writes;
 764     DtSrINT32           num_addrs;
 765
 766     if (nitems >= batch_size) {
 767         printf ( catgets(dtsearch_catd, MS_cborodin, 6,
 768             "put_addrs_2_dtbs_addr_file() nitems=%d, batchsz=%ld\n") ,
 769             (int)nitems, (long)batch_size);
 770         DtSearchExit (58);
 771     }
 772
 773     num_addrs = got_word.or_hwaddrs;
 774     got_word.or_hwaddrs += nitems;  /** somehow, this can exceed total
 775         **** num addrs in database by 1 (!?) ******/
 776         /* (...only if prev 'overlay/compression' didn't delete all) */
 777
 778 #ifdef BYTE_SWAP
 779         /* Put both arrays in 'network' byte order */
 780         for (int32 = 0;  int32 < nitems;  int32++)
 781             HTONL (addrs_array[int32]);
 782         for (int32 = 0;  int32 < num_addrs;  int32++)
 783             HTONL (word_addrs_ii[int32]);
 784 #endif
 785
 786     /*
 787      * If number of new addresses greater than number of free holes,
 788      * find new free slot that is big enough to hold the data .
 789      */
 790     if (nitems > got_word.or_hwfree) {
 791         /* Discard old slot, find new one. */
 792         del_rec.hole_size = num_addrs + got_word.or_hwfree;
 793         del_rec.offset = got_word.or_hwoffset;
 794         free_slot = find_free_space (got_word.or_hwaddrs, &fl_hdr);
 795         add_free_space (&del_rec, &fl_hdr);
 796         if (free_slot == NULL) {
 797             fseek (dtbs_addr_fp, 0L, SEEK_END);
 798             got_word.or_hwoffset = ftell (dtbs_addr_fp);
 799             got_word.or_hwfree = 0;
 800         }
 801         else {
 802             fseek (dtbs_addr_fp, free_slot->offset, SEEK_SET);
 803             got_word.or_hwoffset = free_slot->offset;
 804             got_word.or_hwfree = free_slot->hole_size -
 805                 got_word.or_hwaddrs;
 806         }
 807         /*----- Write new database addresses to a file -----*/
 808         num_writes = fwrite (addrs_array, sizeof(DB_ADDR),
 809                 (size_t)nitems, dtbs_addr_fp);
 810         if (num_writes != nitems) {
 811             DtSearchExit (98);
 812         }
 813
 814         /* Copy the old addresses immediately after the new ones */
 815         num_writes = fwrite (word_addrs_ii, sizeof(DB_ADDR), (size_t)num_addrs,
 816             dtbs_addr_fp);
 817         if (num_writes != num_addrs) {
 818             printf (catgets (dtsearch_catd, MS_cborodin, 776, msg_776),
 819                 PROGNAME"776", strerror(errno));
 820             DtSearchExit (76);
 821         }
 822
 823         /* Write foxes to the free holes, if any, no byte swap */
 824         for (int32 = 0;  int32 < got_word.or_hwfree;  int32++)
 825             addrs_array [int32] = 0xFFFFFFFF;
 826         num_writes = fwrite (addrs_array, sizeof(DB_ADDR),
 827             (size_t)got_word.or_hwfree, dtbs_addr_fp);
 828         if (num_writes != got_word.or_hwfree) {
 829             printf (catgets (dtsearch_catd, MS_cborodin, 776, msg_776),
 830                 PROGNAME"786", strerror(errno));
 831             DtSearchExit (86);
 832         }
 833     } /* end if (nitems > got_word.or_hwfree), had to get bigger slot */
 834
 835     /* Else can reuse existing slot.
 836      * Write the new addresses into free holes.
 837      * The remaining free holes should already have foxes. (?)
 838      */
 839     else {
 840         fseek (dtbs_addr_fp, got_word.or_hwoffset, SEEK_SET);
 841         num_writes = fwrite (addrs_array, sizeof(DB_ADDR),
 842                 (size_t)nitems, dtbs_addr_fp);
 843         if (num_writes != nitems) {
 844             printf (catgets (dtsearch_catd, MS_cborodin, 776, msg_776),
 845                 PROGNAME"798", strerror(errno));
 846             DtSearchExit (87);
 847         }
 848         /* Copy the old addresses immediately after the new ones */
 849         num_writes = fwrite (word_addrs_ii, sizeof(DB_ADDR),
 850                 (size_t)num_addrs, dtbs_addr_fp);
 851         if (num_writes != num_addrs) {
 852             printf (catgets (dtsearch_catd, MS_cborodin, 776, msg_776),
 853                 PROGNAME"889", strerror(errno));
 854             DtSearchExit (89);
 855         }
 856         got_word.or_hwfree -= nitems;
 857     }
 858 } /* put_addrs_2_dtbs_addr_file() */
 859
 860
 861 /****************************************/
 862 /*                                      */
 863 /*       write_2_dtbs_addr_file         */
 864 /*                                      */
 865 /****************************************/
 866 /* Subroutine of fill_data1() from Pass 2.
 867  * Updates OLD (preexisting) word's d99 file.
 868  *
 869  * The vista word rec has already been read into global 'got_word'.
 870  * record_addr_word [num_addrs_for_word] is the array of dba's
 871  * for docs from this batch that contain the current word (built by
 872  * fill_data1 from the dba_list for the word's Pass 1 binary tree node,
 873  * and still in 'host' byte swap order).
 874  * This function freads all the old addresses for that word from
 875  * the d99 file.  It then deletes(!) d99 addrs that
 876  * are in the word's Pass 1 tree node.  It then calls
 877  * put_addrs_2_dtbs_addr_file() to fwrite out the
 878  * dba's in the tree, which are either brand new,
 879  * or are 'updating' the deleted addrs.
 880  * Then it writes the modified old addrs.
 881  * Then rewrites vista word rec with new data.
 882  *
 883  * The bit vector dbas_bits_batch contains a 1 bit
 884  * for every dba for every doc in the fzk file.
 885  * got_word structure:
 886  * .or_hwordkey - the word. (always in a 'huge' word buffer).
 887  * .or_hwoffset - offset in a d99 inverted index file for
 888  *                    a given word. the first address starts
 889  *                    at this position.
 890  * .or_hwaddrs - total number of addresses for a given word.
 891  * .or_hwfree - number of free slots in a database
 892  *                       addresses file for a given word.
 893  */
 894 void            write_2_dtbs_addr_file (void)
 895 {
 896     DtSrINT32           num_addrs_ii;
 897     DtSrINT32           num_reads;
 898     DtSrINT32           i_start, k, cur_ind;
 899     DtSrINT32           num_delete_addrs = 0;
 900     char                addrs_removed = FALSE;
 901     register DtSrINT32  i;
 902     register DtSrINT32  cur_byte;
 903     register char       bit_addrs;
 904     register DB_ADDR    temp1;
 905
 906     if (debugging & DEBUG_O)
 907         printf ("  old vis '%s' ofs=%ld adr=%ld fre=%ld\n",
 908             displayable(got_word.or_hwordkey),
 909             (long) got_word.or_hwoffset,
 910             (long) got_word.or_hwaddrs,
 911             (long) got_word.or_hwfree);
 912
 913     num_addrs_ii = got_word.or_hwaddrs;
 914     if (num_addrs_ii > or_reccount) {
 915         printf (catgets (dtsearch_catd, MS_cborodin, 713,
 916             "\n%s Word '%s' occurs in %ld records,\n"
 917             "  but there are only %ld records in database!\n"
 918             "  (This may be a good candidate for the stoplist).\n"),
 919             PROGNAME"713",
 920             (long) got_word.or_hwordkey,
 921             (long) num_addrs_ii,
 922             (long) or_reccount);
 923         DtSearchExit (68);
 924     }
 925
 926     if (fseek (dtbs_addr_fp, (long) got_word.or_hwoffset, SEEK_SET) != 0)
 927         {
 928         printf (catgets (dtsearch_catd, MS_cborodin, 875,
 929             "\n%s Could not fseek d99 file to offset %ld.\n"),
 930             PROGNAME"875", got_word.or_hwoffset);
 931         DtSearchExit (98);
 932         }
 933     num_reads = fread (word_addrs_ii, sizeof(DB_ADDR),
 934         (size_t)num_addrs_ii, dtbs_addr_fp);
 935     if (num_reads != num_addrs_ii) {
 936         printf (catgets (dtsearch_catd, MS_cborodin, 848,
 937             "\n%s Could not fread %ld bytes (%ld dba's) of d99 file\n"
 938             "  at offset %ld.  Number of dba's read (return code) = %ld.\n"),
 939             PROGNAME"848", sizeof(DB_ADDR) * num_addrs_ii, (long)num_addrs_ii,
 940             (long)got_word.or_hwoffset, (long)num_reads);
 941         DtSearchExit (98);
 942     }
 943 #ifdef BYTE_SWAP
 944     for (i = 0; i < num_addrs_ii; i++)
 945         NTOHL (word_addrs_ii[i]);
 946     /* Now both addr arrays are in 'host' byte swap order */
 947 #endif
 948
 949     /* If there are only new docs,
 950      * this switch will prevent the checking for updates.
 951      */
 952     if (check_existing_addrs) {
 953         i_start = 0;
 954
 955         /* Loop on every preexisting dba for word as read from d99 */
 956         for (i = 0; i < num_addrs_ii; i++) {
 957             if (debugging & DEBUG_o)
 958                 printf ("  old d99 %ld: x%lx(%ld:%ld)",
 959                     (long) i,
 960                     (long) word_addrs_ii[i],
 961                     (long) word_addrs_ii[i] >> 8,
 962                     (long) word_addrs_ii[i] & 0xffL);
 963
 964             /* Get 'record number' by shifting hi 3 bytes 1 byte (8 bits)
 965              * to right over stat wt byte.  D99 rec#'s start at 1,
 966              * so subtract 1 to start at 0 for bit vector.
 967              */
 968             temp1 = (*(word_addrs_ii + i) >> 8) - 1;    /* = rec#, base 0 */
 969             cur_byte = temp1 >> 3;      /* get matching byte# in bit vector */
 970             if (cur_byte >= bit_vector_size) {
 971                 printf ( catgets(dtsearch_catd, MS_cborodin, 9,
 972                     "\n%s Corrupted d99 file for word '%s',\n"
 973                     " database address %ld @ file position %ld => bitvector[%ld],"
 974                     " but max bitvector allocation = %ld.\n") ,
 975                     PROGNAME"727", displayable(got_word.or_hwordkey),
 976                     (long)temp1, (long)i,
 977                     (long)cur_byte, (long)bit_vector_size);
 978                 DtSearchExit (69);
 979             }
 980             bit_addrs = 0;
 981             bit_addrs |= 1 << (temp1 % 8);      /* bit mask */
 982             /*
 983              * If this dba, which is on the current word's old d99
 984              * addrs list, is also a doc in the fzk file (dbas_bits_batch),
 985              * delete it from the d99 list by writing subsequent dba's
 986              * over it.  Boy this recursive nested loop has gotta be slow.
 987              * Faster algorithm?  Add 'good' addrs to the end of
 988              * record_addr_word[].  No nested overlay loop, only one write!
 989              */
 990             if (bit_addrs & (*(dbas_bits_batch + cur_byte))) {
 991                 addrs_removed = TRUE;
 992                 num_delete_addrs++;
 993                 if (i_start == 0) {
 994                     cur_ind = i;
 995                     i_start = i + 1;
 996                 }
 997                 else {
 998                     if (i_start < i) {
 999                         /* compress: move good addrs over
1000                          * space of deleted ones */
1001                         for (k = i_start; k < i; k++) {
1002                             word_addrs_ii[cur_ind] = word_addrs_ii[k];
1003                             cur_ind++;
1004                         }
1005                     }
1006                     i_start = i + 1;
1007                 }
1008             } /* end if where dba is on both fzk list and curr d99 */
1009         } /* end loop on every d99 addr for this word */
1010
1011         if (addrs_removed) {    /* final overlay compression */
1012             if (i_start < i) {
1013                 /* compress: move good addrs over
1014                  * space of deleted ones */
1015                 for (k = i_start; k < i; k++) {
1016                     word_addrs_ii[cur_ind] = word_addrs_ii[k];
1017                     cur_ind++;
1018                 }
1019             }
1020         }
1021     } /* end if (check_existing_addrs) */
1022
1023     got_word.or_hwaddrs -= num_delete_addrs;
1024     got_word.or_hwfree += num_delete_addrs;
1025
1026     /* The old dba array word_addrs_ii[] is now 'compressed',
1027      * it contains only addrs not in fzk file.
1028      * And the vista rec 'got_word' now matches it.
1029      * And record_addr_word[] still contains
1030      * the new/updated addrs from the fzk file.
1031      * Now Efim calls a func to write them both back out to d99 file.
1032      */
1033     put_addrs_2_dtbs_addr_file (record_addr_word, num_addrs_for_word);
1034     write_wordstr (&got_word, 0);       /* update vista WORD rec */
1035
1036     return;
1037 } /*  write_2_dtbs_addr_file() */
1038
1039
1040 /********************************/
1041 /*                              */
1042 /*      write_new_word_2_dtbs   */
1043 /*                              */
1044 /********************************/
1045 /* Subroutine of fill_data1() in Pass 2 for a NEW word.
1046  * Writes d99 data, and updates (empty) got_word vista record.
1047  * record_addr_word [num_addrs_for_word] is the array of addrs
1048  * for docs from this batch that contain the current word (built by
1049  * fill_data1 from the dba_list for the word's Pass 1 binary tree node).
1050  * It will be byte swapped from 'host' to 'network' order in this function.
1051  */
1052 void            write_new_word_2_dtbs (void)
1053 {
1054     FREE_SPACE_STR *free_slot;
1055     DtSrINT32   num_writes;
1056     int             ret_fseek;
1057     DtSrINT32   int32;
1058
1059     if (debugging & (DEBUG_n  | DEBUG_N))
1060         printf ("  new word '%s', adrs=%ld,",
1061             got_word.or_hwordkey, (long)num_addrs_for_word);
1062
1063     free_slot = find_free_space (num_addrs_for_word, &fl_hdr);
1064     if (free_slot == NULL) {
1065         /* append addrs to end of d99 file */
1066         ret_fseek = fseek (dtbs_addr_fp, 0L, SEEK_END);
1067         got_word.or_hwoffset = ftell (dtbs_addr_fp);
1068         got_word.or_hwfree = 0;
1069         if (debugging & (DEBUG_n  | DEBUG_N))
1070             printf ("APPEND ofs=%ld, fre=0\n", got_word.or_hwoffset);
1071     }
1072     else {
1073         ret_fseek = fseek (dtbs_addr_fp,
1074                 (long)free_slot->offset, SEEK_SET);
1075         got_word.or_hwoffset = free_slot->offset;
1076         got_word.or_hwfree = free_slot->hole_size -
1077             num_addrs_for_word;
1078         if (debugging & (DEBUG_n  | DEBUG_N))
1079             printf (" REUSE slot ofs=%ld, fre=%ld\n",
1080                 got_word.or_hwoffset, got_word.or_hwfree);
1081     }
1082
1083     /***** Write new database addresses to d99 file *********/
1084     if (debugging & DEBUG_n) {
1085         for (int32 = 0;  int32 < num_addrs_for_word;  int32++) {
1086             printf ("     dba #%ld: x%lx(%ld:%ld)\n",
1087                 (long)int32,
1088                 (long)record_addr_word[int32],
1089                 (long)record_addr_word[int32] >> 8,
1090                 (long)record_addr_word[int32] & 0xffL);
1091         }
1092     }
1093 #ifdef BYTE_SWAP
1094         /* Put addr array in 'network' byte order */
1095         for (int32 = 0;  int32 < num_addrs_for_word;  int32++)
1096             HTONL (record_addr_word[int32]);
1097 #endif
1098     num_writes = fwrite (record_addr_word, sizeof(DB_ADDR),
1099         (size_t)num_addrs_for_word, dtbs_addr_fp);
1100     if (num_writes != num_addrs_for_word)
1101         DtSearchExit (97);
1102
1103     got_word.or_hwaddrs = num_addrs_for_word;
1104
1105     if (got_word.or_hwfree != 0) {
1106         /* Fill unused free holes with foxes for debugging.
1107          * Note that byte swap is unnecessary for foxes.
1108          * Note that record_addr_word is now available for this action.
1109          */
1110         for (int32 = 0;  int32 < got_word.or_hwfree;  int32++)
1111             *(record_addr_word + int32) = 0xFFFFFFFF;
1112         num_writes = fwrite (record_addr_word, sizeof(DB_ADDR),
1113             (size_t)got_word.or_hwfree, dtbs_addr_fp);
1114         if (num_writes != got_word.or_hwfree) {
1115             printf (catgets (dtsearch_catd, MS_cborodin, 776, msg_776),
1116                 PROGNAME"960", strerror(errno));
1117             DtSearchExit (96);
1118         }
1119     }
1120
1121     /* Save changed word_info structure back to the vista database! */
1122     write_wordstr (&got_word, 0);
1123     return;
1124 } /* write_new_word_2_dtbs() */
1125
1126
1127 /************************/
1128 /*                      */
1129 /*      fill_data1      */
1130 /*                      */
1131 /************************/
1132 /* Called from write_to_file() in Pass 2.
1133  * Write_to_file() is 'visit node' function of tree traversal.
1134  * It has converted dbalist in each word node in tree to
1135  * array of dbas (record_addr_word [num_addrs_for_word])
1136  * with correct statistical weighting, still in 'host' byte swap order.
1137  * This function seeks word key in database.  If word is new,
1138  * it calls functions to write new vista rec and d99 data.
1139  * If word is old it calls functions to read word rec and update d99.
1140  */
1141 void            fill_data1 (char *node_word)
1142 {
1143     char            miker[1024];
1144     strcpy (miker, node_word);
1145
1146     count_word_ii++;
1147     if (shutdown_now) {
1148         printf (catgets (dtsearch_catd, MS_cborodin, 164,
1149             "\n%s Abort due to signal %d.  Database %s\n"
1150             "  probably corrupted.  Restore backup database.\n"),
1151             PROGNAME"164", shutdown_now, dicname);
1152         DtSearchExit (10);
1153     }
1154
1155     /* print occasional progress dots and msgs */
1156     if (!(count_word_ii % words_per_dot)) {
1157         putchar ('.');
1158         dotcount++;
1159         if (!(dotcount % 10))
1160             putchar (' ');
1161         if (dotcount >= 50) {
1162             dotcount = 0;
1163             seconds_left = (unsigned long)
1164                 (((float) num_of_diff_words /
1165                     (float) count_word_ii - 1.) *
1166                 (float) (time (NULL) - timestart));
1167             printf (catgets (dtsearch_catd, MS_cborodin, 849,
1168                 "\n%s: Word #%ld, %.0f%% done.  Est %lum %02lus "
1169                 "to completion.\n"),
1170                 aa_argv0, count_word_ii,
1171                 (float) count_word_ii / (float) num_of_diff_words * 100.0,
1172                 /***(count_word_ii * 100L) / num_of_diff_words,***/
1173                 seconds_left / 60L, seconds_left % 60L);
1174         }
1175         else
1176             fflush (stdout);
1177     }   /* endif for progress dots and msgs */
1178
1179     strncpy (got_word.or_hwordkey, node_word, DtSrMAXWIDTH_HWORD);
1180     got_word.or_hwordkey[DtSrMAXWIDTH_HWORD - 1] = 0;
1181     find_keyword (miker, 0);    /* vista KEYFIND for word rec */
1182     if (db_status == S_NOTFOUND) {      /* this is a NEW word */
1183         got_word.or_hwoffset = 0;
1184         got_word.or_hwfree = 0;
1185         got_word.or_hwaddrs = 0;
1186         fillnew_wordrec (&got_word, 0); /* write (empty) vista word rec */
1187         if (db_status != S_OKAY)
1188             vista_abort (PROGNAME"981");
1189         write_new_word_2_dtbs();        /* write NEW word's d99 entries
1190                                          * and update vista word rec */
1191         return;
1192     }
1193
1194     /* update previously existing word */
1195     read_wordstr (&got_word, 0);        /* read OLD word rec into got_word */
1196     if (db_status == S_OKAY)
1197         write_2_dtbs_addr_file();       /* update OLD word's d99 entries
1198                                          * and update vista word rec */
1199     return;
1200 }       /* fill_data1() */
1201
1202
1203 /************************************************/
1204 /*                                              */
1205 /*              load_into_bintree               */
1206 /*                                              */
1207 /************************************************/
1208 /* Pass 1 function.
1209  * Loads parsed word token or stem token into
1210  * inverted index binary tree along with passed dba.
1211  * Token is allowed to be empty, ie first byte is \0.
1212  * Derived from Efim's original 'teskey_parse()'
1213  * and bin_tree() functions.
1214  * Variables static for speeeeeeed.
1215  */
1216 static void     load_into_bintree (
1217                         char    *parser_token,
1218                         int     token_is_stem,
1219                         DB_ADDR dba)
1220 {
1221     static DtSrINT16    or_maxwordsz;
1222     static char         *cptr;
1223     static int          i;
1224     static TREENODE     **this_link;
1225     static TREENODE     *newnode;
1226     static DBALIST      *newdba;
1227     static char         *tokbuf =       NULL;
1228
1229     if (*parser_token == 0) {
1230         if (debugging & DEBUG_I)
1231             printf (" bintr=<empty> dba=%ld\n", (long)dba);
1232         return;
1233     }
1234
1235     /* Copy token to a buffer.
1236      * Stems have a special prefix character
1237      * to distinguish them from words.
1238      * Also increment total dba word count.
1239      */
1240     if (tokbuf == NULL) {
1241         or_maxwordsz = dblk.dbrec.or_maxwordsz;
1242         tokbuf = austext_malloc ((size_t) or_maxwordsz + 4,
1243             PROGNAME"1152", NULL);
1244     }
1245     if (token_is_stem) {
1246         tokbuf[0] = STEM_CH;
1247         strncpy (tokbuf + 1, parser_token, (size_t)or_maxwordsz);
1248         dbas_word_count[dba]++;
1249     }
1250     else
1251         strncpy (tokbuf, parser_token, (size_t)or_maxwordsz);
1252     tokbuf [or_maxwordsz] = 0;
1253     if (debugging & DEBUG_I)
1254         printf (" bintr='%s' dba=%ld ", displayable(tokbuf), (long)dba);
1255
1256     /* TREE TRAVERSAL.  Search binary tree to find either
1257      * insertion point or identical preexisting token.
1258      */
1259     for (this_link = &root_node; *this_link != NULL; ) {
1260         i = strcmp (tokbuf, (*this_link)->word);
1261
1262         /* If identical word/stem token already exists... */
1263         if (i == 0) {
1264             /* If token appears more than once in current
1265              * document (dba already exists at top of dba list),
1266              * just increment the word count in the list.
1267              */
1268             if ((*this_link)->dba_list->dba == dba)
1269                 (*this_link)->dba_list->w_c++;
1270
1271             /* If this is first appearance of token for this doc
1272              * (dba is not at start of token's dba list),
1273              * insert dba at start of token's dba list.
1274              */
1275             else {
1276                 if ((newdba = malloc (sizeof(DBALIST))) == NULL) {
1277                     printf (catgets (dtsearch_catd, MS_cborodin, 374,
1278                         msg_374), PROGNAME"1150");
1279                     DtSearchExit (26);
1280                 }
1281                 newdba->dba =             dba;
1282                 newdba->w_c =             1;
1283                 newdba->next_dba =        (*this_link)->dba_list;
1284                 (*this_link)->dba_list =  newdba;
1285             }
1286             if (debugging & DEBUG_I)
1287                 printf (" Old %ld=%ld\n",
1288                     (long)((*this_link)->dba_list->dba),
1289                     (long)((*this_link)->dba_list->w_c));
1290             return;     /* done with token */
1291
1292         } /* endif where token was found in binary tree */
1293
1294         /* Increment link ptr by descending to correct subtree */
1295         if (i < 0) {
1296             this_link = &(*this_link)->llink;
1297             if (debugging & DEBUG_I)
1298                 putchar ('L');
1299         }
1300         else {
1301             this_link = &(*this_link)->rlink;
1302             if (debugging & DEBUG_I)
1303                 putchar ('R');
1304         }
1305     } /* end tree traversal */
1306
1307     /* Tree traversal never found a preexisting token node.
1308      * Create a new node and insert it at the point
1309      * indicated by link ptr.
1310      */
1311     newnode = austext_malloc (sizeof(TREENODE) + strlen(tokbuf) + 4,
1312         PROGNAME"1234", NULL);
1313     newnode->llink =    NULL;
1314     newnode->rlink =    NULL;
1315     newnode->word = (char *) (newnode + 1);     /* use mem at end of node */
1316     strcpy (newnode->word, tokbuf);
1317
1318     newdba = austext_malloc (sizeof(DBALIST), PROGNAME"1235", NULL);
1319     newnode->dba_list = newdba;
1320     newdba->dba =       dba;
1321     newdba->w_c =       1;
1322     newdba->next_dba =  NULL;
1323
1324     *this_link =        newnode;
1325     num_of_diff_words++;
1326
1327     if (debugging & DEBUG_I)
1328         printf (" New %ld=%ld\n",
1329             (long)((*this_link)->dba_list->dba),
1330             (long)((*this_link)->dba_list->w_c));
1331     return;
1332 } /* load_into_bintree() */
1333
1334
1335 /**********************************************/
1336 /*                                            */
1337 /*                    MAIN                    */
1338 /*                                            */
1339 /**********************************************/
1340 main (int argc, char **argv)
1341 {
1342     int                 i;
1343     long                word_offset;    /* <-- PARG.offsetp */
1344     long                bytes_in;       /* ftell() */
1345     DtSrINT32           dba_offset;
1346     int                 got_ETX;
1347     char                *cptr, *src;
1348     char                temp_buf[40];
1349     char                db_key [DtSrMAX_DB_KEYSIZE + 2];
1350     int                 oops = FALSE;
1351     register DtSrINT32  cur_byte;
1352     struct tm           *tmptr;
1353     DB_ADDR             dba, temp_dba;
1354     time_t              elapsed;
1355     size_t              mallocsz;
1356     char                *parsebufp, *stembufp;
1357
1358     /******************* INITIALIZE ******************/
1359     setlocale (LC_ALL, "");
1360     dtsearch_catd = catopen (FNAME_DTSRCAT, 0);
1361
1362     aa_argv0 = strdup (argv[0]);
1363     time (&elapsed);
1364     tmptr = localtime (&elapsed);
1365     strftime (buf, sizeof(buf),
1366         catgets (dtsearch_catd, MS_misc, 22, "%A, %b %d %Y, %I:%M %p"),
1367         tmptr);
1368     printf (catgets (dtsearch_catd, MS_cborodin, 1, "%s.  Run %s.\n"),
1369         aa_argv0, buf);
1370     austext_exit_last = print_exit_code;
1371     batch_size = BATCH_SIZE;
1372     init_user_interrupt ();
1373     default_hashsize = duprec_hashsize;
1374
1375     memset (&dblk, 0, sizeof(DBLK));
1376
1377     memset (&parg, 0, sizeof(PARG));
1378     parg.dblk =         &dblk;
1379     parg.etxdelim =     ETXDELIM;       /* default, can be changed */
1380     parg.offsetp =      &word_offset;
1381     parg.flags |=       PA_INDEXING;    /* do compounding, if parser can */
1382
1383     /* Read user specified command line arguments */
1384     user_args_processor (argc, argv);
1385
1386     /* Finish init now that we know final values */
1387     inbuf = austext_malloc (inbufsz + 16, PROGNAME"1349", NULL);
1388     temp = austext_malloc (inbufsz + 16, PROGNAME"1285", NULL);
1389     sprintbuffer = austext_malloc (inbufsz + _POSIX_PATH_MAX + 16,
1390         PROGNAME"1286", NULL);
1391     record_addr_word = austext_malloc ((sizeof(DB_ADDR) * batch_size) + 16,
1392         PROGNAME "1133", NULL);
1393
1394     /* Save dicname and path in dblk.  Save full name of d99 file. */
1395     strcpy (dblk.name, dicname);
1396     dblk.path = dicpath;
1397     strcpy (dtbs_addr_file, dicpath);
1398     strcat (dtbs_addr_file, dicname);
1399     strcat (dtbs_addr_file, EXT_DTBS);
1400
1401     /* Open the database */
1402     if (!austext_dopen (dicname, dicpath, NULL, cache_size, &dbrec)) {
1403         fprintf (aa_stderr, "%s\n", DtSearchGetMessages());
1404         DtSearchExit (3);
1405     }
1406     memcpy (&dblk.dbrec, &dbrec, sizeof(DBREC));
1407
1408     /* Load database's parser, stemmer, and linguistic files into dblk. */
1409     if (!load_language (&dblk, NULL)) {
1410         puts (DtSearchGetMessages());
1411         printf (catgets (dtsearch_catd, MS_cborodin, 1097,
1412             "%s Aborting due to errors in loading language files.\n"),
1413             PROGNAME"1097");
1414         DtSearchExit(3);
1415     }
1416
1417     RECFRST (PROGNAME "1067", OR_OBJREC, 0);
1418     CRGET (PROGNAME "1069", &dba, 0);  /* byte swap already done in vista */
1419
1420     or_reccount = dbrec.or_reccount;    /* DtSrINT32 */
1421     or_recslots = dbrec.or_recslots;    /* promoted to DtSrINT32 */
1422     or_maxdba = dbrec.or_maxdba;        /* DtSrINT32 lim of dbas_word_count */
1423     bit_vector_size = ((or_maxdba / or_recslots + 1) >> 3) + 1; /* DtSrINT32 */
1424     dba_offset = or_recslots - (dba & 0x00FFFFFF);      /* DtSrINT32 */
1425
1426     if (debugging)
1427         printf (PROGNAME"1286 "
1428             "realnumrec=%ld recslots=%ld bitvecsz=%ld"
1429             " dbaoffset=%d maxdba=%ld\n",
1430             (long)or_reccount, (long)or_recslots, (long)bit_vector_size,
1431             (int)dba_offset, (long)or_maxdba);
1432
1433     /* Allocate memory space for the arrays.
1434      * dbas_bits_batch = 'bit vector', one bit for every possible rec#.
1435      *   the 1 bits = only the dba's that are in this fzk batch.
1436      * word_addrs_ii = fread buffer for d99 file.
1437      * dbas_word_count = summing bkts for word count statistics.
1438      */
1439     dbas_bits_batch = (char *) austext_malloc ((size_t)bit_vector_size + 48,
1440         PROGNAME "1150", NULL);
1441     word_addrs_ii = (DB_ADDR *) austext_malloc (
1442         sizeof (DB_ADDR) * (or_reccount + 1) + 48,
1443         PROGNAME "1152", NULL);
1444     mallocsz = sizeof(DtSrINT32) * (or_maxdba + 1) + 48;
1445     dbas_word_count = (DtSrINT32 *) austext_malloc (mallocsz,
1446         PROGNAME "1154", NULL);
1447     memset (dbas_bits_batch, 0, (size_t)bit_vector_size + 48);
1448     memset (dbas_word_count, 0, mallocsz);
1449
1450     root_node = NULL;
1451
1452    /* Open the d99 file that contains database addresses.
1453     * If the file doesn't exist, it means the database
1454     * for keyword search is empty - open a new file.
1455     */
1456     if ((dtbs_addr_fp = fopen (dtbs_addr_file, "r+b")) == NULL) {
1457         dtbs_addr_fp = fopen (dtbs_addr_file, "w+b");
1458         check_existing_addrs = FALSE;
1459         new_dtbs_file = TRUE;
1460         if (dtbs_addr_fp == NULL) {
1461             /* msg 1068 used multiple places */
1462             printf (catgets (dtsearch_catd, MS_cborodin, 1068,
1463                 "%s Can't open new inverted index file '%s': %s\n"),
1464                 PROGNAME"1068", dtbs_addr_file, strerror(errno));
1465             DtSearchExit (13);
1466         }
1467         /* write New Header Information to a file */
1468         init_header (dtbs_addr_fp, &fl_hdr);
1469     }
1470     else {
1471         /* read Header Information from d99 file */
1472         if (!fread_d99_header (&fl_hdr, dtbs_addr_fp)) {
1473             /* msg 1068 used multiple places */
1474             printf (catgets (dtsearch_catd, MS_cborodin, 1068,
1475                 "%s Can't read header data for '%s': %s\n"),
1476                 PROGNAME"1422", dtbs_addr_file, strerror(errno));
1477             DtSearchExit (13);
1478         }
1479     }
1480
1481     /* open input .fzk file */
1482     src = getcwd (sprintbuffer, _POSIX_PATH_MAX);
1483     if (!src && debugging)
1484         printf (PROGNAME"1336 Can't getcwd: %s.\n", strerror(errno));
1485     if (!src)
1486         src = getenv ("PWD");
1487     printf (catgets (dtsearch_catd, MS_misc, 24,
1488         "%s: current working directory = '%s', .fzk file = '%s'\n"),
1489         aa_argv0,
1490         (src) ? src : catgets (dtsearch_catd, MS_misc, 6, "<unknown>"),
1491         fname_input);
1492     if ((instream = fopen (fname_input, "rt")) == NULL) {
1493 BAD_INPUT_FILE:
1494         printf (catgets (dtsearch_catd, MS_cborodin, 1083,
1495             "%s Can't read input file '%s': %s\n"),
1496             PROGNAME"1083", fname_input, strerror(errno));
1497         DtSearchExit (14);
1498     }
1499     if (fstat (fileno (instream), &fstat_input) == -1)
1500         goto BAD_INPUT_FILE;
1501     parg.ftext = instream;      /* for readchar_ftext(), discard_to_ETX() */
1502
1503     time (&totalstart);         /* for total elapsed time */
1504     timestart = totalstart;     /* for Pass 1 elapsed time */
1505
1506     /*------------ PASS 1:  ------------
1507      * Main Read Loop.  For each text record in input file,
1508      * parse and stem words, store them into binary tree
1509      * inverted index in memory.
1510      * The first few lines are database administrative values.
1511      * They are presumed ascii and read with fgets() as
1512      * 'lines' terminated with \n.  The text of the document
1513      * itself is presumed to be in the appropriate database
1514      * 'language', so it is *not* presumed to be lines
1515      * terminated with \n.  The document text is read by
1516      * the language's parser() a 'word' at a time, which
1517      * ultimately means a byte at a time.
1518      */
1519     printf (catgets (dtsearch_catd, MS_cborodin, 1108,
1520         "%s: Beginning Pass 1, reading records from '%s'.\n"
1521         "   Each dot = %d records.\n"),
1522         aa_argv0, fname_input, recs_per_dot);
1523     dotcount = 0;
1524
1525     while (!feof(instream)) {
1526
1527         /* 1. Read and discard the FZKEY line.
1528          * 2. Read and discard the ABSTRACT line.
1529          * 3. Read the UNIQUE KEY for the record.
1530          *    Do some record initialization steps here.
1531          * 4. Read and discard the DATE line.
1532          * 5. Let the parser read and parse rest of record, ie doc text...
1533          */
1534
1535         /*----- READ LINE #1, fzkey -----*/
1536         if (fgets (inbuf, inbufsz, instream) == NULL)
1537             break;
1538         inbuf [inbufsz] = 0;    /* just to be sure */
1539
1540         if (shutdown_now) {
1541             printf (catgets (dtsearch_catd, MS_cborodin, 164,
1542                 "\n%s: %s Abort due to signal %d.  Database %s\n"
1543                 "  possibly corrupted.  Restore backup database.\n"),
1544                 aa_argv0, PROGNAME"1299", shutdown_now, dicname);
1545             DtSearchExit (11);
1546         }
1547
1548         /* Silently skip null records just like dtsrload */
1549         if (strcmp (inbuf, parg.etxdelim) == 0)
1550             continue;
1551
1552         record_count++;
1553
1554         /*----- READ LINE #2, abstract -----*/
1555         if (fgets (inbuf, inbufsz, instream) == NULL) {
1556 INVALID_FZK_FORMAT:
1557             printf (catgets (dtsearch_catd, MS_cborodin, 1129,
1558                 "%s: %s Invalid .fzk file format.\n"),
1559                 fname_input, PROGNAME"1129");
1560             DtSearchExit (22);
1561         }
1562         inbuf[inbufsz] = 0;     /* just to be sure */
1563
1564         /*--- READ LINE #3, unique database key ---*/
1565         if (fgets (inbuf, inbufsz, instream) == NULL)
1566             goto INVALID_FZK_FORMAT;
1567         inbuf[inbufsz] = 0;     /* just to be sure */
1568
1569         if ((cptr = strtok (inbuf, " \t\n")) == NULL)
1570             goto INVALID_FZK_FORMAT;
1571
1572         /* If necessary, discard long keys exactly like cravel */
1573         if (strlen (cptr) >= DtSrMAX_DB_KEYSIZE) {
1574             printf (catgets (dtsearch_catd, MS_cborodin, 659,
1575                 "\n%s: %s Discarding record, key too long:\n  '%s'.\n"),
1576                 aa_argv0, PROGNAME"659", cptr);
1577             discard_to_ETX (&parg);
1578             continue;
1579         }
1580         strcpy (db_key, cptr);
1581
1582         /* Skip duplicate record ids in same order as dtsrload */
1583         i = is_duprec (db_key);
1584         if (i == 2) {   /* out of memory */
1585             printf (catgets (dtsearch_catd, MS_cborodin, 374, msg_374),
1586                     PROGNAME"1317");
1587             DtSearchExit (57);
1588         }
1589         else if (i == 1) {      /* duplicate record id */
1590             duplicate_recids++;
1591             if (dotcount > 0)
1592                     putchar ('\n');
1593             printf (catgets (dtsearch_catd, MS_cborodin, 1402,
1594                 "%s: Discarded duplicate rec #%lu '%s'.\n"),
1595                 aa_argv0, record_count, db_key);
1596             discard_to_ETX (&parg);
1597             continue;
1598         }
1599
1600         /****** FFFFFFFFFFFFFFFFFFFFF **********/
1601         /* Convert database address (slot #) to 'record number',
1602          * what dba would have been if all records took up
1603          * only one slot and there were no dbrec at top of file.
1604          * Record numbers on d99, like dba's, start at #1,
1605          * but rec numbers here (in bit vector) start at #0.
1606          */
1607         KEYFIND (PROGNAME "222", OR_OBJKEY, (char *) db_key, 0);
1608         if (db_status != S_OKAY) {
1609             normal_retncode = 1;        /* = 'warning' */
1610             if (dotcount > 0)
1611                 putchar ('\n');
1612             printf (catgets (dtsearch_catd, MS_cborodin, 1168,
1613                 "%s: %s Discarded '%s', key not in database.\n"),
1614                 aa_argv0, PROGNAME"1168", displayable(db_key));
1615             discard_to_ETX (&parg);
1616             continue;
1617         }
1618
1619         CRGET (PROGNAME "224", &temp_dba, 0); /* vista already byte swapped */
1620         temp_dba &= 0x00FFFFFF; /* = slot# */
1621         dba = (temp_dba + dba_offset) / or_recslots; /* = rec#, base 1 */
1622         /*
1623          * Don't change this 'dba'!--eventually it goes
1624          * into d99 in this exact format!  It will also
1625          * be used as an index into dbas_word_count[] in
1626          * load_into_bintree() so do a sanity check
1627          * to make sure that it hasn't exceeded the size
1628          * of that array.  (The count increments have been
1629          * reported as as 'uninitialized memory reads'
1630          * by a debugger).  This might happen for example
1631          * if user failed to run dtsrload before dtsrindex?
1632          */
1633         if (dba < 1  ||  dba > or_maxdba) {
1634             printf ( catgets(dtsearch_catd, MS_cborodin, 21,
1635                 "\n%s '%s' record overflows word counter array.\n"
1636                 "Record number %ld > maxdba %ld, dba=%ld, "
1637                 "recslots=%ld, offs=%d.\n") ,
1638                 PROGNAME"1526", displayable(db_key),
1639                 (long)dba, (long)or_maxdba, (long)temp_dba,
1640                 (long)or_recslots, (int)dba_offset);
1641             DtSearchExit (68);
1642         }
1643         temp_dba = dba - 1;     /* = rec# starting at 0 */
1644         cur_byte = temp_dba >> 3;       /* bits to bytes: div by 8 */
1645         if (cur_byte >= bit_vector_size) {
1646             printf ( catgets(dtsearch_catd, MS_cborodin, 22,
1647                 "\n%s '%s' record in database (dba=%ld)\n"
1648                 "  overflows bitvector allocation (%ld >= %ld).\n") ,
1649                 PROGNAME"1475", displayable(db_key), (long)dba,
1650                 (long)cur_byte, (long)bit_vector_size);
1651             DtSearchExit (69);
1652         }
1653         dbas_bits_batch[cur_byte] |= 1 << (temp_dba % 8);
1654
1655         /* Print occasional progress dots and msgs */
1656         if (!(record_count % recs_per_dot)) {
1657             putchar ('.');
1658             dotcount++;
1659             if (!(dotcount % 10))
1660                 putchar (' ');
1661             if (dotcount >= 50) {
1662                 dotcount = 0;
1663                 bytes_in = ftell (instream);
1664                 seconds_left = (unsigned long)
1665                     (((float) fstat_input.st_size /
1666                     (float) bytes_in - 1.) *
1667                     (float) (time (NULL) - timestart));
1668                 printf (catgets (dtsearch_catd, MS_cborodin, 1190,
1669                     "\n%s: Rec #%lu, %.0f%% done.  "
1670                     "Est %lum %02lus to end Pass 1.\n"),
1671                     aa_argv0,
1672                     record_count,
1673                     (float) bytes_in / (float) fstat_input.st_size * 100.0,
1674                     seconds_left / 60UL,
1675                     seconds_left % 60UL);
1676             }
1677             fflush (stdout);
1678         }
1679
1680         /*----- READ LINE #4, date -----*/
1681         if (fgets (inbuf, inbufsz, instream) == NULL)
1682             goto INVALID_FZK_FORMAT;
1683         inbuf[inbufsz] = 0;     /* just to be sure */
1684
1685         /* PARSE LOOP FOR CURRENT TEXT BLOCK.
1686          * We must be in the middle of a record ('lines' #5 and beyond).
1687          * From here to ETX, which is either the record delimiter string
1688          * or the end of file, read the file a 'word' at a time
1689          * using the parse() function for the language specified
1690          * for the database.
1691          * Load_into_bintree() stores each token into
1692          * inverted index binary tree.
1693          * Note: dba here MUST still be rec#, base 1.
1694          * It's stored as is by load_into_bintree(),
1695          * and will be moved as is into d99 file in Pass 2.
1696          */
1697         if (debugging & DEBUG_P)
1698             printf ("\nRecord #%lu '%s'\n"
1699                     "Offset Word----               Stem----\n",
1700                 record_count, db_key);
1701         for (   cptr = dblk.parser (&parg);
1702                 cptr;
1703                 cptr = dblk.parser (NULL)) {
1704
1705             if (debugging & DEBUG_P) {
1706                 printf ("%6ld %s %n", word_offset, cptr, &i);
1707                 if (!(debugging & DEBUG_I))
1708                     while (i++ < 30)
1709                         putchar (' ');
1710             }
1711             load_into_bintree (cptr, FALSE, dba);
1712             cptr = dblk.stemmer (cptr, &dblk);
1713             if (debugging & DEBUG_P) {
1714                 printf ("%s\n", cptr);
1715                 fflush (stdout);
1716             }
1717             load_into_bintree (cptr, TRUE, dba);
1718         }
1719
1720     } /* end of PASS 1 Main read loop */
1721
1722     elapsed = time(NULL) - timestart;
1723     if (dotcount > 0) {
1724         putchar ('\n');
1725         dotcount = 0;
1726     }
1727     if (duplicate_recids > 0L) {
1728         normal_retncode = 1;    /* 'warning' */
1729         sprintf (buf, catgets (dtsearch_catd, MS_cborodin, 40,
1730             "Ignored %ld duplicate records"),
1731             duplicate_recids);
1732     }
1733     else
1734         strcpy (buf, catgets (dtsearch_catd, MS_cborodin, 41,
1735             "No duplicate records found"));
1736     printf (catgets (dtsearch_catd, MS_cborodin, 1225,
1737         "%s: Pass 1 completed in %lum %lus, read %lu records.\n"
1738         "  %s, parsed %lu words.\n"),
1739         aa_argv0, elapsed / 60L, elapsed % 60L, record_count,
1740         buf, num_of_diff_words);
1741     if (record_count > batch_size) {
1742         printf (catgets (dtsearch_catd, MS_cborodin, 33,
1743             "\n%s Number of incoming records exceeded %d.\n"
1744             "  This will usually result in 'Out of Paging Space' "
1745             "error in Pass 2\n"
1746             "  and corruption of database.  Either split the incoming file to\n"
1747             "  reduce record count or use the -b option, and rerun.\n"),
1748             PROGNAME"33", (int)batch_size);
1749         DtSearchExit (33);
1750     }
1751
1752     /*----------------- PASS 2:  -----------------
1753      * Traverse completed binary tree and write it to d99 file.
1754      */
1755     printf (catgets (dtsearch_catd, MS_cborodin, 1233,
1756         "%s: Beginning Pass 2: batch index traversal and database update.\n"
1757         "  Each dot = %d words.\n"),
1758         aa_argv0, words_per_dot);
1759     dotcount = 0;
1760     time (&timestart);
1761     traverse_tree ();   /* actual Pass 2 */
1762     if (dotcount) {
1763         putchar ('\n');
1764         dotcount = 0;
1765     }
1766
1767     /* Write header information to the d99 file */
1768     if (!fwrite_d99_header (&fl_hdr, dtbs_addr_fp)) {
1769         printf (catgets (dtsearch_catd, MS_cborodin, 776, msg_776),
1770             PROGNAME"1723", strerror(errno));
1771         DtSearchExit (13);
1772     }
1773     d_close ();
1774     fclose (dtbs_addr_fp);
1775
1776     elapsed = time (NULL) - timestart;
1777     printf (catgets (dtsearch_catd, MS_cborodin, 1246,
1778         "%s: Pass 2 completed in %lum %lus, updated %lu words.\n"),
1779         aa_argv0, elapsed / 60L, elapsed % 60L, count_word_ii);
1780     if (normal_retncode == 1)
1781         printf (catgets (dtsearch_catd, MS_cborodin, 2,
1782             "%s: Warnings were detected.\n"), aa_argv0);
1783     DtSearchExit (normal_retncode);
1784
1785 } /* main() */
1786
1787 /*************************** DTSRINDEX.C ****************************/
1788