cde/programs/dtsr/dtsrindex.c

   1 /*
   2  * CDE - Common Desktop Environment
   3  *
   4  * Copyright (c) 1993-2012, The Open Group. All rights reserved.
   5  *
   6  * These libraries and programs are free software; you can
   7  * redistribute them and/or modify them under the terms of the GNU
   8  * Lesser General Public License as published by the Free Software
   9  * Foundation; either version 2 of the License, or (at your option)
  10  * any later version.
  11  *
  12  * These libraries and programs are distributed in the hope that
  13  * they will be useful, but WITHOUT ANY WARRANTY; without even the
  14  * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE. See the GNU Lesser General Public License for more
  16  * details.
  17  *
  18  * You should have received a copy of the GNU Lesser General Public
  19  * License along with these librararies and programs; if not, write
  20  * to the Free Software Foundation, Inc., 51 Franklin Street, Fifth
  21  * Floor, Boston, MA 02110-1301 USA
  22  */
  23 /*
  24  *   COMPONENT_NAME: austext
  25  *
  26  *   FUNCTIONS: descend_tree
  27  *              displayable
  28  *              fill_data1
  29  *              load_into_bintree
  30  *              main
  31  *              print_exit_code
  32  *              print_usage_msg
  33  *              put_addrs_2_dtbs_addr_file
  34  *              segregate_dicname
  35  *              traverse_tree
  36  *              user_args_processor
  37  *              write_2_dtbs_addr_file
  38  *              write_new_word_2_dtbs
  39  *              write_to_file
  40  *
  41  *   ORIGINS: 27
  42  *
  43  *
  44  *   (C) COPYRIGHT International Business Machines Corp. 1992,1996
  45  *   All Rights Reserved
  46  *   Licensed Materials - Property of IBM
  47  *   US Government Users Restricted Rights - Use, duplication or
  48  *   disclosure restricted by GSA ADP Schedule Contract with IBM Corp.
  49  */
  50 /************************ DTSRINDEX.C *******************************
  51  * $XConsortium: dtsrindex.c /main/10 1996/09/23 21:02:54 cde-ibm $
  52  * CDE version of borodin.c
  53  * Formerly dtsrindex.c was cborodin.c.
  54  *
  55  * INPUT FORMAT:
  56  * Text file in FZK format.
  57  * Each record contains 4 formatted 'lines' (text strings ending in \n):
  58  * 1. fzkey (not used in this program).
  59  * 2. abstract (not used in this program).
  60  * 3. unique database key for the record.  Used to find the database
  61  *    address of the record which is the reference for the inverted index.
  62  * 4. The record's date (not used in this program).
  63  *
  64  * The rest of the record is unformatted text (not necessarily organized
  65  * into 'lines').  It is read a character at a time and parsed into
  66  * individual words by the parser function for the database's language.
  67  * Each record ends with a delimiter string specified by command line arg.
  68  *
  69  * $Log$
  70  * Revision 2.8  1996/04/10  19:50:38  miker
  71  * Deleted dangerous and unnecessary -a option.
  72  *
  73  * Revision 2.7  1996/03/25  18:54:15  miker
  74  * Changed FILENAME_MAX to _POSIX_PATH_MAX.
  75  *
  76  * Revision 2.6  1996/02/01  18:25:44  miker
  77  * AusText 2.1.11, DtSearch 0.3.  Pass 1 changed to accommodate
  78  * new single-character reading parser/stemmers.
  79  *
  80  * Revision 2.5  1995/12/29  17:16:04  miker
  81  * Bug fix: Opened wrong msg catalog.
  82  *
  83  * Revision 2.4  1995/12/27  21:18:40  miker
  84  * Msg bug: 'percent done' was negative number.
  85  *
  86  * Revision 2.3  1995/12/01  16:15:44  miker
  87  * Deleted unnecessary log2 var, conflict with Solaris function.
  88  * Added -r command line arg.
  89  *
  90  * Revision 2.2  1995/10/26  15:26:53  miker
  91  * Added prolog.
  92  *
  93  * Revision 2.1  1995/09/22  19:29:53  miker
  94  * Freeze DtSearch 0.1, AusText 2.1.8
  95  *
  96  * Revision 1.3  1995/09/05  21:08:54  miker
  97  * Fixed bug: appeared as if 1 and 2 char 'words' were being indexed.
  98  * Added DEBUG_P switch.
  99  *
 100  * Revision 1.2  1995/09/01  22:17:02  miker
 101  * Fixed solaris segfault: too many args to printf in print_usage().
 102  *
 103  * Revision 1.1  1995/08/31  20:51:08  miker
 104  * Initial revision of dtsrindex.c, copied from cborodin.c.
 105  *
 106  * Log: cborodin.c,v
 107  * Revision 1.18  1995/05/30  18:58:54  miker
 108  * Correct bug introduced by previous fix (2.1.5c).
 109  *
 110  * Revision 1.17  1995/05/18  22:54:08  miker
 111  * 2.1.5b cborodin bug.  Segfault due to overflowing bitvector
 112  * after many deletions and no mrclean.
 113  */
 114 #include "SearchP.h"
 115 #include <limits.h>
 116 #include <stdlib.h>
 117 #include <unistd.h>
 118 #include <string.h>
 119 #include <ctype.h>
 120 #include <time.h>
 121 #include <errno.h>
 122 #include <math.h>
 123 #include <sys/stat.h>
 124 #include <locale.h>
 125 #include "vista.h"
 126
 127 extern void     find_keyword (char *cur_word, int vista_num);
 128 extern void     read_wordstr (struct or_hwordrec * glob_word, int vista_num);
 129 extern void     write_wordstr (struct or_hwordrec * glob_word, int vista_num);
 130 extern void     fill_data1 (char *ch);
 131
 132 #define PROGNAME        "DTSRINDEX"
 133
 134 #define BATCH_SIZE      10000L
 135 #define WORDS_PER_DOT   500
 136 #define RECS_PER_DOT    20
 137 #define INBUFSZ         1024    /* default input text header line size */
 138 #define MS_misc         1
 139 #define MS_cborodin     14
 140
 141 /******************* BIT VECTORS *****************/
 142 DB_ADDR        *word_addrs_ii;          /* fread buf for d99 (= tot # dbas) */
 143 DtSrINT32       *dbas_word_count;
 144 char           *dbas_bits_batch;
 145 DB_ADDR        *record_addr_word;
 146 DtSrINT32       num_addrs_for_word;
 147 DtSrINT32       or_reccount;
 148 DtSrINT32       bit_vector_size;
 149
 150 /*-------------------------- GLOBALS ----------------------------*/
 151 /* batch_size also used by fileman.c for allocating unused holes
 152  * in order to no go past end of 'record_addr_word' array.
 153  */
 154 extern DtSrINT32  batch_size;
 155
 156 char            buf[1024];
 157 static int      cache_size =            CACHE_SIZE;
 158 static int      check_existing_addrs =  TRUE;
 159 long            count_word_ii =         0L;
 160 long            dbkey_seqno =           0L;
 161 DBLK            dblk;
 162 DBREC           dbrec;
 163 static int      debugging =             0;
 164   #define DEBUG_I       0x01    /* P1 tree insertions */
 165   #define DEBUG_P       0x10    /* P1 parser/stemmer */
 166   #define DEBUG_T       0x02    /* P2 tree dump (words) */
 167   #define DEBUG_N       0x04    /* P2 NEW words, vista */
 168   #define DEBUG_O       0x08    /* P2 OLD words, vista)  */
 169   #define DEBUG_t       0x20    /* P2 tree dump (dbas) */
 170   #define DEBUG_n       0x40    /* P2 NEW d99 for new words */
 171   #define DEBUG_o       0x80    /* P2 OLD d99 updates for old words */
 172 static unsigned long
 173                 default_hashsize;
 174 char            dicname [10];
 175 char            dicpath [_POSIX_PATH_MAX];
 176 static int      dotcount =              0;
 177 char            dtbs_addr_file [_POSIX_PATH_MAX];
 178 FILE           *dtbs_addr_fp;
 179 long            dtbs_size_records =     0L;
 180 static long     duplicate_recids =      0L;
 181 struct stat     fstat_input;
 182 FILE_HEADER     fl_hdr;
 183 static char     fname_input [_POSIX_PATH_MAX];
 184 struct or_hwordrec
 185                 got_word;
 186 static FILE    *instream;
 187 char            *inbuf;
 188 int             inbuf_overflowed =      FALSE;
 189 size_t          inbufsz =               INBUFSZ;
 190 int             is_pmr;
 191 static DtSrINT32
 192                 or_maxdba =             0;
 193 static char     msg_374[] =     "\n%s Out of Memory!\n"
 194                                 "  Split the incoming file into several "
 195                                 "smaller files and try again.\n";
 196 static char     msg_776[] =     "\n%s Write Failure d99 file: %s\n";
 197 char            new_dtbs_file =         FALSE;
 198 long            num_of_diff_words =     0L;
 199 int             normal_retncode =       0;
 200 static PARG     parg;
 201 int             parsep_char =           END_RETAIN_PAGE;
 202 char            rec_type;
 203 unsigned long   record_count =          0UL;
 204 int             record_lines;
 205 static int      recs_per_dot =          RECS_PER_DOT;
 206 static unsigned long
 207                 seconds_left;
 208 extern int      shutdown_now;
 209 static DtSrINT32
 210                 or_recslots;
 211 char            *sprintbuffer =         NULL;
 212 char            *temp =                 NULL;
 213 extern int      debugging_teskey;
 214 time_t          timestart =             0;
 215 time_t          totalstart =            0;
 216 static int      words_per_dot =         WORDS_PER_DOT;
 217
 218 /************************************************/
 219 /*                                              */
 220 /*                   DBALIST                    */
 221 /*                                              */
 222 /************************************************/
 223 typedef struct dba_str {
 224     DB_ADDR             dba;
 225     DtSrINT32           w_c;
 226     struct dba_str      *next_dba;
 227 }               DBALIST;
 228
 229 /************************************************/
 230 /*                                              */
 231 /*                   TREENODE                   */
 232 /*                                              */
 233 /************************************************/
 234 typedef struct _treen_ {
 235     char           *word;       /* ptr to word in stop list */
 236     struct _treen_ *llink;      /* left link in binary tree */
 237     struct _treen_ *rlink;      /* ptr to right link in binary tree */
 238     DBALIST        *dba_list;
 239 }               TREENODE;
 240
 241 static TREENODE *root_node =            NULL;
 242 static TREENODE *top_of_stack;
 243 static TREENODE *stack;
 244 static TREENODE *pres;
 245 static TREENODE *prev;
 246 static TREENODE *next;
 247 static TREENODE *avail_node;
 248
 249
 250
 251 /************************************************/
 252 /*                                              */
 253 /*                 displayable                  */
 254 /*                                              */
 255 /************************************************/
 256 /* Returns static string same as passed string except nonprintable
 257  * and nonascii chars replaced by '^' for display.
 258  */
 259 static char     *displayable (char *passed_string)
 260 {
 261     static char         *buf =          NULL;
 262     static size_t       buflen =        0;
 263     size_t              passed_len =    strlen (passed_string);
 264     char                *targ, *src;
 265     if (buflen < passed_len) {
 266         if (buf)
 267             free (buf);
 268         buflen = passed_len;
 269         buf = austext_malloc (buflen + 4, PROGNAME"158", NULL);
 270     }
 271     targ = buf;
 272     for (src = passed_string;  *src != 0;  src++) {
 273         if (*src >= 32  && *src < 127)
 274             *targ++ = *src;
 275         else
 276             *targ++ = '^';
 277     }
 278     *targ = 0;
 279     return buf;
 280 } /* displayable() */
 281
 282
 283 /************************************************/
 284 /*                                              */
 285 /*               print_exit_code                */
 286 /*                                              */
 287 /************************************************/
 288 /* Called from inside DtSearchExit() at (*austext_exit_last)() */
 289 static void     print_exit_code (int exit_code)
 290 {
 291     if(dotcount) {
 292         putchar ('\n');
 293         dotcount = 0;
 294     }
 295     /* Put total seconds into totalstart */
 296     if (totalstart > 0)
 297         totalstart = time (NULL) - totalstart;
 298     printf (catgets (dtsearch_catd, MS_cborodin, 206,
 299         "%s: Exit Code = %d, Total elapsed time %ldm %lds.\n"),
 300         aa_argv0, exit_code, totalstart / 60L, totalstart % 60L);
 301     return;
 302 }       /* print_exit_code() */
 303
 304
 305 /****************************************/
 306 /*                                      */
 307 /*           write_to_file()            */
 308 /*                                      */
 309 /****************************************/
 310 /* This is the 'visit node' point for the tree traversal
 311  * functions of Pass 2 (traverse_tree() and descend_tree()).
 312  *
 313  * Each tree node = word or stem + linked list of dbas.
 314  * When called, each dba list member just contains the number
 315  * of times the token appears in that document.  This function
 316  * chains through the list, builds a statistical 'weight'
 317  * for each doc/word pair, and stores it as a reformatted 'dba'
 318  * in array 'record_addr_word[]', in 'host' byte swap order.
 319  * The count of the current number of addrs
 320  * in the array is stored in 'num_addrs_for_word'.
 321  * Fill_data1() is then called to update or write a new
 322  * vista record and d99 data for the token.
 323  *
 324  * The weight stored for each doc-word instance is 1 byte.
 325  * It's the ratio of log of number of times given word occurs in doc,
 326  * divided by log of total count of all words in doc,
 327  * scaled to range 0 to 255.
 328  * Fundamentally it's a word count of that word in the doc,
 329  * but adjusted as follows:
 330  * 1) Large occurrances in small documents weigh more than
 331  *    the same number of occurrances in large documents.
 332  * 2) Taking the log skews the ratio to be more linear,
 333  *    ie take advantage of higher ranges of the 'weight'.
 334  *    For example a word that occurs in 10% of the document,
 335  *    will have a weight of .5 (50%).
 336  * 3) The scaling changes the ratio, a float between 0. and .9999,
 337  *    to an integer between 0 and 255.
 338  */
 339 void            write_to_file (TREENODE * output_node)
 340 {
 341     DBALIST     *print_dba;
 342     DB_ADDR     mydba;
 343
 344     /* 'record_addr_word[]' was permanently allocated
 345      * with a size = max batch size so it can hold
 346      * all the addrs for a single word node in the tree.
 347      * In effect it will replace the dba linked list.
 348      * Note: word_addrs_ii (io buffer for d99 file) != record_addr_word[].
 349      */
 350
 351     if (debugging & (DEBUG_T | DEBUG_t)) {      /* Print out tree node */
 352         printf (" node '%s' %c%c%c",
 353             displayable(output_node->word),
 354             (output_node->llink)? 'L' : '.',
 355             (output_node->rlink)? 'R' : '.',
 356             (debugging & DEBUG_t)? '\n' : ' ');
 357     }
 358
 359     num_addrs_for_word = 0;     /* DtSrINT32 */
 360     print_dba = output_node->dba_list;
 361     while (print_dba != NULL) {
 362
 363         mydba = print_dba->dba;
 364         if (debugging & DEBUG_t)
 365             printf ("    dba #%ld: node adr=%ld cnt=%ld",
 366                 (long)num_addrs_for_word, (long)mydba, (long)print_dba->w_c);
 367
 368         record_addr_word [num_addrs_for_word] =
 369             mydba << 8;  /* rec# in hi 3 bytes */
 370         record_addr_word [num_addrs_for_word] +=
 371             (log ((double) (print_dba->w_c) + 0.5) /
 372             log ((double) (dbas_word_count[mydba] + 1))) * 256;
 373
 374         if (debugging & DEBUG_t)
 375             printf ("  -> x%lx (%ld:%ld)\n",
 376                 (long)record_addr_word [num_addrs_for_word],
 377                 (long)record_addr_word [num_addrs_for_word] >> 8,
 378                 (long)record_addr_word [num_addrs_for_word] & 0xffL);
 379
 380         print_dba = print_dba->next_dba;
 381         num_addrs_for_word++;
 382         if (num_addrs_for_word >= batch_size) {
 383             printf (catgets (dtsearch_catd, MS_cborodin, 280,
 384                 "\n%s num_addrs_for_word (%ld) >= batchsz (%ld).\n"),
 385                 PROGNAME"280", (long)num_addrs_for_word, (long)batch_size);
 386             DtSearchExit (91);
 387         }
 388     }
 389     if ((debugging & DEBUG_T)  && !(debugging & DEBUG_t))
 390         printf (" dbacnt=%ld\n", (long)num_addrs_for_word);
 391
 392     fill_data1 (output_node->word);
 393
 394     return;
 395 } /* write_to_file() */
 396
 397
 398 /****************************************/
 399 /*                                      */
 400 /*           descend_tree()             */
 401 /*                                      */
 402 /****************************************/
 403 /* Coroutine of traverse_tree(), Pass 2 Robson tree traversal.
 404  * The write_to_file() function is the 'preorder visit' point.
 405  */
 406 void            descend_tree (void)
 407 {
 408     int             not_done = TRUE;
 409
 410     while (not_done) {
 411         if ((pres->llink == NULL) && (pres->rlink == NULL)) {
 412             write_to_file (pres);
 413             avail_node = pres;
 414             return;
 415         }
 416         if (pres->llink != NULL) {
 417             next = pres->llink;
 418             pres->llink = prev;
 419             prev = pres;
 420             pres = next;
 421         }
 422         else {
 423             write_to_file (pres);
 424             next = pres->rlink;
 425             pres->rlink = prev;
 426             prev = pres;
 427             pres = next;
 428         }
 429     }
 430     return;
 431 } /* descend_tree() */
 432
 433
 434 /********************************/
 435 /*                              */
 436 /*        traverse_tree         */
 437 /*                              */
 438 /********************************/
 439 /* This is the actual Pass 2 function, a tree traversal
 440  * of Pass 1's word-dba binary tree.
 441  * The algorithm is based on the J. M. ROBSON link inversion traversal
 442  * algorithm for binary trees. Ref. Thomas A. STANDISH  pp. 77-78.
 443  * The write_to_file() function is the 'preorder visit' point.
 444  */
 445 void            traverse_tree (void)
 446 {
 447     int             not_done = TRUE;
 448     int             descend = TRUE;
 449
 450     /* Dheck for the empty tree */
 451     if (root_node == NULL) {
 452         printf (catgets (dtsearch_catd, MS_cborodin, 288,
 453             "%s Abort. There are no words in the input file %s.\n"),
 454             PROGNAME"288", fname_input);
 455         DtSearchExit (34);
 456     }
 457     /* Initialize the variables */
 458     pres = root_node;
 459     prev = pres;
 460     top_of_stack = NULL;
 461     stack = NULL;
 462
 463     while (not_done) {
 464         if (descend) {
 465             descend_tree ();
 466         }
 467         if (pres == root_node) {
 468             return;
 469         }
 470         if (prev->rlink == NULL) {
 471             write_to_file (prev);
 472             next = prev->llink;
 473             prev->llink = pres;
 474             pres = prev;
 475             prev = next;
 476             descend = FALSE;
 477         }
 478         else {
 479             if (prev->llink == NULL) {
 480                 next = prev->rlink;
 481                 prev->rlink = pres;
 482                 pres = prev;
 483                 prev = next;
 484                 descend = FALSE;
 485             }
 486             else {
 487                 if (prev == top_of_stack) {
 488                     next = stack;
 489                     top_of_stack = stack->rlink;
 490                     stack = stack->llink;
 491                     next->llink = NULL;
 492                     next->rlink = NULL;
 493                     next = prev->llink;
 494                     prev->llink = prev->rlink;
 495                     prev->rlink = pres;
 496                     pres = prev;
 497                     prev = next;
 498                     descend = FALSE;
 499                 }
 500                 else {
 501                     write_to_file (prev);
 502                     avail_node->llink = stack;
 503                     avail_node->rlink = top_of_stack;
 504                     stack = avail_node;
 505                     top_of_stack = prev;
 506                     next = prev->rlink;
 507                     prev->rlink = pres;
 508                     pres = next;
 509                     descend = TRUE;
 510                 }
 511             }
 512         }
 513     }
 514 } /* traverse_tree() */
 515
 516
 517
 518 /********************************************************/
 519 /*                                                      */
 520 /*                 print_usage_msg                      */
 521 /*                                                      */
 522 /********************************************************/
 523 static void     print_usage_msg (void)
 524 {
 525                     printf (catgets (dtsearch_catd, MS_cborodin, 17,
 526 "\n"
 527 "USAGE: %s -d<dbname> [options] <infile>\n"
 528 "       Listed default file name extensions can be overridden.\n"
 529 "  -d<dbname>  1 - 8 character database name, include optional path prefix.\n"
 530 "  -t<etxstr>  End of text document delimiter string.  Default '\\f\\n'.\n"
 531 "  -r<N>       Change Pass 1 records-per-dot from %d to <N>.\n"
 532 "  -b<N>       Change max batch size from %ld to <N>.\n"
 533 "  -c<N>       Change database paging cache from %ld 1K pages to <N> 1K pages.\n"
 534 "              <N> >= 16 by powers of 2.  Initially try only small changes.\n"
 535 "  -i<N>       Change (i)nput buffer size from default %d to <N>.\n"
 536 "  -h<N>       Change duplicate record id hash table size from %ld to <N>.\n"
 537 "              -h0 means there are no duplicates, do not check for them.\n"
 538 "  <infile>    Input [path]file name.  Default extension %s.\n"),
 539         aa_argv0,
 540         (int) RECS_PER_DOT,
 541         (long) BATCH_SIZE,  (long) CACHE_SIZE,
 542         (int) INBUFSZ,  default_hashsize,  EXT_FZKEY);
 543     return;
 544 } /* print_usage_msg() */
 545
 546
 547 /********************************************************/
 548 /*                                                      */
 549 /*                segregate_dicname                     */
 550 /*                                                      */
 551 /********************************************************/
 552 /* Separates dictionary name from pathname and loads
 553  * them into the globals 'dicname' and 'dicpath'.
 554  * Returns TRUE if dicname is valid, else returns FALSE.
 555  */
 556 static int      segregate_dicname (char *string)
 557 {
 558     char            mybuf[_POSIX_PATH_MAX];
 559     char           *ptr;
 560     int             i;
 561
 562     strncpy (mybuf, string, sizeof (mybuf));
 563     mybuf[sizeof (mybuf) - 1] = 0;
 564
 565     /*
 566      * Set 'ptr' to just the 8 char dictionary name by moving
 567      * it backwards until first non-alphanumeric character
 568      * (such as a ":" in the dos drive id or a slash between directories),
 569      * or to the beginning of string.
 570      */
 571     for (ptr = mybuf + strlen (mybuf) - 1; ptr >= mybuf; ptr--)
 572         if (!isalnum (*ptr)) {
 573             ptr++;
 574             break;
 575         }
 576     if (ptr < mybuf)
 577         ptr = mybuf;
 578
 579     /* test for valid dictionary name */
 580     i = strlen (ptr);
 581     if (i < 1 || i > 8)
 582         return FALSE;
 583
 584     strcpy (dicname, ptr);
 585     *ptr = 0;
 586     strncpy (dicpath, mybuf, sizeof (dicpath));
 587     dicpath[sizeof (dicpath) - 1] = 0;
 588     return TRUE;
 589 } /* segregate_dicname() */
 590
 591
 592 /********************************************************/
 593 /*                                                      */
 594 /*                 USER_ARGS_PROCESSOR                  */
 595 /*                                                      */
 596 /********************************************************/
 597 /* handles command line arguments for 'main' */
 598 void            user_args_processor (int argc, char **argv)
 599 {
 600     char           *argptr;
 601     char           *targ, *src;
 602     int             i;
 603
 604     if (argc <= 1) {
 605         print_usage_msg ();
 606         DtSearchExit (2);
 607     }
 608     /* Initialize some variables prior to parsing command line */
 609     dicname[0] = 0;
 610     dicpath[0] = 0;
 611
 612     /* Each pass grabs new parm of "-xxx" format */
 613     while (--argc > 0 && (*++argv)[0] == '-') {
 614         argptr = argv[0];
 615         switch (argptr[1]) {
 616
 617             case 't':           /* ETX delimiter string */
 618                 /* Replace any "\n" string with real linefeed */
 619                 targ = parg.etxdelim = malloc (strlen (argptr + 2) + 4);
 620                 src = argptr + 2;
 621                 while (*src) {
 622                     if (src[0] == '\\' && src[1] == 'n') {
 623                         *targ++ = '\n';
 624                         src += 2;
 625                     }
 626                     else
 627                         *targ++ = *src++;
 628                 }
 629                 *targ = 0;
 630                 break;
 631
 632             case 'r':
 633                 if ((recs_per_dot = atoi (argptr + 2)) <= 0) {
 634                     printf (catgets (dtsearch_catd, MS_cborodin, 577,
 635                         "%s Invalid arg '%s'.  Using default -r%d.\n"),
 636                         PROGNAME"577", argptr, RECS_PER_DOT);
 637                     recs_per_dot = RECS_PER_DOT;
 638                 }
 639                 break;
 640
 641             case 'h':
 642                 duprec_hashsize = atol (argptr + 2);
 643                 if (duprec_hashsize == 0UL)
 644                     printf (catgets (dtsearch_catd, MS_cborodin, 539,
 645                         "%s Duplicate record id checking disabled.\n"),
 646                         PROGNAME"539");
 647                 break;
 648
 649             case 'b':
 650                 batch_size = atol (argptr + 2);
 651                 if (batch_size <= 0L) {
 652                     printf (catgets (dtsearch_catd, MS_cborodin, 595,
 653                         "%s Invalid batch size argument '%s'.\n"),
 654                         PROGNAME"595", argptr);
 655                     goto BADPARM;
 656                 }
 657                 break;
 658
 659             case 'c':
 660                 cache_size = atoi (argptr + 2);
 661                 if (cache_size < 16) {
 662                     /* minimum size is 16 */
 663                     if (cache_size > 0)
 664                         cache_size = 16;
 665                     /* on error reset size to default */
 666                     else
 667                         cache_size = CACHE_SIZE;
 668 CACHE_ADJUSTED:
 669                     printf (catgets (dtsearch_catd, MS_cborodin, 600,
 670                             "%sCache size readjusted to %d.\n"),
 671                         PROGNAME "600 ", cache_size);
 672                     break;
 673                 }
 674                 /* If necessary, round up to nearest power of 2 */
 675                 for (i = 4; i < 12; i++)
 676                     if (1 << i >= cache_size)
 677                         break;
 678                 i = 1 << i;
 679                 if (i != cache_size) {
 680                     cache_size = i;
 681                     goto CACHE_ADJUSTED;
 682                 }
 683                 break;
 684
 685             case 'D':           /* unadvertised debugging feature */
 686                 for (i = 2;  argptr[i] != 0;  i++) {
 687                     switch (argptr[i]) {
 688                         case 'I':       debugging |= DEBUG_I;  break;
 689                         case 'P':       debugging |= DEBUG_P;
 690                                 /******* debugging_teskey = TRUE; ******/
 691                                         break;
 692                         case 'N':       debugging |= DEBUG_N;  break;
 693                         case 'n':       debugging |= DEBUG_n;  break;
 694                         case 'O':       debugging |= DEBUG_O;  break;
 695                         case 'o':       debugging |= DEBUG_o;  break;
 696                         case 'T':       debugging |= DEBUG_T;  break;
 697                         case 't':       debugging |= DEBUG_t;  break;
 698                         default:        goto BADPARM;
 699                     }
 700                 }
 701                 break;
 702
 703             case 'd':
 704                 /* May include both dicname and dicpath */
 705                 if (!segregate_dicname (argptr + 2)) {
 706                     printf (catgets (dtsearch_catd, MS_cborodin, 550,
 707                         "%s '%s' is invalid path/database name.\n"),
 708                         PROGNAME"550", argptr);
 709                     goto BADPARM;
 710                 }
 711                 break;
 712
 713             case 'i':           /* (I)nput buffer size */
 714                 if ((inbufsz = atol (argptr + 2)) <= 0) {
 715                     printf (catgets (dtsearch_catd, MS_cborodin, 558,
 716                         "%s Invalid input buffer size '%s'.\n"),
 717                         PROGNAME"558", argptr);
 718                     goto BADPARM;
 719                 }
 720                 break;
 721
 722             default:
 723                 printf (catgets (dtsearch_catd, MS_cborodin, 567,
 724                     "%s Unknown command line argument '%s'.\n"),
 725                     PROGNAME"567", argptr);
 726 BADPARM:
 727                 print_usage_msg ();
 728                 DtSearchExit (2);       /* abort */
 729
 730         }                       /* endswitch */
 731     }                           /* endwhile for cmd line '-'processing */
 732
 733     /* Validate input file name */
 734     if (argc-- <= 0) {
 735         printf (catgets (dtsearch_catd, MS_cborodin, 580,
 736             "%s Missing required input file name.\n"),
 737             PROGNAME"580");
 738         goto BADPARM;
 739     }
 740     /* Don't incr argv yet--save input file name */
 741     else
 742         append_ext (fname_input, _POSIX_PATH_MAX, argv[0], EXT_FZKEY);
 743
 744     /* Check for missing database name */
 745     if (dicname[0] == 0) {
 746         printf (catgets (dtsearch_catd, MS_cborodin, 589,
 747             "%s No database name specified (-d argument).\a\n"),
 748             PROGNAME"589");
 749         goto BADPARM;
 750     }
 751     strcpy (dblk.name, dicname);
 752     dblk.path = dicpath;
 753     return;
 754 } /* user_args_processor() */
 755
 756
 757 /****************************************/
 758 /*                                      */
 759 /*      put_addrs_2_dtbs_addr_file      */
 760 /*                                      */
 761 /****************************************/
 762 /* Suboutine of write_2_dtbs_addr_file() from Pass 2.
 763  * That function has used a bit vector to determine
 764  * the total change in old d99 addrs for preexisting words,
 765  * and prepared for writing an array of old dbas that
 766  * are not in the current words tree node (globally named
 767  * word_addrs_ii [num_addrs]).
 768  * The addrs that ARE in the Pass 1 node fzk file were previously
 769  * prepared in a similar array of dbas, globally named
 770  * record_addr_word [num_addrs_for_word] but passed here as
 771  * 'addrs_array' and 'nitems'.
 772  * Both arrays will be byte swapped from 'host' to
 773  * 'network' order in this function.
 774  * This function does the actual fwrite of both arrays to the d99.
 775  * If the number of new addrs can fit in the available free slots,
 776  * it rewrites to original offset, otherwise appends to end of d99.
 777  */
 778 static void     put_addrs_2_dtbs_addr_file (
 779                     DB_ADDR     *addrs_array,
 780                     DtSrINT32   nitems)
 781 {
 782     FREE_SPACE_STR      *free_slot;
 783     FREE_SPACE_STR      del_rec;
 784     DtSrINT32           int32;
 785     DtSrINT32           num_writes;
 786     DtSrINT32           num_addrs;
 787
 788     if (nitems >= batch_size) {
 789         printf ( catgets(dtsearch_catd, MS_cborodin, 6,
 790             "put_addrs_2_dtbs_addr_file() nitems=%d, batchsz=%ld\n") ,
 791             (int)nitems, (long)batch_size);
 792         DtSearchExit (58);
 793     }
 794
 795     num_addrs = got_word.or_hwaddrs;
 796     got_word.or_hwaddrs += nitems;  /** somehow, this can exceed total
 797         **** num addrs in database by 1 (!?) ******/
 798         /* (...only if prev 'overlay/compression' didn't delete all) */
 799
 800 #ifdef BYTE_SWAP
 801         /* Put both arrays in 'network' byte order */
 802         for (int32 = 0;  int32 < nitems;  int32++)
 803             HTONL (addrs_array[int32]);
 804         for (int32 = 0;  int32 < num_addrs;  int32++)
 805             HTONL (word_addrs_ii[int32]);
 806 #endif
 807
 808     /*
 809      * If number of new addresses greater than number of free holes,
 810      * find new free slot that is big enough to hold the data .
 811      */
 812     if (nitems > got_word.or_hwfree) {
 813         /* Discard old slot, find new one. */
 814         del_rec.hole_size = num_addrs + got_word.or_hwfree;
 815         del_rec.offset = got_word.or_hwoffset;
 816         free_slot = find_free_space (got_word.or_hwaddrs, &fl_hdr);
 817         add_free_space (&del_rec, &fl_hdr);
 818         if (free_slot == NULL) {
 819             fseek (dtbs_addr_fp, 0L, SEEK_END);
 820             got_word.or_hwoffset = ftell (dtbs_addr_fp);
 821             got_word.or_hwfree = 0;
 822         }
 823         else {
 824             fseek (dtbs_addr_fp, free_slot->offset, SEEK_SET);
 825             got_word.or_hwoffset = free_slot->offset;
 826             got_word.or_hwfree = free_slot->hole_size -
 827                 got_word.or_hwaddrs;
 828         }
 829         /*----- Write new database addresses to a file -----*/
 830         num_writes = fwrite (addrs_array, sizeof(DB_ADDR),
 831                 (size_t)nitems, dtbs_addr_fp);
 832         if (num_writes != nitems) {
 833             DtSearchExit (98);
 834         }
 835
 836         /* Copy the old addresses immediately after the new ones */
 837         num_writes = fwrite (word_addrs_ii, sizeof(DB_ADDR), (size_t)num_addrs,
 838             dtbs_addr_fp);
 839         if (num_writes != num_addrs) {
 840             printf (catgets (dtsearch_catd, MS_cborodin, 776, msg_776),
 841                 PROGNAME"776", strerror(errno));
 842             DtSearchExit (76);
 843         }
 844
 845         /* Write foxes to the free holes, if any, no byte swap */
 846         for (int32 = 0;  int32 < got_word.or_hwfree;  int32++)
 847             addrs_array [int32] = 0xFFFFFFFF;
 848         num_writes = fwrite (addrs_array, sizeof(DB_ADDR),
 849             (size_t)got_word.or_hwfree, dtbs_addr_fp);
 850         if (num_writes != got_word.or_hwfree) {
 851             printf (catgets (dtsearch_catd, MS_cborodin, 776, msg_776),
 852                 PROGNAME"786", strerror(errno));
 853             DtSearchExit (86);
 854         }
 855     } /* end if (nitems > got_word.or_hwfree), had to get bigger slot */
 856
 857     /* Else can reuse existing slot.
 858      * Write the new addresses into free holes.
 859      * The remaining free holes should already have foxes. (?)
 860      */
 861     else {
 862         fseek (dtbs_addr_fp, got_word.or_hwoffset, SEEK_SET);
 863         num_writes = fwrite (addrs_array, sizeof(DB_ADDR),
 864                 (size_t)nitems, dtbs_addr_fp);
 865         if (num_writes != nitems) {
 866             printf (catgets (dtsearch_catd, MS_cborodin, 776, msg_776),
 867                 PROGNAME"798", strerror(errno));
 868             DtSearchExit (87);
 869         }
 870         /* Copy the old addresses immediately after the new ones */
 871         num_writes = fwrite (word_addrs_ii, sizeof(DB_ADDR),
 872                 (size_t)num_addrs, dtbs_addr_fp);
 873         if (num_writes != num_addrs) {
 874             printf (catgets (dtsearch_catd, MS_cborodin, 776, msg_776),
 875                 PROGNAME"889", strerror(errno));
 876             DtSearchExit (89);
 877         }
 878         got_word.or_hwfree -= nitems;
 879     }
 880 } /* put_addrs_2_dtbs_addr_file() */
 881
 882
 883 /****************************************/
 884 /*                                      */
 885 /*       write_2_dtbs_addr_file         */
 886 /*                                      */
 887 /****************************************/
 888 /* Subroutine of fill_data1() from Pass 2.
 889  * Updates OLD (preexisting) word's d99 file.
 890  *
 891  * The vista word rec has already been read into global 'got_word'.
 892  * record_addr_word [num_addrs_for_word] is the array of dba's
 893  * for docs from this batch that contain the current word (built by
 894  * fill_data1 from the dba_list for the word's Pass 1 binary tree node,
 895  * and still in 'host' byte swap order).
 896  * This function freads all the old addresses for that word from
 897  * the d99 file.  It then deletes(!) d99 addrs that
 898  * are in the word's Pass 1 tree node.  It then calls
 899  * put_addrs_2_dtbs_addr_file() to fwrite out the
 900  * dba's in the tree, which are either brand new,
 901  * or are 'updating' the deleted addrs.
 902  * Then it writes the modified old addrs.
 903  * Then rewrites vista word rec with new data.
 904  *
 905  * The bit vector dbas_bits_batch contains a 1 bit
 906  * for every dba for every doc in the fzk file.
 907  * got_word structure:
 908  * .or_hwordkey - the word. (always in a 'huge' word buffer).
 909  * .or_hwoffset - offset in a d99 inverted index file for
 910  *                    a given word. the first address starts
 911  *                    at this position.
 912  * .or_hwaddrs - total number of addresses for a given word.
 913  * .or_hwfree - number of free slots in a database
 914  *                       addresses file for a given word.
 915  */
 916 void            write_2_dtbs_addr_file (void)
 917 {
 918     DtSrINT32           num_addrs_ii;
 919     DtSrINT32           num_reads;
 920     DtSrINT32           i_start, k, cur_ind;
 921     DtSrINT32           num_delete_addrs = 0;
 922     char                addrs_removed = FALSE;
 923     register DtSrINT32  i;
 924     register DtSrINT32  cur_byte;
 925     register char       bit_addrs;
 926     register DB_ADDR    temp1;
 927
 928     if (debugging & DEBUG_O)
 929         printf ("  old vis '%s' ofs=%ld adr=%ld fre=%ld\n",
 930             displayable(got_word.or_hwordkey),
 931             (long) got_word.or_hwoffset,
 932             (long) got_word.or_hwaddrs,
 933             (long) got_word.or_hwfree);
 934
 935     num_addrs_ii = got_word.or_hwaddrs;
 936     if (num_addrs_ii > or_reccount) {
 937         printf (catgets (dtsearch_catd, MS_cborodin, 713,
 938             "\n%s Word '%s' occurs in %ld records,\n"
 939             "  but there are only %ld records in database!\n"
 940             "  (This may be a good candidate for the stoplist).\n"),
 941             PROGNAME"713",
 942             (long) got_word.or_hwordkey,
 943             (long) num_addrs_ii,
 944             (long) or_reccount);
 945         DtSearchExit (68);
 946     }
 947
 948     if (fseek (dtbs_addr_fp, (long) got_word.or_hwoffset, SEEK_SET) != 0)
 949         {
 950         printf (catgets (dtsearch_catd, MS_cborodin, 875,
 951             "\n%s Could not fseek d99 file to offset %ld.\n"),
 952             PROGNAME"875", got_word.or_hwoffset);
 953         DtSearchExit (98);
 954         }
 955     num_reads = fread (word_addrs_ii, sizeof(DB_ADDR),
 956         (size_t)num_addrs_ii, dtbs_addr_fp);
 957     if (num_reads != num_addrs_ii) {
 958         printf (catgets (dtsearch_catd, MS_cborodin, 848,
 959             "\n%s Could not fread %ld bytes (%ld dba's) of d99 file\n"
 960             "  at offset %ld.  Number of dba's read (return code) = %ld.\n"),
 961             PROGNAME"848", sizeof(DB_ADDR) * num_addrs_ii, (long)num_addrs_ii,
 962             (long)got_word.or_hwoffset, (long)num_reads);
 963         DtSearchExit (98);
 964     }
 965 #ifdef BYTE_SWAP
 966     for (i = 0; i < num_addrs_ii; i++)
 967         NTOHL (word_addrs_ii[i]);
 968     /* Now both addr arrays are in 'host' byte swap order */
 969 #endif
 970
 971     /* If there are only new docs,
 972      * this switch will prevent the checking for updates.
 973      */
 974     if (check_existing_addrs) {
 975         i_start = 0;
 976
 977         /* Loop on every preexisting dba for word as read from d99 */
 978         for (i = 0; i < num_addrs_ii; i++) {
 979             if (debugging & DEBUG_o)
 980                 printf ("  old d99 %ld: x%lx(%ld:%ld)",
 981                     (long) i,
 982                     (long) word_addrs_ii[i],
 983                     (long) word_addrs_ii[i] >> 8,
 984                     (long) word_addrs_ii[i] & 0xffL);
 985
 986             /* Get 'record number' by shifting hi 3 bytes 1 byte (8 bits)
 987              * to right over stat wt byte.  D99 rec#'s start at 1,
 988              * so subtract 1 to start at 0 for bit vector.
 989              */
 990             temp1 = (*(word_addrs_ii + i) >> 8) - 1;    /* = rec#, base 0 */
 991             cur_byte = temp1 >> 3;      /* get matching byte# in bit vector */
 992             if (cur_byte >= bit_vector_size) {
 993                 printf ( catgets(dtsearch_catd, MS_cborodin, 9,
 994                     "\n%s Corrupted d99 file for word '%s',\n"
 995                     " database address %ld @ file position %ld => bitvector[%ld],"
 996                     " but max bitvector allocation = %ld.\n") ,
 997                     PROGNAME"727", displayable(got_word.or_hwordkey),
 998                     (long)temp1, (long)i,
 999                     (long)cur_byte, (long)bit_vector_size);
1000                 DtSearchExit (69);
1001             }
1002             bit_addrs = 0;
1003             bit_addrs |= 1 << (temp1 % 8);      /* bit mask */
1004             /*
1005              * If this dba, which is on the current word's old d99
1006              * addrs list, is also a doc in the fzk file (dbas_bits_batch),
1007              * delete it from the d99 list by writing subsequent dba's
1008              * over it.  Boy this recursive nested loop has gotta be slow.
1009              * Faster algorithm?  Add 'good' addrs to the end of
1010              * record_addr_word[].  No nested overlay loop, only one write!
1011              */
1012             if (bit_addrs & (*(dbas_bits_batch + cur_byte))) {
1013                 addrs_removed = TRUE;
1014                 num_delete_addrs++;
1015                 if (i_start == 0) {
1016                     cur_ind = i;
1017                     i_start = i + 1;
1018                 }
1019                 else {
1020                     if (i_start < i) {
1021                         /* compress: move good addrs over
1022                          * space of deleted ones */
1023                         for (k = i_start; k < i; k++) {
1024                             word_addrs_ii[cur_ind] = word_addrs_ii[k];
1025                             cur_ind++;
1026                         }
1027                     }
1028                     i_start = i + 1;
1029                 }
1030             } /* end if where dba is on both fzk list and curr d99 */
1031         } /* end loop on every d99 addr for this word */
1032
1033         if (addrs_removed) {    /* final overlay compression */
1034             if (i_start < i) {
1035                 /* compress: move good addrs over
1036                  * space of deleted ones */
1037                 for (k = i_start; k < i; k++) {
1038                     word_addrs_ii[cur_ind] = word_addrs_ii[k];
1039                     cur_ind++;
1040                 }
1041             }
1042         }
1043     } /* end if (check_existing_addrs) */
1044
1045     got_word.or_hwaddrs -= num_delete_addrs;
1046     got_word.or_hwfree += num_delete_addrs;
1047
1048     /* The old dba array word_addrs_ii[] is now 'compressed',
1049      * it contains only addrs not in fzk file.
1050      * And the vista rec 'got_word' now matches it.
1051      * And record_addr_word[] still contains
1052      * the new/updated addrs from the fzk file.
1053      * Now Efim calls a func to write them both back out to d99 file.
1054      */
1055     put_addrs_2_dtbs_addr_file (record_addr_word, num_addrs_for_word);
1056     write_wordstr (&got_word, 0);       /* update vista WORD rec */
1057
1058     return;
1059 } /*  write_2_dtbs_addr_file() */
1060
1061
1062 /********************************/
1063 /*                              */
1064 /*      write_new_word_2_dtbs   */
1065 /*                              */
1066 /********************************/
1067 /* Subroutine of fill_data1() in Pass 2 for a NEW word.
1068  * Writes d99 data, and updates (empty) got_word vista record.
1069  * record_addr_word [num_addrs_for_word] is the array of addrs
1070  * for docs from this batch that contain the current word (built by
1071  * fill_data1 from the dba_list for the word's Pass 1 binary tree node).
1072  * It will be byte swapped from 'host' to 'network' order in this function.
1073  */
1074 void            write_new_word_2_dtbs (void)
1075 {
1076     FREE_SPACE_STR *free_slot;
1077     DtSrINT32   num_writes;
1078     int             ret_fseek;
1079     DtSrINT32   int32;
1080
1081     if (debugging & (DEBUG_n  | DEBUG_N))
1082         printf ("  new word '%s', adrs=%ld,",
1083             got_word.or_hwordkey, (long)num_addrs_for_word);
1084
1085     free_slot = find_free_space (num_addrs_for_word, &fl_hdr);
1086     if (free_slot == NULL) {
1087         /* append addrs to end of d99 file */
1088         ret_fseek = fseek (dtbs_addr_fp, 0L, SEEK_END);
1089         got_word.or_hwoffset = ftell (dtbs_addr_fp);
1090         got_word.or_hwfree = 0;
1091         if (debugging & (DEBUG_n  | DEBUG_N))
1092             printf ("APPEND ofs=%ld, fre=0\n", got_word.or_hwoffset);
1093     }
1094     else {
1095         ret_fseek = fseek (dtbs_addr_fp,
1096                 (long)free_slot->offset, SEEK_SET);
1097         got_word.or_hwoffset = free_slot->offset;
1098         got_word.or_hwfree = free_slot->hole_size -
1099             num_addrs_for_word;
1100         if (debugging & (DEBUG_n  | DEBUG_N))
1101             printf (" REUSE slot ofs=%ld, fre=%ld\n",
1102                 got_word.or_hwoffset, got_word.or_hwfree);
1103     }
1104
1105     /***** Write new database addresses to d99 file *********/
1106     if (debugging & DEBUG_n) {
1107         for (int32 = 0;  int32 < num_addrs_for_word;  int32++) {
1108             printf ("     dba #%ld: x%lx(%ld:%ld)\n",
1109                 (long)int32,
1110                 (long)record_addr_word[int32],
1111                 (long)record_addr_word[int32] >> 8,
1112                 (long)record_addr_word[int32] & 0xffL);
1113         }
1114     }
1115 #ifdef BYTE_SWAP
1116         /* Put addr array in 'network' byte order */
1117         for (int32 = 0;  int32 < num_addrs_for_word;  int32++)
1118             HTONL (record_addr_word[int32]);
1119 #endif
1120     num_writes = fwrite (record_addr_word, sizeof(DB_ADDR),
1121         (size_t)num_addrs_for_word, dtbs_addr_fp);
1122     if (num_writes != num_addrs_for_word)
1123         DtSearchExit (97);
1124
1125     got_word.or_hwaddrs = num_addrs_for_word;
1126
1127     if (got_word.or_hwfree != 0) {
1128         /* Fill unused free holes with foxes for debugging.
1129          * Note that byte swap is unnecessary for foxes.
1130          * Note that record_addr_word is now available for this action.
1131          */
1132         for (int32 = 0;  int32 < got_word.or_hwfree;  int32++)
1133             *(record_addr_word + int32) = 0xFFFFFFFF;
1134         num_writes = fwrite (record_addr_word, sizeof(DB_ADDR),
1135             (size_t)got_word.or_hwfree, dtbs_addr_fp);
1136         if (num_writes != got_word.or_hwfree) {
1137             printf (catgets (dtsearch_catd, MS_cborodin, 776, msg_776),
1138                 PROGNAME"960", strerror(errno));
1139             DtSearchExit (96);
1140         }
1141     }
1142
1143     /* Save changed word_info structure back to the vista database! */
1144     write_wordstr (&got_word, 0);
1145     return;
1146 } /* write_new_word_2_dtbs() */
1147
1148
1149 /************************/
1150 /*                      */
1151 /*      fill_data1      */
1152 /*                      */
1153 /************************/
1154 /* Called from write_to_file() in Pass 2.
1155  * Write_to_file() is 'visit node' function of tree traversal.
1156  * It has converted dbalist in each word node in tree to
1157  * array of dbas (record_addr_word [num_addrs_for_word])
1158  * with correct statistical weighting, still in 'host' byte swap order.
1159  * This function seeks word key in database.  If word is new,
1160  * it calls functions to write new vista rec and d99 data.
1161  * If word is old it calls functions to read word rec and update d99.
1162  */
1163 void            fill_data1 (char *node_word)
1164 {
1165     char            miker[1024];
1166     strcpy (miker, node_word);
1167
1168     count_word_ii++;
1169     if (shutdown_now) {
1170         printf (catgets (dtsearch_catd, MS_cborodin, 164,
1171             "\n%s Abort due to signal %d.  Database %s\n"
1172             "  probably corrupted.  Restore backup database.\n"),
1173             PROGNAME"164", shutdown_now, dicname);
1174         DtSearchExit (10);
1175     }
1176
1177     /* print occasional progress dots and msgs */
1178     if (!(count_word_ii % words_per_dot)) {
1179         putchar ('.');
1180         dotcount++;
1181         if (!(dotcount % 10))
1182             putchar (' ');
1183         if (dotcount >= 50) {
1184             dotcount = 0;
1185             seconds_left = (unsigned long)
1186                 (((float) num_of_diff_words /
1187                     (float) count_word_ii - 1.) *
1188                 (float) (time (NULL) - timestart));
1189             printf (catgets (dtsearch_catd, MS_cborodin, 849,
1190                 "\n%s: Word #%ld, %.0f%% done.  Est %lum %02lus "
1191                 "to completion.\n"),
1192                 aa_argv0, count_word_ii,
1193                 (float) count_word_ii / (float) num_of_diff_words * 100.0,
1194                 /***(count_word_ii * 100L) / num_of_diff_words,***/
1195                 seconds_left / 60L, seconds_left % 60L);
1196         }
1197         else
1198             fflush (stdout);
1199     }   /* endif for progress dots and msgs */
1200
1201     strncpy (got_word.or_hwordkey, node_word, DtSrMAXWIDTH_HWORD);
1202     got_word.or_hwordkey[DtSrMAXWIDTH_HWORD - 1] = 0;
1203     find_keyword (miker, 0);    /* vista KEYFIND for word rec */
1204     if (db_status == S_NOTFOUND) {      /* this is a NEW word */
1205         got_word.or_hwoffset = 0;
1206         got_word.or_hwfree = 0;
1207         got_word.or_hwaddrs = 0;
1208         fillnew_wordrec (&got_word, 0); /* write (empty) vista word rec */
1209         if (db_status != S_OKAY)
1210             vista_abort (PROGNAME"981");
1211         write_new_word_2_dtbs();        /* write NEW word's d99 entries
1212                                          * and update vista word rec */
1213         return;
1214     }
1215
1216     /* update previously existing word */
1217     read_wordstr (&got_word, 0);        /* read OLD word rec into got_word */
1218     if (db_status == S_OKAY)
1219         write_2_dtbs_addr_file();       /* update OLD word's d99 entries
1220                                          * and update vista word rec */
1221     return;
1222 }       /* fill_data1() */
1223
1224
1225 /************************************************/
1226 /*                                              */
1227 /*              load_into_bintree               */
1228 /*                                              */
1229 /************************************************/
1230 /* Pass 1 function.
1231  * Loads parsed word token or stem token into
1232  * inverted index binary tree along with passed dba.
1233  * Token is allowed to be empty, ie first byte is \0.
1234  * Derived from Efim's original 'teskey_parse()'
1235  * and bin_tree() functions.
1236  * Variables static for speeeeeeed.
1237  */
1238 static void     load_into_bintree (
1239                         char    *parser_token,
1240                         int     token_is_stem,
1241                         DB_ADDR dba)
1242 {
1243     static DtSrINT16    or_maxwordsz;
1244     static char         *cptr;
1245     static int          i;
1246     static TREENODE     **this_link;
1247     static TREENODE     *newnode;
1248     static DBALIST      *newdba;
1249     static char         *tokbuf =       NULL;
1250
1251     if (*parser_token == 0) {
1252         if (debugging & DEBUG_I)
1253             printf (" bintr=<empty> dba=%ld\n", (long)dba);
1254         return;
1255     }
1256
1257     /* Copy token to a buffer.
1258      * Stems have a special prefix character
1259      * to distinguish them from words.
1260      * Also increment total dba word count.
1261      */
1262     if (tokbuf == NULL) {
1263         or_maxwordsz = dblk.dbrec.or_maxwordsz;
1264         tokbuf = austext_malloc ((size_t) or_maxwordsz + 4,
1265             PROGNAME"1152", NULL);
1266     }
1267     if (token_is_stem) {
1268         tokbuf[0] = STEM_CH;
1269         strncpy (tokbuf + 1, parser_token, (size_t)or_maxwordsz);
1270         dbas_word_count[dba]++;
1271     }
1272     else
1273         strncpy (tokbuf, parser_token, (size_t)or_maxwordsz);
1274     tokbuf [or_maxwordsz] = 0;
1275     if (debugging & DEBUG_I)
1276         printf (" bintr='%s' dba=%ld ", displayable(tokbuf), (long)dba);
1277
1278     /* TREE TRAVERSAL.  Search binary tree to find either
1279      * insertion point or identical preexisting token.
1280      */
1281     for (this_link = &root_node; *this_link != NULL; ) {
1282         i = strcmp (tokbuf, (*this_link)->word);
1283
1284         /* If identical word/stem token already exists... */
1285         if (i == 0) {
1286             /* If token appears more than once in current
1287              * document (dba already exists at top of dba list),
1288              * just increment the word count in the list.
1289              */
1290             if ((*this_link)->dba_list->dba == dba)
1291                 (*this_link)->dba_list->w_c++;
1292
1293             /* If this is first appearance of token for this doc
1294              * (dba is not at start of token's dba list),
1295              * insert dba at start of token's dba list.
1296              */
1297             else {
1298                 if ((newdba = malloc (sizeof(DBALIST))) == NULL) {
1299                     printf (catgets (dtsearch_catd, MS_cborodin, 374,
1300                         msg_374), PROGNAME"1150");
1301                     DtSearchExit (26);
1302                 }
1303                 newdba->dba =             dba;
1304                 newdba->w_c =             1;
1305                 newdba->next_dba =        (*this_link)->dba_list;
1306                 (*this_link)->dba_list =  newdba;
1307             }
1308             if (debugging & DEBUG_I)
1309                 printf (" Old %ld=%ld\n",
1310                     (long)((*this_link)->dba_list->dba),
1311                     (long)((*this_link)->dba_list->w_c));
1312             return;     /* done with token */
1313
1314         } /* endif where token was found in binary tree */
1315
1316         /* Increment link ptr by descending to correct subtree */
1317         if (i < 0) {
1318             this_link = &(*this_link)->llink;
1319             if (debugging & DEBUG_I)
1320                 putchar ('L');
1321         }
1322         else {
1323             this_link = &(*this_link)->rlink;
1324             if (debugging & DEBUG_I)
1325                 putchar ('R');
1326         }
1327     } /* end tree traversal */
1328
1329     /* Tree traversal never found a preexisting token node.
1330      * Create a new node and insert it at the point
1331      * indicated by link ptr.
1332      */
1333     newnode = austext_malloc (sizeof(TREENODE) + strlen(tokbuf) + 4,
1334         PROGNAME"1234", NULL);
1335     newnode->llink =    NULL;
1336     newnode->rlink =    NULL;
1337     newnode->word = (char *) (newnode + 1);     /* use mem at end of node */
1338     strcpy (newnode->word, tokbuf);
1339
1340     newdba = austext_malloc (sizeof(DBALIST), PROGNAME"1235", NULL);
1341     newnode->dba_list = newdba;
1342     newdba->dba =       dba;
1343     newdba->w_c =       1;
1344     newdba->next_dba =  NULL;
1345
1346     *this_link =        newnode;
1347     num_of_diff_words++;
1348
1349     if (debugging & DEBUG_I)
1350         printf (" New %ld=%ld\n",
1351             (long)((*this_link)->dba_list->dba),
1352             (long)((*this_link)->dba_list->w_c));
1353     return;
1354 } /* load_into_bintree() */
1355
1356
1357 /**********************************************/
1358 /*                                            */
1359 /*                    MAIN                    */
1360 /*                                            */
1361 /**********************************************/
1362 main (int argc, char **argv)
1363 {
1364     int                 i;
1365     long                word_offset;    /* <-- PARG.offsetp */
1366     long                bytes_in;       /* ftell() */
1367     DtSrINT32           dba_offset;
1368     int                 got_ETX;
1369     char                *cptr, *src;
1370     char                temp_buf[40];
1371     char                db_key [DtSrMAX_DB_KEYSIZE + 2];
1372     int                 oops = FALSE;
1373     register DtSrINT32  cur_byte;
1374     struct tm           *tmptr;
1375     DB_ADDR             dba, temp_dba;
1376     time_t              elapsed;
1377     size_t              mallocsz;
1378     char                *parsebufp, *stembufp;
1379
1380     /******************* INITIALIZE ******************/
1381     setlocale (LC_ALL, "");
1382     dtsearch_catd = catopen (FNAME_DTSRCAT, 0);
1383
1384     aa_argv0 = strdup (argv[0]);
1385     time (&elapsed);
1386     tmptr = localtime (&elapsed);
1387     strftime (buf, sizeof(buf),
1388         catgets (dtsearch_catd, MS_misc, 22, "%A, %b %d %Y, %I:%M %p"),
1389         tmptr);
1390     printf (catgets (dtsearch_catd, MS_cborodin, 1, "%s.  Run %s.\n"),
1391         aa_argv0, buf);
1392     austext_exit_last = print_exit_code;
1393     batch_size = BATCH_SIZE;
1394     init_user_interrupt ();
1395     default_hashsize = duprec_hashsize;
1396
1397     memset (&dblk, 0, sizeof(DBLK));
1398
1399     memset (&parg, 0, sizeof(PARG));
1400     parg.dblk =         &dblk;
1401     parg.etxdelim =     ETXDELIM;       /* default, can be changed */
1402     parg.offsetp =      &word_offset;
1403     parg.flags |=       PA_INDEXING;    /* do compounding, if parser can */
1404
1405     /* Read user specified command line arguments */
1406     user_args_processor (argc, argv);
1407
1408     /* Finish init now that we know final values */
1409     inbuf = austext_malloc (inbufsz + 16, PROGNAME"1349", NULL);
1410     temp = austext_malloc (inbufsz + 16, PROGNAME"1285", NULL);
1411     sprintbuffer = austext_malloc (inbufsz + _POSIX_PATH_MAX + 16,
1412         PROGNAME"1286", NULL);
1413     record_addr_word = austext_malloc ((sizeof(DB_ADDR) * batch_size) + 16,
1414         PROGNAME "1133", NULL);
1415
1416     /* Save dicname and path in dblk.  Save full name of d99 file. */
1417     strcpy (dblk.name, dicname);
1418     dblk.path = dicpath;
1419     strcpy (dtbs_addr_file, dicpath);
1420     strcat (dtbs_addr_file, dicname);
1421     strcat (dtbs_addr_file, EXT_DTBS);
1422
1423     /* Open the database */
1424     if (!austext_dopen (dicname, dicpath, NULL, cache_size, &dbrec)) {
1425         fprintf (aa_stderr, "%s\n", DtSearchGetMessages());
1426         DtSearchExit (3);
1427     }
1428     memcpy (&dblk.dbrec, &dbrec, sizeof(DBREC));
1429
1430     /* Load database's parser, stemmer, and linguistic files into dblk. */
1431     if (!load_language (&dblk, NULL)) {
1432         puts (DtSearchGetMessages());
1433         printf (catgets (dtsearch_catd, MS_cborodin, 1097,
1434             "%s Aborting due to errors in loading language files.\n"),
1435             PROGNAME"1097");
1436         DtSearchExit(3);
1437     }
1438
1439     RECFRST (PROGNAME "1067", OR_OBJREC, 0);
1440     CRGET (PROGNAME "1069", &dba, 0);  /* byte swap already done in vista */
1441
1442     or_reccount = dbrec.or_reccount;    /* DtSrINT32 */
1443     or_recslots = dbrec.or_recslots;    /* promoted to DtSrINT32 */
1444     or_maxdba = dbrec.or_maxdba;        /* DtSrINT32 lim of dbas_word_count */
1445     bit_vector_size = ((or_maxdba / or_recslots + 1) >> 3) + 1; /* DtSrINT32 */
1446     dba_offset = or_recslots - (dba & 0x00FFFFFF);      /* DtSrINT32 */
1447
1448     if (debugging)
1449         printf (PROGNAME"1286 "
1450             "realnumrec=%ld recslots=%ld bitvecsz=%ld"
1451             " dbaoffset=%d maxdba=%ld\n",
1452             (long)or_reccount, (long)or_recslots, (long)bit_vector_size,
1453             (int)dba_offset, (long)or_maxdba);
1454
1455     /* Allocate memory space for the arrays.
1456      * dbas_bits_batch = 'bit vector', one bit for every possible rec#.
1457      *   the 1 bits = only the dba's that are in this fzk batch.
1458      * word_addrs_ii = fread buffer for d99 file.
1459      * dbas_word_count = summing bkts for word count statistics.
1460      */
1461     dbas_bits_batch = (char *) austext_malloc ((size_t)bit_vector_size + 48,
1462         PROGNAME "1150", NULL);
1463     word_addrs_ii = (DB_ADDR *) austext_malloc (
1464         sizeof (DB_ADDR) * (or_reccount + 1) + 48,
1465         PROGNAME "1152", NULL);
1466     mallocsz = sizeof(DtSrINT32) * (or_maxdba + 1) + 48;
1467     dbas_word_count = (DtSrINT32 *) austext_malloc (mallocsz,
1468         PROGNAME "1154", NULL);
1469     memset (dbas_bits_batch, 0, (size_t)bit_vector_size + 48);
1470     memset (dbas_word_count, 0, mallocsz);
1471
1472     root_node = NULL;
1473
1474    /* Open the d99 file that contains database addresses.
1475     * If the file doesn't exist, it means the database
1476     * for keyword search is empty - open a new file.
1477     */
1478     if ((dtbs_addr_fp = fopen (dtbs_addr_file, "r+b")) == NULL) {
1479         dtbs_addr_fp = fopen (dtbs_addr_file, "w+b");
1480         check_existing_addrs = FALSE;
1481         new_dtbs_file = TRUE;
1482         if (dtbs_addr_fp == NULL) {
1483             /* msg 1068 used multiple places */
1484             printf (catgets (dtsearch_catd, MS_cborodin, 1068,
1485                 "%s Can't open new inverted index file '%s': %s\n"),
1486                 PROGNAME"1068", dtbs_addr_file, strerror(errno));
1487             DtSearchExit (13);
1488         }
1489         /* write New Header Information to a file */
1490         init_header (dtbs_addr_fp, &fl_hdr);
1491     }
1492     else {
1493         /* read Header Information from d99 file */
1494         if (!fread_d99_header (&fl_hdr, dtbs_addr_fp)) {
1495             /* msg 1068 used multiple places */
1496             printf (catgets (dtsearch_catd, MS_cborodin, 1068,
1497                 "%s Can't read header data for '%s': %s\n"),
1498                 PROGNAME"1422", dtbs_addr_file, strerror(errno));
1499             DtSearchExit (13);
1500         }
1501     }
1502
1503     /* open input .fzk file */
1504     src = getcwd (sprintbuffer, _POSIX_PATH_MAX);
1505     if (!src && debugging)
1506         printf (PROGNAME"1336 Can't getcwd: %s.\n", strerror(errno));
1507     if (!src)
1508         src = getenv ("PWD");
1509     printf (catgets (dtsearch_catd, MS_misc, 24,
1510         "%s: current working directory = '%s', .fzk file = '%s'\n"),
1511         aa_argv0,
1512         (src) ? src : catgets (dtsearch_catd, MS_misc, 6, "<unknown>"),
1513         fname_input);
1514     if ((instream = fopen (fname_input, "rt")) == NULL) {
1515 BAD_INPUT_FILE:
1516         printf (catgets (dtsearch_catd, MS_cborodin, 1083,
1517             "%s Can't read input file '%s': %s\n"),
1518             PROGNAME"1083", fname_input, strerror(errno));
1519         DtSearchExit (14);
1520     }
1521     if (fstat (fileno (instream), &fstat_input) == -1)
1522         goto BAD_INPUT_FILE;
1523     parg.ftext = instream;      /* for readchar_ftext(), discard_to_ETX() */
1524
1525     time (&totalstart);         /* for total elapsed time */
1526     timestart = totalstart;     /* for Pass 1 elapsed time */
1527
1528     /*------------ PASS 1:  ------------
1529      * Main Read Loop.  For each text record in input file,
1530      * parse and stem words, store them into binary tree
1531      * inverted index in memory.
1532      * The first few lines are database administrative values.
1533      * They are presumed ascii and read with fgets() as
1534      * 'lines' terminated with \n.  The text of the document
1535      * itself is presumed to be in the appropriate database
1536      * 'language', so it is *not* presumed to be lines
1537      * terminated with \n.  The document text is read by
1538      * the language's parser() a 'word' at a time, which
1539      * ultimately means a byte at a time.
1540      */
1541     printf (catgets (dtsearch_catd, MS_cborodin, 1108,
1542         "%s: Beginning Pass 1, reading records from '%s'.\n"
1543         "   Each dot = %d records.\n"),
1544         aa_argv0, fname_input, recs_per_dot);
1545     dotcount = 0;
1546
1547     while (!feof(instream)) {
1548
1549         /* 1. Read and discard the FZKEY line.
1550          * 2. Read and discard the ABSTRACT line.
1551          * 3. Read the UNIQUE KEY for the record.
1552          *    Do some record initialization steps here.
1553          * 4. Read and discard the DATE line.
1554          * 5. Let the parser read and parse rest of record, ie doc text...
1555          */
1556
1557         /*----- READ LINE #1, fzkey -----*/
1558         if (fgets (inbuf, inbufsz, instream) == NULL)
1559             break;
1560         inbuf [inbufsz] = 0;    /* just to be sure */
1561
1562         if (shutdown_now) {
1563             printf (catgets (dtsearch_catd, MS_cborodin, 164,
1564                 "\n%s: %s Abort due to signal %d.  Database %s\n"
1565                 "  possibly corrupted.  Restore backup database.\n"),
1566                 aa_argv0, PROGNAME"1299", shutdown_now, dicname);
1567             DtSearchExit (11);
1568         }
1569
1570         /* Silently skip null records just like dtsrload */
1571         if (strcmp (inbuf, parg.etxdelim) == 0)
1572             continue;
1573
1574         record_count++;
1575
1576         /*----- READ LINE #2, abstract -----*/
1577         if (fgets (inbuf, inbufsz, instream) == NULL) {
1578 INVALID_FZK_FORMAT:
1579             printf (catgets (dtsearch_catd, MS_cborodin, 1129,
1580                 "%s: %s Invalid .fzk file format.\n"),
1581                 fname_input, PROGNAME"1129");
1582             DtSearchExit (22);
1583         }
1584         inbuf[inbufsz] = 0;     /* just to be sure */
1585
1586         /*--- READ LINE #3, unique database key ---*/
1587         if (fgets (inbuf, inbufsz, instream) == NULL)
1588             goto INVALID_FZK_FORMAT;
1589         inbuf[inbufsz] = 0;     /* just to be sure */
1590
1591         if ((cptr = strtok (inbuf, " \t\n")) == NULL)
1592             goto INVALID_FZK_FORMAT;
1593
1594         /* If necessary, discard long keys exactly like cravel */
1595         if (strlen (cptr) >= DtSrMAX_DB_KEYSIZE) {
1596             printf (catgets (dtsearch_catd, MS_cborodin, 659,
1597                 "\n%s: %s Discarding record, key too long:\n  '%s'.\n"),
1598                 aa_argv0, PROGNAME"659", cptr);
1599             discard_to_ETX (&parg);
1600             continue;
1601         }
1602         strcpy (db_key, cptr);
1603
1604         /* Skip duplicate record ids in same order as dtsrload */
1605         i = is_duprec (db_key);
1606         if (i == 2) {   /* out of memory */
1607             printf (catgets (dtsearch_catd, MS_cborodin, 374, msg_374),
1608                     PROGNAME"1317");
1609             DtSearchExit (57);
1610         }
1611         else if (i == 1) {      /* duplicate record id */
1612             duplicate_recids++;
1613             if (dotcount > 0)
1614                     putchar ('\n');
1615             printf (catgets (dtsearch_catd, MS_cborodin, 1402,
1616                 "%s: Discarded duplicate rec #%lu '%s'.\n"),
1617                 aa_argv0, record_count, db_key);
1618             discard_to_ETX (&parg);
1619             continue;
1620         }
1621
1622         /****** FFFFFFFFFFFFFFFFFFFFF **********/
1623         /* Convert database address (slot #) to 'record number',
1624          * what dba would have been if all records took up
1625          * only one slot and there were no dbrec at top of file.
1626          * Record numbers on d99, like dba's, start at #1,
1627          * but rec numbers here (in bit vector) start at #0.
1628          */
1629         KEYFIND (PROGNAME "222", OR_OBJKEY, (char *) db_key, 0);
1630         if (db_status != S_OKAY) {
1631             normal_retncode = 1;        /* = 'warning' */
1632             if (dotcount > 0)
1633                 putchar ('\n');
1634             printf (catgets (dtsearch_catd, MS_cborodin, 1168,
1635                 "%s: %s Discarded '%s', key not in database.\n"),
1636                 aa_argv0, PROGNAME"1168", displayable(db_key));
1637             discard_to_ETX (&parg);
1638             continue;
1639         }
1640
1641         CRGET (PROGNAME "224", &temp_dba, 0); /* vista already byte swapped */
1642         temp_dba &= 0x00FFFFFF; /* = slot# */
1643         dba = (temp_dba + dba_offset) / or_recslots; /* = rec#, base 1 */
1644         /*
1645          * Don't change this 'dba'!--eventually it goes
1646          * into d99 in this exact format!  It will also
1647          * be used as an index into dbas_word_count[] in
1648          * load_into_bintree() so do a sanity check
1649          * to make sure that it hasn't exceeded the size
1650          * of that array.  (The count increments have been
1651          * reported as as 'uninitialized memory reads'
1652          * by a debugger).  This might happen for example
1653          * if user failed to run dtsrload before dtsrindex?
1654          */
1655         if (dba < 1  ||  dba > or_maxdba) {
1656             printf ( catgets(dtsearch_catd, MS_cborodin, 21,
1657                 "\n%s '%s' record overflows word counter array.\n"
1658                 "Record number %ld > maxdba %ld, dba=%ld, "
1659                 "recslots=%ld, offs=%d.\n") ,
1660                 PROGNAME"1526", displayable(db_key),
1661                 (long)dba, (long)or_maxdba, (long)temp_dba,
1662                 (long)or_recslots, (int)dba_offset);
1663             DtSearchExit (68);
1664         }
1665         temp_dba = dba - 1;     /* = rec# starting at 0 */
1666         cur_byte = temp_dba >> 3;       /* bits to bytes: div by 8 */
1667         if (cur_byte >= bit_vector_size) {
1668             printf ( catgets(dtsearch_catd, MS_cborodin, 22,
1669                 "\n%s '%s' record in database (dba=%ld)\n"
1670                 "  overflows bitvector allocation (%ld >= %ld).\n") ,
1671                 PROGNAME"1475", displayable(db_key), (long)dba,
1672                 (long)cur_byte, (long)bit_vector_size);
1673             DtSearchExit (69);
1674         }
1675         dbas_bits_batch[cur_byte] |= 1 << (temp_dba % 8);
1676
1677         /* Print occasional progress dots and msgs */
1678         if (!(record_count % recs_per_dot)) {
1679             putchar ('.');
1680             dotcount++;
1681             if (!(dotcount % 10))
1682                 putchar (' ');
1683             if (dotcount >= 50) {
1684                 dotcount = 0;
1685                 bytes_in = ftell (instream);
1686                 seconds_left = (unsigned long)
1687                     (((float) fstat_input.st_size /
1688                     (float) bytes_in - 1.) *
1689                     (float) (time (NULL) - timestart));
1690                 printf (catgets (dtsearch_catd, MS_cborodin, 1190,
1691                     "\n%s: Rec #%lu, %.0f%% done.  "
1692                     "Est %lum %02lus to end Pass 1.\n"),
1693                     aa_argv0,
1694                     record_count,
1695                     (float) bytes_in / (float) fstat_input.st_size * 100.0,
1696                     seconds_left / 60UL,
1697                     seconds_left % 60UL);
1698             }
1699             fflush (stdout);
1700         }
1701
1702         /*----- READ LINE #4, date -----*/
1703         if (fgets (inbuf, inbufsz, instream) == NULL)
1704             goto INVALID_FZK_FORMAT;
1705         inbuf[inbufsz] = 0;     /* just to be sure */
1706
1707         /* PARSE LOOP FOR CURRENT TEXT BLOCK.
1708          * We must be in the middle of a record ('lines' #5 and beyond).
1709          * From here to ETX, which is either the record delimiter string
1710          * or the end of file, read the file a 'word' at a time
1711          * using the parse() function for the language specified
1712          * for the database.
1713          * Load_into_bintree() stores each token into
1714          * inverted index binary tree.
1715          * Note: dba here MUST still be rec#, base 1.
1716          * It's stored as is by load_into_bintree(),
1717          * and will be moved as is into d99 file in Pass 2.
1718          */
1719         if (debugging & DEBUG_P)
1720             printf ("\nRecord #%lu '%s'\n"
1721                     "Offset Word----               Stem----\n",
1722                 record_count, db_key);
1723         for (   cptr = dblk.parser (&parg);
1724                 cptr;
1725                 cptr = dblk.parser (NULL)) {
1726
1727             if (debugging & DEBUG_P) {
1728                 printf ("%6ld %s %n", word_offset, cptr, &i);
1729                 if (!(debugging & DEBUG_I))
1730                     while (i++ < 30)
1731                         putchar (' ');
1732             }
1733             load_into_bintree (cptr, FALSE, dba);
1734             cptr = dblk.stemmer (cptr, &dblk);
1735             if (debugging & DEBUG_P) {
1736                 printf ("%s\n", cptr);
1737                 fflush (stdout);
1738             }
1739             load_into_bintree (cptr, TRUE, dba);
1740         }
1741
1742     } /* end of PASS 1 Main read loop */
1743
1744     elapsed = time(NULL) - timestart;
1745     if (dotcount > 0) {
1746         putchar ('\n');
1747         dotcount = 0;
1748     }
1749     if (duplicate_recids > 0L) {
1750         normal_retncode = 1;    /* 'warning' */
1751         sprintf (buf, catgets (dtsearch_catd, MS_cborodin, 40,
1752             "Ignored %ld duplicate records"),
1753             duplicate_recids);
1754     }
1755     else
1756         strcpy (buf, catgets (dtsearch_catd, MS_cborodin, 41,
1757             "No duplicate records found"));
1758     printf (catgets (dtsearch_catd, MS_cborodin, 1225,
1759         "%s: Pass 1 completed in %lum %lus, read %lu records.\n"
1760         "  %s, parsed %lu words.\n"),
1761         aa_argv0, elapsed / 60L, elapsed % 60L, record_count,
1762         buf, num_of_diff_words);
1763     if (record_count > batch_size) {
1764         printf (catgets (dtsearch_catd, MS_cborodin, 33,
1765             "\n%s Number of incoming records exceeded %d.\n"
1766             "  This will usually result in 'Out of Paging Space' "
1767             "error in Pass 2\n"
1768             "  and corruption of database.  Either split the incoming file to\n"
1769             "  reduce record count or use the -b option, and rerun.\n"),
1770             PROGNAME"33", (int)batch_size);
1771         DtSearchExit (33);
1772     }
1773
1774     /*----------------- PASS 2:  -----------------
1775      * Traverse completed binary tree and write it to d99 file.
1776      */
1777     printf (catgets (dtsearch_catd, MS_cborodin, 1233,
1778         "%s: Beginning Pass 2: batch index traversal and database update.\n"
1779         "  Each dot = %d words.\n"),
1780         aa_argv0, words_per_dot);
1781     dotcount = 0;
1782     time (&timestart);
1783     traverse_tree ();   /* actual Pass 2 */
1784     if (dotcount) {
1785         putchar ('\n');
1786         dotcount = 0;
1787     }
1788
1789     /* Write header information to the d99 file */
1790     if (!fwrite_d99_header (&fl_hdr, dtbs_addr_fp)) {
1791         printf (catgets (dtsearch_catd, MS_cborodin, 776, msg_776),
1792             PROGNAME"1723", strerror(errno));
1793         DtSearchExit (13);
1794     }
1795     d_close ();
1796     fclose (dtbs_addr_fp);
1797
1798     elapsed = time (NULL) - timestart;
1799     printf (catgets (dtsearch_catd, MS_cborodin, 1246,
1800         "%s: Pass 2 completed in %lum %lus, updated %lu words.\n"),
1801         aa_argv0, elapsed / 60L, elapsed % 60L, count_word_ii);
1802     if (normal_retncode == 1)
1803         printf (catgets (dtsearch_catd, MS_cborodin, 2,
1804             "%s: Warnings were detected.\n"), aa_argv0);
1805     DtSearchExit (normal_retncode);
1806
1807 } /* main() */
1808
1809 /*************************** DTSRINDEX.C ****************************/
1810