2 * CDE - Common Desktop Environment
4 * Copyright (c) 1993-2012, The Open Group. All rights reserved.
6 * These libraries and programs are free software; you can
7 * redistribute them and/or modify them under the terms of the GNU
8 * Lesser General Public License as published by the Free Software
9 * Foundation; either version 2 of the License, or (at your option)
12 * These libraries and programs are distributed in the hope that
13 * they will be useful, but WITHOUT ANY WARRANTY; without even the
14 * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU Lesser General Public License for more
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with these librararies and programs; if not, write
20 * to the Free Software Foundation, Inc., 51 Franklin Street, Fifth
21 * Floor, Boston, MA 02110-1301 USA
24 * COMPONENT_NAME: austext
26 * FUNCTIONS: UPDATE_MAXDBA
42 * (C) COPYRIGHT International Business Machines Corp. 1993,1995
44 * Licensed Materials - Property of IBM
45 * US Government Users Restricted Rights - Use, duplication or
46 * disclosure restricted by GSA ADP Schedule Contract with IBM Corp.
48 /*********************** DTSRLOAD.C ***************************
49 * $XConsortium: dtsrload.c /main/8 1996/09/23 21:04:17 cde-ibm $
51 * Formerly dtsrload.c was cravel.c.
52 * Input: Standard AusText .fzk file.
53 * Function: Adds to or updates corresponding DtSearch-
54 * AusText database records.
57 * Revision 2.7 1996/03/25 18:54:44 miker
58 * Changed FILENAME_MAX to _POSIX_PATH_MAX.
60 * Revision 2.6 1996/03/13 22:53:47 miker
61 * Changed char to UCHAR several places.
63 * Revision 2.5 1996/02/01 18:46:02 miker
64 * AusText 2.1.11, DtSearch 0.3. Changed document text reads from fgets
65 * to new single character reading functions to match dtsrindex.
66 * Added -t etx delimiter string command line arg.
68 * Revision 2.4 1995/12/01 16:18:22 miker
69 * Added fflush for stdout and stderr for clean printing to AusBuild log.
71 * Revision 2.3 1995/10/26 17:48:45 miker
72 * Fixed duplicate msgs catopen().
74 * Revision 2.2 1995/10/25 18:39:52 miker
77 * Revision 2.1 1995/09/22 19:31:48 miker
78 * Freeze DtSearch 0.1, AusText 2.1.8
80 * Revision 1.3 1995/09/20 22:52:47 miker
81 * Fixed bug: DtSrFlNOTAVAIL was being set in wrong obj field.
83 * Revision 1.2 1995/09/19 21:59:53 miker
84 * Set DtSrFlNOTAVAIL when appropriate for doc.
85 * If DtSearch, use DtSrVERSION instead of AUSAPI_VERSION in banner.
87 * Revision 1.1 1995/08/31 20:52:34 miker
90 * Revision 1.12 1995/06/08 19:42:44 miker
91 * 2.1.5f: Removed -w option. It no longer had an effect.
104 #include <sys/types.h>
105 #include <netinet/in.h>
107 #define PROGNAME "DTSRLOAD"
108 #define RECS_PER_DOT 20
109 #define TERMINATE_LINE if (dotcount>0) { putchar('\n'); }
110 #define EXIT_NORMAL 0 /* perfect return code */
111 #define EXIT_WARNING 1 /* functioned ok, but with warnings */
112 #define EXIT_VANISH 3 /* input file effectively empty */
116 /*--------------- EXTERNS ------------------*/
119 extern void gen_vec (char *fname_huffcode_tab);
120 extern long gen_vec_hufid;
122 /*--------------- GLOBALS ------------------*/
123 static char *abstrbuf = NULL;
124 static int blobs_are_used; /* boolean */
125 static long created_reccount = 0L;
126 static long dbrec_hufid = 1L;
127 unsigned long default_hashsize;
128 int debug_mode = FALSE;
129 int debug_encode = FALSE;
130 static char dicname[10]; /* 1 - 8 char database name */
131 char dicpath[_POSIX_PATH_MAX];
132 static int dotcount = 0;
133 static long duplicate_recids = 0L;
134 char fname_huffcode_tab[_POSIX_PATH_MAX];
135 char fname_input[_POSIX_PATH_MAX];
136 struct stat fstat_input;
137 static FILE *infile = NULL;
138 static long input_reccount = 0L;
141 static int need_final_progress_msg = TRUE;
142 static int normal_exitcode = EXIT_NORMAL;
147 static DB_ADDR objdba = NULL_DBA;
149 static int recs_per_dot = RECS_PER_DOT;
150 static time_t starttime = 0L;
153 char sprintbuf[1024 + _POSIX_PATH_MAX];
154 static int sumblobs = 0;
155 static int sumlines = 0;
158 static long updated_reccount = 0L;
160 struct or_dbrec dbrec;
161 struct or_objrec objrec;
162 struct or_miscrec miscrec;
163 struct or_blobrec blobrec;
165 /********************************************************/
169 /********************************************************/
170 /* Ensures global var 'maxdba' always contains highest D00 slot number */
171 #define UPDATE_MAXDBA(dba) {if((dba&0xffffff)>maxdba)maxdba=dba&0xffffff;}
174 /********************************************************/
176 /* segregate_dicname */
178 /********************************************************/
179 /* Separates dictionary name from pathname and loads
180 * them into the globals 'dicname' and 'dicpath'.
181 * Returns TRUE if dicname is valid, else returns FALSE.
183 static int segregate_dicname (char *string)
188 strncpy (dicpath, string, sizeof (dicpath));
189 dicpath[sizeof (dicpath) - 1] = 0;
191 /* Set 'ptr' to just the 8 char dictionary name by moving
192 * it backwards until first non-alphanumeric character
193 * (such as a ":" in the dos drive id or a slash between directories),
194 * or to the beginning of string.
196 for (ptr = dicpath + strlen (dicpath) - 1; ptr >= dicpath; ptr--)
197 if (!isalnum (*ptr)) {
204 /* test for valid dictionary name */
209 strcpy (dicname, ptr);
210 *ptr = 0; /* truncate dicname off of full path/dicname */
212 } /* segregate_dicname() */
215 /********************************************************/
217 /* user_args_processor */
219 /********************************************************/
220 /* handles command line arguments for 'main' */
221 static void user_args_processor (int argc, char **argv)
228 printf (catgets (dtsearch_catd, MS_cravel, 1,
229 "\nUSAGE: %s -d<dbname> [options] infile\n"
230 " Listed default file name extensions can be overridden.\n"
231 " -d<dbname> 1 - 8 char database name, incl optional path prefix.\n"
232 " File name extensions automatically appended.\n"
233 " -t<etxstr> End of text doc delimiter string. Default '\\f\\n'.\n"
234 " -c Initialize database record count by counting records.\n"
235 " -p<N> Print a progress dot every <N> records (default %d).\n"
236 " -h<N> Change duplicate rec id hash table size from %ld to <N>.\n"
237 " -h0 means there are no duplicates, don't check for them.\n"
238 " -e<path> Path-filename of huffman encode table (default %s).\n"
239 " <infile> Input [path]file name. Default extension %s.\n"
242 RECS_PER_DOT, default_hashsize,
243 FNAME_HUFFCODE_TAB, EXT_FZKEY);
247 /* Each pass grabs new parm of "-xxx" format */
248 for (argc--, argv++; argc > 0 && ((*argv)[0] == '-' || (*argv)[0] == '+');
252 if (strncmp (argptr, "-russell", 8) == 0) {
254 if (argptr[8] == '2')
259 argptr[1] = tolower (argptr[1]);
261 case 'd': /* (D)ictionary */
262 /* May include both dicname and dicpath */
263 if (!segregate_dicname (argptr + 2)) {
264 printf (catgets (dtsearch_catd, MS_cravel, 246,
265 "\n%s '%s' is invalid path/dictionary name.\n"),
271 case 't': /* ETX delimiter string */
272 /* Replace any "\n" string with real linefeed */
273 targ = parg.etxdelim = malloc (strlen (argptr + 2) + 4);
276 if (src[0] == '\\' && src[1] == 'n') {
287 if ((recs_per_dot = atoi (argptr + 2)) <= 0) {
288 recs_per_dot = RECS_PER_DOT;
289 printf (catgets (dtsearch_catd, MS_cravel, 582,
290 "%sIgnored invalid progress dot argument '%s'.\n"),
291 PROGNAME "582 ", argptr);
296 append_ext (fname_huffcode_tab, sizeof (fname_huffcode_tab),
297 argptr + 2, EXT_HUFFCODE);
301 duprec_hashsize = atol (argptr + 2);
302 if (duprec_hashsize == 0UL)
303 printf (catgets (dtsearch_catd, MS_cravel, 13,
304 "%s Duplicate record id checking disabled.\n"),
308 case 'c': /* force correct initial reccount by counting
310 system_reccount = -1;
315 printf (catgets (dtsearch_catd, MS_cravel, 14,
316 "\n%s Unknown command line argument '%s'.\n"),
319 } /* endwhile for cmd line '-'processing */
321 /* validate input file name */
323 puts (catgets (dtsearch_catd, MS_cravel, 15,
324 "\nMissing required input file name.\a"));
328 append_ext (fname_input, sizeof (fname_input), argv[0], EXT_FZKEY);
330 /* check for missing database name */
331 if (dicname[0] == 0) {
332 puts (catgets (dtsearch_catd, MS_cravel, 16,
333 "\nNo database name specified (-d argument).\a"));
337 } /* user_args_processor() */
340 /****************************************/
342 /* count_all_records */
344 /****************************************/
345 /* Initializes system_reccount and maxdba by
346 * actually counting all records in database.
347 * Must be called after dbrec has been read to ensure
348 * maxdba accounts for last miscrec slot number.
350 static void count_all_records (void)
352 char keybuf[DtSrMAX_DB_KEYSIZE + 4];
354 printf (catgets (dtsearch_catd, MS_cravel, 17,
355 "%s Initializing total record count "
356 "in database by actually counting...\n"),
360 KEYFRST (PROGNAME "286", OR_OBJKEY, 0);
361 while (db_status == S_OKAY) {
362 KEYREAD (PROGNAME "288", keybuf);
363 if (db_status != S_OKAY)
364 vista_abort (PROGNAME "288");
365 /* don't count records beginning with ctrl char */
366 if (keybuf[0] >= 32) {
368 CRGET (PROGNAME "251", &objdba, 0);
369 UPDATE_MAXDBA (objdba);
371 KEYNEXT (PROGNAME "291", OR_OBJKEY, 0);
373 /* account for last record's misc record slots */
374 maxdba += dbrec.or_recslots;
376 } /* count_all_records() */
379 /****************************************/
383 /****************************************/
384 /* Read the database's dbrec and load global variables
385 * system_reccount and maxdba with current values from db.
387 static void read_dbrec (void)
389 RECFRST (PROGNAME "285", OR_DBREC, 0); /* seqtl retrieval */
390 if (db_status != S_OKAY) {
391 printf (catgets (dtsearch_catd, MS_misc, 13,
392 "%sNo DB record in database '%s'.\n"
393 " The usual cause is failure to initialize "
394 "the database (run initausd).\n"),
395 PROGNAME"296 ", dicname);
398 RECREAD (PROGNAME "302", &dbrec, 0);
399 if (db_status != S_OKAY)
400 vista_abort (PROGNAME "303");
401 swab_dbrec (&dbrec, NTOH);
405 " DBREC: reccount=%ld maxdba=%ld vers='%s' dbacc=%d\n"
406 " fzkeysz=%d abstrsz=%d maxwordsz=%d otype=%d lang=%d\n"
407 " hufid=%ld flags=x%x compflags=x%x uflags=x%lx sec=x%lx\n"
408 ,(long)dbrec.or_reccount
409 ,(long)dbrec.or_maxdba
411 ,(int)dbrec.or_dbaccess
412 ,(int)dbrec.or_fzkeysz
413 ,(int)dbrec.or_abstrsz
414 ,(int)dbrec.or_maxwordsz
415 ,(int)dbrec.or_dbotype
416 ,(int)dbrec.or_language
417 ,(long)dbrec.or_hufid
418 ,(int)dbrec.or_dbflags
419 ,(int)dbrec.or_compflags
420 ,(long)dbrec.or_dbuflags
421 ,(long)dbrec.or_dbsecmask
425 dbrec_hufid = dbrec.or_hufid;
427 /* Confirm compatible program-database version numbers */
428 if (!is_compatible_version (dbrec.or_version, SCHEMA_VERSION)) {
429 printf (catgets(dtsearch_catd, MS_misc, 14,
430 "%s Program schema version '%s' incompatible with "
431 "database '%s' version '%s'.\n") ,
432 PROGNAME"245", SCHEMA_VERSION, dicname, dbrec.or_version);
436 /* If blobs are specified for the database,
437 * they must be compressed blobs.
439 switch (dbrec.or_dbaccess) {
440 case ORA_VARIES: /* use of blobs determined obj by obj */
441 case ORA_BLOB: /* objects stored directly in blobs */
442 case ORA_REFBLOB: /* refs to objects stored in blobs */
443 blobs_are_used = TRUE;
444 if (!(dbrec.or_compflags & ORC_COMPBLOB)) {
445 /* = don't compress blobs */
446 printf (catgets (dtsearch_catd, MS_cravel, 717,
447 "%s Aborting: Uncompressed blobs not yet supported.\n"),
453 blobs_are_used = FALSE;
457 /* Initialize global variable maxdba, which records largest slot number.
458 * If requested, init tot reccount by actually counting records.
460 if (system_reccount == -1)
461 count_all_records ();
463 system_reccount = dbrec.or_reccount;
464 maxdba = dbrec.or_maxdba;
467 printf (catgets (dtsearch_catd, MS_cravel, 18,
468 "%s: '%s' schema ver = %s, rec count = %ld, last slot = %ld.\n"),
469 aa_argv0, dicname, dbrec.or_version,
470 (long)system_reccount, (long)maxdba);
475 /****************************************/
479 /****************************************/
480 /* Write the database's updated reccount and maxdba fields */
481 static void write_dbrec (void)
486 RECFRST (PROGNAME "355", OR_DBREC, 0); /* seqtl retrieval */
487 if (db_status != S_OKAY)
488 vista_abort (PROGNAME "356");
489 int32 = htonl (system_reccount);
490 CRWRITE (PROGNAME "341", OR_RECCOUNT, &int32, 0);
491 int32 = htonl (maxdba);
492 CRWRITE (PROGNAME "342", OR_MAXDBA, &int32, 0);
494 /* If this was the first load of a new database,
495 * save the huffman encode table id.
497 if (blobs_are_used && dbrec_hufid == -1) {
498 int32 = htonl ((DtSrINT32)gen_vec_hufid);
499 CRWRITE (PROGNAME "343", OR_HUFID, &int32, 0);
501 if (db_status != S_OKAY)
502 vista_abort (PROGNAME "344");
503 printf (catgets (dtsearch_catd, MS_cravel, 19,
504 "%s: Final database record count = %ld, last slot = %ld.\n"),
505 aa_argv0, (long)system_reccount, (long)maxdba);
507 } /* write_dbrec() */
510 /************************************************/
514 /************************************************/
515 /* prints complete progress message and statistics to stdout */
516 static void print_progress (void)
518 time_t seconds = time (NULL) - starttime;
519 long bytes_in = ftell (infile);
522 bytes_in = fstat_input.st_size; /* make final msg "100%" */
524 printf (catgets (dtsearch_catd, MS_cravel, 20,
525 "%s: %ld input records processed in %ldm %lds, (%ld%%).\n"
526 " %ld duplicates, %ld new, %ld updates.\n"),
528 input_reccount, seconds / 60L, seconds % 60L,
529 (bytes_in * 100L) / fstat_input.st_size,
530 duplicate_recids, created_reccount, updated_reccount);
531 need_final_progress_msg = FALSE;
533 } /* print_progress() */
536 /************************************************/
538 /* print_exit_code */
540 /************************************************/
541 /* Called from inside DtSearchExit() at austext_exit_last */
542 static void print_exit_code (int exit_code)
548 printf ( catgets(dtsearch_catd, MS_cravel, 2,
549 "%s: Exit code = %d\n") ,
550 aa_argv0, exit_code);
554 } /* print_exit_code() */
557 /************************************************/
559 /* load_next_miscrec */
561 /************************************************/
562 /* Repeatedly called from create_object() or update_object()
563 * to fill miscrec buffer with next FZKABSTR type miscrec
564 * from input file data saved in fzkbuf and abstrbuf.
565 * First call for a given object is signaled by passed arg.
566 * Thereafter static pointers keep track of where we are
567 * in the source bufs to correctly load the next miscrec.
568 * Initial state = fill-with-fzkey, if there is a fzkey.
569 * Second state = fill-with-abstract, if there is an abstract.
570 * Last state = zero-fill balance of remaining misc records.
571 * Returns TRUE until last state completed (no more miscrecs can be written).
573 static int load_next_miscrec (int first_call)
576 FILL_FZKEY, FILL_ABSTR, FILL_ZEROS
578 fill_state = FILL_ZEROS;
579 static char *src = NULL;
580 static int srclen = 0;
581 static int totbytes = 0;
586 /* Initialize static variables at first call. */
588 /* If fzkey-abstract misc recs not used, return immediately. */
589 if ((totbytes = dbrec.or_fzkeysz + dbrec.or_abstrsz) <= 0)
591 if (dbrec.or_fzkeysz > 0) {
592 fprintf (aa_stderr, catgets(dtsearch_catd, MS_cravel, 522,
593 "%s This version of %s does not support semantic databases.\n"),
594 PROGNAME"522", aa_argv0);
598 fill_state = FILL_ABSTR;
600 srclen = dbrec.or_abstrsz;
604 /* If NOT first call, but we've finished writing everything out,
605 * then tell the caller there's nothing left to do.
607 else if (totbytes <= 0)
610 /* Main loop is on each byte of the or_misc field of miscrec.
611 * Depending on the fill state, the byte will be a fzkey byte,
612 * an abstract byte, or a binary zero byte.
614 targ = (char *) miscrec.or_misc;
615 for (i = 0; i < sizeof(miscrec.or_misc); i++, totbytes--) {
616 switch (fill_state) {
619 if (--srclen <= 0) { /* end of fzkey? */
620 if (dbrec.or_abstrsz > 0) {
621 fill_state = FILL_ABSTR;
623 srclen = dbrec.or_abstrsz;
626 fill_state = FILL_ZEROS;
631 if (*src == 0 || --srclen <= 0) /* end of abstract? */
632 fill_state = FILL_ZEROS;
641 fprintf (aa_stderr, catgets (dtsearch_catd, MS_misc, 25,
642 "%sAbort due to program error.\n"),
648 miscrec.or_misctype = ORM_FZKABS;
650 } /* load_next_miscrec() */
653 /************************************************/
657 /************************************************/
658 /* Creates new object rec and misc recs from current vista rec.
659 * Sets global objdba to new rec's dba and updates maxdba if necessary.
660 * 1 create fields in objrec buffer, and write it.
661 * (or_objsize will be rewritten after text size has been determined.)
662 * 2 create fzkey-abstract rec as necessary.
664 static void create_object (char *key)
670 memset (&objrec, 0, sizeof (objrec));
672 /* Copy the key into the buffer. The previous initialization
673 * ensures that the key will be padded on the right with zero fill.
674 * At this point, key length should never be too long because
675 * it has been previously tested (when the line was first read in).
678 targ = objrec.or_objkey;
679 for (i = 0; i < DtSrMAX_DB_KEYSIZE; i++) {
685 /* Objdate will be updated later if line #4 has
686 * valid DtSrObjdate format. Otherwise current
687 * date/time stamp will be the default.
689 objrec.or_objdate = starttimeobjd;
691 /* If all objects in database are same type, mark approp obj flag */
692 if (dbrec.or_dbotype != 0)
693 objrec.or_objtype = dbrec.or_dbotype;
695 /* If blobs are never used, mark each obj as 'unretrievable' */
697 objrec.or_objflags |= DtSrFlNOTAVAIL;
698 swab_objrec (&objrec, HTON);
699 FILLNEW (PROGNAME "487", OR_OBJREC, &objrec, 0);
700 if (db_status != S_OKAY)
701 vista_abort (PROGNAME "495");
702 CRGET (PROGNAME "375", &objdba, 0); /* save object's dba */
703 UPDATE_MAXDBA (objdba);
705 printf ("---> new rec: inrecno %6ld, slot %6ld, key '%s'\n",
706 (long int) input_reccount, (long int) objdba & 0xffffff, objrec.or_objkey);
708 /* Make current object record the owner of all its sets */
709 SETOR (PROGNAME "376", OR_OBJ_BLOBS, 0);
710 SETOR (PROGNAME "377", OR_OBJ_MISCS, 0);
712 /* If fzkeys and/or abstracts are used,
713 * write out the misc record(s) now.
715 if (load_next_miscrec (TRUE))
717 HTONS (miscrec.or_misctype);
718 FILLNEW (PROGNAME "501", OR_MISCREC, &miscrec, 0);
719 CRGET (PROGNAME "503", &tempdba, 0);
720 UPDATE_MAXDBA (tempdba);
721 CONNECT (PROGNAME "505", OR_OBJ_MISCS, 0);
722 } while (load_next_miscrec (FALSE));
724 system_reccount++; /* new obj rec, so incr tot num database recs */
727 } /* create_object() */
730 /************************************************/
734 /************************************************/
735 /* Reinitializes portions of preexisting object rec.
736 * (Presumes vista 'current record' is desired object rec.)
737 * Sets objdba to rec's dba and updates maxdba if necessary.
738 * System_reccount is not altered because this is not a new record.
739 * 1 reinit certain fields in objrec, and rewrite it.
740 * (or_objsize will be rewritten after text size has been determined.)
741 * 2 delete all blobs (there should be no hyper recs,
742 * and existing user notes should not be changed).
743 * 3 update fzkey-abstract rec(s) as necessary.
744 * Important: misc rec updates should always be IN-PLACE.
745 * If miscrecs are deleted first then readded,
746 * there is no guarantee that their slots will be adjacent.
747 * This will screw up bit vector calculations in the inverted
748 * index word searches. In-place updates are faster anyway,
749 * and we know that the number of misc rec slots is constant.
751 static void update_object (char *key)
754 int first_fzkabstr = TRUE;
758 DtSrINT32 zero_objsize = 0;
760 /* Slot number is dba with high order byte (filenum) parsed out */
761 CRGET (PROGNAME "467", &objdba, 0); /* save object's dba */
762 UPDATE_MAXDBA (objdba);
764 printf ("----> update: inrecno %6ld, slot %6ld, key '%s'\n",
765 (long int) input_reccount, (long int) objdba & 0xffffff, key);
767 /* Reinit certain fields.
768 * Objsize will be rewritten after new text size determined.
769 * Objdate will be rewritten if .fzk file has valid
770 * DtSrObjdate format in line #4.
772 CRWRITE (PROGNAME "472", OR_OBJSIZE, &zero_objsize, 0);
773 int32 = htonl (starttimeobjd);
774 CRWRITE (PROGNAME "681", OR_OBJDATE, &int32, 0);
776 /* Make current object record the owner of all its sets */
777 SETOR (PROGNAME "475", OR_OBJ_BLOBS, 0);
778 SETOR (PROGNAME "476", OR_OBJ_MISCS, 0);
780 /* Delete all blobs in a loop */
781 FINDFM (PROGNAME "480", OR_OBJ_BLOBS, 0);
782 while (db_status == S_OKAY) {
783 DISDEL (PROGNAME "482", 0);
784 FINDFM (PROGNAME "483", OR_OBJ_BLOBS, 0);
787 /* Update all miscrecs in a loop.
788 * User notes are left alone,
789 * and fzkey-abstracts are updated.
790 * Currently other types are not allowed.
792 first_fzkabstr = TRUE;
793 FINDFM (PROGNAME "480", OR_OBJ_MISCS, 0);
794 while (db_status == S_OKAY) {
795 CRREAD (PROGNAME "496", OR_MISCTYPE, &misctype, 0);
800 break; /* do nothing */
802 case ORM_FZKABS: /* combined fzkey-abstract rec */
803 if (load_next_miscrec (first_fzkabstr)) {
804 HTONS (miscrec.or_misctype);
805 RECWRITE (PROGNAME "601", &miscrec, 0);
806 CRGET (PROGNAME "605", &tempdba, 0);
807 UPDATE_MAXDBA (tempdba);
808 first_fzkabstr = FALSE;
811 DISDEL (PROGNAME "709", 0);
816 DISDEL (PROGNAME "529", 0);
819 FINDNM (PROGNAME "506", OR_OBJ_MISCS, 0);
820 } /* end update loop for all members of OBJ_MISCS set */
824 } /* update_object() */
827 /************************************************/
831 /************************************************/
832 /* Called from main while reading document text.
833 * Calls huffman compression encoder at convenient
834 * intervals and at ETX.
836 static void call_encoder (UCHAR *ucharbuf, int buflen)
841 printf ("buflen = %d, sumlines = %d, cum objsize = %ld\n",
842 (int)buflen, (int)sumlines, (long)objsize);
844 if (hc_encode (&blobrec, ucharbuf, buflen, FALSE)) {
846 sumblobs += blobrec.or_bloblen;
847 printf ("---> WRITE sumlines = %d, bloblen = %d, "
848 "sumblobs = %d, objsize = %ld\n",
849 sumlines, (int)blobrec.or_bloblen,
850 (int)sumblobs, (long)objsize);
853 HTONS (blobrec.or_bloblen);
854 FILLNEW (PROGNAME "572", OR_BLOBREC, &blobrec, 0);
855 CONNECT (PROGNAME "578", OR_OBJ_BLOBS, 0);
858 } /* call_encoder() */
861 /************************************************/
865 /************************************************/
866 int main (int argc, char *argv[])
868 static int hufftab_never_loaded = TRUE;
872 char *cptr, *targ, *src;
874 char uniqkey [DtSrMAX_DB_KEYSIZE + 4];
879 setlocale (LC_ALL, "");
880 dtsearch_catd = catopen (FNAME_DTSRCAT, 0);
884 tmptr = localtime (&starttime);
885 starttimeobjd = tm2objdate (tmptr);
886 strftime (linebuf, sizeof (linebuf),
887 catgets (dtsearch_catd, MS_misc, 22, "%A, %b %d %Y, %I:%M %p"),
889 printf (catgets (dtsearch_catd, MS_misc, 23,
890 "%s: Version %s. Run %s.\n"),
894 austext_exit_last = print_exit_code;
895 init_user_interrupt (); /* specify signal handlers */
896 default_hashsize = duprec_hashsize; /* deflt val in isduprec.c */
897 strcpy (fname_huffcode_tab, FNAME_HUFFCODE_TAB);
901 memset (&dblk, 0, sizeof(DBLK));
902 memset (&parg, 0, sizeof(PARG));
904 parg.etxdelim = ETXDELIM;
906 /* Parse user's command line args and maybe change global variables */
907 user_args_processor (argc, argv);
908 strcpy (dblk.name, dicname);
910 /* Open the database */
912 printf (PROGNAME "211 database OPEN string = '%s%s'\n",
914 if (!austext_dopen (dicname, dicpath, NULL, 0, NULL)) {
915 fprintf (aa_stderr, "%s\n", DtSearchGetMessages());
919 src = getcwd (linebuf, sizeof (linebuf));
921 src = getenv ("PWD");
922 printf (catgets (dtsearch_catd, MS_misc, 24,
923 "%s: cwd = '%s', fzkfile = '%s'\n"),
925 (src) ? src : catgets (dtsearch_catd, MS_misc, 6, "<unknown>"),
927 if ((infile = fopen (fname_input, "r")) == NULL) {
928 fprintf (aa_stderr, catgets (dtsearch_catd, MS_misc, 12,
929 "%sUnable to open %s:\n %s\n"),
930 PROGNAME "326 ", fname_input, strerror (errno));
933 parg.ftext = infile; /* for discard_to_ETX() */
935 /* Read in starting database record count and other db config/status data */
938 /* If fzkeys and/or abstracts are used,
939 * create correctly sized buffers for them.
941 if (dbrec.or_fzkeysz > 0) {
942 fprintf (aa_stderr, catgets(dtsearch_catd, MS_cravel, 522,
943 "%s This version of %s does not support semantic databases.\n"),
944 PROGNAME"523", aa_argv0);
948 if (dbrec.or_abstrsz > 0)
949 abstrbuf = austext_malloc (dbrec.or_abstrsz + 16, PROGNAME "744", NULL);
951 /* Get input file size for progress msgs */
952 if (fstat (fileno (infile), &fstat_input) == -1) {
953 fprintf (aa_stderr, catgets (dtsearch_catd, MS_cravel, 29,
954 "%s Unable to get status for %s: %s\n"),
955 PROGNAME"337", fname_input, strerror (errno));
958 if (fstat_input.st_size <= 0L) {
959 fprintf (aa_stderr, catgets (dtsearch_catd, MS_cravel, 30,
960 "%s File %s is empty.\n"),
961 PROGNAME"343", fname_input);
965 printf (catgets (dtsearch_catd, MS_cravel, 31,
966 "%s: Each dot = %d records processed.\n"),
967 aa_argv0, recs_per_dot);
969 /*-------------------- MAIN LOOP --------------------
970 * Executed once for each new input record.
971 * 1. Read and process the FZKEY line.
972 * 2. Read and process the ABSTRACT line.
973 * 3. Read the UNIQUE KEY line.
974 * Write out an object record at this point.
975 * 4. Read and process the DATE line, update object rec.
976 * 5. Use readchar_ftext to read document text until ETX.
977 * Either blob it or discard it as appropriate.
979 while (!feof(infile)) {
981 /*----- READ LINE #1, fzkey -------------------------
982 * First line of new record.
983 * Abort now if a shutdown signal was sent.
984 * Skip null records (ETX str followed immediately by ETX str).
985 * If this database uses fzkeys, "pack" current fzkey
986 * and save it in the correct miscrec buffer.
987 * If fzkeys are combined with abstracts they share the same
988 * miscrec, otherwise they they reside in their own miscrec.
989 * WARNING! Presumes or_fzkeysz <= the space allocated
990 * for it in the correct miscrec.
991 *-----------------------------------------------------*/
992 if (fgets (linebuf, sizeof(linebuf) - 1, infile) == NULL)
995 /* Got at least one line of a new record. Print progress dots */
996 if (!(input_reccount % recs_per_dot)) {
997 if (input_reccount) {
1000 if (!(dotcount % 10))
1002 if (dotcount >= 50) {
1011 need_final_progress_msg = TRUE;
1013 linebuf [sizeof(linebuf)-1] = 0;
1014 linelen = strlen (linebuf);
1019 printf (catgets (dtsearch_catd, MS_misc, 15,
1020 "%sReceived abort signal %d.\n"),
1021 PROGNAME"373 ", shutdown_now);
1022 write_dbrec (); /* at least update reccount and maxdba */
1023 DtSearchExit (100 + shutdown_now);
1025 /* Skip null record */
1026 if (strcmp (linebuf, parg.etxdelim) == 0)
1029 /*----- READ LINE #2, abstract ------------------------
1030 * Second line is abstract line. Save it in record buffer,
1031 * hopping over the first 10 chars ("ABSTRACT: ....").
1032 *-----------------------------------------------------*/
1033 if (fgets (linebuf, sizeof (linebuf) - 1, infile) == NULL)
1035 linebuf [sizeof(linebuf)-1] = 0;
1036 linelen = strlen (linebuf);
1038 if (strncmp (linebuf, "ABSTRACT: ", 10) != 0) {
1039 cptr = PROGNAME"580";
1041 normal_exitcode = EXIT_WARNING;
1043 printf (catgets (dtsearch_catd, MS_cravel, 579,
1044 "%s Discarded rec #%ld: Invalid .fzk file format.\n"),
1045 cptr, input_reccount);
1046 if (strcmp (linebuf, parg.etxdelim) != 0)
1047 discard_to_ETX (&parg);
1051 /* If abstracts are used, save this one in the abstract buffer */
1052 if (dbrec.or_abstrsz > 0) {
1053 linebuf[--linelen] = 0; /* delete terminating \n */
1054 strncpy (abstrbuf, linebuf + 10, dbrec.or_abstrsz);
1055 abstrbuf[dbrec.or_abstrsz - 1] = 0;
1058 /*--- READ LINE #3, unique database key ------------------
1059 * Third line is 'unique record id'.
1060 * If key is valid update old objrec
1061 * or create new one as necessary.
1062 * (There may be one more write required
1063 * after we determine total blob size).
1064 *-----------------------------------------------------*/
1065 if (fgets (linebuf, sizeof (linebuf) - 1, infile) == NULL)
1067 linebuf [sizeof(linebuf)-1] = 0;
1068 linelen = strlen (linebuf);
1069 if (strcmp (linebuf, parg.etxdelim) == 0) {
1070 cptr = PROGNAME"1068";
1071 goto INVALID_FORMAT;
1074 * Isolate first token surrounded by whitespace
1075 * (and parse out \n)
1077 if ((db_key = strtok (linebuf, " \t\n")) == NULL) {
1078 cptr = PROGNAME"1076";
1079 goto INVALID_FORMAT;
1081 if (strlen (db_key) > DtSrMAX_DB_KEYSIZE - 1) {
1082 normal_exitcode = EXIT_WARNING;
1084 printf (catgets (dtsearch_catd, MS_cravel, 33,
1085 "%s Discarded rec #%ld: Key too long:\n '%s'.\n"),
1086 PROGNAME"606", input_reccount, db_key);
1087 discard_to_ETX (&parg);
1090 if (!isalnum (db_key[0])) {
1091 normal_exitcode = EXIT_WARNING;
1093 printf (catgets (dtsearch_catd, MS_cravel, 927,
1094 "%s Discarded rec #%ld: First char (keytype) of key\n"
1095 " '%s' is not alphanumeric.\n"),
1096 PROGNAME"927", input_reccount, db_key);
1097 discard_to_ETX (&parg);
1101 /* If duplicate record in fzk file, discard it. */
1102 i = is_duprec (db_key);
1105 printf (catgets (dtsearch_catd, MS_cravel, 34,
1106 "%s Out of Memory! "
1107 "Set -h arg to a smaller number,\n"
1108 " or reduce the number of input records.\n"),
1112 else if (i == 1) { /* skip duplicate record id */
1113 normal_exitcode = EXIT_WARNING;
1115 printf (catgets (dtsearch_catd, MS_cravel, 35,
1116 "%s: Discarded duplicate rec #%ld '%s'.\n"),
1117 aa_argv0, input_reccount, db_key);
1119 discard_to_ETX (&parg);
1123 * Try to read the object record from the database. If it
1124 * already exists (UPDATE): delete all its blobs (there
1125 * should be no hyper recs). create or update
1126 * fzkey-abstract recs as necessary. dont change any
1127 * existing user notes. update fields in objrec buffer,
1128 * but don't write it yet-- objrec will be rewritten
1129 * after text size has been determined. If it doesn't
1130 * exist (CREATE): create fields in objrec buffer, and
1131 * write it. create fzkey-abstract recs as necessary.
1132 * objrec will be rewritten after text size has been
1133 * determined. After update or create, objdba contains
1134 * dba of curr obj record.
1136 strcpy (uniqkey, db_key);
1137 KEYFIND (PROGNAME "489", OR_OBJKEY, uniqkey, 0);
1138 if (db_status == S_OKAY)
1139 update_object (uniqkey);
1141 create_object (uniqkey);
1144 /*----- READ LINE #4, date -----------------------------
1145 * Line #4 is object date/time string (OBJDATESTR format).
1146 * It is no longer optional. If invalid, the current
1147 * run date that was preloaded into the record is used.
1148 *-----------------------------------------------------*/
1149 if (fgets (linebuf, sizeof (linebuf) - 1, infile) == NULL)
1151 linebuf [sizeof(linebuf)-1] = 0;
1152 linelen = strlen (linebuf);
1153 if (!is_objdatestr (linebuf, &objdate)) {
1154 normal_exitcode = EXIT_WARNING;
1155 if (strcmp (linebuf, parg.etxdelim) == 0) {
1156 cptr = PROGNAME"1155";
1157 goto INVALID_FORMAT;
1160 printf (catgets (dtsearch_catd, MS_cravel, 1086,
1161 "%s Record '%s' has invalid date format--"
1162 "using run date.\n"),
1163 PROGNAME"1086", uniqkey);
1165 else { /* objdate is valid */
1166 KEYFIND (PROGNAME "1098", OR_OBJKEY, uniqkey, 0);
1167 if (db_status != S_OKAY)
1168 vista_abort (PROGNAME "1101");
1169 HTONL (objdate); /* ready for record writes */
1170 CRWRITE (PROGNAME "1102", OR_OBJDATE, &objdate, 0);
1173 /*----- READ TO ETX, record text ---------------------
1174 * Balance of record (after line 4 to end of record marker)
1175 * is text. It may or may not be formatted in neat ascii
1176 * lines, ie it may not have periodic linefeeds (\n).
1177 * If this database does not store compressed records (blobs)
1178 * we just discard all chars to end of text delimiter (ETX).
1179 * Otherwise we read it char by char using readchar_ftext()
1180 * and fill linebuf to some convenient size.
1182 * Repeated calls to hc_encode() build
1183 * a compressed record in its own internal blobrec buffer.
1184 * When the buffer is full, hc_encode copies it to
1185 * the passed blobrec buffer and returns TRUE.
1186 * The caller should then write out the blobrec.
1187 * If hc_encode returns FALSE, its internal blobrec is not
1188 * yet full so the caller should not yet write out his record.
1189 *-----------------------------------------------------*/
1191 if (!blobs_are_used) {
1192 discard_to_ETX (&parg);
1196 * Initialize blob compression by reading in huffman
1197 * encode table (first execution only). Ensure table id
1198 * is same as one used for previous compressions, if any.
1200 if (hufftab_never_loaded) {
1201 hufftab_never_loaded = FALSE;
1202 gen_vec (fname_huffcode_tab);
1203 if (dbrec_hufid != gen_vec_hufid && dbrec_hufid != -1L) {
1205 printf (catgets (dtsearch_catd, MS_cravel, 1153,
1206 "%s Current data compression table id"
1207 " in '%s' is %ld.\n"
1208 " Database '%s' previously compressed"
1209 " with table %ld.\n"),
1210 PROGNAME"1153 ", fname_huffcode_tab,
1211 gen_vec_hufid, dicname, dbrec_hufid);
1216 * Compress document text. Repeatedly load linebuf
1217 * with fixed number of chars and compress it.
1223 if ((linebuf[0] = readchar_ftext (&parg)) == 0) {
1224 normal_exitcode = EXIT_WARNING;
1226 printf ( catgets(dtsearch_catd, MS_cravel, 1215,
1227 "%s Warning. Record '%s' has no text.\n"),
1228 PROGNAME"1215" , uniqkey);
1232 while (linebuf [linelen] = readchar_ftext (NULL)) {
1233 if (++linelen >= 80) {
1234 call_encoder ((UCHAR *)linebuf, linelen);
1240 * At ETX: If a partial line remains, process it just like
1241 * the full lines above. Then write out total size to
1242 * object record, and make the final call to hc_encode with
1243 * empty line and TRUE flag to indicate 'no more text,
1244 * flush your last partial buffer'.
1247 call_encoder ((UCHAR *)linebuf, linelen);
1248 CRSET (PROGNAME "685", &objdba, 0);
1249 int32 = htonl (objsize);
1250 CRWRITE (PROGNAME "686", OR_OBJSIZE, &int32, 0);
1251 if (hc_encode (&blobrec, (UCHAR *)"", 0, TRUE)) {
1253 sumblobs += blobrec.or_bloblen;
1254 printf ("---> FINAL sumlines =%d, bloblen = %d, "
1255 "sumblobs = %ld, objsize = %ld\n",
1256 (int)sumlines, (int)blobrec.or_bloblen,
1257 (long)sumblobs, (long)objsize);
1259 HTONS (blobrec.or_bloblen);
1260 FILLNEW (PROGNAME "624", OR_BLOBREC, &blobrec, 0);
1261 CONNECT (PROGNAME "625", OR_OBJ_BLOBS, 0);
1264 } /* end main record loop */
1266 if (need_final_progress_msg)
1271 /* If all input records were discarded, complete processing
1272 * but upgrade warning exit code to hard error code.
1274 if (created_reccount <= 0L && updated_reccount <= 0L) {
1275 normal_exitcode = EXIT_VANISH;
1276 fprintf (stderr, catgets (dtsearch_catd, MS_cravel, 1048,
1277 "%sDatabase objects not changed because input "
1278 "file effectively empty.\n"),
1282 /* Close database and print return code via exits.
1283 * Return code is either 0 (perfect), 1 (warnings),
1284 * or 3 (input file effectively empty).
1286 DtSearchExit (normal_exitcode);
1289 /*********************** DTSRLOAD.C ***************************/