2 * COMPONENT_NAME: austext
4 * FUNCTIONS: compare_dba
14 * (C) COPYRIGHT International Business Machines Corp. 1993,1995
16 * Licensed Materials - Property of IBM
17 * US Government Users Restricted Rights - Use, duplication or
18 * disclosure restricted by GSA ADP Schedule Contract with IBM Corp.
20 /*************************** VEDELETE.C ****************************
21 * $XConsortium: vedelete.c /main/6 1996/08/12 13:21:16 cde-ibm $
22 * Implements the opera OE_DELETE_RECID and OE_DELETE_BATCH functions.
23 * For each dba in usrblk.dbatab (count is in usrblk.dbcount):
24 * deletes header rec, text blobs, and user notes;
25 * database addresses on inverted index file;
26 * and associated words and stems btree references.
29 * Revision 2.2 1995/10/26 14:22:06 miker
32 * Revision 2.1 1995/09/22 22:29:07 miker
33 * Freeze DtSearch 0.1, AusText 2.1.8
35 * Revision 1.10 1995/09/05 19:22:46 miker
36 * Made usrblk, ausapi_msglist global. Deleted obsolete socblk refs.
37 * Minor name changes for DtSearch.
39 * Revision 1.9 1995/07/18 22:30:58 miker
40 * Delete msglist arg from vista_abort() function calls.
46 #define PROGNAME "VEDELETE"
50 #define WORDS_PER_DOT 1000
51 #define MAX_REC_READ 128 /* Max number of addresses to be read from
52 * database addresses file. 128 *
53 * sizeof(DB_ADDR) = 512 bytes. This is the
54 * size of one block read from hard disk. */
56 /************ GLOBAL VARIABLES ***************/
57 static struct or_hwordrec got_hword;
58 static struct or_lwordrec got_lword;
59 static struct or_swordrec got_sword;
62 static FILE *dtbs_addr_fp;
63 static int debugging = FALSE;
64 static int verbose_debugging = FALSE;
65 static char *keyptr = NULL;
66 static long updated_words, deleted_words, skipped_words;
67 static long freed_bytes;
70 static long word_count, dot_count;
73 /********************************/
77 /********************************/
78 /* Called for every single database word.
79 * Prints a dot for every WORDS_PER_DOT word,
80 * a space every 10 dots, and a new line every 50 dots.
82 static void print_dots (void)
85 if (word_count % WORDS_PER_DOT == 0) {
87 fputc ('.', aa_stderr);
89 if (dot_count % 50 == 0) {
90 fputc ('\n', aa_stderr);
93 else if (dot_count % 10 == 0)
94 fputc (' ', aa_stderr);
101 /********************************/
105 /********************************/
106 /* Convert d99 dba format (the record NUMBER in the high
107 * 3 bytes, and statistical info in lower byte)
108 * to vista dba format (d00 file number in high byte,
109 * slot number in lower 3 bytes). The conversion
110 * accounts for the number of slots each record requires,
111 * as well as an offset for the dbrec in slot #1.
112 * (There is no slot #0 in vista; that's a NULL_DBA).
113 * Formula: subtract 1, multiply by num reclots per rec, add 2.
115 static DB_ADDR d99_to_dba (DB_ADDR d99)
117 #define D00_HIBYTE ((DB_ADDR) OR_D00 << 24)
118 /* Move hi 3 bytes to lo 3 bytes and zero out hi byte */
120 return D00_HIBYTE | ((--d99) * (DB_ADDR) recslots + 2);
124 /********************************/
128 /********************************/
129 static int compare_dba (const void *dba1, const void *dba2)
131 DB_ADDR *i = (DB_ADDR *) dba1;
132 DB_ADDR *j = (DB_ADDR *) dba2;
139 } /* compare_dba() */
142 /********************************/
146 /********************************/
147 /* This function is called for each word in the database.
148 * It reads and rewrites all the dbas in the d99 file for the word,
149 * deleting those that are listed in the usrblk dba table.
150 * Return - UPDATE_WORD if there are still some dba's in the
151 * d99 file for this word.
152 * Return - DELETE_WORD if all the dba's in the d99 were in
154 * Return - SKIP_UPDATE if none of the dba's in the d99 file
155 * were in the usrblk table.
157 static int delete_addrs (void)
159 DB_ADDR copy_addrs[MAX_REC_READ];
160 DB_ADDR word_addrs[MAX_REC_READ];
165 size_t num_reads, num_writes, ncopy;
166 DtSrINT32 num_addrs, num_found;
167 long read_offset, write_offset;
169 if (fseek (dtbs_addr_fp, got_hword.or_hwoffset, SEEK_SET)) {
170 printf (PROGNAME "80 "
171 "fseek failed on '%s%s.d99'. Offset=%ld. Word='%s'.\n",
172 usrblk.dblk->path, usrblk.dblk->name,
173 got_hword.or_hwoffset, keyptr);
176 write_offset = ftell (dtbs_addr_fp);
177 if (got_hword.or_hwoffset != write_offset) {
178 printf (PROGNAME "88 ftell failed on '%s%s.d99'."
179 " hwoffset=%ld write_offset=%ld.\n",
180 usrblk.dblk->path, usrblk.dblk->name,
181 (long)got_hword.or_hwoffset, write_offset);
184 num_addrs = got_hword.or_hwaddrs;
188 /* OUTER LOOP: for each block of dbas */
192 * read only a single block of max recs, or fewer dba's if this is
195 if (num_addrs > MAX_REC_READ) {
196 num_reads = MAX_REC_READ;
197 num_addrs -= MAX_REC_READ;
201 num_reads = num_addrs;
203 if (fread (word_addrs, sizeof (DB_ADDR), num_reads, dtbs_addr_fp)
205 puts ("\n\r" PROGNAME "98 "
206 "AusText is shutting down due to corrupted k2x/d99 links.");
210 /* INNER LOOP: each dba in this block */
211 for (i = 0; i < num_reads; i++) {
212 vista_dba = d99_to_dba (ntohl (word_addrs[i]));
213 dba_eq = (char *) bsearch (
214 &vista_dba, /* key to search for */
215 usrblk.dbatab, /* start of the table */
216 usrblk.dbacount, /* # elems in the table */
217 sizeof (DB_ADDR), /* size of each elem */
218 compare_dba); /* my compare function */
220 if (dba_eq != NULL) {
222 * HIT! the passed word links to this dba. Don't copy the dba
229 /* MISS! Keep the dba by copying it to the copy array */
230 copy_addrs[ncopy++] = htonl (word_addrs[i]);
232 } /* end INNER LOOP for each dba in curr block */
236 * Once we start deleting dba's: save the curr 'read' loc, write
237 * the shorter 'copy' array to the output file at the curr
238 * 'write' loc, update the 'write' loc ptr, restore the 'read'
241 read_offset = ftell (dtbs_addr_fp);
242 if (read_offset <= 0L) {
243 printf (PROGNAME "169 ftell failed on '%s%s.d99'.\n",
244 usrblk.dblk->path, usrblk.dblk->name);
247 if (fseek (dtbs_addr_fp, write_offset, SEEK_SET)) {
248 printf (PROGNAME "175 fseek to %ld failed on '%s%s.d99'.\n",
249 write_offset, usrblk.dblk->path, usrblk.dblk->name);
252 num_writes = fwrite (copy_addrs, sizeof (DB_ADDR),
253 ncopy, dtbs_addr_fp);
254 if (num_writes != ncopy) {
255 printf (PROGNAME "283 fwrite at pos %ld failed on '%s%s.d99'.\n"
256 " Wrote %ld dba's instead of %ld dba's.\n",
257 write_offset, usrblk.dblk->path, usrblk.dblk->name,
261 write_offset = ftell (dtbs_addr_fp);
262 if (write_offset <= 0L) {
263 printf (PROGNAME "191 ftell failed on '%s%s.d99'.\n",
264 usrblk.dblk->path, usrblk.dblk->name);
267 if (fseek (dtbs_addr_fp, read_offset, SEEK_SET)) {
268 printf (PROGNAME "198 fseek to %ld failed on '%s%s.d99'.\n",
269 read_offset, usrblk.dblk->path, usrblk.dblk->name);
272 } /* end if (addrs_found) */
274 /* [do nothing, leave this block as is, just incr write ptr] */
275 write_offset = ftell (dtbs_addr_fp);
276 if (write_offset <= 0L) {
277 printf (PROGNAME "208 ftell failed on '%s%s.d99'.\n",
278 usrblk.dblk->path, usrblk.dblk->name);
282 } /* end OUTER LOOP for this word */
285 got_hword.or_hwaddrs -= num_found;
286 got_hword.or_hwfree += num_found;
287 if (got_hword.or_hwaddrs < 0) {
288 printf (PROGNAME "220 duplicate dbas for '%s' in '%s%s.d99'.\n",
289 keyptr, usrblk.dblk->path, usrblk.dblk->name);
292 if (got_hword.or_hwaddrs == 0) {
294 freed_bytes += sizeof (DB_ADDR) * got_hword.or_hwfree;
299 * @@@ fill out newly freed dba positions with FF's as debugging
305 } /* end if (addrs_found) */
308 } /* delete_addrs() */
311 /****************************************/
315 /****************************************/
316 /* Initializes accumulators, or generates msg
317 * of accumulator status and empties the accumulators.
319 static void print_counters (char *wordtype)
322 static long totupdwords, totdelwords, totskipwords;
323 static long totfreedbytes;
325 if (wordtype == NULL) {
326 /* initialize buckets */
327 totupdwords = totdelwords = totskipwords = 0L;
328 updated_words = deleted_words = skipped_words = 0L;
329 freed_bytes = totfreedbytes = 0L;
332 else if ((long) wordtype == -1L) {
333 /* final totals msg */
334 sprintf (msgbuf, PROGNAME " Batch Totals: "
335 "updated %ld, deleted %ld, unaffected %ld.\n"
336 " Old word count = %ld, New word count = %ld.\n"
337 " Freed %ld total bytes in d99 file.\n"
338 ,totupdwords, totdelwords, totskipwords
339 ,totupdwords + totdelwords + totskipwords
340 ,totupdwords + totskipwords
342 DtSearchAddMessage (msgbuf);
346 /* regular progress msg */
347 sprintf (msgbuf, PROGNAME " %ld %s Words: "
348 "%ld updated, %ld deleted, %ld unaffected.\n"
349 " Freed %ld bytes in d99 file."
350 ,updated_words + deleted_words + skipped_words
352 ,updated_words, deleted_words, skipped_words
355 DtSearchAddMessage (msgbuf);
356 totupdwords += updated_words;
357 totdelwords += deleted_words;
358 totskipwords += skipped_words;
359 totfreedbytes += freed_bytes;
360 updated_words = deleted_words = skipped_words = freed_bytes = 0L;
363 } /* print_counters() */
366 /****************************************/
370 /****************************************/
371 /* First do Gendler's code: words, stems, and inverted indexes.
372 * If it goes down, at least the database itself won't be corrupted.
373 * Then do Russell's code: delete text blobs, misc recs,
374 * and the object records. If it goes down here, only one
375 * or two records will be corrupted and none of Gendler's
376 * word searches will retrieve records yet to be deleted.
377 * The total database record count is adjusted between the
378 * loops because it's only used for Gendler's code.
379 * Addresses in usrblk.dbatab are PRESUMED VALID object records.
380 @@@@@@ still one bug left. Semantic inverted index files (d97, d98)
381 @@@@@@ not yet updated. But cborodin reindexes everything every time
382 @@@@@@ it runs so only failures will occur for semantic searches (rare),
383 @@@@@@ after tomita deletions (rare), before cborodin runs (common).
384 @@@@@@ Since this window is very small and no database corruption
385 @@@@@@ can occur, by order of management I'm to fix it later.
386 @@@@@@ The fix is: include the same module used in cborodin to reindex
387 @@@@@@ completely. On a 3 gig database that takes about 2 minutes.
389 void ve_delete (void)
392 DB_ADDR *dbap1, *dbap2;
393 char charbuf[200 + DtSrMAX_DB_KEYSIZE];
396 DtSrINT32 *reccount = &usrblk.dblk->dbrec.or_reccount;
397 DtSrINT16 maxwordsz = usrblk.dblk->dbrec.or_maxwordsz;
398 int dbacount = usrblk.dbacount;
402 DtSearchAddMessage (PROGNAME"336 "
403 "Database address table is empty; nothing to delete!");
404 usrblk.retncode = OE_NOOP;
408 recslots = usrblk.dblk->dbrec.or_recslots;
409 vistano = usrblk.dblk->vistano;
410 dtbs_addr_fp = usrblk.dblk->iifile;
411 debugging = ((usrblk.debug & USRDBG_DELETE) != 0L);
412 verbose_debugging = ((usrblk.debug & USRDBG_VERBOSE) != 0L);
414 print_counters (NULL); /* initialize all counters */
416 fprintf (aa_stderr, PROGNAME "185 "
417 "db='%s' vistano=%d maxwordsz=%d dbacount=%d.\n",
418 usrblk.dblk->name, vistano, (int)maxwordsz, dbacount);
419 if (verbose_debugging)
420 for (i = 0; i < dbacount; i++) {
421 dba = usrblk.dbatab[i];
422 CRSET (PROGNAME "178", &dba, vistano);
423 CRREAD (PROGNAME "179", OR_OBJKEY, charbuf, vistano);
424 fprintf (aa_stderr, " #%d\tdba=%ld:%ld key='%s'\n",
425 i, dba>>24, dba & 0xffffff, charbuf);
428 /* Sort the array of database addresses.
429 * After sorting, eliminate any duplicate dba's
430 * and adjust dbacount if necessary.
432 qsort (usrblk.dbatab, (size_t)dbacount, sizeof(DB_ADDR), compare_dba);
433 for (i = 1; i < dbacount; i++) {
434 if (usrblk.dbatab[i-1] != usrblk.dbatab[i])
436 dba = usrblk.dbatab[i];
437 sprintf (charbuf, PROGNAME"370 Duplicate dba = %ld:%ld ignored.",
438 (long)dba>>24, (long)(dba & 0x00ffffff));
439 DtSearchAddMessage (charbuf);
441 fprintf (aa_stderr, "%s\n", charbuf);
442 /* subloop moves rest of table up by one */
443 for (j = i + 1; j < dbacount; j++)
444 usrblk.dbatab[j-1] = usrblk.dbatab[j];
446 usrblk.dbacount = dbacount; /* in case caller uses this */
452 fprintf (aa_stderr, PROGNAME"355 "
453 "After sorting dbatab, dbacount=%d.\n", dbacount);
454 if (verbose_debugging)
455 for (i = 0; i < dbacount; i++) {
456 dba = usrblk.dbatab[i];
457 CRSET (PROGNAME "358", &dba, vistano);
458 CRREAD (PROGNAME "359", OR_OBJKEY, charbuf, vistano);
459 fprintf (aa_stderr, " #%d\tdba=%ld:%ld key='%s'\n",
460 i, (long)dba>>24, (long)(dba & 0x00ffffff), charbuf);
465 * There are 3 identical Gendler Loops, one for each possible group of
466 * word sizes: swords, lwords, and hwords. To speed up this lengthy
467 * process, only those loops will be executed as determined by database's
468 * maxwordsz. However delete_addrs() function always uses the
469 * buffer for huge words, so the necessary fields will be copied into
470 * that buffer for the smaller word sizes.
473 /*--------- Gendler's SWORD Loop ---------*/
474 /* every database has short words */
475 fprintf (aa_stderr, PROGNAME "368 "
476 "Entering SHORT word loop. Each dot = %ld words.\n",
482 updated_words = skipped_words = deleted_words = 0L;
484 KEYFRST (PROGNAME "203", OR_SWORDKEY, vistano);
485 while (db_status == S_OKAY) {
487 RECREAD (PROGNAME "182", &got_sword, vistano);
488 got_hword.or_hwoffset = ntohl (got_sword.or_swoffset);
489 got_hword.or_hwaddrs = ntohl (got_sword.or_swaddrs);
490 got_hword.or_hwfree = ntohl (got_sword.or_swfree);
491 keyptr = got_sword.or_swordkey;
492 ret_code = delete_addrs();
493 if (ret_code == UPDATE_WORD) {
494 got_sword.or_swoffset = htonl (got_hword.or_hwoffset);
495 got_sword.or_swaddrs = htonl (got_hword.or_hwaddrs);
496 got_sword.or_swfree = htonl (got_hword.or_hwfree);
497 RECWRITE (PROGNAME "183", &got_sword, vistano);
499 else if (ret_code == DELETE_WORD) {
500 DELETE (PROGNAME "184", vistano);
502 KEYNEXT (PROGNAME "196", OR_SWORDKEY, vistano);
503 } /* end loop on each sword */
505 print_counters ("Short");
508 /*--------- Gendler's LWORD Loop ---------*/
509 fprintf (aa_stderr, PROGNAME "398 "
510 "Entering LONG word loop. Each dot = %ld words.\n",
516 updated_words = skipped_words = deleted_words = 0L;
518 KEYFRST (PROGNAME "243", OR_LWORDKEY, vistano);
519 while (db_status == S_OKAY) {
521 RECREAD (PROGNAME "246", &got_lword, vistano);
522 got_hword.or_hwoffset = ntohs (got_lword.or_lwoffset);
523 got_hword.or_hwaddrs = ntohs (got_lword.or_lwaddrs);
524 got_hword.or_hwfree = ntohs (got_lword.or_lwfree);
525 keyptr = got_lword.or_lwordkey;
526 ret_code = delete_addrs ();
527 if (ret_code == UPDATE_WORD) {
528 got_lword.or_lwoffset = htons (got_hword.or_hwoffset);
529 got_lword.or_lwaddrs = htons (got_hword.or_hwaddrs);
530 got_lword.or_lwfree = htons (got_hword.or_hwfree);
531 RECWRITE (PROGNAME "252", &got_lword, vistano);
533 else if (ret_code == DELETE_WORD) {
534 DELETE (PROGNAME "256", vistano);
536 KEYNEXT (PROGNAME "258", OR_LWORDKEY, vistano);
538 print_counters ("Long");
540 /*--------- Gendler's HWORD Loop --------- */
541 fprintf (aa_stderr, PROGNAME "429 "
542 "Entering HUGE word loop. Each dot = %ld words.\n",
548 updated_words = skipped_words = deleted_words = 0L;
550 KEYFRST (PROGNAME "280", OR_HWORDKEY, vistano);
551 while (db_status == S_OKAY) {
553 RECREAD (PROGNAME "284", &got_hword, vistano);
554 NTOHS (got_hword.or_hwoffset);
555 NTOHS (got_hword.or_hwaddrs);
556 NTOHS (got_hword.or_hwfree);
557 keyptr = got_hword.or_hwordkey;
558 ret_code = delete_addrs ();
559 if (ret_code == UPDATE_WORD) {
560 HTONS (got_hword.or_hwoffset);
561 HTONS (got_hword.or_hwaddrs);
562 HTONS (got_hword.or_hwfree);
563 RECWRITE (PROGNAME "289", &got_hword, vistano);
565 else if (ret_code == DELETE_WORD) {
566 DELETE (PROGNAME "293", vistano);
568 KEYNEXT (PROGNAME "295", OR_HWORDKEY, vistano);
570 print_counters ("Huge");
572 /* print final batch totals */
573 print_counters ((char *) -1);
575 /*--------- Russell's Loop ---------*/
577 PROGNAME"470 All words processed. Now deleting %d objects...\n",
579 for (i = 0; i < dbacount; i++) {
580 dba = usrblk.dbatab[i];
582 if (verbose_debugging) {
583 fprintf (aa_stderr, PROGNAME "471 "
584 "Deleting object #%2d,\tdba %ld:%ld.\n",
585 i, (long)dba>>24, (long)(dba & 0x00ffffff));
588 /* Delete blobs. Use real d_csoset at first call
589 * so we can print out fancy error msg.
591 d_csoset (OR_OBJ_BLOBS, &dba, vistano);
592 if (db_status != 0) {
593 fputs (vista_msg(PROGNAME"152"), aa_stderr);
594 fprintf (aa_stderr, PROGNAME"153 Abort: "
595 "vistano=%d, dbatab #%d (out of %d), dba=%ld:%ld.\n",
596 vistano, i, dbacount, (long)dba>>24, (long)(dba & 0x00ffffff));
600 FINDFM (PROGNAME "155", OR_OBJ_BLOBS, vistano);
601 while (db_status != S_EOS) {
602 DISDEL (PROGNAME "158", vistano);
603 FINDFM (PROGNAME "159", OR_OBJ_BLOBS, vistano);
606 /* Delete misc records (old user notes, abstracts, fzkeys) */
607 CSOSET (PROGNAME "142", OR_OBJ_MISCS, &dba, vistano);
608 FINDFM (PROGNAME "145", OR_OBJ_MISCS, vistano);
609 while (db_status != S_EOS) {
610 DISDEL (PROGNAME "148", vistano);
611 FINDFM (PROGNAME "149", OR_OBJ_MISCS, vistano);
614 /* Pull the final plug by deleting the object record itself */
615 CRSET (PROGNAME "200", &dba, vistano);
616 DISDEL (PROGNAME "201", vistano);
617 } /* end of Russell's loop */
619 /* Adjust total record count */
620 *reccount -= dbacount;
623 RECFRST (PROGNAME "355", OR_DBREC, vistano); /* seqtl retrieval */
624 if (db_status != S_OKAY)
625 vista_abort (PROGNAME"356");
627 CRWRITE (PROGNAME "341", OR_RECCOUNT, reccount, vistano);
628 if (db_status != S_OKAY)
629 vista_abort (PROGNAME"342");
631 usrblk.retncode = OE_OK;
635 /*************************** VEDELETE.C ****************************/