2 * CDE - Common Desktop Environment
4 * Copyright (c) 1993-2012, The Open Group. All rights reserved.
6 * These libraries and programs are free software; you can
7 * redistribute them and/or modify them under the terms of the GNU
8 * Lesser General Public License as published by the Free Software
9 * Foundation; either version 2 of the License, or (at your option)
12 * These libraries and programs are distributed in the hope that
13 * they will be useful, but WITHOUT ANY WARRANTY; without even the
14 * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU Lesser General Public License for more
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with these librararies and programs; if not, write
20 * to the Free Software Foundation, Inc., 51 Franklin Street, Fifth
21 * Floor, Boston, MA 02110-1301 USA
23 /* $XConsortium: boolsrch.c /main/4 1996/09/23 21:00:18 cde-ibm $
25 * (c) Copyright 1996 Digital Equipment Corporation.
26 * (c) Copyright 1996 Hewlett-Packard Company.
27 * (c) Copyright 1996 International Business Machines Corp.
28 * (c) Copyright 1996 Sun Microsystems, Inc.
29 * (c) Copyright 1996 Novell, Inc.
30 * (c) Copyright 1996 FUJITSU LIMITED.
31 * (c) Copyright 1996 Hitachi.
34 * COMPONENT_NAME: austext
36 * FUNCTIONS: boolean_search
37 * calc_result_bitvec_WK
53 * (C) COPYRIGHT International Business Machines Corp. 1996
55 * Licensed Materials - Property of IBM
56 * US Government Users Restricted Rights - Use, duplication or
57 * disclosure restricted by GSA ADP Schedule Contract with IBM Corp.
59 /********************* BOOLSRCH.C **********************
60 * $Id: boolsrch.c /main/4 1996/09/23 21:00:18 cde-ibm $
62 * The vista code from the original vewords.c.
63 * Given a final truth table and stems array from the user's boolean
64 * query (output of boolean_search()), find all database records
65 * containing the truth table's set operations and return
66 * their database addresses in a resultlist.
67 * See boolpars.h for format and limitations of TRUTHTAB.
69 *-------------- D99DBA TO DBA CONVERSION ----------------
70 * 'd99dbas' are not real vista dbas! They were modified
71 * as follows to permit shorter bit vectors,
72 * and to minimize bit shifts at search time.
73 * vista_dba <- (OR_D00 << 24) | vista_slot
74 * vista_slot <- ((d99recno - 1) * or_recslots) + 2
75 * d99dba <- (d99recno << 8) | weight_byte
76 * d99recno <- ((vista_slot - 2) / or_recslots) + 1
77 * The d99 and bitvec recno of the first rec is 1.
78 * The slotno (vista dba) of the first rec is 2
79 * (dbrec occupies first slot and vista slots begin at 1).
82 * Revision 1.5 1996/03/20 19:21:49 miker
83 * Completed collocations code. Restored get_colloc_bitvec() from colloc.c.
85 * Revision 1.4 1996/03/18 22:06:24 miker
86 * Bug fix. Zero permute NOT queries always returned no hits.
88 * Revision 1.3 1996/03/13 23:05:24 miker
89 * Change long double constant to regular float for better portability.
91 * Revision 1.2 1996/03/13 22:36:37 miker
92 * Changed char to UCHAR several places; similar typecasts.
93 * Moved collocations processing to colloc.c.
95 * Revision 1.1 1996/03/05 15:52:06 miker
98 /***#define _ALL_SOURCE****/ /* to pickup typedefs for shm vnodes */
105 #include "boolpars.h"
107 #define PROGNAME "BOOLSRCH"
108 #define INIT_ITERATIONS 50
109 #define MS_boolsrch 16
111 * DBAS_PER_BLOCK is the max number of dbas to be read
112 * from d99 file. Note DBAS_PER_BLOCK * sizeof(DB_ADDR) = 512 bytes,
113 * the standard blksize of one hard disk block.
115 #define DBAS_PER_BLOCK 128
117 #define RESET_BIT(bv, by, bm) bv[by] &= (UCHAR) ~bm
119 #if (DtSrMAX_STEMCOUNT != 8)
120 #error DtSrMAX_STEMCOUNT does not equal 8.
123 /****************************************/
127 /****************************************/
135 /****************************************/
139 /****************************************/
140 int debugging_boolsrch = FALSE;
142 static int all_key_types = TRUE;
143 static UCHAR *bitvec_allocp = NULL;
144 static size_t bitvec_allocsz = 0;
145 static long bitveclen; /* 1/8 of tot_addr_count */
146 static UCHAR *bitvecs [DtSrMAX_STEMCOUNT];
147 static int check_dates = FALSE;
148 static int do_stat_sort = FALSE;
149 static double idf [DtSrMAX_STEMCOUNT];
150 static char *msgbuf = NULL;
151 static int need_zero_permute = FALSE;
152 static struct or_objrec objrec;
153 static DB_ADDR objrecdba;
154 static int or_abstrsz = 0;
155 static int or_fzkeysz = 0;
156 static short or_language = DtSrLaENG;
157 static long or_maxdba; /* largest dba in database */
158 static long or_reccount; /* tot num db obj (real_num_rec) */
159 static long or_recslots; /* D00 slots per obj (slot_d00) */
160 static struct or_hwordrec
162 static PROXWT *proxwts = NULL;
164 static UCHAR *result_bitvec;
165 static long result_count = 0;
166 static DtSrResult *resultlist = NULL;
167 static int save_stemno = 0;
168 static long tot_addr_count; /* may be > reccount bcs deletes */
170 static float *wtvec = NULL;
173 /************************************************/
175 /* got_USR_STOPSRCH */
177 /************************************************/
178 /* Called at beginning of every workproc.
179 * Returns TRUE if user pushed STOP SEARCH button,
182 static int got_USR_STOPSRCH (void)
184 if ((usrblk.flags & USR_STOPSRCH) == 0)
186 if (OE_flags & OE_AUDIT)
187 oe_write_audit_rec (-1L);
188 usrblk.retncode = OE_USER_STOP;
192 /****************************************/
196 /****************************************/
198 * Reads a database record given a d99 record number.
199 * Returns TRUE and loads globals objrec and objrecdba
200 * on success, else returns FALSE.
202 static int read_recno (long recno)
204 /* Convert recno to a real dba */
205 objrecdba = (recno - 1) * or_recslots + 2;
206 if (objrecdba >= or_maxdba)
208 objrecdba |= (OR_D00 << 24);
210 /* Read the object record.
211 * Skip records with database read errors.
212 * Use d_crset instead of CRSET and d_recread
213 * instead of RECREAD to trap vista errors
216 d_crset (&objrecdba, vistano);
217 if (db_status != S_OKAY) {
219 if (debugging_boolsrch) {
221 PROGNAME"434 Invalid dba %ld. "
222 "recno=%ld bitvec[%d]=%02x db_status=%d.\n",
223 objrecdba, recno, recno>>3, 1<<(recno%8), db_status);
228 d_recread (&objrec, vistano);
229 if (db_status != S_OKAY)
231 swab_objrec (&objrec, NTOH);
236 /************************************************/
240 /************************************************/
241 /* Subroutine of boolean_search() initialization.
242 * Loads idf[] (inverse doc frequency) for each stem.
243 * IDF = 1.0 for a word that occurs in every record.
244 * For a word that occurs only once in entire database:
245 * NUM OF DB RECS IDF OF SINGULAR WORD
254 static void calculate_idfs (void)
260 for (i = 0; i < saveusr.stemcount; i++) {
261 if ( or_wordrecs[i].or_hwaddrs == 0 ||
262 or_wordrecs[i].or_hwordkey[0] == '@')
265 /* ln(2) = 0.693147181 */
266 dbl = (double) or_reccount / (double) or_wordrecs[i].or_hwaddrs;
267 idf[i] = log(dbl) / 0.693147181 + 1.0;
268 if (debugging_boolsrch)
270 PROGNAME"733 IDF[%d] numdocs=%5ld idf=%lf\n",
271 i, or_wordrecs[i].or_hwaddrs, idf[i]);
275 } /* calculate_idfs() */
278 /************************************************/
280 /* load_or_wordrecs */
282 /************************************************/
283 /* Subroutine of boolean_search() initialization.
284 * Loads or_wordrecs[] array with vista key file
285 * records for each term in saveusr.stems.
286 * Returns TRUE on success. Else returns FALSE with
287 * appropriate usrblk.retncode and user msgs on msglist.
289 static int load_or_wordrecs (void)
295 int colloc_count = 0;
296 int not_found_count = 0;
300 or_wordrecs = austext_malloc (
301 saveusr.stemcount * sizeof (struct or_hwordrec) + 16,
302 PROGNAME "782", NULL);
304 for (stemno = 0; stemno < saveusr.stemcount; stemno++) {
306 wordrec = &or_wordrecs [stemno];
308 /* If this is a collocation term,
309 * save the two indexes and the collocation
310 * value in the wordrec buffer instead of usual
311 * offsets and dba counts.
313 if (saveusr.stems[stemno][0] == '@') {
314 strcpy (wordrec->or_hwordkey, saveusr.stems[stemno]);
315 sscanf (saveusr.stems[stemno], COLLOC_STEM_FORMAT, &i, &j, &k);
316 wordrec->or_hwoffset = i;
317 wordrec->or_hwfree = j;
318 wordrec->or_hwaddrs = k;
323 if (debugging_boolsrch)
324 fprintf (aa_stderr, PROGNAME"823 KEYFIND[%d] ", stemno);
325 find_keyword (saveusr.stems[stemno], vistano);
327 * If term is found, add it to the or_wordrecs[] array.
328 * But it is an error to include a word in more records
329 * than the max specified in site config file. This is
330 * meaningful for databases where certain common high
331 * frequency words slip by which should be on the stoplist.
332 * It's possible in huge databases to run out of memory
333 * assembling very long resultlists.
335 if (db_status == S_OKAY) {
336 strncpy (wordrec->or_hwordkey, saveusr.stems[stemno],
338 wordrec->or_hwordkey [DtSrMAXWIDTH_HWORD - 1] = 0;
339 read_wordstr (wordrec, vistano);
340 if (db_status != S_OKAY) {
341 /* Probable corrupted database. The btree
342 * read succeeded but the record read failed.
344 sprintf (msgbuf, catgets(dtsearch_catd, MS_boolsrch, 6,
345 "%s Database Error. Word '%s' is\n"
346 "listed in database '%s' but has no index record.") ,
347 PROGNAME"295", usrblk.stems[stemno], usrblk.dblk->label);
348 DtSearchAddMessage (msgbuf);
349 usrblk.retncode = OE_SYSTEM_STOP;
350 if (debugging_boolsrch)
352 "db error, db_status = %d.\n", db_status);
355 if (debugging_boolsrch)
356 fprintf (aa_stderr, "ofs=%ld addrs=%ld free=%ld\n",
357 wordrec->or_hwoffset,
360 if (wordrec->or_hwaddrs > OE_words_hitlimit) {
361 sprintf (msgbuf, catgets (dtsearch_catd, MS_boolsrch, 14,
362 "%s '%s' has more than %ld hits.\n"
363 "Please remove it from the query or raise the WHITLIM\n"
364 "value in the search engine configuration file."),
365 PROGNAME"1444", wordrec->or_hwordkey, OE_words_hitlimit);
366 DtSearchAddMessage (msgbuf);
367 /* Also log WHITLIM msg for administrator... */
368 fprintf (aa_stderr, "%s\n", msgbuf);
369 usrblk.retncode = OE_BAD_QUERY;
374 /* Only other possible nonfatal vista return is S_NOTFOUND.
375 * If qry_is_all_ANDs we can quit right now.
376 * Otherwise switch off all bits in the word's bit vector.
378 else if (qry_is_all_ANDs) {
379 if (debugging_boolsrch)
380 fputs ("not found, qry_all_ANDs, quit.\n", aa_stderr);
381 usrblk.retncode = OE_NOTAVAIL;
386 memset (wordrec, 0, sizeof(struct or_hwordrec));
387 if (debugging_boolsrch)
388 fputs ("not found, addrs-->0.\n", aa_stderr);
392 } /* end loop for each term in saveusr.stems[] */
394 /* It's a failure if all the user's words
395 * don't exist in database.
397 if (not_found_count + colloc_count >= saveusr.stemcount) {
398 usrblk.retncode = OE_NOTAVAIL;
403 } /* load_or_wordrecs() */
406 /****************************************/
410 /****************************************/
411 /* Subroutine of stuff_DtSrResult().
412 * Given d99recno, finds proxwt[] for record,
413 * calculates and returns integer proximity.
415 static int get_proximity (long recno)
417 long byteno = recno >> 3;
418 int bitmask = 1 << (recno % 8);
420 for (i = 0; i < proxwtct; i++)
421 if (proxwts[i].byteno == byteno && proxwts[i].bitmask == bitmask)
425 return proxwts[i].proximity;
426 } /* get_proximity() */
429 /****************************************/
431 /* stuff_DtSrResult */
433 /****************************************/
434 /* Subroutine of load_DtSrResults_WK().
435 * Loads passed DtSrResult structure with data from global objrec.
436 * Performs additional vista reads as necessary to get misc recs.
438 static void stuff_DtSrResult (
444 char *src, *targ, *targend;
445 static struct or_miscrec
448 new->objflags = objrec.or_objflags;
449 new->objuflags = objrec.or_objuflags;
450 new->objsize = objrec.or_objsize;
451 new->objdate = objrec.or_objdate;
452 new->objtype = objrec.or_objtype;
453 new->objcost = objrec.or_objcost;
455 new->dba = objrecdba;
456 new->language = or_language;
457 strncpy (new->reckey, objrec.or_objkey, DtSrMAX_DB_KEYSIZE);
459 new->proximity = get_proximity (recno);
461 /* The abstract immediately follows the fuzzy key
462 * in the FZKABS misc recs. It may span several recs.
464 new->abstractp = (char *) (new + 1);
465 if (or_abstrsz > 0) {
466 targ = new->abstractp;
467 targend = targ + or_abstrsz - 1;
468 fzkey_remaining = or_fzkeysz;
469 CRSET (PROGNAME"226", &objrecdba, vistano);
470 SETOR (PROGNAME"227", OR_OBJ_MISCS, saveusr.vistano);
471 FINDFM (PROGNAME"228", OR_OBJ_MISCS, saveusr.vistano);
473 while (db_status == S_OKAY) {
474 RECREAD (PROGNAME"2209", &miscrecbuf, saveusr.vistano);
475 NTOHS (miscrecbuf.or_misctype);
476 if (miscrecbuf.or_misctype == ORM_FZKABS) {
477 src = (char *) miscrecbuf.or_misc;
479 for (m = 0; m < sizeof(miscrecbuf.or_misc); m++) {
481 /* skip over the fzkey */
482 if (fzkey_remaining > 0) {
488 /* copy the abstract */
490 if (*src++ == 0 || targ++ >= targend) {
492 targ = targend; /* force outer loop end */
495 } /* end for-loop m */
496 } /* end (misctype == FZKABS) */
500 FINDNM (PROGNAME"545", OR_OBJ_MISCS, saveusr.vistano);
501 } /* end while-loop */
503 } /* endif: (or_abstrsz > 0) */
506 } /* stuff_DtSrResult() */
509 /****************************************/
511 /* load_DtSrResults_WK */
513 /****************************************/
514 /* Builds DtSrResult list for every record
515 * in result_bitvec, but not more than aa_maxhits.
517 static void load_DtSrResults_WK (void)
525 size_t resultsz = sizeof(DtSrResult) + or_abstrsz + 4;
527 if (got_USR_STOPSRCH())
530 DtSearchFreeResults (&resultlist);
534 /* Make a single pass through the final result_bitvec.
535 * For each nonzero bit, ie each database record
536 * that satisfies the query requirements,
537 * retrieve the record and push it onto the
538 * DtSrResult list. If not sorting records,
539 * stop when we reach the user's specified aa_maxhits count.
542 for (recno = 1; recno < tot_addr_count; recno++) {
543 byteno = recno >> 3; /* divide by 8 */
547 if ((result_bitvec[byteno] & (1 << bitno)) == 0)
550 if (!read_recno (recno))
553 /* Create new DtSrResult node, push it onto resultlist. */
554 resultp = austext_malloc (resultsz + 4, PROGNAME"466", NULL);
555 memset (resultp, 0, resultsz);
556 resultp->link = resultlist;
557 resultlist = resultp;
559 /* Load the new DtSrResult node from the object record */
560 stuff_DtSrResult (resultp, recno);
562 /* Check if any more reads are necessary.
563 * If not sorting, stop after aa_maxhits.
564 * If sorting, there won't be more than
565 * aa_maxhits recs in the bitvec anyway.
568 if (dittocount >= aa_maxhits)
571 } /* end bitvec loop */
574 /*--------- All Done. Clean up and return to caller. ---------*/
575 /*@@@@@@ make separate workproc call if aa_maxhits > 100.
576 @@@@@ sort may take a long time */
586 if (dittocount <= 0) {
587 usrblk.workproc = dummy_workproc;
588 usrblk.retncode = OE_NOTAVAIL;
592 usrblk.retncode = OE_OK;
593 usrblk.workproc = dummy_workproc;
595 usrblk.stemcount = saveusr.stemcount;
596 if (usrblk.search_type == 'W')
597 memcpy (usrblk.stems, saveusr.stems,
598 saveusr.stemcount * DtSrMAXWIDTH_HWORD);
600 /* Don't copy first char (ctrl-o) stem */
601 for (i = 0; i < saveusr.stemcount; i++)
602 strcpy (usrblk.stems[i], &saveusr.stems[i][1]);
605 DtSearchSortResults (&resultlist, DtSrSORT_PROX);
606 usrblk.dittocount = dittocount;
607 if (usrblk.dittolist)
608 DtSearchFreeResults (&usrblk.dittolist);
609 usrblk.dittolist = resultlist;
612 } /* load_DtSrResults_WK() */
615 /****************************************/
617 /* weights_filter_WK */
619 /****************************************/
620 /* This workproc is called only if we're doing statistical sorting.
621 * (1) It reduces the result_bitvec to it's final size,
622 * containing only the highest aa_maxhits statistical weights
624 * (2) It replaces (possibly large) wtvec with (probably much smaller)
625 * array of PROXWT structures containing the selected records'
626 * weights and calculated proximities, for final ranking sort.
629 static void weights_filter_WK (void)
634 int smallest, biggest;
636 long byteno, smallest_byteno;
637 int bitmask, smallest_bitmask;
639 if (got_USR_STOPSRCH())
642 /* Init weight filtering */
645 proxwtct = (result_count < aa_maxhits)? result_count : aa_maxhits;
646 proxwts = austext_malloc (proxwtct * sizeof(PROXWT) + 4,
647 PROGNAME"429", NULL);
648 memset (proxwts, 0, proxwtct * sizeof(PROXWT));
651 biggestwt = 0.0; /* biggest single wt of all docs */
653 /* One pass thru entire result_bitvec */
654 for (recno = 1; recno < tot_addr_count; recno++) {
656 bitmask = 1 << (recno % 8);
659 if ((result_bitvec[byteno] & bitmask) == 0)
662 /* Make scalefac = sum of squares of all wts in bitvec.
663 * It's possible that all or some of the weights are
664 * zero (eg queries like "~aaa" or "~aaa | bbb").
665 * In this case give them a very small positive number
666 * so we don't divide by zero later on.
668 if (wtvec[recno] == 0.0)
670 scalefac += (double) wtvec[recno] * (double) wtvec[recno];
673 * The following logic first fills up the proxwts table.
674 * After that if a bitvec's weight is larger than the smallest
675 * proxwt, replace the smallest proxwt with the new weight
676 * and switch off the previous smallest in the original bitvec.
680 * Just discard rec on bitvec if it's weight
681 * is smaller than the current smallest.
683 if (wtvec [recno] <= proxwts[smallest].wt) {
684 RESET_BIT (result_bitvec, byteno, bitmask);
689 * Else discard current smallest if
690 * table full, ie it really points to something.
692 if (proxwts[smallest].wt > 0.0) {
693 smallest_byteno = proxwts[smallest].byteno;
694 smallest_bitmask = proxwts[smallest].bitmask;
695 RESET_BIT (result_bitvec, smallest_byteno, smallest_bitmask);
699 /* Add this weight to the proxwts table. */
700 proxwts [smallest] .wt = wtvec [recno];
701 proxwts [smallest] .byteno = byteno;
702 proxwts [smallest] .bitmask = bitmask;
704 /* Keep track of the table entry that has
705 * the highest weight. This will eventually
706 * be the first sorted hit on the hitlist.
707 * It's weight/proximity will be used
708 * to scale the proximities of the
711 if (biggestwt < wtvec[recno]) {
712 biggestwt = wtvec[recno];
716 /* Find the next smallest */
718 for (i = 1; i < proxwtct; i++) {
719 if (proxwts[i].wt < proxwts[smallest].wt)
723 } /* end loop on every recno */
728 /* PROXIMITY CALCULATIONS.
729 * In order to translate statistical weight into an AusText
730 * proximity, basically you have to invert it, then scale it.
731 * The statistical weight is a similarity measure: the
732 * larger it is the more similar the document to the query.
733 * But AusText 'proximity' is like a 'distance' measure,
734 * the smaller the number the closer the document is to the query.
736 * First 'normalize' each document's statistical
737 * weight to be a fraction between 0 and 1. Done
738 * by calculating a normalization factor,
739 * the sqrt of the sum of squares of weights of all
740 * docs that would have qualified for the hitlist
741 * if we weren't truncating. Note cosine-based normalization
742 * factor (Pythagorean) always >= largest wt so we can
743 * guarantee all normalized weights are > 0.0 and <= 1.0.
745 * The proximity itself is calculated as the 'percent value'
746 * that the doc is 'distant' from perfection (1.0 or 100%).
747 * For example, if the normalized weight of the first record
748 * is .931 then it's proximity will be 7 (100% - 93% = 7).
750 * The proximity of every other hit is scaled away
751 * from the first because the normalization algorithm
752 * tends to clump proximities when there are a lot of hits.
753 * Specifically the proximity of every hit is a constant
754 * scale factor (derived from the first proximity),
755 * divided by it's weight.
757 * A "bulls eye" (normalized weight = 1.0, proximity == 0)
758 * for the first hit is not allowed so scale factor will
759 * not also be zero. Otherwise *all* hits in that particular
760 * results list would be bulls eyes.
762 scalefac = (double) biggestwt / sqrt (scalefac);
763 /* normalized weight of first hit */
764 scalefac = (1.0 - scalefac) * 100.0;
765 /* proximity of first hit */
769 scalefac *= (double) biggestwt * 1.2;
770 /* scale factor for other hits */
771 for (i = 0; i < proxwtct; i++) {
772 proxwts[i].proximity = (int) (scalefac / (double) proxwts[i].wt);
773 if (proxwts[i].proximity > 9999)
774 proxwts[i].proximity = 9999;
777 if (debugging_boolsrch) {
779 PROGNAME"489 FINAL PROXWTS proxwtct=%d bigwt=%.2f scalefac=%.2lf\n",
780 proxwtct, biggestwt, scalefac);
781 for (i=0; i<10; i++) {
785 " byteno=%3ld bitmask=%02x wt=%.2f prox=%d\n",
786 proxwts[i].byteno, proxwts[i].bitmask,
787 proxwts[i].wt, proxwts[i].proximity);
789 fprintf (aa_stderr, PROGNAME"499 WEIGHT RESULTS resultct=%ld bv=\n",
791 for (i=0; i<22; i++) {
794 fprintf (aa_stderr, " %02x", (int) result_bitvec[i]);
796 fputc ('\n', aa_stderr);
800 usrblk.retncode = OE_SEARCHING;
801 usrblk.workproc = load_DtSrResults_WK;
803 } /* weights_filter_WK() */
806 /****************************************/
808 /* dbread_filter_WK */
810 /****************************************/
811 /* Called if we must remove documents from result_bitvec
812 * because of keytype or date,
814 static void dbread_filter_WK (void)
821 if (got_USR_STOPSRCH())
823 if (debugging_boolsrch) {
825 fputs (PROGNAME"865 DBREAD discards (k=keytype d=date):\n", aa_stderr);
829 /* One pass thru entire result_bitvec */
830 for (recno = 1; recno < tot_addr_count; recno++) {
832 bitmask = 1 << (recno % 8);
834 if ((result_bitvec[byteno] & bitmask) == 0)
837 if (!read_recno (recno))
840 /* Skip undesired record types */
841 if (!all_key_types) {
842 if (strchr (saveusr.ktchars, objrec.or_objkey[0]) == NULL) {
843 RESET_BIT (result_bitvec, byteno, bitmask);
845 if (debugging_boolsrch) {
847 fputc ('k', aa_stderr);
854 /* Skip record if out of date range */
856 if (!objdate_in_range (objrec.or_objdate,
857 usrblk.objdate1, usrblk.objdate2)) {
858 RESET_BIT (result_bitvec, byteno, bitmask);
860 if (debugging_boolsrch) {
862 fputc ('d', aa_stderr);
869 } /* end loop on every recno */
871 if (debugging_boolsrch) {
874 fputc ('\n', aa_stderr);
876 PROGNAME"857 DBREAD RESULTS discards=%ld resultct=%ld bv=\n",
877 discards, result_count);
878 for (i=0; i<22; i++) {
881 fprintf (aa_stderr, " %02x", (int) result_bitvec[i]);
883 fputc ('\n', aa_stderr);
887 /* Determine next workproc.
888 * (1) If no records survived the read db filter,
889 * we're done, return 'no hits'.
890 * (2) If we're sorting, the next workproc reduces the
891 * bitvec to the aa_maxhits recs with the highest
892 * statistical weights.
893 * (3) Otherwise the next workproc just loads the hitlist.
895 if (result_count <= 0) {
896 usrblk.retncode = OE_NOTAVAIL;
897 usrblk.workproc = dummy_workproc;
899 else if (do_stat_sort) {
900 usrblk.retncode = OE_SEARCHING;
901 usrblk.workproc = weights_filter_WK;
904 if (debugging_boolsrch)
906 PROGNAME"931 No sorting by statistical weights.\n");
907 usrblk.retncode = OE_SEARCHING;
908 usrblk.workproc = load_DtSrResults_WK;
911 } /* dbread_filter_WK() */
914 /****************************************/
916 /* calc_result_bitvec_WK */
918 /****************************************/
919 /* Second workproc after read_stem_bitvec_WK().
920 * If possible, minimizes size of truth table permutes,
921 * then applies them to stem bitvecs to create result_bitvec.
923 static void calc_result_bitvec_WK (void)
930 UCHAR my_permutes [256];
934 if (got_USR_STOPSRCH())
937 /* If there are fewer than a full complement of stems,
938 * minimize size of truth table by discarding
939 * permutes that refer to unused stems.
941 if (saveusr.stemcount < DtSrMAX_STEMCOUNT) {
942 /* Set high order bits of mask to mark unused stem positions */
944 for (i = 0; i < saveusr.stemcount; i++)
948 /* 'cpm' is a candidate permute */
950 for (cpm = 0; cpm < 256; cpm++) {
952 * Discard candidate if it refers to an unused stem.
957 * Otherwise if candidate is in final_truthtab, keep it.
959 for (i = 0; i < final_truthtab.pmsz; i++) {
960 if (final_truthtab.permutes[i] == (UCHAR) cpm) {
961 my_permutes [my_pmsz] = (UCHAR) cpm;
966 if (debugging_boolsrch) {
968 PROGNAME"565 Minimize truth table, pmsz=%d-->%d\n permutes=",
969 final_truthtab.pmsz, my_pmsz);
970 for (i=0; i<16; i++) {
973 fprintf (aa_stderr, " %02x", (int) my_permutes [i]);
975 fputc ('\n', aa_stderr);
978 final_truthtab.permutes = my_permutes;
979 final_truthtab.pmsz = my_pmsz;
980 } /* end minimize of permutes */
982 /* Calculate result bit vector.
983 * Loop 1 is a single pass through the bit vectors
984 * (a bit loop inside a byte loop).
985 * For each nonzero bit, ie each database record
986 * that has at least one of the query terms in it,
987 * build a 'permute' equivalent to the boolean
988 * representation of the terms in that record (Loop 2).
989 * Then search the truth table permutes for a match (Loop 3).
990 * If found, set the record's bit in the result_bitvec.
993 /* LOOP 1. For each database addr... */
995 for (byteno = 0; byteno < bitveclen; byteno++) {
996 for (bitno = 0; bitno < 8; bitno++) {
999 /* LOOP 2. Build permute for each query term. */
1001 for (stemno = 0; stemno < saveusr.stemcount; stemno++) {
1002 if (bitvecs [stemno] [byteno] & (UCHAR) mask)
1003 permute |= 1 << stemno;
1006 /* LOOP 3. Search truth table for matching permute. */
1007 for (i = 0; i < final_truthtab.pmsz; i++) {
1008 if (final_truthtab.permutes[i] == permute) {
1009 result_bitvec [byteno] |= (UCHAR) mask;
1016 if (debugging_boolsrch) {
1017 fprintf (aa_stderr, PROGNAME"621 PRELIM RESULTS resultct=%ld bv=\n",
1019 for (i=0; i<22; i++) {
1022 fprintf (aa_stderr, " %02x", (int) result_bitvec[i]);
1024 fputc ('\n', aa_stderr);
1028 /* The next workprocs are 'filters', reducing the size
1029 * of result_bitvec by removing various unwanted records.
1030 * They're done in the following order:
1031 * (1) If no records survived the truth table manipulations,
1032 * we're done, return 'no hits'.
1033 * (2) If we must remove documents because of keytype or date,
1034 * the next workproc is the filter that reads the database.
1035 * (3) If we're sorting, the next workproc reduces the
1036 * bitvec to the aa_maxhits recs with the highest
1037 * statistical weights.
1038 * (4) Otherwise the next workproc just loads the hitlist.
1040 if (result_count <= 0) {
1041 usrblk.retncode = OE_NOTAVAIL;
1042 usrblk.workproc = dummy_workproc;
1044 else if (!all_key_types || check_dates) {
1045 usrblk.retncode = OE_SEARCHING;
1046 usrblk.workproc = dbread_filter_WK;
1048 else if (do_stat_sort) {
1049 if (debugging_boolsrch)
1051 PROGNAME"948 No db reads necessary for date or keytype.\n");
1052 usrblk.retncode = OE_SEARCHING;
1053 usrblk.workproc = weights_filter_WK;
1056 if (debugging_boolsrch)
1058 PROGNAME"625 No filtering: no sort and no db reads.\n");
1059 usrblk.retncode = OE_SEARCHING;
1060 usrblk.workproc = load_DtSrResults_WK;
1063 } /* calc_result_bitvec_WK() */
1066 /****************************************/
1070 /****************************************/
1071 /* Subroutine of read_stem_bitvec_WK().
1072 * Repeatedly called to get each d99dba in the inverted index
1073 * file (d99) for a specific index term. The first call passes
1074 * the term's wordrec with d99 offset and size information.
1075 * Subsequent calls pass NULL.
1076 * Returns valid d99dba, or 0 at end of term's index, or -1 on error.
1077 * Actual reads are performed a disk block at a time,
1078 * with dbas stored in a static buffer for the next call.
1080 static DB_ADDR read_d99 (struct or_hwordrec *wordrec)
1082 static DB_ADDR readbuf [DBAS_PER_BLOCK];
1083 static DB_ADDR *bufptr, *endbuf;
1085 static long bal_read, request_read, actual_read;
1087 /* First call for new term */
1089 fptr = usrblk.dblk->iifile;
1090 fseek (fptr, wordrec->or_hwoffset, SEEK_SET);
1091 bal_read = wordrec->or_hwaddrs;
1092 bufptr = endbuf = 0; /* triggers block read */
1095 /* Time to read another block */
1096 if (bufptr >= endbuf) {
1099 if (bal_read > DBAS_PER_BLOCK) {
1100 request_read = DBAS_PER_BLOCK;
1101 bal_read -= DBAS_PER_BLOCK;
1102 endbuf = readbuf + DBAS_PER_BLOCK;
1105 /* last block is usually short */
1106 request_read = bal_read;
1108 endbuf = readbuf + request_read;
1110 if (fread (readbuf, sizeof(DB_ADDR), request_read, fptr)
1112 sprintf (msgbuf, catgets(dtsearch_catd, MS_boolsrch, 28,
1113 "%s Database Read Error in %s.d99.") ,
1114 PROGNAME"428", usrblk.dblk->name);
1115 DtSearchAddMessage (msgbuf);
1121 /******return *bufptr++;*******/
1122 return ntohl (*bufptr++);
1126 /****************************************/
1128 /* get_colloc_bitvec */
1130 /****************************************/
1131 /* Subroutine of read_stem_bitvec_WK().
1132 * Constructs a 'collocation bitvector' for current save_stemno.
1133 * A collocation expression requests the return of all records
1134 * containing both of two terms (a kind of boolean AND) such that
1135 * the occurrences are within n characters of each other.
1136 * For example "ICE @5 CREAM" requests the return of all records
1137 * containing both "ICE" and "CREAM" but only if they are separated
1138 * by no more than 5 characters.
1140 * Since offset information is not stored in the inverted index
1141 * this module initially returns the intersection of the two words'
1142 * bit vectors (boolean AND). Then it retrieves each record,
1143 * builds an offset (hilites) table for each of the two words,
1144 * then compares the offset differences in the tables.
1145 * If no occurrence pairs are within the specified separation
1146 * range, the record is deleted from the bitvector.
1147 * Returns 0 if successful, otherwise returns -1 and msgs.
1148 @@@@ rewrite as its own workproc--reading/hiliting can take a long time...
1150 static int get_colloc_bitvec (void)
1152 int stemno_A = or_wordrecs[save_stemno].or_hwoffset;
1153 int stemno_B = or_wordrecs[save_stemno].or_hwfree;
1154 long range = or_wordrecs[save_stemno].or_hwaddrs;
1155 UCHAR *bitvec_A = bitvecs [stemno_A];
1156 UCHAR *bitvec_B = bitvecs [stemno_B];
1157 UCHAR *bitvec_C = bitvecs [save_stemno];
1163 DtSrHitword *hitwords_A, *hitwords_B;
1164 long hitwcount_A, hitwcount_B;
1165 long threshold_range;
1168 long a, b, offset_A, offset_B;
1170 /* First construct the set intersection (AND) of
1171 * each of the collocated terms in the colloc bitvec.
1173 for (byteno = 0; byteno < bitveclen; byteno++)
1174 bitvec_C [byteno] = bitvec_A [byteno] & bitvec_B [byteno];
1175 if (debugging_boolsrch) {
1178 PROGNAME"312 INTERSECT[%d] (colloc %d & %d):\n",
1179 save_stemno, stemno_A, stemno_B);
1180 for (i=0; i<bitveclen; i++) {
1181 fprintf (aa_stderr, " %02x", bitvec_C[i]);
1185 fputc ('\n', aa_stderr);
1189 /* Read cleartext for each rec in intersection/colloc bitvec.
1190 * Get hitwords (hilite table) for each collocation term.
1191 * Switch off recs in bitvec where no term pairs are in
1192 * collocation range.
1194 for (recno = 1; recno < tot_addr_count; recno++) {
1195 byteno = recno >> 3; /* divide by 8 */
1196 bitmask = 1 << (recno % 8);
1198 /* Skip zero bits */
1199 if ((bitvec_C[byteno] & bitmask) == 0)
1202 /* Convert recno to vista database address.
1203 * Silently skip rec if dba doesn't exist.
1205 dba = (recno - 1) * or_recslots + 2;
1206 if (dba >= or_maxdba) {
1207 RESET_BIT (bitvec_C, byteno, bitmask);
1210 dba |= (OR_D00 << 24);
1212 /* Silently skip records that have no document text */
1213 if ((bloblist = ve_getblobs (dba, vistano)) == NULL) {
1214 if (debugging_boolsrch) {
1216 PROGNAME"126 No blobs for recno=%ld byteno=%ld mask%02x\n",
1217 recno, byteno, bitmask);
1220 RESET_BIT (bitvec_C, byteno, bitmask);
1224 /* Uncompress record text into usrblk.cleartext */
1225 if (oe_unblob (bloblist) != OE_OK)
1228 /* Build 'hilite' table for stem A. If stem
1229 * can't be found in the record, silently skip it.
1230 * Otherwise save the table.
1232 stemp = saveusr.stems [stemno_A];
1233 if (stemp[0] == STEM_CH) {
1239 if (!hilite_cleartext (parse_type, stemp, 1)) {
1240 RESET_BIT (bitvec_C, byteno, bitmask);
1243 hitwords_A = usrblk.hitwords;
1244 hitwcount_A = usrblk.hitwcount;
1245 usrblk.hitwords = NULL;
1246 usrblk.hitwcount = 0;
1248 /* In the same way build 'hilite' table for stem B */
1249 stemp = saveusr.stems [stemno_B];
1250 if (stemp[0] == STEM_CH) {
1256 if (!hilite_cleartext (parse_type, stemp, 1)) {
1257 RESET_BIT (bitvec_C, byteno, bitmask);
1261 hitwords_B = usrblk.hitwords;
1262 hitwcount_B = usrblk.hitwcount;
1263 usrblk.hitwords = NULL;
1264 usrblk.hitwcount = 0;
1266 /* Compare the two hilite tables for range matches */
1267 got_a_colloc = FALSE;
1269 for (a = 0; a < hitwcount_A; a++) {
1270 offset_A = hitwords_A[a].offset;
1271 threshold_range = offset_A + hitwords_A[a].length + range;
1272 for (; b < hitwcount_B; b++) {
1273 offset_B = hitwords_B[b].offset;
1275 /* Advance B to first entry past A's offset */
1276 if (offset_B <= offset_A )
1277 continue; /* ...the B loop */
1278 if (offset_B <= threshold_range)
1279 got_a_colloc = TRUE;
1280 break; /* ...the B loop */
1282 if (got_a_colloc || b >= hitwcount_B)
1283 break; /* ...the A loop */
1288 /* If no collocations found within range,
1289 * switch off rec in colloc bitvec.
1292 RESET_BIT (bitvec_C, byteno, bitmask);
1294 } /* end loop on each recno in intersection/colloc bitvec */
1297 } /* get_colloc_bitvec() */
1300 /****************************************/
1302 /* read_stem_bitvec_WK */
1304 /****************************************/
1305 /* First workproc after boolean_search().
1306 * Each iterative call loads one (save_stemno) real stem's bitvec.
1307 * After last stem bitvec loaded, sets up
1308 * call to next workproc in sequence.
1310 static void read_stem_bitvec_WK (void)
1316 if (got_USR_STOPSRCH())
1319 /* Process collocation 'stems' */
1320 if (saveusr.stems [save_stemno] [0] == '@') {
1321 d99recno = get_colloc_bitvec();
1325 for ( d99recno = read_d99 (&or_wordrecs [save_stemno]);
1327 d99recno = read_d99 (NULL)) {
1328 if (d99recno == -1) /* read error */
1331 /* Save low byte 'statistical weight' value.
1332 * It can only be 0 - 255.
1335 weight = (float) (d99recno & 0x000000ff) + 1.0;
1337 d99recno = (d99recno >> 8) & 0x00ffffff;
1339 /* Set correct bit in bitvec.
1340 * The byte number is the recno divided by 8.
1341 * The bit number is the remainder after division by 8.
1343 if ((byteno = d99recno >> 3) >= bitveclen) {
1344 sprintf (msgbuf, catgets(dtsearch_catd, MS_boolsrch, 32,
1345 "%s Database Error: %s '%s'\n"
1346 "in database '%s' has invalid d99 record number %ld.") ,
1348 (usrblk.search_type == 'W') ?
1349 catgets(dtsearch_catd, MS_boolsrch, 33, "Word") :
1350 catgets(dtsearch_catd, MS_boolsrch, 34, "Stem of"),
1351 usrblk.stems [save_stemno],
1354 DtSearchAddMessage (msgbuf);
1355 d99recno = -1; /* force error return */
1358 bitvecs [save_stemno] [byteno] |= 1 << (d99recno % 8);
1360 /* Add to correct weight in weight vector.
1361 * IDF ranges between 1.0 and 20.0, and weight
1362 * is 1 - 256, so we're adding 1 - ~5000 to wtvec.
1365 wtvec [d99recno] += weight * (float) idf [save_stemno];
1367 } /* end loop that retrieves every d99recno for curr stem */
1371 if (debugging_boolsrch) {
1373 if (debugging_boolsrch)
1374 fprintf (aa_stderr, PROGNAME"313 BITVEC[%ld]:\n", save_stemno);
1375 for (i=0; i<bitveclen; i++) {
1376 fprintf (aa_stderr, " %02x", bitvecs[save_stemno][i]);
1380 fputc ('\n', aa_stderr);
1384 if (d99recno == 0) {
1385 /* Normal conclusion. Increment to next stem.
1386 * If not all stems have been read,
1387 * this is still the next workproc.
1388 * Otherwise the next workproc is the one
1389 * merging all bitvectors into the final
1390 * result bitvec using the truth table.
1392 usrblk.retncode = OE_SEARCHING;
1393 if (++save_stemno < saveusr.stemcount)
1394 usrblk.workproc = read_stem_bitvec_WK;
1396 usrblk.workproc = calc_result_bitvec_WK;
1399 /* d99recno must be -1 */
1400 usrblk.retncode = OE_SYSTEM_STOP;
1402 } /* read_stem_bitvec_WK() */
1405 /****************************************/
1407 /* boolean_search */
1409 /****************************************/
1410 /* Called from Opera_Engine after successful boolean_parse().
1411 * Expects valid globals: saveusr.stems, saveusr.stemcount,
1412 * usrblk.stems (contains original unstemmed query terms for msgs),
1413 * usrblk.search_type, final_truthtab, qry_has_no_NOTs,
1414 * and qry_is_all_ANDs.
1415 * Based on parts of the function ve_word_search().
1416 * Upon return, usrblk.retncode, msglist, etc is appropriately loaded.
1417 * Upon successful return usrblk.stems, usrblk.stemcount,
1418 * and dittolist are also loaded.
1420 void boolean_search (void)
1423 size_t allocsz_needed;
1426 if ( saveusr.stemcount <= 0 ||
1427 final_truthtab.pmsz <= 0 ||
1428 final_truthtab.pmsz >= 256 ) {
1429 fprintf (aa_stderr, catgets(dtsearch_catd, MS_boolsrch, 35,
1430 "%s Program Error: stemct=%d pmsz=%d\n") ,
1431 PROGNAME"1404", saveusr.stemcount, final_truthtab.pmsz);
1435 /*---------- Init globals ----------*/
1437 msgbuf = austext_malloc (500, PROGNAME"393", NULL);
1438 debugging_boolsrch = (usrblk.debug & USRDBG_SRCHCMPL);
1439 need_zero_permute = (final_truthtab.permutes[0] == 0);
1440 do_stat_sort = ((usrblk.flags & USR_SORT_WHITL) != 0);
1441 check_dates = (usrblk.objdate1 || usrblk.objdate2);
1442 or_abstrsz = usrblk.dblk->dbrec.or_abstrsz;
1443 or_fzkeysz = usrblk.dblk->dbrec.or_fzkeysz;
1444 or_language = usrblk.dblk->dbrec.or_language;
1445 or_maxdba = usrblk.dblk->dbrec.or_maxdba;
1446 usrblk.flags &= ~USR_STOPSRCH; /* turn off stop button */
1448 saveusr.vistano = vistano = usrblk.dblk->vistano;
1449 saveusr.dittolist = NULL;
1450 saveusr.dittocount = 0L;
1451 saveusr.iterations = INIT_ITERATIONS;
1453 * saveusr.ktchars is a string holding
1454 * first char of desired record ids.
1456 all_key_types = TRUE;
1457 for (i = 0, j = 0; i < usrblk.dblk->ktcount; i++) {
1458 if (usrblk.dblk->keytypes[i].is_selected)
1459 saveusr.ktchars[j++] = usrblk.dblk->keytypes[i].ktchar;
1461 all_key_types = FALSE;
1463 saveusr.ktchars[j] = '\0';
1465 or_recslots = (long) (usrblk.dblk->dbrec.or_recslots);
1466 or_reccount = usrblk.dblk->dbrec.or_reccount;
1468 /* RECFRST is just to get the slot# (dba) of the
1469 * first real object record after the dbrec.
1470 * Currently the dbrec occupies only one slot,
1471 * the first (#1), so dba will usually be #2.
1474 RECFRST(PROGNAME"2545", OR_OBJREC, saveusr.vistano);
1475 CRGET(PROGNAME"2546", &dba, saveusr.vistano);
1478 tot_addr_count = ((usrblk.dblk->dbrec.or_maxdba + 1) / or_recslots) + 1;
1479 bitveclen = (tot_addr_count >> 3) + 1;
1481 if (debugging_boolsrch) {
1482 fprintf (aa_stderr, PROGNAME"360 "
1483 "boolean_search: typ='%c' needzpm?=%d sort?=%d maxhits=%d\n"
1484 " maxdba=%ld recct=%ld recslts=%ld\n"
1485 " totnmadr=%ld bvln=%ld allkts?=%d ktchars='%s'\n"
1490 ,usrblk.dblk->dbrec.or_maxdba
1502 /*---------- Read vista btree ----------
1503 * Load or_wordrecs[] array for each term in saveusr.stems.
1505 if (!load_or_wordrecs())
1508 /* If statistically sorting final resultlist, calculate
1509 * idf (inverse document frequency) for each term using
1510 * the frequency data in or_wordrecs[].
1515 /* Bitvector allocation. Number needed is one for each stem,
1516 * plus one extra to accumulate the result bitvector.
1518 allocsz_needed = bitveclen * (saveusr.stemcount + 1);
1519 if (debugging_boolsrch)
1520 fprintf (aa_stderr, PROGNAME"430 "
1521 "bitvecs[] alloc needed=%ld (bvln=%ld stems=%d+1), have=%ld.\n",
1522 allocsz_needed, bitveclen, saveusr.stemcount, bitvec_allocsz);
1523 if (bitvec_allocsz < allocsz_needed) {
1525 free (bitvec_allocp);
1526 bitvec_allocp = austext_malloc (allocsz_needed + 16,
1527 PROGNAME"508", NULL);
1528 if (debugging_boolsrch)
1529 fprintf (aa_stderr, PROGNAME"432 bitvecs[] realloc %ld-->%ld.\n",
1530 bitvec_allocsz, allocsz_needed);
1531 bitvec_allocsz = allocsz_needed;
1534 /* Clear all bitvecs to zero and assign them */
1535 memset (bitvec_allocp, 0, allocsz_needed);
1536 for (i = 0; i < saveusr.stemcount; i++)
1537 bitvecs[i] = bitvec_allocp + (i * bitveclen);
1538 result_bitvec = bitvec_allocp + (i * bitveclen);
1540 /* If sorting statistically, allocate weight vector.
1541 * One float for each db record.
1548 wtvec = austext_malloc ((tot_addr_count + 4) * sizeof(float) + 4,
1549 PROGNAME"040", NULL);
1550 memset (wtvec, 0, (tot_addr_count + 4) * sizeof(float));
1553 /* The 'zero permute' is every record that has
1554 * NONE of the query terms in it. It can only be
1555 * generated if a NOT operator was included in the query.
1557 if (need_zero_permute) {
1558 sprintf (msgbuf, catgets (dtsearch_catd, MS_boolsrch, 15,
1559 "%s Your query requires retrieving every\n"
1560 "document in the database that does not have any of\n"
1561 "your query words. This type of search may take an\n"
1562 "unusually long time."),
1564 DtSearchAddMessage (msgbuf);
1567 if (debugging_boolsrch)
1570 /* Searches may take a long time. To allow gui to put a
1571 * a 'working' dialog box and a 'cancel' button,
1572 * we pass execution to workprocs.
1573 * If user cannot cancel search no matter how
1574 * long it may take, we call each of the subsequent
1575 * workproc functions directly from here.
1576 * Otherwise they will themselves setup each
1577 * subsequent call to usrblk.workproc(), as long as
1578 * the previous call returns OE_SEARCHING and the user
1579 * hasn't pushed USR_STOPSRCH.
1581 usrblk.workproc = read_stem_bitvec_WK;
1582 save_stemno = 0; /* global arg for first workproc */
1583 usrblk.workproc(); /* direct call to first workproc */
1585 if ((usrblk.flags & USR_NO_ITERATE) != 0 &&
1586 (usrblk.debug & USRDBG_ITERATE) == 0) {
1587 while (usrblk.retncode == OE_SEARCHING)
1592 } /* boolean_search() */
1594 /************************** BOOLSRCH.C **********************/