cde/lib/DtSearch/boolsrch.c

   1 /*
   2  * CDE - Common Desktop Environment
   3  *
   4  * Copyright (c) 1993-2012, The Open Group. All rights reserved.
   5  *
   6  * These libraries and programs are free software; you can
   7  * redistribute them and/or modify them under the terms of the GNU
   8  * Lesser General Public License as published by the Free Software
   9  * Foundation; either version 2 of the License, or (at your option)
  10  * any later version.
  11  *
  12  * These libraries and programs are distributed in the hope that
  13  * they will be useful, but WITHOUT ANY WARRANTY; without even the
  14  * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE. See the GNU Lesser General Public License for more
  16  * details.
  17  *
  18  * You should have received a copy of the GNU Lesser General Public
  19  * License along with these librararies and programs; if not, write
  20  * to the Free Software Foundation, Inc., 51 Franklin Street, Fifth
  21  * Floor, Boston, MA 02110-1301 USA
  22  */
  23 /* $XConsortium: boolsrch.c /main/4 1996/09/23 21:00:18 cde-ibm $
  24  *
  25  * (c) Copyright 1996 Digital Equipment Corporation.
  26  * (c) Copyright 1996 Hewlett-Packard Company.
  27  * (c) Copyright 1996 International Business Machines Corp.
  28  * (c) Copyright 1996 Sun Microsystems, Inc.
  29  * (c) Copyright 1996 Novell, Inc.
  30  * (c) Copyright 1996 FUJITSU LIMITED.
  31  * (c) Copyright 1996 Hitachi.
  32  */
  33 /*
  34  *   COMPONENT_NAME: austext
  35  *
  36  *   FUNCTIONS: boolean_search
  37  *              calc_result_bitvec_WK
  38  *              calculate_idfs
  39  *              dbread_filter_WK
  40  *              get_proximity
  41  *              got_USR_STOPSRCH
  42  *              load_DtSrResults_WK
  43  *              load_or_wordrecs
  44  *              read_d99
  45  *              read_recno
  46  *              read_stem_bitvec_WK
  47  *              stuff_DtSrResult
  48  *              weights_filter_WK
  49  *
  50  *   ORIGINS: 27
  51  *
  52  *
  53  *   (C) COPYRIGHT International Business Machines Corp. 1996
  54  *   All Rights Reserved
  55  *   Licensed Materials - Property of IBM
  56  *   US Government Users Restricted Rights - Use, duplication or
  57  *   disclosure restricted by GSA ADP Schedule Contract with IBM Corp.
  58  */
  59 /********************* BOOLSRCH.C **********************
  60  * $Id: boolsrch.c /main/4 1996/09/23 21:00:18 cde-ibm $
  61  * February 1996.
  62  * The vista code from the original vewords.c.
  63  * Given a final truth table and stems array from the user's boolean
  64  * query (output of boolean_search()), find all database records
  65  * containing the truth table's set operations and return
  66  * their database addresses in a resultlist.
  67  * See boolpars.h for format and limitations of TRUTHTAB.
  68  *
  69  *-------------- D99DBA TO DBA CONVERSION ----------------
  70  * 'd99dbas' are not real vista dbas!  They were modified
  71  * as follows to permit shorter bit vectors,
  72  * and to minimize bit shifts at search time.
  73  *     vista_dba   <-  (OR_D00 << 24) | vista_slot
  74  *     vista_slot  <-  ((d99recno - 1) * or_recslots) + 2
  75  *     d99dba      <-  (d99recno << 8) | weight_byte
  76  *     d99recno    <-  ((vista_slot - 2) / or_recslots) + 1
  77  * The d99 and bitvec recno of the first rec is 1.
  78  * The slotno (vista dba) of the first rec is 2
  79  * (dbrec occupies first slot and vista slots begin at 1).
  80  *
  81  * $Log$
  82  * Revision 1.5  1996/03/20  19:21:49  miker
  83  * Completed collocations code.  Restored get_colloc_bitvec() from colloc.c.
  84  *
  85  * Revision 1.4  1996/03/18  22:06:24  miker
  86  * Bug fix.  Zero permute NOT queries always returned no hits.
  87  *
  88  * Revision 1.3  1996/03/13  23:05:24  miker
  89  * Change long double constant to regular float for better portability.
  90  *
  91  * Revision 1.2  1996/03/13  22:36:37  miker
  92  * Changed char to UCHAR several places; similar typecasts.
  93  * Moved collocations processing to colloc.c.
  94  *
  95  * Revision 1.1  1996/03/05  15:52:06  miker
  96  * Initial revision
  97  */
  98 /***#define _ALL_SOURCE****/     /* to pickup typedefs for shm vnodes */
  99 #include "SearchE.h"
 100 #include <string.h>
 101 #include <stdio.h>
 102 #include <stdlib.h>
 103 #include <math.h>
 104 #include "vista.h"
 105 #include "boolpars.h"
 106
 107 #define PROGNAME        "BOOLSRCH"
 108 #define INIT_ITERATIONS 50
 109 #define MS_boolsrch     16
 110 /*
 111  * DBAS_PER_BLOCK is the max number of dbas to be read
 112  * from d99 file.  Note DBAS_PER_BLOCK * sizeof(DB_ADDR) = 512 bytes,
 113  * the standard blksize of one hard disk block.
 114  */
 115 #define DBAS_PER_BLOCK  128
 116
 117 #define RESET_BIT(bv, by, bm)   bv[by] &= (UCHAR) ~bm
 118
 119 #if (DtSrMAX_STEMCOUNT != 8)
 120 #error DtSrMAX_STEMCOUNT does not equal 8.
 121 #endif
 122
 123 /****************************************/
 124 /*                                      */
 125 /*                PROXWT                */
 126 /*                                      */
 127 /****************************************/
 128 typedef struct {
 129     float       wt;
 130     long        byteno;
 131     int         bitmask;
 132     int         proximity;
 133     }   PROXWT;
 134
 135 /****************************************/
 136 /*                                      */
 137 /*                GLOBALS               */
 138 /*                                      */
 139 /****************************************/
 140 int                     debugging_boolsrch =    FALSE;
 141
 142 static int              all_key_types =         TRUE;
 143 static UCHAR            *bitvec_allocp =        NULL;
 144 static size_t           bitvec_allocsz =        0;
 145 static long             bitveclen;      /* 1/8 of tot_addr_count */
 146 static UCHAR            *bitvecs [DtSrMAX_STEMCOUNT];
 147 static int              check_dates =           FALSE;
 148 static int              do_stat_sort =          FALSE;
 149 static double           idf [DtSrMAX_STEMCOUNT];
 150 static char             *msgbuf =               NULL;
 151 static int              need_zero_permute =     FALSE;
 152 static struct or_objrec objrec;
 153 static DB_ADDR          objrecdba;
 154 static int              or_abstrsz =            0;
 155 static int              or_fzkeysz =            0;
 156 static short            or_language =           DtSrLaENG;
 157 static long             or_maxdba;      /* largest dba in database */
 158 static long             or_reccount;    /* tot num db obj (real_num_rec) */
 159 static long             or_recslots;    /* D00 slots per obj (slot_d00) */
 160 static struct or_hwordrec
 161                         *or_wordrecs =          NULL;
 162 static PROXWT           *proxwts =              NULL;
 163 static int              proxwtct;
 164 static UCHAR            *result_bitvec;
 165 static long             result_count =          0;
 166 static DtSrResult       *resultlist =           NULL;
 167 static int              save_stemno =           0;
 168 static long             tot_addr_count; /* may be > reccount bcs deletes */
 169 static int              vistano;
 170 static float            *wtvec =                NULL;
 171
 172
 173 /************************************************/
 174 /*                                              */
 175 /*              got_USR_STOPSRCH                */
 176 /*                                              */
 177 /************************************************/
 178 /* Called at beginning of every workproc.
 179  * Returns TRUE if user pushed STOP SEARCH button,
 180  * else FALSE.
 181  */
 182 static int      got_USR_STOPSRCH (void)
 183 {
 184     if ((usrblk.flags & USR_STOPSRCH) == 0)
 185         return FALSE;
 186     if (OE_flags & OE_AUDIT)
 187         oe_write_audit_rec (-1L);
 188     usrblk.retncode = OE_USER_STOP;
 189     return TRUE;
 190 }
 191
 192 /****************************************/
 193 /*                                      */
 194 /*             read_recno               */
 195 /*                                      */
 196 /****************************************/
 197 /* Utility function.
 198  * Reads a database record given a d99 record number.
 199  * Returns TRUE and loads globals objrec and objrecdba
 200  * on success, else returns FALSE.
 201  */
 202 static int      read_recno (long recno)
 203 {
 204     /* Convert recno to a real dba */
 205     objrecdba = (recno - 1) * or_recslots + 2;
 206     if (objrecdba >= or_maxdba)
 207             return FALSE;
 208     objrecdba |= (OR_D00 << 24);
 209
 210     /* Read the object record.
 211      * Skip records with database read errors.
 212      * Use d_crset instead of CRSET and d_recread
 213      * instead of RECREAD to trap vista errors
 214      * without aborting.
 215      */
 216     d_crset (&objrecdba, vistano);
 217     if (db_status != S_OKAY) {
 218 BAD_DBA:
 219         if (debugging_boolsrch) {
 220             fprintf (aa_stderr,
 221                 PROGNAME"434 Invalid dba %ld.  "
 222                 "recno=%ld bitvec[%d]=%02x  db_status=%d.\n",
 223                 objrecdba, recno, recno>>3, 1<<(recno%8), db_status);
 224             fflush (aa_stderr);
 225         }
 226         return FALSE;
 227     }
 228     d_recread (&objrec, vistano);
 229     if (db_status != S_OKAY)
 230         goto BAD_DBA;
 231     swab_objrec (&objrec, NTOH);
 232     return TRUE;
 233 } /* read_recno() */
 234
 235
 236 /************************************************/
 237 /*                                              */
 238 /*               calculate_idfs                 */
 239 /*                                              */
 240 /************************************************/
 241 /* Subroutine of boolean_search() initialization.
 242  * Loads idf[] (inverse doc frequency) for each stem.
 243  * IDF = 1.0 for a word that occurs in every record.
 244  * For a word that occurs only once in entire database:
 245  *  NUM OF DB RECS    IDF OF SINGULAR WORD
 246  *              10       4.32
 247  *             100       7.64
 248  *           1,000      10.97
 249  *          10,000      14.29
 250  *         100,000      17.61
 251  *       1,000,000      20.93
 252  *      10,000,000      24.25
 253  */
 254 static void     calculate_idfs (void)
 255 {
 256     int         i;
 257     char        *cptr;
 258     double      dbl;
 259
 260     for (i = 0;  i < saveusr.stemcount;  i++) {
 261         if (    or_wordrecs[i].or_hwaddrs == 0  ||
 262                 or_wordrecs[i].or_hwordkey[0] == '@')
 263             idf[i] = 0.0;
 264         else {
 265             /* ln(2) = 0.693147181 */
 266             dbl =  (double) or_reccount / (double) or_wordrecs[i].or_hwaddrs;
 267             idf[i] = log(dbl)  /  0.693147181  +  1.0;
 268             if (debugging_boolsrch)
 269                 fprintf (aa_stderr,
 270                     PROGNAME"733 IDF[%d]  numdocs=%5ld  idf=%lf\n",
 271                     i, or_wordrecs[i].or_hwaddrs, idf[i]);
 272         }
 273     }
 274     return;
 275 } /* calculate_idfs() */
 276
 277
 278 /************************************************/
 279 /*                                              */
 280 /*              load_or_wordrecs                */
 281 /*                                              */
 282 /************************************************/
 283 /* Subroutine of boolean_search() initialization.
 284  * Loads or_wordrecs[] array with vista key file
 285  * records for each term in saveusr.stems.
 286  * Returns TRUE on success.  Else returns FALSE with
 287  * appropriate usrblk.retncode and user msgs on msglist.
 288  */
 289 static int      load_or_wordrecs (void)
 290 {
 291     int         i, j, k;
 292     int         stemno;
 293     struct or_hwordrec
 294                 *wordrec;
 295     int         colloc_count =  0;
 296     int         not_found_count =       0;
 297
 298     if (or_wordrecs)
 299         free (or_wordrecs);
 300     or_wordrecs = austext_malloc (
 301         saveusr.stemcount * sizeof (struct or_hwordrec) + 16,
 302         PROGNAME "782", NULL);
 303
 304     for (stemno = 0; stemno < saveusr.stemcount; stemno++) {
 305
 306         wordrec = &or_wordrecs [stemno];
 307
 308         /* If this is a collocation term,
 309          * save the two indexes and the collocation
 310          * value in the wordrec buffer instead of usual
 311          * offsets and dba counts.
 312          */
 313         if (saveusr.stems[stemno][0] == '@') {
 314             strcpy (wordrec->or_hwordkey, saveusr.stems[stemno]);
 315             sscanf (saveusr.stems[stemno], COLLOC_STEM_FORMAT, &i, &j, &k);
 316             wordrec->or_hwoffset = i;
 317             wordrec->or_hwfree = j;
 318             wordrec->or_hwaddrs = k;
 319             colloc_count++;
 320             continue;
 321         }
 322
 323         if (debugging_boolsrch)
 324             fprintf (aa_stderr, PROGNAME"823 KEYFIND[%d] ", stemno);
 325         find_keyword (saveusr.stems[stemno], vistano);
 326         /*
 327          * If term is found, add it to the or_wordrecs[] array.
 328          * But it is an error to include a word in more records
 329          * than the max specified in site config file.  This is
 330          * meaningful for databases where certain common high
 331          * frequency words slip by which should be on the stoplist.
 332          * It's possible in huge databases to run out of memory
 333          * assembling very long resultlists.
 334          */
 335         if (db_status == S_OKAY) {
 336             strncpy (wordrec->or_hwordkey, saveusr.stems[stemno],
 337                 DtSrMAXWIDTH_HWORD);
 338             wordrec->or_hwordkey [DtSrMAXWIDTH_HWORD - 1] = 0;
 339             read_wordstr (wordrec, vistano);
 340             if (db_status != S_OKAY) {
 341                 /* Probable corrupted database.  The btree
 342                  * read succeeded but the record read failed.
 343                  */
 344                 sprintf (msgbuf, catgets(dtsearch_catd, MS_boolsrch, 6,
 345                     "%s Database Error.  Word '%s' is\n"
 346                     "listed in database '%s' but has no index record.") ,
 347                     PROGNAME"295", usrblk.stems[stemno], usrblk.dblk->label);
 348                 DtSearchAddMessage (msgbuf);
 349                 usrblk.retncode = OE_SYSTEM_STOP;
 350                 if (debugging_boolsrch)
 351                     fprintf (aa_stderr,
 352                         "db error, db_status = %d.\n", db_status);
 353                 return FALSE;
 354             }
 355             if (debugging_boolsrch)
 356                 fprintf (aa_stderr, "ofs=%ld addrs=%ld free=%ld\n",
 357                     wordrec->or_hwoffset,
 358                     wordrec->or_hwaddrs,
 359                     wordrec->or_hwfree);
 360             if (wordrec->or_hwaddrs > OE_words_hitlimit) {
 361                 sprintf (msgbuf, catgets (dtsearch_catd, MS_boolsrch, 14,
 362                     "%s '%s' has more than %ld hits.\n"
 363                     "Please remove it from the query or raise the WHITLIM\n"
 364                     "value in the search engine configuration file."),
 365                     PROGNAME"1444", wordrec->or_hwordkey, OE_words_hitlimit);
 366                 DtSearchAddMessage (msgbuf);
 367                 /* Also log WHITLIM msg for administrator... */
 368                 fprintf (aa_stderr, "%s\n", msgbuf);
 369                 usrblk.retncode = OE_BAD_QUERY;
 370                 return FALSE;
 371             }
 372         }
 373
 374         /* Only other possible nonfatal vista return is S_NOTFOUND.
 375          * If qry_is_all_ANDs we can quit right now.
 376          * Otherwise switch off all bits in the word's bit vector.
 377          */
 378         else if (qry_is_all_ANDs) {
 379             if (debugging_boolsrch)
 380                 fputs ("not found, qry_all_ANDs, quit.\n", aa_stderr);
 381             usrblk.retncode = OE_NOTAVAIL;
 382             return FALSE;
 383         }
 384
 385         else {
 386             memset (wordrec, 0, sizeof(struct or_hwordrec));
 387             if (debugging_boolsrch)
 388                 fputs ("not found, addrs-->0.\n", aa_stderr);
 389             not_found_count++;
 390         }
 391
 392     } /* end loop for each term in saveusr.stems[] */
 393
 394     /* It's a failure if all the user's words
 395      * don't exist in database.
 396      */
 397     if (not_found_count + colloc_count >= saveusr.stemcount) {
 398         usrblk.retncode = OE_NOTAVAIL;
 399         return FALSE;
 400     }
 401
 402     return TRUE;
 403 } /* load_or_wordrecs() */
 404
 405
 406 /****************************************/
 407 /*                                      */
 408 /*             get_proximity            */
 409 /*                                      */
 410 /****************************************/
 411 /* Subroutine of stuff_DtSrResult().
 412  * Given d99recno, finds proxwt[] for record,
 413  * calculates and returns integer proximity.
 414  */
 415 static int      get_proximity (long recno)
 416 {
 417     long        byteno = recno >> 3;
 418     int         bitmask = 1 << (recno % 8);
 419     int         i;
 420     for (i = 0;  i < proxwtct;  i++)
 421         if (proxwts[i].byteno == byteno && proxwts[i].bitmask == bitmask)
 422             break;
 423     if (i >= proxwtct)
 424         return -1;
 425     return proxwts[i].proximity;
 426 } /* get_proximity() */
 427
 428
 429 /****************************************/
 430 /*                                      */
 431 /*           stuff_DtSrResult           */
 432 /*                                      */
 433 /****************************************/
 434 /* Subroutine of load_DtSrResults_WK().
 435  * Loads passed DtSrResult structure with data from global objrec.
 436  * Performs additional vista reads as necessary to get misc recs.
 437  */
 438 static void     stuff_DtSrResult (
 439                     DtSrResult          *new,
 440                     long                recno)
 441 {
 442     int         m;
 443     int         fzkey_remaining;
 444     char        *src, *targ, *targend;
 445     static struct or_miscrec
 446                 miscrecbuf;
 447
 448     new->objflags =     objrec.or_objflags;
 449     new->objuflags =    objrec.or_objuflags;
 450     new->objsize =      objrec.or_objsize;
 451     new->objdate =      objrec.or_objdate;
 452     new->objtype =      objrec.or_objtype;
 453     new->objcost =      objrec.or_objcost;
 454     new->dbn =          OE_dbn;
 455     new->dba =          objrecdba;
 456     new->language =     or_language;
 457     strncpy (new->reckey, objrec.or_objkey, DtSrMAX_DB_KEYSIZE);
 458     if (do_stat_sort)
 459         new->proximity = get_proximity (recno);
 460
 461     /* The abstract immediately follows the fuzzy key
 462      * in the FZKABS misc recs.  It may span several recs.
 463      */
 464     new->abstractp =    (char *) (new + 1);
 465     if (or_abstrsz > 0) {
 466         targ = new->abstractp;
 467         targend = targ + or_abstrsz - 1;
 468         fzkey_remaining = or_fzkeysz;
 469         CRSET (PROGNAME"226", &objrecdba, vistano);
 470         SETOR (PROGNAME"227", OR_OBJ_MISCS, saveusr.vistano);
 471         FINDFM (PROGNAME"228", OR_OBJ_MISCS, saveusr.vistano);
 472
 473         while (db_status == S_OKAY) {
 474             RECREAD (PROGNAME"2209", &miscrecbuf, saveusr.vistano);
 475             NTOHS (miscrecbuf.or_misctype);
 476             if (miscrecbuf.or_misctype == ORM_FZKABS) {
 477                 src = (char *) miscrecbuf.or_misc;
 478
 479                 for (m = 0;   m < sizeof(miscrecbuf.or_misc);   m++) {
 480
 481                     /* skip over the fzkey */
 482                     if (fzkey_remaining > 0) {
 483                         src++;
 484                         fzkey_remaining--;
 485                         continue;
 486                     }
 487
 488                     /* copy the abstract */
 489                     *targ = *src;
 490                     if (*src++ == 0 || targ++ >= targend) {
 491                         *targ = 0;
 492                         targ = targend;  /* force outer loop end */
 493                         break;
 494                     }
 495                 } /* end for-loop m */
 496             } /* end (misctype == FZKABS) */
 497
 498             if (targ >= targend)
 499                 break;
 500             FINDNM (PROGNAME"545", OR_OBJ_MISCS, saveusr.vistano);
 501         } /* end while-loop */
 502
 503     } /* endif: (or_abstrsz > 0) */
 504
 505     return;
 506 } /* stuff_DtSrResult() */
 507
 508
 509 /****************************************/
 510 /*                                      */
 511 /*          load_DtSrResults_WK         */
 512 /*                                      */
 513 /****************************************/
 514 /* Builds DtSrResult list for every record
 515  * in result_bitvec, but not more than aa_maxhits.
 516  */
 517 static void     load_DtSrResults_WK (void)
 518 {
 519     long                recno;
 520     int                 bitno;
 521     long                byteno;
 522     int                 i;
 523     long                dittocount;
 524     DtSrResult          *resultp;
 525     size_t              resultsz = sizeof(DtSrResult) + or_abstrsz + 4;
 526
 527     if (got_USR_STOPSRCH())
 528         return;
 529     if (resultlist) {
 530         DtSearchFreeResults (&resultlist);
 531         resultlist = NULL;
 532     }
 533
 534     /* Make a single pass through the final result_bitvec.
 535      * For each nonzero bit, ie each database record
 536      * that satisfies the query requirements,
 537      * retrieve the record and push it onto the
 538      * DtSrResult list.  If not sorting records,
 539      * stop when we reach the user's specified aa_maxhits count.
 540      */
 541     dittocount = 0;
 542     for (recno = 1;  recno < tot_addr_count;  recno++) {
 543         byteno = recno >> 3;    /* divide by 8 */
 544         bitno = recno % 8;
 545
 546         /* Skip zero bits */
 547         if ((result_bitvec[byteno] & (1 << bitno)) == 0)
 548             continue;
 549
 550         if (!read_recno (recno))
 551             continue;
 552
 553         /* Create new DtSrResult node, push it onto resultlist. */
 554         resultp = austext_malloc (resultsz + 4, PROGNAME"466", NULL);
 555         memset (resultp, 0, resultsz);
 556         resultp->link = resultlist;
 557         resultlist = resultp;
 558
 559         /* Load the new DtSrResult node from the object record */
 560         stuff_DtSrResult (resultp, recno);
 561
 562         /* Check if any more reads are necessary.
 563          * If not sorting, stop after aa_maxhits.
 564          * If sorting, there won't be more than
 565          * aa_maxhits recs in the bitvec anyway.
 566          */
 567         dittocount++;
 568         if (dittocount >= aa_maxhits)
 569             break;
 570
 571     }  /* end bitvec loop */
 572
 573
 574     /*--------- All Done.  Clean up and return to caller. ---------*/
 575 /*@@@@@@  make separate workproc call if aa_maxhits > 100.
 576   @@@@@ sort may take a long time */
 577     if (wtvec) {
 578         free (wtvec);
 579         wtvec = NULL;
 580     }
 581     if (proxwts) {
 582         free (proxwts);
 583         proxwts = NULL;
 584     }
 585
 586     if (dittocount <= 0) {
 587         usrblk.workproc = dummy_workproc;
 588         usrblk.retncode = OE_NOTAVAIL;
 589         return;
 590     }
 591
 592     usrblk.retncode =   OE_OK;
 593     usrblk.workproc =   dummy_workproc;
 594
 595     usrblk.stemcount =  saveusr.stemcount;
 596     if (usrblk.search_type == 'W')
 597         memcpy (usrblk.stems, saveusr.stems,
 598             saveusr.stemcount * DtSrMAXWIDTH_HWORD);
 599     else
 600         /* Don't copy first char (ctrl-o) stem */
 601         for (i = 0;  i < saveusr.stemcount;  i++)
 602             strcpy (usrblk.stems[i], &saveusr.stems[i][1]);
 603
 604     if (do_stat_sort)
 605         DtSearchSortResults (&resultlist, DtSrSORT_PROX);
 606     usrblk.dittocount = dittocount;
 607     if (usrblk.dittolist)
 608         DtSearchFreeResults (&usrblk.dittolist);
 609     usrblk.dittolist =  resultlist;
 610     resultlist = NULL;
 611     return;
 612 } /* load_DtSrResults_WK() */
 613
 614
 615 /****************************************/
 616 /*                                      */
 617 /*          weights_filter_WK           */
 618 /*                                      */
 619 /****************************************/
 620 /* This workproc is called only if we're doing statistical sorting.
 621  * (1) It reduces the result_bitvec to it's final size,
 622  * containing only the highest aa_maxhits statistical weights
 623  * in wtvec.
 624  * (2) It replaces (possibly large) wtvec with (probably much smaller)
 625  * array of PROXWT structures containing the selected records'
 626  * weights and calculated proximities, for final ranking sort.
 627  *
 628  */
 629 static void     weights_filter_WK (void)
 630 {
 631     int         i;
 632     double      scalefac;
 633     long        recno;
 634     int         smallest, biggest;
 635     float       biggestwt;
 636     long        byteno, smallest_byteno;
 637     int         bitmask, smallest_bitmask;
 638
 639     if (got_USR_STOPSRCH())
 640         return;
 641
 642     /* Init weight filtering */
 643     if (proxwts)
 644         free (proxwts);
 645     proxwtct = (result_count < aa_maxhits)? result_count : aa_maxhits;
 646     proxwts = austext_malloc (proxwtct * sizeof(PROXWT) + 4,
 647         PROGNAME"429", NULL);
 648     memset (proxwts, 0, proxwtct * sizeof(PROXWT));
 649     smallest = 0;
 650     scalefac = 0.0;
 651     biggestwt = 0.0;    /* biggest single wt of all docs */
 652
 653     /* One pass thru entire result_bitvec */
 654     for (recno = 1;  recno < tot_addr_count;  recno++) {
 655         byteno = recno >> 3;
 656         bitmask = 1 << (recno % 8);
 657
 658         /* Skip zero bits */
 659         if ((result_bitvec[byteno] & bitmask) == 0)
 660             continue;
 661
 662         /* Make scalefac = sum of squares of all wts in bitvec.
 663          * It's possible that all or some of the weights are
 664          * zero (eg queries like "~aaa" or "~aaa | bbb").
 665          * In this case give them a very small positive number
 666          * so we don't divide by zero later on.
 667          */
 668         if (wtvec[recno] == 0.0)
 669             wtvec[recno] = 0.1;
 670         scalefac += (double) wtvec[recno] * (double) wtvec[recno];
 671
 672         /*
 673          * The following logic first fills up the proxwts table.
 674          * After that if a bitvec's weight is larger than the smallest
 675          * proxwt, replace the smallest proxwt with the new weight
 676          * and switch off the previous smallest in the original bitvec.
 677          */
 678
 679         /*
 680          * Just discard rec on bitvec if it's weight
 681          * is smaller than the current smallest.
 682          */
 683         if (wtvec [recno] <= proxwts[smallest].wt) {
 684             RESET_BIT (result_bitvec, byteno, bitmask);
 685             result_count--;
 686             continue;
 687         }
 688         /*
 689          * Else discard current smallest if
 690          * table full, ie it really points to something.
 691          */
 692         if (proxwts[smallest].wt > 0.0) {
 693             smallest_byteno = proxwts[smallest].byteno;
 694             smallest_bitmask = proxwts[smallest].bitmask;
 695             RESET_BIT (result_bitvec, smallest_byteno, smallest_bitmask);
 696             result_count--;
 697         }
 698
 699         /* Add this weight to the proxwts table. */
 700         proxwts [smallest] .wt =        wtvec [recno];
 701         proxwts [smallest] .byteno =    byteno;
 702         proxwts [smallest] .bitmask =   bitmask;
 703
 704         /* Keep track of the table entry that has
 705          * the highest weight.  This will eventually
 706          * be the first sorted hit on the hitlist.
 707          * It's weight/proximity will be used
 708          * to scale the proximities of the
 709          * other hits.
 710          */
 711         if (biggestwt < wtvec[recno]) {
 712             biggestwt = wtvec[recno];
 713             biggest = smallest;
 714         }
 715
 716         /* Find the next smallest */
 717         smallest = 0;
 718         for (i = 1;  i < proxwtct;  i++) {
 719             if (proxwts[i].wt < proxwts[smallest].wt)
 720                 smallest = i;
 721         }
 722
 723     } /* end loop on every recno */
 724
 725     free (wtvec);
 726     wtvec = NULL;
 727
 728     /* PROXIMITY CALCULATIONS.
 729      * In order to translate statistical weight into an AusText
 730      * proximity, basically you have to invert it, then scale it.
 731      * The statistical weight is a similarity measure: the
 732      * larger it is the more similar the document to the query.
 733      * But AusText 'proximity' is like a 'distance' measure,
 734      * the smaller the number the closer the document is to the query.
 735      *
 736      * First 'normalize' each document's statistical
 737      * weight to be a fraction between 0 and 1.  Done
 738      * by calculating a normalization factor,
 739      * the sqrt of the sum of squares of weights of all
 740      * docs that would have qualified for the hitlist
 741      * if we weren't truncating.   Note cosine-based normalization
 742      * factor (Pythagorean) always >= largest wt so we can
 743      * guarantee all normalized weights are > 0.0 and <= 1.0.
 744      *
 745      * The proximity itself is calculated as the 'percent value'
 746      * that the doc is 'distant' from perfection (1.0 or 100%).
 747      * For example, if the normalized weight of the first record
 748      * is .931 then it's proximity will be 7 (100% - 93% = 7).
 749      *
 750      * The proximity of every other hit is scaled away
 751      * from the first because the normalization algorithm
 752      * tends to clump proximities when there are a lot of hits.
 753      * Specifically the proximity of every hit is a constant
 754      * scale factor (derived from the first proximity),
 755      * divided by it's weight.
 756      *
 757      * A "bulls eye" (normalized weight = 1.0, proximity == 0)
 758      * for the first hit is not allowed so scale factor will
 759      * not also be zero.  Otherwise *all* hits in that particular
 760      * results list would be bulls eyes.
 761      */
 762     scalefac = (double) biggestwt / sqrt (scalefac);
 763                         /* normalized weight of first hit */
 764     scalefac = (1.0 - scalefac) * 100.0;
 765                         /* proximity of first hit */
 766     if (scalefac < 1.0)
 767         scalefac = 1.0;
 768                         /* No bulls eyes */
 769     scalefac *= (double) biggestwt * 1.2;
 770                         /* scale factor for other hits */
 771     for (i = 0;  i < proxwtct;  i++) {
 772         proxwts[i].proximity = (int) (scalefac / (double) proxwts[i].wt);
 773         if (proxwts[i].proximity > 9999)
 774             proxwts[i].proximity = 9999;
 775     }
 776
 777     if (debugging_boolsrch) {
 778         fprintf (aa_stderr,
 779             PROGNAME"489 FINAL PROXWTS proxwtct=%d bigwt=%.2f scalefac=%.2lf\n",
 780             proxwtct, biggestwt, scalefac);
 781         for (i=0;  i<10;  i++) {
 782             if (i >= proxwtct)
 783                 break;
 784             fprintf (aa_stderr,
 785                 "  byteno=%3ld bitmask=%02x wt=%.2f prox=%d\n",
 786                 proxwts[i].byteno, proxwts[i].bitmask,
 787                 proxwts[i].wt, proxwts[i].proximity);
 788         }
 789         fprintf (aa_stderr, PROGNAME"499 WEIGHT RESULTS resultct=%ld  bv=\n",
 790             result_count);
 791         for (i=0;  i<22;  i++) {
 792             if (i >= bitveclen)
 793                 break;
 794             fprintf (aa_stderr, " %02x", (int) result_bitvec[i]);
 795         }
 796         fputc ('\n', aa_stderr);
 797         fflush (aa_stderr);
 798     }
 799
 800     usrblk.retncode = OE_SEARCHING;
 801     usrblk.workproc = load_DtSrResults_WK;
 802     return;
 803 } /* weights_filter_WK() */
 804
 805
 806 /****************************************/
 807 /*                                      */
 808 /*          dbread_filter_WK            */
 809 /*                                      */
 810 /****************************************/
 811 /* Called if we must remove documents from result_bitvec
 812  * because of keytype or date,
 813  */
 814 static void     dbread_filter_WK (void)
 815 {
 816     long        recno;
 817     long        byteno;
 818     int         bitmask;
 819     long        discards;
 820
 821     if (got_USR_STOPSRCH())
 822         return;
 823     if (debugging_boolsrch) {
 824         discards = 0;
 825         fputs (PROGNAME"865 DBREAD discards (k=keytype d=date):\n", aa_stderr);
 826         fflush (aa_stderr);
 827     }
 828
 829     /* One pass thru entire result_bitvec */
 830     for (recno = 1;  recno < tot_addr_count;  recno++) {
 831         byteno = recno >> 3;
 832         bitmask = 1 << (recno % 8);
 833
 834         if ((result_bitvec[byteno] & bitmask) == 0)
 835             continue;
 836
 837         if (!read_recno (recno))
 838             continue;
 839
 840         /* Skip undesired record types */
 841         if (!all_key_types) {
 842             if (strchr (saveusr.ktchars, objrec.or_objkey[0]) == NULL) {
 843                 RESET_BIT (result_bitvec, byteno, bitmask);
 844                 result_count--;
 845                 if (debugging_boolsrch) {
 846                     discards++;
 847                     fputc ('k', aa_stderr);
 848                     fflush (aa_stderr);
 849                 }
 850                 continue;
 851             }
 852         }
 853
 854         /* Skip record if out of date range */
 855         if (check_dates) {
 856             if (!objdate_in_range (objrec.or_objdate,
 857                         usrblk.objdate1, usrblk.objdate2)) {
 858                 RESET_BIT (result_bitvec, byteno, bitmask);
 859                 result_count--;
 860                 if (debugging_boolsrch) {
 861                     discards++;
 862                     fputc ('d', aa_stderr);
 863                     fflush (aa_stderr);
 864                 }
 865                 continue;
 866             }
 867         }
 868
 869     } /* end loop on every recno */
 870
 871     if (debugging_boolsrch) {
 872         int     i;
 873         if (discards)
 874             fputc ('\n', aa_stderr);
 875         fprintf (aa_stderr,
 876             PROGNAME"857 DBREAD RESULTS discards=%ld resultct=%ld  bv=\n",
 877             discards, result_count);
 878         for (i=0;  i<22;  i++) {
 879             if (i >= bitveclen)
 880                 break;
 881             fprintf (aa_stderr, " %02x", (int) result_bitvec[i]);
 882         }
 883         fputc ('\n', aa_stderr);
 884         fflush (aa_stderr);
 885     }
 886
 887     /* Determine next workproc.
 888      * (1) If no records survived the read db filter,
 889      *     we're done, return 'no hits'.
 890      * (2) If we're sorting, the next workproc reduces the
 891      *     bitvec to the aa_maxhits recs with the highest
 892      *     statistical weights.
 893      * (3) Otherwise the next workproc just loads the hitlist.
 894      */
 895     if (result_count <= 0) {
 896         usrblk.retncode = OE_NOTAVAIL;
 897         usrblk.workproc = dummy_workproc;
 898     }
 899     else if (do_stat_sort) {
 900         usrblk.retncode = OE_SEARCHING;
 901         usrblk.workproc = weights_filter_WK;
 902     }
 903     else {
 904         if (debugging_boolsrch)
 905             fprintf (aa_stderr,
 906                 PROGNAME"931 No sorting by statistical weights.\n");
 907         usrblk.retncode = OE_SEARCHING;
 908         usrblk.workproc = load_DtSrResults_WK;
 909     }
 910     return;
 911 } /* dbread_filter_WK() */
 912
 913
 914 /****************************************/
 915 /*                                      */
 916 /*        calc_result_bitvec_WK         */
 917 /*                                      */
 918 /****************************************/
 919 /* Second workproc after read_stem_bitvec_WK().
 920  * If possible, minimizes size of truth table permutes,
 921  * then applies them to stem bitvecs to create result_bitvec.
 922  */
 923 static void     calc_result_bitvec_WK (void)
 924 {
 925     int         mask;
 926     int         cpm;
 927     long        byteno;
 928     int         bitno, stemno;
 929     UCHAR       permute;
 930     UCHAR       my_permutes [256];
 931     int         my_pmsz;
 932     int         i;
 933
 934     if (got_USR_STOPSRCH())
 935         return;
 936
 937     /* If there are fewer than a full complement of stems,
 938      * minimize size of truth table by discarding
 939      * permutes that refer to unused stems.
 940      */
 941     if (saveusr.stemcount < DtSrMAX_STEMCOUNT) {
 942         /* Set high order bits of mask to mark unused stem positions */
 943         mask = 0;
 944         for (i = 0;  i < saveusr.stemcount;  i++)
 945             mask |= 1 << i;
 946         mask = ~mask;
 947
 948         /* 'cpm' is a candidate permute */
 949         my_pmsz = 0;
 950         for (cpm = 0;  cpm < 256;  cpm++) {
 951             /*
 952              * Discard candidate if it refers to an unused stem.
 953              */
 954             if (cpm & mask)
 955                 continue;
 956             /*
 957              * Otherwise if candidate is in final_truthtab, keep it.
 958              */
 959             for (i = 0;  i < final_truthtab.pmsz;  i++) {
 960                 if (final_truthtab.permutes[i] == (UCHAR) cpm) {
 961                     my_permutes [my_pmsz] = (UCHAR) cpm;
 962                     my_pmsz++;
 963                 }
 964             }
 965         }
 966         if (debugging_boolsrch) {
 967             fprintf (aa_stderr,
 968                 PROGNAME"565 Minimize truth table, pmsz=%d-->%d\n  permutes=",
 969                 final_truthtab.pmsz, my_pmsz);
 970             for (i=0;  i<16;  i++) {
 971                 if (i >= my_pmsz)
 972                     break;
 973                 fprintf (aa_stderr, " %02x", (int) my_permutes [i]);
 974             }
 975             fputc ('\n', aa_stderr);
 976             fflush (aa_stderr);
 977         }
 978         final_truthtab.permutes = my_permutes;
 979         final_truthtab.pmsz = my_pmsz;
 980     } /* end minimize of permutes */
 981
 982     /* Calculate result bit vector.
 983      * Loop 1 is a single pass through the bit vectors
 984      * (a bit loop inside a byte loop).
 985      * For each nonzero bit, ie each database record
 986      * that has at least one of the query terms in it,
 987      * build a 'permute' equivalent to the boolean
 988      * representation of the terms in that record (Loop 2).
 989      * Then search the truth table permutes for a match (Loop 3).
 990      * If found, set the record's bit in the result_bitvec.
 991      */
 992
 993     /* LOOP 1.  For each database addr... */
 994     result_count = 0;
 995     for (byteno = 0;  byteno < bitveclen;  byteno++) {
 996         for (bitno = 0;  bitno < 8;  bitno++) {
 997             mask = 1 << bitno;
 998
 999             /* LOOP 2.  Build permute for each query term. */
1000             permute = 0;
1001             for (stemno = 0;  stemno < saveusr.stemcount;  stemno++) {
1002                 if (bitvecs [stemno] [byteno]  &  (UCHAR) mask)
1003                     permute |= 1 << stemno;
1004             }
1005
1006             /* LOOP 3.  Search truth table for matching permute. */
1007             for (i = 0;  i < final_truthtab.pmsz;  i++) {
1008                 if (final_truthtab.permutes[i] == permute) {
1009                     result_bitvec [byteno] |= (UCHAR) mask;
1010                     result_count++;
1011                 }
1012             }
1013         }
1014     }
1015
1016     if (debugging_boolsrch) {
1017         fprintf (aa_stderr, PROGNAME"621 PRELIM RESULTS resultct=%ld  bv=\n",
1018             result_count);
1019         for (i=0;  i<22;  i++) {
1020             if (i >= bitveclen)
1021                 break;
1022             fprintf (aa_stderr, " %02x", (int) result_bitvec[i]);
1023         }
1024         fputc ('\n', aa_stderr);
1025         fflush (aa_stderr);
1026     }
1027
1028     /* The next workprocs are 'filters', reducing the size
1029      * of result_bitvec by removing various unwanted records.
1030      * They're done in the following order:
1031      * (1) If no records survived the truth table manipulations,
1032      *     we're done, return 'no hits'.
1033      * (2) If we must remove documents because of keytype or date,
1034      *     the next workproc is the filter that reads the database.
1035      * (3) If we're sorting, the next workproc reduces the
1036      *     bitvec to the aa_maxhits recs with the highest
1037      *     statistical weights.
1038      * (4) Otherwise the next workproc just loads the hitlist.
1039      */
1040     if (result_count <= 0) {
1041         usrblk.retncode = OE_NOTAVAIL;
1042         usrblk.workproc = dummy_workproc;
1043     }
1044     else if (!all_key_types || check_dates) {
1045         usrblk.retncode = OE_SEARCHING;
1046         usrblk.workproc = dbread_filter_WK;
1047     }
1048     else if (do_stat_sort) {
1049         if (debugging_boolsrch)
1050             fprintf (aa_stderr,
1051                 PROGNAME"948 No db reads necessary for date or keytype.\n");
1052         usrblk.retncode = OE_SEARCHING;
1053         usrblk.workproc = weights_filter_WK;
1054     }
1055     else {
1056         if (debugging_boolsrch)
1057             fprintf (aa_stderr,
1058                 PROGNAME"625 No filtering: no sort and no db reads.\n");
1059         usrblk.retncode = OE_SEARCHING;
1060         usrblk.workproc = load_DtSrResults_WK;
1061     }
1062     return;
1063 } /* calc_result_bitvec_WK() */
1064
1065
1066 /****************************************/
1067 /*                                      */
1068 /*              read_d99                */
1069 /*                                      */
1070 /****************************************/
1071 /* Subroutine of read_stem_bitvec_WK().
1072  * Repeatedly called to get each d99dba in the inverted index
1073  * file (d99) for a specific index term.  The first call passes
1074  * the term's wordrec with d99 offset and size information.
1075  * Subsequent calls pass NULL.
1076  * Returns valid d99dba, or 0 at end of term's index, or -1 on error.
1077  * Actual reads are performed a disk block at a time,
1078  * with dbas stored in a static buffer for the next call.
1079  */
1080 static DB_ADDR  read_d99 (struct or_hwordrec *wordrec)
1081 {
1082     static DB_ADDR      readbuf [DBAS_PER_BLOCK];
1083     static DB_ADDR      *bufptr, *endbuf;
1084     static FILE         *fptr;
1085     static long         bal_read, request_read, actual_read;
1086
1087     /* First call for new term */
1088     if (wordrec) {
1089         fptr = usrblk.dblk->iifile;
1090         fseek (fptr, wordrec->or_hwoffset, SEEK_SET);
1091         bal_read = wordrec->or_hwaddrs;
1092         bufptr = endbuf = 0;    /* triggers block read */
1093     }
1094
1095     /* Time to read another block */
1096     if (bufptr >= endbuf) {
1097         if (bal_read <= 0)
1098             return 0;
1099         if (bal_read > DBAS_PER_BLOCK) {
1100             request_read = DBAS_PER_BLOCK;
1101             bal_read -= DBAS_PER_BLOCK;
1102             endbuf = readbuf + DBAS_PER_BLOCK;
1103         }
1104         else {
1105             /* last block is usually short */
1106             request_read = bal_read;
1107             bal_read = 0;
1108             endbuf = readbuf + request_read;
1109         }
1110         if (fread (readbuf, sizeof(DB_ADDR), request_read, fptr)
1111                 != request_read) {
1112             sprintf (msgbuf, catgets(dtsearch_catd, MS_boolsrch, 28,
1113                 "%s Database Read Error in %s.d99.") ,
1114                 PROGNAME"428", usrblk.dblk->name);
1115             DtSearchAddMessage (msgbuf);
1116             return -1;
1117         }
1118         bufptr = readbuf;
1119     }
1120
1121     /******return *bufptr++;*******/
1122     return ntohl (*bufptr++);
1123 } /* read_d99() */
1124
1125
1126 /****************************************/
1127 /*                                      */
1128 /*           get_colloc_bitvec          */
1129 /*                                      */
1130 /****************************************/
1131 /* Subroutine of read_stem_bitvec_WK().
1132  * Constructs a 'collocation bitvector' for current save_stemno.
1133  * A collocation expression requests the return of all records
1134  * containing both of two terms (a kind of boolean AND) such that
1135  * the occurrences are within n characters of each other.
1136  * For example "ICE @5 CREAM" requests the return of all records
1137  * containing both "ICE" and "CREAM" but only if they are separated
1138  * by no more than 5 characters.
1139  *
1140  * Since offset information is not stored in the inverted index
1141  * this module initially returns the intersection of the two words'
1142  * bit vectors (boolean AND).  Then it retrieves each record,
1143  * builds an offset (hilites) table for each of the two words,
1144  * then compares the offset differences in the tables.
1145  * If no occurrence pairs are within the specified separation
1146  * range, the record is deleted from the bitvector.
1147  * Returns 0 if successful, otherwise returns -1 and msgs.
1148 @@@@ rewrite as its own workproc--reading/hiliting can take a long time...
1149  */
1150 static int      get_colloc_bitvec (void)
1151 {
1152     int         stemno_A =      or_wordrecs[save_stemno].or_hwoffset;
1153     int         stemno_B =      or_wordrecs[save_stemno].or_hwfree;
1154     long        range =         or_wordrecs[save_stemno].or_hwaddrs;
1155     UCHAR       *bitvec_A =     bitvecs [stemno_A];
1156     UCHAR       *bitvec_B =     bitvecs [stemno_B];
1157     UCHAR       *bitvec_C =     bitvecs [save_stemno];
1158     long        byteno, recno;
1159     UCHAR       bitmask;
1160     int         parse_type;
1161     int         got_a_colloc;
1162     char        *stemp;
1163     DtSrHitword *hitwords_A, *hitwords_B;
1164     long        hitwcount_A, hitwcount_B;
1165     long        threshold_range;
1166     DB_ADDR     dba;
1167     LLIST       *bloblist;
1168     long        a, b, offset_A, offset_B;
1169
1170     /* First construct the set intersection (AND) of
1171      * each of the collocated terms in the colloc bitvec.
1172      */
1173     for (byteno = 0;  byteno < bitveclen;  byteno++)
1174         bitvec_C [byteno] = bitvec_A [byteno] & bitvec_B [byteno];
1175     if (debugging_boolsrch) {
1176         int     i;
1177         fprintf (aa_stderr,
1178             PROGNAME"312 INTERSECT[%d] (colloc %d & %d):\n",
1179                 save_stemno, stemno_A, stemno_B);
1180         for  (i=0; i<bitveclen; i++) {
1181             fprintf (aa_stderr, " %02x", bitvec_C[i]);
1182             if (i > 22)
1183                 break;
1184         }
1185         fputc ('\n', aa_stderr);
1186         fflush (aa_stderr);
1187     }
1188
1189     /* Read cleartext for each rec in intersection/colloc bitvec.
1190      * Get hitwords (hilite table) for each collocation term.
1191      * Switch off recs in bitvec where no term pairs are in
1192      * collocation range.
1193      */
1194     for (recno = 1;  recno < tot_addr_count;  recno++) {
1195         byteno = recno >> 3;    /* divide by 8 */
1196         bitmask = 1 << (recno % 8);
1197
1198         /* Skip zero bits */
1199         if ((bitvec_C[byteno] & bitmask) == 0)
1200             continue;
1201
1202         /* Convert recno to vista database address.
1203          * Silently skip rec if dba doesn't exist.
1204          */
1205         dba = (recno - 1) * or_recslots + 2;
1206         if (dba >= or_maxdba) {
1207             RESET_BIT (bitvec_C, byteno, bitmask);
1208             continue;
1209         }
1210         dba |= (OR_D00 << 24);
1211
1212         /* Silently skip records that have no document text */
1213         if ((bloblist = ve_getblobs (dba, vistano)) == NULL) {
1214             if (debugging_boolsrch) {
1215                 fprintf (aa_stderr,
1216                     PROGNAME"126 No blobs for recno=%ld byteno=%ld mask%02x\n",
1217                     recno, byteno, bitmask);
1218                 fflush (aa_stderr);
1219             }
1220             RESET_BIT (bitvec_C, byteno, bitmask);
1221             continue;
1222         }
1223
1224         /* Uncompress record text into usrblk.cleartext */
1225         if (oe_unblob (bloblist) != OE_OK)
1226             return -1;
1227
1228         /* Build 'hilite' table for stem A.  If stem
1229          * can't be found in the record, silently skip it.
1230          * Otherwise save the table.
1231          */
1232         stemp = saveusr.stems [stemno_A];
1233         if (stemp[0] == STEM_CH) {
1234             parse_type = 'S';
1235             stemp++;
1236         }
1237         else
1238             parse_type = 'W';
1239         if (!hilite_cleartext (parse_type, stemp, 1)) {
1240             RESET_BIT (bitvec_C, byteno, bitmask);
1241             continue;
1242         }
1243         hitwords_A = usrblk.hitwords;
1244         hitwcount_A = usrblk.hitwcount;
1245         usrblk.hitwords = NULL;
1246         usrblk.hitwcount = 0;
1247
1248         /* In the same way build 'hilite' table for stem B */
1249         stemp = saveusr.stems [stemno_B];
1250         if (stemp[0] == STEM_CH) {
1251             parse_type = 'S';
1252             stemp++;
1253         }
1254         else
1255             parse_type = 'W';
1256         if (!hilite_cleartext (parse_type, stemp, 1)) {
1257             RESET_BIT (bitvec_C, byteno, bitmask);
1258             free (hitwords_A);
1259             continue;
1260         }
1261         hitwords_B = usrblk.hitwords;
1262         hitwcount_B = usrblk.hitwcount;
1263         usrblk.hitwords = NULL;
1264         usrblk.hitwcount = 0;
1265
1266         /* Compare the two hilite tables for range matches */
1267         got_a_colloc = FALSE;
1268         b = 0;
1269         for (a = 0;  a < hitwcount_A;  a++) {
1270             offset_A = hitwords_A[a].offset;
1271             threshold_range = offset_A + hitwords_A[a].length + range;
1272             for (;  b < hitwcount_B;  b++) {
1273                 offset_B = hitwords_B[b].offset;
1274
1275                 /* Advance B to first entry past A's offset */
1276                 if (offset_B <= offset_A )
1277                     continue;   /* ...the B loop */
1278                 if (offset_B <= threshold_range)
1279                     got_a_colloc = TRUE;
1280                 break;          /* ...the B loop */
1281             }  /* end B loop */
1282             if (got_a_colloc  ||  b >= hitwcount_B)
1283                 break;          /* ...the A loop */
1284         } /* end A loop */
1285         free (hitwords_A);
1286         free (hitwords_B);
1287
1288         /* If no collocations found within range,
1289          * switch off rec in colloc bitvec.
1290          */
1291         if (!got_a_colloc)
1292             RESET_BIT (bitvec_C, byteno, bitmask);
1293
1294     } /* end loop on each recno in intersection/colloc bitvec */
1295
1296     return 0;
1297 } /* get_colloc_bitvec() */
1298
1299
1300 /****************************************/
1301 /*                                      */
1302 /*          read_stem_bitvec_WK         */
1303 /*                                      */
1304 /****************************************/
1305 /* First workproc after boolean_search().
1306  * Each iterative call loads one (save_stemno) real stem's bitvec.
1307  * After last stem bitvec loaded, sets up
1308  * call to next workproc in sequence.
1309  */
1310 static void     read_stem_bitvec_WK (void)
1311 {
1312     long        byteno;
1313     DB_ADDR     d99recno;
1314     float       weight;
1315
1316     if (got_USR_STOPSRCH())
1317         return;
1318
1319     /* Process collocation 'stems' */
1320     if (saveusr.stems [save_stemno] [0] == '@') {
1321         d99recno = get_colloc_bitvec();
1322         goto DONE_READING;
1323     }
1324
1325     for (       d99recno = read_d99 (&or_wordrecs [save_stemno]);
1326                 d99recno;
1327                 d99recno = read_d99 (NULL)) {
1328         if (d99recno == -1)     /* read error */
1329             break;
1330
1331         /* Save low byte 'statistical weight' value.
1332          * It can only be 0 - 255.
1333          */
1334         if (do_stat_sort)
1335             weight = (float) (d99recno & 0x000000ff) + 1.0;
1336
1337         d99recno = (d99recno >> 8) & 0x00ffffff;
1338
1339         /* Set correct bit in bitvec.
1340          * The byte number is the recno divided by 8.
1341          * The bit number is the remainder after division by 8.
1342          */
1343         if ((byteno = d99recno >> 3) >= bitveclen) {
1344             sprintf (msgbuf, catgets(dtsearch_catd, MS_boolsrch, 32,
1345                 "%s Database Error: %s '%s'\n"
1346                 "in database '%s' has invalid d99 record number %ld.") ,
1347                 PROGNAME"394",
1348                 (usrblk.search_type == 'W') ?
1349                         catgets(dtsearch_catd, MS_boolsrch, 33, "Word") :
1350                         catgets(dtsearch_catd, MS_boolsrch, 34, "Stem of"),
1351                 usrblk.stems [save_stemno],
1352                 usrblk.dblk->label,
1353                 d99recno);
1354             DtSearchAddMessage (msgbuf);
1355             d99recno = -1;      /* force error return */
1356             goto DONE_READING;
1357         }
1358         bitvecs [save_stemno] [byteno] |= 1 << (d99recno % 8);
1359
1360         /* Add to correct weight in weight vector.
1361          * IDF ranges between 1.0 and 20.0, and weight
1362          * is 1 - 256, so we're adding 1 - ~5000 to wtvec.
1363          */
1364         if (do_stat_sort)
1365             wtvec [d99recno] += weight * (float) idf [save_stemno];
1366
1367     } /* end loop that retrieves every d99recno for curr stem */
1368
1369 DONE_READING:
1370
1371     if (debugging_boolsrch) {
1372         int     i;
1373         if (debugging_boolsrch)
1374             fprintf (aa_stderr, PROGNAME"313 BITVEC[%ld]:\n", save_stemno);
1375         for  (i=0; i<bitveclen; i++) {
1376             fprintf (aa_stderr, " %02x", bitvecs[save_stemno][i]);
1377             if (i > 22)
1378                 break;
1379         }
1380         fputc ('\n', aa_stderr);
1381         fflush (aa_stderr);
1382     }
1383
1384     if (d99recno == 0) {
1385         /* Normal conclusion.  Increment to next stem.
1386          * If not all stems have been read,
1387          * this is still the next workproc.
1388          * Otherwise the next workproc is the one
1389          * merging all bitvectors into the final
1390          * result bitvec using the truth table.
1391          */
1392         usrblk.retncode = OE_SEARCHING;
1393         if (++save_stemno < saveusr.stemcount)
1394             usrblk.workproc = read_stem_bitvec_WK;
1395         else
1396             usrblk.workproc = calc_result_bitvec_WK;
1397     }
1398     else
1399         /* d99recno must be -1 */
1400         usrblk.retncode = OE_SYSTEM_STOP;
1401     return;
1402 } /* read_stem_bitvec_WK() */
1403
1404
1405 /****************************************/
1406 /*                                      */
1407 /*            boolean_search            */
1408 /*                                      */
1409 /****************************************/
1410 /* Called from Opera_Engine after successful boolean_parse().
1411  * Expects valid globals: saveusr.stems, saveusr.stemcount,
1412  * usrblk.stems (contains original unstemmed query terms for msgs),
1413  * usrblk.search_type, final_truthtab, qry_has_no_NOTs,
1414  * and qry_is_all_ANDs.
1415  * Based on parts of the function ve_word_search().
1416  * Upon return, usrblk.retncode, msglist, etc is appropriately loaded.
1417  * Upon successful return usrblk.stems, usrblk.stemcount,
1418  * and dittolist are also loaded.
1419  */
1420 void    boolean_search (void)
1421 {
1422     int         i, j;
1423     size_t      allocsz_needed;
1424
1425     /* Sanity checks */
1426     if (        saveusr.stemcount <= 0          ||
1427                 final_truthtab.pmsz <= 0        ||
1428                 final_truthtab.pmsz >= 256      ) {
1429         fprintf (aa_stderr, catgets(dtsearch_catd, MS_boolsrch, 35,
1430             "%s Program Error: stemct=%d pmsz=%d\n") ,
1431             PROGNAME"1404", saveusr.stemcount, final_truthtab.pmsz);
1432         DtSearchExit (14);
1433     }
1434
1435     /*---------- Init globals ----------*/
1436     if (!msgbuf)
1437         msgbuf = austext_malloc (500, PROGNAME"393", NULL);
1438     debugging_boolsrch =        (usrblk.debug & USRDBG_SRCHCMPL);
1439     need_zero_permute = (final_truthtab.permutes[0] == 0);
1440     do_stat_sort =      ((usrblk.flags & USR_SORT_WHITL) != 0);
1441     check_dates =       (usrblk.objdate1 || usrblk.objdate2);
1442     or_abstrsz =        usrblk.dblk->dbrec.or_abstrsz;
1443     or_fzkeysz =        usrblk.dblk->dbrec.or_fzkeysz;
1444     or_language =       usrblk.dblk->dbrec.or_language;
1445     or_maxdba =         usrblk.dblk->dbrec.or_maxdba;
1446     usrblk.flags &=     ~USR_STOPSRCH;  /* turn off stop button */
1447
1448     saveusr.vistano = vistano = usrblk.dblk->vistano;
1449     saveusr.dittolist =         NULL;
1450     saveusr.dittocount =        0L;
1451     saveusr.iterations =        INIT_ITERATIONS;
1452     /*
1453      * saveusr.ktchars is a string holding
1454      * first char of desired record ids.
1455      */
1456     all_key_types =             TRUE;
1457     for (i = 0, j = 0; i < usrblk.dblk->ktcount; i++) {
1458         if (usrblk.dblk->keytypes[i].is_selected)
1459             saveusr.ktchars[j++] = usrblk.dblk->keytypes[i].ktchar;
1460         else
1461             all_key_types =     FALSE;
1462     }
1463     saveusr.ktchars[j] = '\0';
1464
1465     or_recslots =       (long) (usrblk.dblk->dbrec.or_recslots);
1466     or_reccount =       usrblk.dblk->dbrec.or_reccount;
1467
1468     /* RECFRST is just to get the slot# (dba) of the
1469      * first real object record after the dbrec.
1470      * Currently the dbrec occupies only one slot,
1471      * the first (#1), so dba will usually be #2.
1472      */
1473 /********
1474     RECFRST(PROGNAME"2545", OR_OBJREC, saveusr.vistano);
1475     CRGET(PROGNAME"2546", &dba, saveusr.vistano);
1476     dba &= 0x00FFFFFF;
1477 ********/
1478     tot_addr_count =    ((usrblk.dblk->dbrec.or_maxdba + 1) / or_recslots) + 1;
1479     bitveclen =         (tot_addr_count >> 3) + 1;
1480
1481     if (debugging_boolsrch) {
1482         fprintf (aa_stderr, PROGNAME"360 "
1483             "boolean_search: typ='%c' needzpm?=%d sort?=%d maxhits=%d\n"
1484             "  maxdba=%ld recct=%ld recslts=%ld\n"
1485             "  totnmadr=%ld bvln=%ld allkts?=%d  ktchars='%s'\n"
1486             ,usrblk.search_type
1487             ,need_zero_permute
1488             ,do_stat_sort
1489             ,aa_maxhits
1490             ,usrblk.dblk->dbrec.or_maxdba
1491             ,or_reccount
1492             ,or_recslots
1493             ,tot_addr_count
1494             ,bitveclen
1495             ,all_key_types
1496             ,saveusr.ktchars
1497             );
1498         fflush (aa_stderr);
1499     }
1500
1501
1502     /*---------- Read vista btree ----------
1503      * Load or_wordrecs[] array for each term in saveusr.stems.
1504      */
1505     if (!load_or_wordrecs())
1506         return;
1507
1508     /* If statistically sorting final resultlist, calculate
1509      * idf (inverse document frequency) for each term using
1510      * the frequency data in or_wordrecs[].
1511      */
1512     if (do_stat_sort)
1513         calculate_idfs();
1514
1515     /* Bitvector allocation.  Number needed is one for each stem,
1516      * plus one extra to accumulate the result bitvector.
1517      */
1518     allocsz_needed = bitveclen * (saveusr.stemcount + 1);
1519     if (debugging_boolsrch)
1520         fprintf (aa_stderr, PROGNAME"430 "
1521             "bitvecs[] alloc needed=%ld (bvln=%ld stems=%d+1), have=%ld.\n",
1522             allocsz_needed, bitveclen, saveusr.stemcount, bitvec_allocsz);
1523     if (bitvec_allocsz < allocsz_needed) {
1524         if (bitvec_allocp)
1525             free (bitvec_allocp);
1526         bitvec_allocp = austext_malloc (allocsz_needed + 16,
1527             PROGNAME"508", NULL);
1528         if (debugging_boolsrch)
1529             fprintf (aa_stderr, PROGNAME"432 bitvecs[] realloc %ld-->%ld.\n",
1530                 bitvec_allocsz, allocsz_needed);
1531         bitvec_allocsz = allocsz_needed;
1532     }
1533
1534     /* Clear all bitvecs to zero and assign them */
1535     memset (bitvec_allocp, 0, allocsz_needed);
1536     for (i = 0;  i < saveusr.stemcount;  i++)
1537         bitvecs[i] = bitvec_allocp + (i * bitveclen);
1538     result_bitvec = bitvec_allocp + (i * bitveclen);
1539
1540     /* If sorting statistically, allocate weight vector.
1541      * One float for each db record.
1542      */
1543     if (wtvec) {
1544         free (wtvec);
1545         wtvec = NULL;
1546     }
1547     if (do_stat_sort) {
1548         wtvec = austext_malloc ((tot_addr_count + 4) * sizeof(float) + 4,
1549             PROGNAME"040", NULL);
1550         memset (wtvec, 0, (tot_addr_count + 4) * sizeof(float));
1551     }
1552
1553     /* The 'zero permute' is every record that has
1554      * NONE of the query terms in it.  It can only be
1555      * generated if a NOT operator was included in the query.
1556      */
1557     if (need_zero_permute) {
1558         sprintf (msgbuf, catgets (dtsearch_catd, MS_boolsrch, 15,
1559             "%s Your query requires retrieving every\n"
1560             "document in the database that does not have any of\n"
1561             "your query words.  This type of search may take an\n"
1562             "unusually long time."),
1563             PROGNAME"1536");
1564         DtSearchAddMessage (msgbuf);
1565     }
1566
1567     if (debugging_boolsrch)
1568         fflush (aa_stderr);
1569
1570     /* Searches may take a long time.  To allow gui to put a
1571      * a 'working' dialog box and a 'cancel' button,
1572      * we pass execution to workprocs.
1573      * If user cannot cancel search no matter how
1574      * long it may take, we call each of the subsequent
1575      * workproc functions directly from here.
1576      * Otherwise they will themselves setup each
1577      * subsequent call to usrblk.workproc(), as long as
1578      * the previous call returns OE_SEARCHING and the user
1579      * hasn't pushed USR_STOPSRCH.
1580      */
1581     usrblk.workproc = read_stem_bitvec_WK;
1582     save_stemno = 0;    /* global arg for first workproc */
1583     usrblk.workproc();  /* direct call to first workproc */
1584
1585     if ((usrblk.flags & USR_NO_ITERATE) != 0  &&
1586                 (usrblk.debug & USRDBG_ITERATE) == 0) {
1587         while (usrblk.retncode == OE_SEARCHING)
1588             usrblk.workproc();
1589     }
1590     return;
1591
1592 } /* boolean_search() */
1593
1594 /************************** BOOLSRCH.C **********************/