cde/lib/DtSearch/boolpars.c

   1 /*
   2  * CDE - Common Desktop Environment
   3  *
   4  * Copyright (c) 1993-2012, The Open Group. All rights reserved.
   5  *
   6  * These libraries and programs are free software; you can
   7  * redistribute them and/or modify them under the terms of the GNU
   8  * Lesser General Public License as published by the Free Software
   9  * Foundation; either version 2 of the License, or (at your option)
  10  * any later version.
  11  *
  12  * These libraries and programs are distributed in the hope that
  13  * they will be useful, but WITHOUT ANY WARRANTY; without even the
  14  * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE. See the GNU Lesser General Public License for more
  16  * details.
  17  *
  18  * You should have received a copy of the GNU Lesser General Public
  19  * License along with these librararies and programs; if not, write
  20  * to the Free Software Foundation, Inc., 51 Franklin Street, Fifth
  21  * Floor, Boston, MA 02110-1301 USA
  22  */
  23 /* $XConsortium: boolpars.c /main/5 1996/11/25 18:49:27 drk $
  24  *
  25  * (c) Copyright 1996 Digital Equipment Corporation.
  26  * (c) Copyright 1996 Hewlett-Packard Company.
  27  * (c) Copyright 1996 International Business Machines Corp.
  28  * (c) Copyright 1996 Sun Microsystems, Inc.
  29  * (c) Copyright 1996 Novell, Inc.
  30  * (c) Copyright 1996 FUJITSU LIMITED.
  31  * (c) Copyright 1996 Hitachi.
  32  */
  33 /*
  34  *   COMPONENT_NAME: austext
  35  *
  36  *   FUNCTIONS: add_syntax_errmsg
  37  *              boolean_parse
  38  *              boolyac_AND
  39  *              boolyac_COLLOC
  40  *              boolyac_NOT
  41  *              boolyac_OR
  42  *              copy_final_truthtab
  43  *              copy_token
  44  *              creatett
  45  *              freett
  46  *              get_stem_truthtab
  47  *              main
  48  *              process_user_args
  49  *              yyerror
  50  *              yylex
  51  *
  52  *   ORIGINS: 27
  53  *
  54  *
  55  *   (C) COPYRIGHT International Business Machines Corp. 1996
  56  *   All Rights Reserved
  57  *   Licensed Materials - Property of IBM
  58  *   US Government Users Restricted Rights - Use, duplication or
  59  *   disclosure restricted by GSA ADP Schedule Contract with IBM Corp.
  60  */
  61 /********************* BOOLPARS.C ********************
  62  * $Id: boolpars.c /main/5 1996/11/25 18:49:27 drk $
  63  * February 1996.
  64  * AusText/DtSearch yacc-based boolean query parser.
  65  * Converts boolean query into stems array and truth table
  66  * for subsequent search.  Boolyac.y is the yacc source.
  67  * After processing by yacc, it becomes boolyac.c and boolyac.h.
  68  * This module contains all the related C source code: yylex,
  69  * yacc action functions, and the main AusText driver function, boolean_parse.
  70  * Additional information (format of TRUTHTAB) in header file boolpars.h.
  71  *
  72  * $Log$
  73  * Revision 1.4  1996/03/22  23:12:50  miker
  74  * Added string.h header and correctly cast strcspn() calls.
  75  *
  76  * Revision 1.3  1996/03/20  19:14:30  miker
  77  * Enable collocation expressions in stem (type 'S') searches.
  78  *
  79  * Revision 1.2  1996/03/13  22:35:59  miker
  80  * Changed char to UCHAR several places; similar typecasts.
  81  *
  82  * Revision 1.1  1996/03/05  15:52:06  miker
  83  * Initial revision
  84  */
  85 #include "SearchE.h"
  86 #include <string.h>
  87 #include "boolpars.h"
  88 #include "boolyac.h"
  89
  90 #if (DtSrMAX_STEMCOUNT != 8)
  91 #error DtSrMAX_STEMCOUNT is not defined to be 8.
  92 #endif
  93
  94 #define PROGNAME        "BOOLPARS"
  95 #define WORD_ENDERS     " \t\n\f()|@~&"
  96 #define MAX_YYERRORS    4
  97 #define MS_boolpars     28
  98
  99
 100 /****************************************/
 101 /*                                      */
 102 /*               GLOBALS                */
 103 /*                                      */
 104 /****************************************/
 105 int             qry_has_no_NOTs =       FALSE;
 106 int             qry_is_all_ANDs =       FALSE;
 107 TRUTHTAB        final_truthtab =        { 0 };
 108 int             parser_invalid_wordcount = 0;
 109
 110 static int      debugging_boolpars =    FALSE;
 111 static unsigned char
 112                 *final_permutes =       NULL;
 113 static int      last_token_was_boolop = TRUE;
 114 static char     *msgbuf =               NULL;
 115 static UCHAR    *next_lex_char =        NULL;
 116 static int      paren_count =           0;
 117 static TRUTHTAB *ttlist =               NULL;
 118 static int      yyerror_count =         0;
 119 static size_t   yyleng;         /* same as in lex API */
 120 static char     *yytext;        /* same as in lex API */
 121
 122
 123 /****************************************/
 124 /*                                      */
 125 /*           add_syntax_errmsg          */
 126 /*                                      */
 127 /****************************************/
 128 /* Action function called for yacc rules used to trap syntax errors.
 129  * Adds error message identified by msgno to user's msglist.
 130  */
 131 void    add_syntax_errmsg (int msgno)
 132 {
 133     switch (msgno) {
 134         case 1:
 135             /* Message #2 is called in two places */
 136             sprintf (msgbuf, catgets(dtsearch_catd, MS_boolpars, 2,
 137                 "%s Query field is empty."),
 138                 PROGNAME"086");
 139             DtSearchAddMessage (msgbuf);
 140             break;
 141
 142         case 2:
 143             sprintf (msgbuf, catgets(dtsearch_catd, MS_boolpars, 5,
 144                 "%s Boolean operators must be positioned\n"
 145                 "between words or expressions.  Two sequential words\n"
 146                 "without an operator are interpreted as being separated\n"
 147                 "by the AND operator (&)."),
 148                 PROGNAME"091");
 149             DtSearchAddMessage (msgbuf);
 150             break;
 151
 152         case 3:
 153             sprintf (msgbuf, catgets(dtsearch_catd, MS_boolpars, 6,
 154                 "%s Expression in parentheses is missing."),
 155                 PROGNAME"093");
 156             DtSearchAddMessage (msgbuf);
 157             break;
 158
 159         case 4:
 160             sprintf (msgbuf, catgets(dtsearch_catd, MS_boolpars, 7,
 161                 "%s NOT operator (~) must be positioned to\n"
 162                 "the left of the word or expression it qualifies."),
 163                 PROGNAME"098");
 164             DtSearchAddMessage (msgbuf);
 165             break;
 166
 167         case 5:
 168             /* Message #3 is called in two places */
 169             sprintf (msgbuf, catgets(dtsearch_catd, MS_boolpars, 3,
 170                 "%s COLLOCATION operator (@) may\n"
 171                 "only be positioned between two words."),
 172                 PROGNAME"111");
 173             DtSearchAddMessage (msgbuf);
 174             break;
 175
 176         case 6:
 177             sprintf (msgbuf, catgets(dtsearch_catd, MS_boolpars, 4,
 178                 "%s One or more words in your\n"
 179                 "query are not stored in database '%s'.") ,
 180                 PROGNAME"089", usrblk.dblk->label);
 181             DtSearchAddMessage (msgbuf);
 182             break;
 183
 184         default:
 185             sprintf (msgbuf,  catgets(dtsearch_catd, MS_boolpars, 8,
 186                 "%s Invalid boolean query.  Syntax Error #%d.") ,
 187                 PROGNAME"100", msgno);
 188             DtSearchAddMessage (msgbuf);
 189             break;
 190     }
 191     return;
 192 } /* add_syntax_errmsg() */
 193
 194
 195 /****************************************/
 196 /*                                      */
 197 /*              creatett                */
 198 /*                                      */
 199 /****************************************/
 200 /* Constructor for new truth table.
 201  * Allocates it, inits it, and links it into ttlist.
 202  */
 203 static TRUTHTAB *creatett (int stemno, int pmsz, unsigned char *permutes)
 204 {
 205     TRUTHTAB *newtt = austext_malloc (sizeof(TRUTHTAB) + pmsz + 4,
 206         PROGNAME"140", NULL);
 207     memset (newtt, 0, sizeof(TRUTHTAB));
 208     newtt->stemno = stemno;
 209     newtt->pmsz = pmsz;
 210     newtt->permutes = (unsigned char *) (newtt + 1);
 211     memcpy (newtt->permutes, permutes, pmsz);
 212     newtt->next = ttlist;
 213     ttlist = newtt;
 214     return newtt;
 215 } /* creatett() */
 216
 217
 218 /****************************************/
 219 /*                                      */
 220 /*               freett                 */
 221 /*                                      */
 222 /****************************************/
 223 /* Destructor of passed truth table.
 224  * Unlinks it from ttlist and frees it.
 225  */
 226 static void     *freett (TRUTHTAB *argtt)
 227 {
 228     TRUTHTAB    *tt;
 229     TRUTHTAB    **lastlink = &ttlist;
 230     for (tt = ttlist;  tt;  tt = tt->next) {
 231         if (tt == argtt) {
 232             *lastlink = tt->next;
 233             free (tt);
 234             break;
 235         }
 236         lastlink = &tt->next;
 237     }
 238     return;
 239 } /* freett() */
 240
 241
 242 /****************************************/
 243 /*                                      */
 244 /*         copy_final_truthtab          */
 245 /*                                      */
 246 /****************************************/
 247 /* Copys passed truth table into global final_truthtab.
 248  * Returns final_truthtab.
 249  */
 250 TRUTHTAB        *copy_final_truthtab (TRUTHTAB *tt)
 251 {
 252     memset (&final_truthtab, 0, sizeof(TRUTHTAB));
 253     if (!final_permutes)
 254         final_permutes = austext_malloc (300, PROGNAME"788", NULL);
 255     final_truthtab.pmsz =       tt->pmsz;
 256     final_truthtab.permutes =   final_permutes;
 257     memcpy (final_permutes, tt->permutes, final_truthtab.pmsz);
 258     return &final_truthtab;
 259 } /* copy_final_truthtab() */
 260
 261
 262 /****************************************/
 263 /*                                      */
 264 /*          get_stem_truthtab           */
 265 /*                                      */
 266 /****************************************/
 267 /* Subroutine of yylex.  Also used in yacc action functions.
 268  * Creates and returns truth table for passed stem.
 269  * If stem is new, adds it to saveusr.stems array, and adds
 270  * the original query word string to usrblk.stems for msgs.
 271  * Returns NULL and posts err msg if array is full
 272  * or has other error.
 273  */
 274 static TRUTHTAB *get_stem_truthtab (char *newstem, char *origword)
 275 {
 276     int                 i, stemno;
 277     unsigned char       bitmask;
 278     unsigned char       *pmp;
 279     unsigned char       new_permutes [128];
 280     TRUTHTAB            *newtt;
 281
 282     /* Check if stem is already in array */
 283     for (stemno = 0;  stemno < saveusr.stemcount;  stemno++)
 284         if (strcmp (newstem, saveusr.stems[stemno]) == 0)
 285             break;
 286
 287     /* Add new stem to array */
 288     if (stemno == saveusr.stemcount) {
 289         if (++saveusr.stemcount > DtSrMAX_STEMCOUNT) {
 290             sprintf (msgbuf, catgets (dtsearch_catd, MS_boolpars, 9,
 291                 "%s Too many terms in boolean query."),
 292                 PROGNAME"1513");
 293             DtSearchAddMessage (msgbuf);
 294             saveusr.stemcount--;
 295             return NULL;
 296         }
 297         strncpy (saveusr.stems[stemno], newstem, DtSrMAXWIDTH_HWORD);
 298         saveusr.stems [stemno] [DtSrMAXWIDTH_HWORD - 1] = 0;
 299         if (origword) {
 300             strncpy (usrblk.stems[stemno], origword, DtSrMAXWIDTH_HWORD);
 301             usrblk.stems [stemno] [DtSrMAXWIDTH_HWORD - 1] = 0;
 302         }
 303     }
 304
 305     /* Stemno now indicates correct term in saveusr.stems.
 306      * Truth table for a single term has 128 8-bit permutes,
 307      * the 1/2 of all 256 possible permutations that have
 308      * that term's bit switched on.
 309      */
 310     bitmask = 1 << stemno;      /* mask with only newstem's bit on */
 311     pmp = new_permutes;
 312     for (i=0; i<256; i++)
 313         if ((i & bitmask) != 0) {
 314             *pmp = i;
 315             pmp++;
 316         }
 317     newtt = creatett (stemno, 128, new_permutes);
 318     if (debugging_boolpars) {
 319         fprintf (aa_stderr, "   WORD: stem[%d]='%c%s' expr=%p pmsz=%d\n",
 320             stemno,
 321             (saveusr.stems[stemno][0] == STEM_CH) ?
 322                 '~'  :  saveusr.stems[stemno][0],
 323             &saveusr.stems[stemno][1],
 324             newtt, newtt->pmsz);
 325         fflush (aa_stderr);
 326     }
 327     return newtt;
 328 } /* get_stem_truthtab() */
 329
 330
 331 /****************************************/
 332 /*                                      */
 333 /*              boolyac_AND             */
 334 /*                                      */
 335 /****************************************/
 336 /* Action function for AND expression rule.
 337  * Returns set INTERSECTION of passed truth tables,
 338  * ie only the permutes they have in common.
 339  * Any truth table, input or output, can be the empty or
 340  * the universal set.  For example: "(A & B) & ~A" is empty.
 341  */
 342 TRUTHTAB        *boolyac_AND (TRUTHTAB *tt1, TRUTHTAB *tt2) {
 343     TRUTHTAB            *newtt;
 344     unsigned char       new_permutes [256];
 345     int                 pm1, pm2, newpm;
 346
 347     pm1 = pm2 = newpm = 0;
 348     while (pm1 < tt1->pmsz  &&  pm2 < tt2->pmsz) {
 349         if (tt1->permutes[pm1] < tt2->permutes[pm2])
 350             pm1++;
 351         else if (tt1->permutes[pm1] > tt2->permutes[pm2])
 352             pm2++;
 353         else {
 354             new_permutes [newpm++] = tt1->permutes [pm1];
 355             pm1++;
 356             pm2++;
 357         }
 358     }
 359
 360     /* Free old truthtabs, create new one. */
 361     freett (tt1);
 362     freett (tt2);
 363     newtt = creatett (-1, newpm, new_permutes);
 364     if (debugging_boolpars) {
 365         fprintf (aa_stderr, "    AND: exprs=%p,%p-->expr=%p pmsz=%d\n",
 366             tt1, tt2, newtt, newtt->pmsz);
 367         fflush (aa_stderr);
 368     }
 369     return newtt;
 370 } /* boolyac_AND() */
 371
 372
 373 /****************************************/
 374 /*                                      */
 375 /*              boolyac_OR              */
 376 /*                                      */
 377 /****************************************/
 378 /* Action function for OR expression rule.
 379  * Returns set UNION of passed truth tables.
 380  * Any truth table, input or output, can be the empty or
 381  * the universal set.  For example: "A | ~A" is universal.
 382  */
 383 TRUTHTAB        *boolyac_OR (TRUTHTAB *tt1, TRUTHTAB *tt2) {
 384     TRUTHTAB            *newtt;
 385     unsigned char       new_permutes [256];
 386     unsigned char       *permutes1 = tt1->permutes;
 387     unsigned char       *permutes2 = tt2->permutes;
 388     int                 pm1, pm2, newpm;
 389
 390     pm1 = pm2 = newpm = 0;
 391
 392     /* While neither permutes array is exhausted... */
 393     while (pm1 < tt1->pmsz  &&  pm2 < tt2->pmsz) {
 394         if (permutes1[pm1] < permutes2[pm2])
 395             new_permutes [newpm++] = permutes1[pm1++];
 396         else if (permutes2[pm2] < permutes1[pm1])
 397             new_permutes [newpm++] = permutes2[pm2++];
 398         else {
 399             new_permutes [newpm++] = permutes1[pm1++];
 400             pm2++;
 401         }
 402     }
 403     /* After one or both permutes arrays are exhausted... */
 404     while (pm1 < tt1->pmsz)
 405         new_permutes [newpm++] = permutes1[pm1++];
 406     while (pm2 < tt2->pmsz)
 407         new_permutes [newpm++] = permutes2[pm2++];
 408
 409     /* Free old truthtabs, create new one. */
 410     freett (tt1);
 411     freett (tt2);
 412     newtt = creatett (-1, newpm, new_permutes);
 413     if (debugging_boolpars) {
 414         fprintf (aa_stderr, "     OR: exprs=%p,%p-->expr=%p pmsz=%d\n",
 415             tt1, tt2, newtt, newtt->pmsz);
 416         fflush (aa_stderr);
 417     }
 418     return newtt;
 419 } /* boolyac_OR() */
 420
 421
 422 /****************************************/
 423 /*                                      */
 424 /*              boolyac_NOT             */
 425 /*                                      */
 426 /****************************************/
 427 /* Action function for NOT expression rule.
 428  * Returns set COMPLEMENT of passed truth table,
 429  * ie the universal set minus the passed set,
 430  * ie all possible permutes except those passed.
 431  * Either the old or the new truth table can be
 432  * the empty or the universal set.
 433  */
 434 TRUTHTAB        *boolyac_NOT (TRUTHTAB *oldtt) {
 435     TRUTHTAB            *newtt;
 436     unsigned char       new_permutes [256];
 437     int                 oldpm, newpm;
 438     int                 candidate;
 439
 440     oldpm = newpm = 0;
 441     for (candidate = 0;  candidate < 256;  candidate++) {
 442         if (oldpm >= oldtt->pmsz  ||  candidate < oldtt->permutes [oldpm]) {
 443             new_permutes [newpm++] = candidate;
 444         }
 445         /*
 446          * oldtt not done  &&  candidate == oldtt.
 447          * (candidate > oldtt not possible).
 448          */
 449         else {
 450             oldpm++;
 451         }
 452     }
 453     freett (oldtt);
 454     newtt = creatett (-1, newpm, new_permutes);
 455     if (debugging_boolpars) {
 456         fprintf (aa_stderr, "    NOT: expr=%p-->expr=%p pmsz=%d\n",
 457             oldtt, newtt, newtt->pmsz);
 458         fflush (aa_stderr);
 459     }
 460     return newtt;
 461 } /* boolyac_NOT() */
 462
 463
 464 /****************************************/
 465 /*                                      */
 466 /*            boolyac_COLLOC            */
 467 /*                                      */
 468 /****************************************/
 469 /* Action function for COLLOCATION expression rule.
 470  * The record set satisfying a collocation expression is
 471  * generated dynamically.  At the parse level it is equivalent
 472  * to a separate 'word' with its own (undetermined) record set.
 473  * So it's given its own slot in saveusr.stems.  The word
 474  * in saveusr.stems is formated "@ssttv[v...]" where ss and tt are
 475  * ascii numbers that index the original collocated words
 476  * in saveusr.stems, and v... is the collocation value integer.
 477  * For example, "@03005" represents the collocation of stem
 478  * number 3 and stem number 0, with collocation value 5.
 479  *
 480  * Returns NULL and errmsg on msglist if any problems.
 481  */
 482 TRUTHTAB        *boolyac_COLLOC (
 483                     TRUTHTAB    *word1tt,
 484                     int         colloc_val,
 485                     TRUTHTAB    *word2tt)
 486 {
 487     TRUTHTAB    *newtt;
 488     char        wordbuf [DtSrMAXWIDTH_HWORD];
 489
 490     if (word1tt->stemno < 0  ||  word2tt->stemno < 0) {
 491         /* Message #3 is called in two places */
 492         sprintf (msgbuf, catgets(dtsearch_catd, MS_boolpars, 3,
 493             "%s COLLOCATION operator (@) may\n"
 494             "only be positioned between two words."),
 495             PROGNAME"371");
 496         DtSearchAddMessage (msgbuf);
 497         return NULL;
 498     }
 499     if (word1tt->stemno == word2tt->stemno) {
 500         sprintf (msgbuf, catgets(dtsearch_catd, MS_boolpars, 12,
 501             "%s Collocation operator is not\n"
 502             "permitted between identical words."),
 503             PROGNAME"377");
 504         DtSearchAddMessage (msgbuf);
 505         return NULL;
 506     }
 507     sprintf (wordbuf, COLLOC_STEM_FORMAT,
 508         word1tt->stemno, word2tt->stemno, colloc_val);
 509     if ((newtt = get_stem_truthtab (wordbuf, wordbuf)) == NULL)
 510         return NULL;
 511     freett (word1tt);
 512     freett (word2tt);
 513     if (debugging_boolpars) {
 514         fprintf (aa_stderr, " COLLOC: exprs=%p,%p-->expr=%p pmsz=%d\n",
 515             word1tt, word2tt, newtt, newtt->pmsz);
 516         fflush (aa_stderr);
 517     }
 518     return newtt;
 519 } /* boolyac_COLLOC() */
 520
 521
 522 /****************************************/
 523 /*                                      */
 524 /*               yyerror                */
 525 /*                                      */
 526 /****************************************/
 527 /* Replaces standard yacc error routine. */
 528 void    yyerror (char *msg) {
 529     if (strcmp (msg, "syntax error") == 0) {
 530         if (DtSearchHasMessages())
 531             return;
 532         else if (parser_invalid_wordcount > 0)
 533             add_syntax_errmsg(6);
 534         else {
 535             sprintf (msgbuf, catgets(dtsearch_catd, MS_boolpars, 1,
 536                 "%s Your search string is an invalid\n"
 537                 "boolean query.  Please reformulate and try again."),
 538                 PROGNAME"001");
 539             DtSearchAddMessage (msgbuf);
 540         }
 541     }
 542     else
 543         DtSearchAddMessage (msg);
 544     return;
 545 } /* yyerror() */
 546
 547
 548 /****************************************/
 549 /*                                      */
 550 /*              copy_token              */
 551 /*                                      */
 552 /****************************************/
 553 /* Subroutine of yylex().  Copies passed substring
 554  * Into a zero-terminated buffer of its own.
 555  * Static buffer good until next call.
 556  */
 557 static char     *copy_token (UCHAR *tokenp, size_t toklen)
 558 {
 559     static char         *buf =  NULL;
 560     static size_t       bufsz = 0;
 561     if (toklen > bufsz) {
 562         if (buf)
 563             free (buf);
 564         bufsz = toklen + (toklen >> 1); /* 1.5 times size needed */
 565         buf = austext_malloc (bufsz + 4, PROGNAME"182", NULL);
 566     }
 567     strncpy (buf, (char *) tokenp, toklen);
 568     buf [toklen] = 0;
 569     return buf;
 570 } /* copy_token() */
 571
 572
 573 /****************************************/
 574 /*                                      */
 575 /*                 yylex                */
 576 /*                                      */
 577 /****************************************/
 578 /* Delivers tokens to yyparse() from usrblk.query */
 579 int     yylex (void)
 580 {
 581     int         retn_token;
 582     PARG        parg;
 583     char        *stembufp;
 584     char        mystembuf [DtSrMAXWIDTH_HWORD + 4];
 585
 586 GET_ANOTHER_TOKEN:
 587
 588     /* Skip white space */
 589     while (ascii_charmap[*next_lex_char] & WHITESPACE)
 590         next_lex_char++;
 591
 592     /* Terminating zero indicates end of query and end of parse.
 593      * Automatically close unbalanced parentheses.
 594      */
 595     if (*next_lex_char == 0) {
 596         if (paren_count > 0) {
 597             paren_count--;
 598             retn_token =        ')';
 599             yytext =            ")";
 600             yyleng =            1;
 601             goto DELIVER_TOKEN;
 602         }
 603         retn_token =            0;
 604         yytext =                "";
 605         yyleng =                0;
 606         goto DELIVER_TOKEN;
 607     }
 608
 609     switch (*next_lex_char) {
 610         case '|':       /* OR operator */
 611             last_token_was_boolop =     TRUE;
 612             retn_token =                '|';
 613             yytext =                    "|";
 614             yyleng =                    1;
 615             next_lex_char++;
 616             break;
 617
 618         case '~':       /* NOT operator */
 619             if (!last_token_was_boolop) {
 620                 /* Generate implied AND between words
 621                  * and parenthesized expressions.
 622                  * A NOT is not itself boolean; it must
 623                  * precede the next word or expression.
 624                  */
 625                 last_token_was_boolop = TRUE;
 626                 retn_token =            '&';
 627                 yytext =                "&";
 628                 yyleng =                1;
 629                 break;
 630             }
 631             last_token_was_boolop =     TRUE;
 632             retn_token =                '~';
 633             yytext =                    "~";
 634             yyleng =                    1;
 635             next_lex_char++;
 636             break;
 637
 638         case '&':       /* AND operator */
 639             if (last_token_was_boolop && qry_is_all_ANDs) {
 640                 /* Ignore multiple AND operators.
 641                  * These might occur if we silently
 642                  * discarded some invalid words.
 643                  */
 644                 next_lex_char++;
 645                 goto GET_ANOTHER_TOKEN;
 646             }
 647             last_token_was_boolop =     TRUE;
 648             retn_token =                '&';
 649             yytext =                    "&";
 650             yyleng =                    1;
 651             next_lex_char++;
 652             break;
 653
 654         case '(':       /* OPEN parentheses */
 655             if (!last_token_was_boolop) {
 656                 /* Generate implied AND between words
 657                  * and parenthesized expressions.
 658                  */
 659                 last_token_was_boolop = TRUE;
 660                 retn_token =            '&';
 661                 yytext =                "&";
 662                 yyleng =                1;
 663                 break;
 664             }
 665             paren_count++;
 666             retn_token =                '(';
 667             yytext =                    "(";
 668             yyleng =                    1;
 669             next_lex_char++;
 670             break;
 671
 672         case ')':       /* CLOSE parentheses */
 673             /* Just discard excessive right parentheses */
 674             if (--paren_count < 0) {
 675                 paren_count = 0;
 676                 next_lex_char++;
 677                 goto GET_ANOTHER_TOKEN;
 678             }
 679             last_token_was_boolop =     FALSE;
 680             retn_token =                ')';
 681             yytext =                    ")";
 682             yyleng =                    1;
 683             next_lex_char++;
 684             break;
 685
 686         case '@':       /* COLLOCATION operator */
 687             /* Collocation token:
 688              * Token is defined as the collocation char followed
 689              * by one or more numeric digits: "@#[#...]".
 690              * Syntactically it's a kind of an AND operator.
 691              * Semantically it's a pseudo word token
 692              * (it will occupy a slot in the stems array).
 693              * The yylval is the integer value following
 694              * the collocation character.
 695              */
 696             yyleng = strcspn ((char *) next_lex_char + 1, WORD_ENDERS) + 1;
 697             yytext = copy_token (next_lex_char, yyleng);
 698             next_lex_char += yyleng;
 699
 700             if ((usrblk.dblk->dbrec.or_dbaccess & ORA_BLOB) == 0) {
 701                 retn_token = ERROR_TOKEN;
 702                 sprintf (msgbuf, catgets(dtsearch_catd, MS_boolpars, 10,
 703                     "%s Collocation searches not available for database '%s'."),
 704                     PROGNAME"2567", usrblk.dblk->label);
 705                 DtSearchAddMessage (msgbuf);
 706                 break;
 707             }
 708             yylval.int_val = atoi (yytext + 1);
 709             if (yylval.int_val <= 0) {
 710                 retn_token = ERROR_TOKEN;
 711                 sprintf (msgbuf, catgets(dtsearch_catd, MS_boolpars, 11,
 712                     "%s Collocation operator '%.*s' is invalid.\n"
 713                     "Correct format is '@n' where n is greater than zero.") ,
 714                     PROGNAME"294", DtSrMAXWIDTH_HWORD, yytext);
 715                 DtSearchAddMessage (msgbuf);
 716                 break;
 717             }
 718             last_token_was_boolop =     TRUE;
 719             retn_token =                COLLOC_TOKEN;
 720             break;
 721
 722
 723         default:
 724             /* Presumed word token:
 725              * Token is all text chars until next whitespace,
 726              * next lex token, or end of string.
 727              * Linguistically parse it and optionally stem it.
 728              * The token value is the truth table for one
 729              * word: all permutes with only that word's
 730              * bits turned on.  If the word is already
 731              * in the stems array, then the permutes
 732              * position is the word's index in the array.
 733              * If the word is not in the array, it's added.
 734              * If the array is full, then an error is reported.
 735              */
 736             if (!last_token_was_boolop) {
 737                 /* Generate implied AND between words
 738                  * and parenthesized expressions.
 739                  */
 740                 last_token_was_boolop = TRUE;
 741                 retn_token =            '&';
 742                 yytext =                "&";
 743                 yyleng =                1;
 744                 break;
 745             }
 746             yyleng = strcspn ((char *) next_lex_char, WORD_ENDERS);
 747             yytext = copy_token (next_lex_char, yyleng);
 748             next_lex_char += yyleng;
 749             /*
 750              * Linguistically parse the token.
 751              * Failure can occur because word is too short
 752              * or too long, it's on the stoplist, etc.
 753              * Setting PA_MSGS causes parser to explain
 754              * invalid words with a msg.
 755              */
 756             memset (&parg, 0, sizeof(PARG));
 757             parg.dblk =         usrblk.dblk;
 758             parg.string =       yytext;
 759             /*****if (!qry_is_all_ANDs)********/
 760                 parg.flags =    PA_MSGS;
 761             stembufp = usrblk.dblk->parser (&parg);
 762             if (debugging_boolpars) {
 763                 fprintf (aa_stderr, "   lang: '%s' -> '%s'\n",
 764                     yytext, (stembufp)? stembufp : "<null>");
 765                 fflush (aa_stderr);
 766             }
 767             /*
 768              * If token is not a linguistically valid word,
 769              * one of two things can happen.  If the query
 770              * is all_ANDs (most common type) we silently
 771              * ignore the token.
 772              * Otherwise report error and quit now.
 773              */
 774             if (stembufp == NULL) {
 775                 parser_invalid_wordcount++;
 776                 if (qry_is_all_ANDs)
 777                     goto GET_ANOTHER_TOKEN;
 778                 retn_token = ERROR_TOKEN;
 779                 if (!DtSearchHasMessages()) {
 780                     sprintf (msgbuf, catgets(dtsearch_catd, MS_boolpars, 13,
 781                         "%s Word '%.*s' is invalid.") ,
 782                         PROGNAME"315", DtSrMAXWIDTH_HWORD, yytext);
 783                     DtSearchAddMessage (msgbuf);
 784                 }
 785                 break;
 786             }
 787             if (strlen(stembufp) != strlen(yytext)) {
 788                 retn_token =            ERROR_TOKEN;
 789                 sprintf (msgbuf, catgets(dtsearch_catd, MS_boolpars, 14,
 790                     "%s String '%.*s' is not a single word.") ,
 791                     PROGNAME"634", DtSrMAXWIDTH_HWORD, yytext);
 792                 DtSearchAddMessage (msgbuf);
 793                 break;
 794             }
 795             /*
 796              * If stemming, we must prefix term with
 797              * special stem char in the stems array.
 798              */
 799             if (usrblk.request == OE_SRCH_STEMS) {
 800                 stembufp = usrblk.dblk->stemmer (stembufp, usrblk.dblk);
 801                 if (debugging_boolpars) {
 802                     fprintf (aa_stderr, " stemer: -> '%s'\n", stembufp);
 803                     fflush (aa_stderr);
 804                 }
 805                 mystembuf[0] = STEM_CH;
 806                 strncpy (mystembuf + 1, stembufp, DtSrMAXWIDTH_HWORD);
 807                 mystembuf [DtSrMAXWIDTH_HWORD - 1] = 0;
 808                 stembufp = mystembuf;
 809             }
 810
 811             /* Load stem into stems arrays and return it's truth table. */
 812             if (yylval.truthtab = get_stem_truthtab (stembufp, yytext)) {
 813                 retn_token =            WORD_TOKEN;
 814                 last_token_was_boolop = FALSE;
 815             }
 816             else
 817                 retn_token =            ERROR_TOKEN;
 818             break;
 819
 820     } /* switch on *next_lex_char */
 821
 822 DELIVER_TOKEN:
 823     if (debugging_boolpars) {
 824         fprintf (aa_stderr,
 825             "  yylex: op?=%d parct=%d tok#=%d lval=%p%sYYTEXT='%s'\n",
 826             last_token_was_boolop, paren_count,
 827             retn_token, yylval.truthtab,
 828             (retn_token == COLLOC_TOKEN)? "\t\t" : "\t",
 829             yytext);
 830         fflush (aa_stderr);
 831     }
 832     return retn_token;
 833
 834 } /* yylex() */
 835
 836
 837 /****************************************/
 838 /*                                      */
 839 /*             boolean_parse            */
 840 /*                                      */
 841 /****************************************/
 842 /* Called from Opera_Engine for boolean searches.
 843  * Driver for yyparse().
 844  * Expects usrblk.request == OE_SRCH_STEMS or OE_SRCH_WORDS.
 845  * If parse is completely successful (query is valid), outputs
 846  *   saveusr.stemcount,
 847  *   saveusr.stems (stemmed if necessary with STEM_CH as first char,
 848  *      and phony colloc words with '@' as first char),
 849  *   usrblk.stems (original unstemmed query terms for err msgs),
 850  *   final_truthtab,
 851  *   qry_has_no_NOTs,
 852  *   qry_is_all_ANDs,
 853  * and returns TRUE.  Truthtab allocation good until next call.
 854  * If parse fails, returns FALSE and err msg(s) on msglist.
 855  */
 856 int     boolean_parse (void)
 857 {
 858     int         i;
 859     char        *cptr;
 860     TRUTHTAB    *tt, *ttnext;
 861
 862     debugging_boolpars = (usrblk.debug & USRDBG_BOOL);
 863     if (!msgbuf)
 864         msgbuf = austext_malloc (300 + DtSrMAXWIDTH_HWORD,
 865                 PROGNAME"255", NULL);
 866
 867     /* Test for empty query */
 868     if (usrblk.query == NULL) {
 869 EMPTY_QUERY:
 870         /* Message #2 is called in two places */
 871         sprintf (msgbuf, catgets(dtsearch_catd, MS_boolpars, 2,
 872             "%s Query is empty."), PROGNAME"289");
 873         DtSearchAddMessage (msgbuf);
 874         return FALSE;
 875     }
 876     for  (cptr = usrblk.query;  *cptr;  cptr++) {
 877         if ((ascii_charmap[*cptr] & WHITESPACE) == 0)
 878             break;
 879     }
 880     if (*cptr == 0)
 881         goto EMPTY_QUERY;
 882
 883     /* Init globals for yylex and yyparse */
 884     next_lex_char =             (UCHAR *) usrblk.query;
 885     paren_count =               0;
 886     yyerror_count =             0;
 887     last_token_was_boolop =     TRUE;
 888     saveusr.stemcount =         0;
 889     parser_invalid_wordcount =  0;
 890
 891     /* Query "is all ANDS" if it has no ORs, NOTs, or COLLOCs.
 892      * Missing or linguistically invalid words will be silently
 893      * discarded for all_ANDs queries.
 894      * Query "has no NOTs" if it has no NOTs.
 895      * Results from queries without NOTs can be statistically sorted.
 896      */
 897     qry_has_no_NOTs = !strchr (usrblk.query, '~');
 898     qry_is_all_ANDs = !strpbrk (usrblk.query, "|~@");
 899
 900     if (debugging_boolpars || (usrblk.debug & USRDBG_SRCHCMPL)) {
 901         fprintf (aa_stderr,
 902             "start boolean_parse: stem?=%d allANDs?=%d noNOTs?=%d\n"
 903             "  query: '%s'\n",
 904             (usrblk.request == OE_SRCH_STEMS),
 905             qry_is_all_ANDs, qry_has_no_NOTs, usrblk.query);
 906         fflush (aa_stderr);
 907     }
 908
 909     if (yyparse() != 0)
 910         return FALSE;
 911
 912     /* Free entire remaining ttlist.  Only you
 913      * can prevent forest fires and memory leaks.
 914      */
 915     tt = ttlist;
 916     while (tt) {
 917         ttnext = tt->next;
 918         free (tt);
 919         tt = ttnext;
 920     }
 921     ttlist = NULL;
 922
 923     if (debugging_boolpars || (usrblk.debug & USRDBG_SRCHCMPL)) {
 924         print_stems (saveusr.stemcount, saveusr.stems,
 925             PROGNAME"815 end boolean_parse, syntax ok,");
 926         fprintf (aa_stderr, "  permutes=%d:", final_truthtab.pmsz);
 927         for (i=0;  i<16;  i++) {
 928             if (i >=  final_truthtab.pmsz)
 929                 break;
 930             fprintf (aa_stderr, " %02x", final_truthtab.permutes [i]);
 931         }
 932         fputc ('\n', aa_stderr);
 933         fflush (aa_stderr);
 934     }
 935
 936     if (final_truthtab.pmsz <= 0) {
 937         sprintf (msgbuf, catgets(dtsearch_catd, MS_boolpars, 15,
 938             "%s Your query cannot logically return\n"
 939             "any records.  Please reformulate and try again."),
 940             PROGNAME"334");
 941         DtSearchAddMessage (msgbuf);
 942         return FALSE;
 943     }
 944     if (final_truthtab.pmsz >= 256) {
 945         sprintf (msgbuf, catgets(dtsearch_catd, MS_boolpars, 16,
 946             "%s Your query will return entire database\n"
 947             "'%s'.  Please reformulate and try again.") ,
 948             PROGNAME"341", usrblk.dblk->label);
 949         DtSearchAddMessage (msgbuf);
 950         return FALSE;
 951     }
 952     return TRUE;
 953 } /* boolean_parse() */
 954
 955
 956 #ifdef TESTBOOL /*-----------------------------------------------*/
 957
 958 USRBLK          usrblk = { 0 };
 959 DBLK            dblk;
 960 SAVEUSR         saveusr = { 0 };
 961 extern int      debugging_teskey;
 962 extern int      debugging_paice;
 963 extern int      debugging_jpn;
 964
 965 /****************************************/
 966 /*                                      */
 967 /*          process_user_args           */
 968 /*                                      */
 969 /****************************************/
 970 /* Subroutine of main().  Validates and loads global
 971  * variables with values from command line arguments.
 972  */
 973 static void     process_user_args (int argc, char *argv[])
 974 {
 975     int         i;
 976     char        *argptr;
 977     char        *cptr;
 978     char        *src, *targ;
 979     int         oops = FALSE;
 980
 981     /* Each pass grabs new parm of "-xxx" format */
 982     argc--, argv++;
 983     while (argc > 0) {
 984         argptr = argv[0];
 985         if (*argptr != '-')
 986             break;
 987         switch (argptr[1]) {
 988             case 'm':
 989                 if (argptr[2] == 'x')
 990                     dblk.dbrec.or_maxwordsz = atoi (argptr + 3);
 991                 else if (argptr[2] == 'n')
 992                     dblk.dbrec.or_minwordsz = atoi (argptr + 3);
 993                 else
 994                     goto BAD_ARG;
 995                 break;
 996
 997             case 'l':
 998                 dblk.dbrec.or_language = atoi (argptr + 2);
 999                 break;
1000
1001             case 'd':
1002                 for (cptr = argptr+2;  *cptr != 0;  cptr++) {
1003                     switch (*cptr) {
1004                         case 't': debugging_teskey = TRUE; break;
1005                         case 'p': debugging_paice = TRUE; break;
1006                         case 'j': debugging_jpn = TRUE; break;
1007                         default:
1008                             oops = TRUE;
1009                             fprintf (aa_stderr,
1010                                 "%s Invalid debug option %c.\a\n",
1011                                 PROGNAME"049", *cptr);
1012                             break;
1013                     }
1014                 }
1015                 break;
1016
1017 BAD_ARG:
1018             default:
1019                 oops = TRUE;
1020                 fprintf (aa_stderr,
1021                     "%s Invalid command line argument '%s'.\a\n",
1022                     PROGNAME"059", argptr);
1023                 break;
1024         } /* end switch */
1025
1026         argc--, argv++;
1027     } /* main loop on each arg */
1028
1029
1030     if (oops) {
1031         fprintf (aa_stderr,
1032             "\nUSAGE: %s [options]\n"
1033             "  -mx#   maximum word size.\n"
1034             "  -mn#   minimum word size.\n"
1035             "  -dtpj  Debug: Teskey, Paice, Japanese.\n"
1036             "  -l#    language number.  Default 0.\n",
1037             aa_argv0);
1038         exit(2);
1039     }
1040     return;
1041 }  /* process_user_args() */
1042
1043
1044 /****************************************/
1045 /*                                      */
1046 /*                 main                 */
1047 /*                                      */
1048 /****************************************/
1049 int     main    (int argc, char *argv[])
1050 {
1051     int         i;
1052     int         valid_boolpars;
1053     char        *cptr;
1054     char        linebuf [1024];
1055
1056     /* Init global variables */
1057     aa_argv0 = argv[0];
1058
1059     memset (&usrblk, 0, sizeof(USRBLK));
1060     usrblk.dblk = &dblk;
1061     usrblk.debug |= USRDBG_BOOL;        /* set debugging_boolpars */
1062
1063     memset (&dblk, 0, sizeof(DBLK));
1064     strcpy (dblk.name, "testbool");
1065     dblk.label = dblk.name;
1066     dblk.dbrec.or_dbaccess |= ORA_BLOB; /* enable collocations */
1067
1068     /* Read command line args */
1069     process_user_args (argc, argv);
1070
1071     if (!load_language (&dblk, NULL)) {
1072         fprintf (aa_stderr,
1073             PROGNAME"140 load_language() failed.  Msgs:\n%s\n",
1074             DtSearchGetMessages());
1075         return 2;
1076     }
1077     fprintf (aa_stderr, "  lang=%d minwdsz=%d maxwdsz=%d.\n",
1078         dblk.dbrec.or_language,
1079         dblk.dbrec.or_minwordsz,
1080         dblk.dbrec.or_maxwordsz);
1081
1082     /* Main loop.  Each line is a boolean query. */
1083     printf ("Enter an AusText boolean query.  'q' or '.' to quit.\n"
1084         "If first char is '$', words will be stemmed:\n> ");
1085     fflush (stdout);
1086     while (fgets (linebuf, sizeof(linebuf), stdin) != NULL) {
1087
1088         linebuf [sizeof(linebuf) - 1] = 0;
1089         if (strcmp (linebuf, ".\n") == 0)
1090             break;
1091         if (strcmp (linebuf, "q\n") == 0)
1092             break;
1093         if (linebuf[0] == '\n')
1094             break;
1095         linebuf [strlen(linebuf) - 1] = 0;      /* overlay \n */
1096
1097         if (linebuf[0] == '$') {
1098             usrblk.query = linebuf + 1;
1099             usrblk.request = OE_SRCH_STEMS;
1100         }
1101         else {
1102             usrblk.query = linebuf;
1103             usrblk.request = OE_SRCH_WORDS;
1104         }
1105
1106         if (!boolean_parse())
1107             puts (PROGNAME"707 boolean_parse() returned FALSE (OE_BAD_QUERY).");
1108         if (DtSearchHasMessages()) {
1109             printf ("mmmmm Messages returned to user mmmmmmmmmmmmmmmmmm\n"
1110                 "%s\nmmmmm End of messages to user mmmmmmmmmmmmmmmmmmmm\n",
1111                 DtSearchGetMessages());
1112             DtSearchFreeMessages();
1113         }
1114
1115     printf ("--------------------------------\n> ");
1116     fflush (stdout);
1117     } /* main read loop for each query line */
1118     return 0;
1119 } /* main() */
1120
1121 #endif /* TESTBOOL */
1122
1123 /********************* BOOLPARS.C ********************/
1124