2 * CDE - Common Desktop Environment
4 * Copyright (c) 1993-2012, The Open Group. All rights reserved.
6 * These libraries and programs are free software; you can
7 * redistribute them and/or modify them under the terms of the GNU
8 * Lesser General Public License as published by the Free Software
9 * Foundation; either version 2 of the License, or (at your option)
12 * These libraries and programs are distributed in the hope that
13 * they will be useful, but WITHOUT ANY WARRANTY; without even the
14 * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU Lesser General Public License for more
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with these librararies and programs; if not, write
20 * to the Free Software Foundation, Inc., 51 Franklin Street, Fifth
21 * Floor, Boston, MA 02110-1301 USA
23 /* $XConsortium: dtsrkdump.c /main/3 1996/09/23 21:03:37 cde-ibm $
25 * (c) Copyright 1996 Digital Equipment Corporation.
26 * (c) Copyright 1996 Hewlett-Packard Company.
27 * (c) Copyright 1996 International Business Machines Corp.
28 * (c) Copyright 1996 Sun Microsystems, Inc.
29 * (c) Copyright 1996 Novell, Inc.
30 * (c) Copyright 1996 FUJITSU LIMITED.
31 * (c) Copyright 1996 Hitachi.
34 * COMPONENT_NAME: austext
36 * FUNCTIONS: count_words
42 * (C) COPYRIGHT International Business Machines Corp. 1994,1996
44 * Licensed Materials - Property of IBM
45 * US Government Users Restricted Rights - Use, duplication or
46 * disclosure restricted by GSA ADP Schedule Contract with IBM Corp.
48 /*********************** DTSRKDUMP.C *************************
49 * $Id: dtsrkdump.c /main/3 1996/09/23 21:03:37 cde-ibm $
51 * Dumps a DtSearch/AusText keyfile to stdout.
52 * Renamed from auskdump for DtSearch.
55 * Revision 2.3 1996/04/10 21:19:28 miker
56 * Program renamed from auskdump with minor cleanup.
59 * *** Log: auskdump.c,v ***
60 * Revision 2.2 1995/10/19 20:29:37 miker
61 * Permit accessing of read-only databases.
62 * Revision 2.1 1995/09/22 18:55:59 miker
63 * Freeze DtSearch 0.1, AusText 2.1.8
64 * Revision 1.11 1995/09/19 21:47:26 miker
65 * Added explanation of '*' in report.
66 * Revision 1.10 1995/09/06 14:18:33 miker
67 * Fixed bug: -p value incorrectly converted to double because
68 * atof() function prototype was not provided from stdlib.h.
69 * Revision 1.9 1995/09/01 23:58:57 miker
70 * Minor name changes for DtSearch.
71 * Print err msgs when databases fail to open.
72 * Revision 1.8 1995/05/30 18:40:12 miker
73 * Print progress dots and some additional dbrec info.
83 #define PROGNAME "DTSRKDUMP"
84 #define MIN_THRESHOLD 100L
85 #define KEYS_PER_DOT 1000
86 #define MS_dtsrkdump 25
88 /*----------------- GLOBALS -------------------*/
90 static long *counters = NULL; /* allocated array */
91 static int do_verbose = FALSE;
93 static long min_threshold = MIN_THRESHOLD;
94 static long maxdba = 0L;
96 static struct or_dbrec
99 /****************************************/
103 /****************************************/
104 void count_words (int index)
108 DtSrINT32 offset, free, addrs;
114 vista_field = OR_SWORDKEY;
116 vista_field = OR_LWORDKEY;
118 vista_field = OR_HWORDKEY;
120 printf (catgets (dtsearch_catd, MS_dtsrkdump, 1,
121 "%s Program Error Abort.\a\n"),
126 KEYFRST (PROGNAME"36", vista_field, 0);
127 while (db_status == S_OKAY) {
128 KEYREAD (PROGNAME"48", buf);
129 if (buf[0] == STEM_CH)
132 (counters[index + 1])++;
135 CRGET (PROGNAME"58", &dba, 0);
139 CRREAD (PROGNAME"66", OR_SWOFFSET, &offset, 0);
140 CRREAD (PROGNAME"67", OR_SWFREE, &free, 0);
141 CRREAD (PROGNAME"68", OR_SWADDRS, &addrs, 0);
144 CRREAD (PROGNAME"76", OR_LWOFFSET, &offset, 0);
145 CRREAD (PROGNAME"77", OR_LWFREE, &free, 0);
146 CRREAD (PROGNAME"78", OR_LWADDRS, &addrs, 0);
149 CRREAD (PROGNAME"86", OR_HWOFFSET, &offset, 0);
150 CRREAD (PROGNAME"87", OR_HWFREE, &free, 0);
151 CRREAD (PROGNAME"88", OR_HWADDRS, &addrs, 0);
157 if (addrs >= min_threshold) {
160 for (ptr = (UCHAR *) buf; *ptr != 0; ptr++) {
161 putchar ((*ptr >= 32) ? *ptr : '~');
165 while (tabstop++ < 22)
167 printf (catgets(dtsearch_catd, MS_dtsrkdump, 2,
168 "%c dba=%d:%-7ld ofs=%-9ld adr=%-6ld fre=%ld\n"),
169 (addrs >= dbrec.or_reccount) ? '*' : ' ',
170 dba >> 24, dba & 0xffffff, offset, addrs, free);
174 else { /* !verbose */
175 if (++keycount % KEYS_PER_DOT == 0) {
177 if (++dotcount % 10 == 0)
179 if (dotcount % 50 == 0) {
185 } /* end !verbose dot printing */
187 KEYNEXT (PROGNAME"98", vista_field, 0);
188 } /* end object key read loop */
193 } /* count_words() */
196 /****************************************/
200 /****************************************/
201 int main (int argc, char *argv[])
209 int do_objkeys = FALSE;
210 int do_wordkeys = FALSE;
212 char rcs_revision [8];
215 double percent = 0.0;
216 int listing_most_words = FALSE;
218 static char *word_labels[6] =
220 "Short Stems = %8ld\n", "Short Words = %8ld\n",
221 "Long Stems = %8ld\n", "Long Words = %8ld\n",
222 "Huge Stems = %8ld\n", "Huge Words = %8ld\n"
227 sscanf ("$Revision: /main/3 $", "%*s %s", rcs_revision);
229 setlocale (LC_ALL, "");
230 dtsearch_catd = catopen (FNAME_DTSRCAT, 0);
232 strftime (buf, sizeof (buf), "%m/%d/%Y, %I:%M %p",
234 printf (catgets(dtsearch_catd, MS_dtsrkdump, 3,
235 "%s %s, engine %s. %s.\n"),
236 aa_argv0, rcs_revision, AUSAPI_VERSION, buf);
240 printf (catgets(dtsearch_catd, MS_dtsrkdump, 4,
241 "\nUSAGE: %s -o|w|ow [-v] [-t<N> | -p<N>] dbname\n"
242 " Reads DtSearch key files and prints summary report.\n"
243 " -o Keys examined are OBJECT record keys.\n"
244 " -w Keys examined are inverted index WORDS.\n"
245 " -v VERBOSE mode, lists every key.\n"
246 " -t<N> Threshold. Sets w and v options, and lists only words\n"
247 " with >= <N> addresses. All words will be listed if <N> = 1.\n"
248 " -p<N> Another threshold. Same as -t except <N> is percent\n"
249 " of the entire database (<N> may include a decimal point).\n"
250 " For example -p99.9 prints out every word that occurs\n"
251 " in 99.9%% or more of the records--an excellent way to find\n"
252 " candidates for the stop list.\n"
253 " If w and v are set without threshold, default is -t%d.\n"
254 " <dbname> 1 - 8 character database name with optional path prefix.\n")
262 else { /* argc >= 2 */
264 /* each pass grabs new token with "-xxx" format */
268 break; /* no more tokens of any kind */
271 break; /* no more option tokens */
273 /* examine each char in this -xxx token */
274 while (*(++ptr) != 0) {
291 percent = atof (ptr + 1);
292 if (percent <= 0.0 || percent > 100.0) {
294 catgets (dtsearch_catd, MS_dtsrkdump, 5,
295 "%s Invalid percent value %lf.\a\n"),
296 PROGNAME"195", percent);
299 ptr[1] = 0; /* terminate parse */
305 if ((min_threshold = atol (ptr + 1)) <= 0L) {
307 catgets (dtsearch_catd, MS_dtsrkdump, 53,
308 "%s Invalid threshold value.\a\n"),
312 ptr[1] = 0; /* terminate parse */
317 catgets (dtsearch_catd, MS_dtsrkdump, 55,
318 "%s Unknown command line argument '%c'.\a\n"),
319 PROGNAME"278", *ptr);
322 } /* end while-loop for each char of -xxx token */
323 } /* end for-loop for each -xxx token */
324 } /* end of options parse altogether */
328 printf (catgets (dtsearch_catd, MS_dtsrkdump, 56,
329 "%s Missing required database name.\a\n"),
333 if (!do_wordkeys && !do_objkeys) {
334 printf (catgets (dtsearch_catd, MS_dtsrkdump, 57,
335 "%s Either -o or -w must be specified.\a\n"),
342 /* Database name may have a long path prefix.
343 * If so, we need to segregate the two.
344 * Set 'ptr' to just the 8 char dictionary name by moving
345 * it backwards until first non-alphanumeric character
346 * (such as a ":" in the dos drive id or a slash between directories),
347 * or to the beginning of string.
349 strncpy (dbpath, argv[0], sizeof (dbpath));
350 dbpath[sizeof (dbpath) - 1] = 0;
351 for (ptr = dbpath + strlen (dbpath) - 1; ptr >= dbpath; ptr--)
352 if (!isalnum (*ptr)) {
359 /* test for valid database name */
361 if (i < 1 || i > 8) {
362 fprintf (stderr, catgets (dtsearch_catd, MS_dtsrkdump, 58,
363 "%s Invalid database name '%s'.\a\n"),
367 strcpy (dbname, ptr);
368 *ptr = 0; /* truncate dbname off of full path/dbname */
370 /* Open database in read-only mode. */
372 if (!austext_dopen (dbname, dbpath, NULL, 0, &dbrec)) {
373 fprintf (stderr, "%s\n", DtSearchGetMessages());
376 maxdba = dbrec.or_maxdba;
378 printf (catgets(dtsearch_catd, MS_dtsrkdump, 60,
379 "%s: '%s' reccount=%ld maxdba=%ld recslots=%hd minw=%hd maxw=%hd\n"),
380 aa_argv0, dbname, dbrec.or_reccount,
381 dbrec.or_maxdba, dbrec.or_recslots,
382 dbrec.or_minwordsz, dbrec.or_maxwordsz);
384 /* Adjust threshold if necessary */
386 min_threshold = (long)
387 ((float) percent * (float) dbrec.or_reccount / 100.0);
388 if (min_threshold > dbrec.or_reccount)
389 min_threshold = dbrec.or_reccount;
390 if (do_wordkeys && do_verbose) {
394 if (min_threshold > 1 && min_threshold < dbrec.or_reccount) {
395 printf (catgets(dtsearch_catd, MS_dtsrkdump, 70,
396 "%s Will only list words occurring "
397 "in %ld or more records.\n"),
398 aa_argv0, min_threshold);
400 (float) min_threshold / (float) dbrec.or_reccount > .90;
403 printf (catgets(dtsearch_catd, MS_dtsrkdump, 80,
404 "%s: Listing all words in database.\n"),
406 listing_most_words = TRUE;
413 * Allocate and initialize an array of keytype counters, one for
414 * each possible ascii keytype char (256).
416 counters = austext_malloc (258 * sizeof(long), PROGNAME"113", NULL);
417 memset (counters, 0, 258 * sizeof(long));
421 KEYFRST (PROGNAME"111", OR_OBJKEY, 0);
422 while (db_status == S_OKAY) {
423 KEYREAD (PROGNAME"288", buf);
424 (counters[buf[0]])++;
426 CRGET (PROGNAME"251", &dba, 0);
427 if (maxdba < (dba & 0xffffff))
431 /* Mark control and nonascii chars with a period. */
434 for (ptr = buf; *ptr != 0; ptr++) {
435 if (*ptr < 32 | *ptr >= 127) {
445 while (i++ < DtSrMAX_DB_KEYSIZE)
448 printf (catgets(dtsearch_catd, MS_dtsrkdump, 100,
449 "dba x%08lx, %6ld\n"), dba, dba);
452 else { /* !verbose */
453 if (++keycount % KEYS_PER_DOT == 0) {
455 if (++dotcount % 10 == 0)
457 if (dotcount % 50 == 0) {
463 } /* end !verbose dot printing */
465 KEYNEXT (PROGNAME"291", OR_OBJKEY, 0);
466 } /* end object key read loop */
468 /* Print objkey summary report */
474 sprintf (buf, catgets(dtsearch_catd, MS_dtsrkdump, 110,
476 printf (catgets(dtsearch_catd, MS_dtsrkdump, 120,
477 "Object Summary for '%s'%s:\n"), dbname, buf);
478 puts (catgets(dtsearch_catd, MS_dtsrkdump, 130,
479 "Object Count by Keytypes:"));
481 for (i = 0; i < 256; i++) {
482 if (counters[i] > 0L) {
483 total += counters[i];
484 if (i > 32 && i < 127)
485 printf (" '%c' %6ld\n", i, counters[i]);
487 printf (" x%02x %6ld\n", i, counters[i]);
490 printf (catgets(dtsearch_catd, MS_dtsrkdump, 160,
491 "TOTAL Objects Count = %ld\n"), total);
492 printf (catgets(dtsearch_catd, MS_dtsrkdump, 170,
493 "Largest Object DBA = %ld\n"), maxdba);
495 } /* end do_objkeys */
498 if (listing_most_words)
499 printf (catgets(dtsearch_catd, MS_dtsrkdump, 180,
500 "%s: * Words marked with asterisk occur in every record.\n"),
504 * Allocate and initialize word and stem counters. First is for
505 * short stems (those beginning with STEM_CH), next is for short
506 * words (everything else). Next are for long stems, long words,
507 * huge stems, and huge words (6 in all).
509 counters = austext_malloc (8 * sizeof (long), PROGNAME"113", NULL);
510 memset (counters, 0, 6 * sizeof(long));
512 count_words (0); /* short */
513 count_words (2); /* long */
514 count_words (4); /* huge */
516 /* print wordkey summary report */
518 putchar ('\n'); /* separate from last report */
522 sprintf (buf, catgets(dtsearch_catd, MS_dtsrkdump, 110,
524 printf (catgets(dtsearch_catd, MS_dtsrkdump, 200,
525 "Words Summary for '%s'%s:\n"), dbname, buf);
527 for (i = 0; i < 6; i++) {
528 printf (word_labels[i], counters[i]);
529 total += counters[i];
531 printf (catgets(dtsearch_catd, MS_dtsrkdump, 210,
532 "TOTAL Words Count = %ld\n"), total);
534 } /* end do_wordkeys */
539 /*********************** DTSRKDUMP.C *************************/