2 * COMPONENT_NAME: austext
4 * FUNCTIONS: dump_hashtab
11 * (C) COPYRIGHT International Business Machines Corp. 1993,1995
13 * Licensed Materials - Property of IBM
14 * US Government Users Restricted Rights - Use, duplication or
15 * disclosure restricted by GSA ADP Schedule Contract with IBM Corp.
17 /******************* ISDUPREC.C *******************
18 * $XConsortium: isduprec.c /main/5 1996/05/07 13:37:35 drk $
20 * Is_duprec() returns 0 (FALSE) for every record id it is passed
21 * unless one is passed that duplicates a previous one,
22 * in which case it returns 1 (TRUE).
23 * It ensures that duplicate record ids in an .fzk file
24 * are not processed by either ravel or borodin.
25 * It does it by storing each recid into a hash table and
26 * searching the table before storing a new recid.
27 * Returns 2 on errors (malloc out of space, etc);
29 * Global 'duprec_hashsize' can be changed to any rational value
30 * for a hash table size (say 1000 to 30,000) prior to the first call
31 * of is_duprec(). It should be roughly => to the total number of
32 * different record ids expected to be passed to is_duprec().
33 * If initialized to 0 before the first call, that will disable
34 * duplicate checking, i.e. is_duprec() will allocate no memory
35 * and always return 0.
38 * Revision 2.2 1995/10/25 17:22:48 miker
41 * Revision 2.1 1995/09/22 20:56:44 miker
42 * Freeze DtSearch 0.1, AusText 2.1.8
44 * Revision 1.3 1995/09/05 18:11:45 miker
45 * Minor changes so ansi c compilers won't whine.
55 #define PROGNAME "ISDUPREC"
56 #define HASHSIZE 3000L
61 unsigned long duprec_hashsize = HASHSIZE;
63 /************************************************/
67 /************************************************/
68 /* The hash table is a HASHSIZE array of pointers to these structures.
69 * Each pointer is initialized to NULL.
70 * Additions are handled by filling in a HASHNODE pointed to
71 * by the table pointer. The 'recid' is NOT a char array of length
72 * 1, but a string whose length varies depending on the actual
73 * length of the passed record id. Each hashnode is malloced
74 * for exactly the right length. Collisions are handled by linking
75 * additional nodes off of the original one.
77 typedef struct hash_tag {
78 struct hash_tag *link;
79 char recid[2]; /* actual array size varies */
84 /************************************************/
88 /************************************************/
89 /* For debugging, prints out all recids in hashtab, skipping empty bkts */
90 static void dump_hashtab (HASHNODE ** hashtab)
94 printf (PROGNAME "67 dump_hashtab(%p):\n", hashtab);
95 for (i = 0, hpp = hashtab; i < duprec_hashsize; i++, hpp++) {
99 for (hp = *hpp; hp != NULL; hp = hp->link)
100 printf (" '%s'", hp->recid);
106 } /* dump_hashtab() */
111 /************************************************/
115 /************************************************/
116 /* Normal return is 0 indicating that passed record id is unique.
117 * Also immediately returns 0 if duplicate checking has been
118 * turned off by setting global 'duprec_hashsize' to zero.
119 * Returns 1 if record id is a duplicate.
120 * Returns 2 if out of memory.
121 * First call uses 'duprec_hashsize' to create hash table.
123 int is_duprec (char *recid)
125 static HASHNODE **hashtab = NULL;
126 static unsigned long primes[10] =
127 {1013, 1511, 2203, 3511, 5003, 10007, 15013, 20011, 25013, 30001};
134 if (duprec_hashsize == 0UL)
137 /* Generate hash table at first call only */
138 if (hashtab == NULL) {
140 * adjust table size upward to nearest preordained prime
143 for (i = 0; i < 9 && primes[i] < duprec_hashsize; i++);
144 duprec_hashsize = primes[i];
146 printf (PROGNAME "117 Create hash table, duprec_hashsize set = %ld.\n",
150 hashtab = malloc ((duprec_hashsize + 2L) * sizeof (HASHNODE **));
154 /* init table to all NULL pointers. */
156 for (i = duprec_hashsize + 2L; i > 0L; i--)
160 /*****dump_hashtab(hashtab);******/
162 /* HASH FUNCTION: H(recid) = (SUM(i*recid[i])) mod M,
163 * where M is table size (prime), and SUM is calculated
164 * for i=1 to end of recid. Multiplying the position by the character
165 * value at that position minimizes the influence of identical
166 * characters at the beginnings and ends of recids,
167 * and also usually yields a number larger than M.
168 * Not skipping over the first position (the keytype char) helps
169 * efficiently catch recids that are blank after the keytype.
175 sum += i++ * (*cp++);
176 hpp = &(hashtab[sum % duprec_hashsize]); /* hpp = head of linked
180 printf (PROGNAME "150 is_duprec('%s')=hashtab[%lu]=%p: ",
181 recid, sum % duprec_hashsize, *hpp);
186 /* Search linked list (if any) for hashnode containing recid */
187 for (hp = *hpp; hp != NULL; hp = hp->link) {
192 if (strcmp (hp->recid, recid) == 0) {
194 printf ("DUP!@listpos=%d\n", i);
198 hpp = &hp->link; /* now hpp = tail of linked list */
201 printf ("miss@listlen=%d\n", i);
204 /* Not a duplicate. Add current recid to hash table. */
205 if ((hp = malloc (sizeof (HASHNODE) + strlen (recid) + 2)) == NULL)
207 strcpy (hp->recid, recid);
209 /*****hp->link = *hpp;******/
216 /************************************************/
220 /************************************************/
221 main (int argc, char *argv[])
228 printf ("USAGE: %s <file> [n]\n"
229 "where file contains list of char strings\n"
230 "and optional n changes hash table size.\n",
234 if ((f = fopen (argv[1], "r")) == NULL) {
235 printf ("Can't open %s: %s\n", argv[1], strerror (errno));
239 duprec_hashsize = atol (argv[2]);
241 while (fgets (buf, sizeof (buf), f) != NULL) {
242 buf[sizeof (buf) - 1] = 0;
244 printf ("%s", buf); /* each buf should end in \n */
252 /******************* ISDUPREC.C *******************/