2 * CDE - Common Desktop Environment
4 * Copyright (c) 1993-2012, The Open Group. All rights reserved.
6 * These libraries and programs are free software; you can
7 * redistribute them and/or modify them under the terms of the GNU
8 * Lesser General Public License as published by the Free Software
9 * Foundation; either version 2 of the License, or (at your option)
12 * These libraries and programs are distributed in the hope that
13 * they will be useful, but WITHOUT ANY WARRANTY; without even the
14 * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU Lesser General Public License for more
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with these librararies and programs; if not, write
20 * to the Free Software Foundation, Inc., 51 Franklin Street, Fifth
21 * Floor, Boston, MA 02110-1301 USA
24 * COMPONENT_NAME: austext
26 * FUNCTIONS: build_tree
39 * (C) COPYRIGHT International Business Machines Corp. 1990,1996
41 * Licensed Materials - Property of IBM
42 * US Government Users Restricted Rights - Use, duplication or
43 * disclosure restricted by GSA ADP Schedule Contract with IBM Corp.
45 /************************** HUFFCODE.C ******************************
46 * $XConsortium: huffcode.c /main/9 1996/11/14 15:31:05 rcs $
48 * Counts frequency of occurrance of every possible byte value of input text.
49 * Creates Huffman Code Table based on byte frequencies and writes it
50 * in 2 formats to 2 different output files.
51 * The encode table (.huf) maintains the frequency counts and explicitly
52 * includes the huffman code strings. Generally speaking, the .huf file
53 * is intended for humans to read. The decode table (.c) is an array
54 * of integers meant to be compiled into an object module, then linked
55 * into the decode program. The .c format closely resembles the original
56 * huffman code tree in this program.
57 * By keeping the tree as an obscure array of integers,
58 * the huffman code can double as an encryption technique,
59 * and the decoding method kept somewhat proprietary.
61 * For a good discussion of Huffman codes and related algorithms,
62 * see "Data Compression Techniques and Applications Hardware and
63 * Software Considerations" by Gilbert Held and Thomas R. Marshall.
64 * The tree itself is balanced to minimize longest bitstring length
65 * per Eugene Schwartz, Information and Control 7, 37-44 (1964).
67 * At beginning of each new execution, the program tries to
68 * open the .huf table file and continue byte frequency counting
70 * If the .huf file doesn't exist, the table's counts are
71 * initialized to zeroes. The .c decode table is recomputed fresh
72 * each run, whether it existed before or not.
74 * If the input file is not specified then the frequencies in the table
75 * are not changed, and the huffman codes are recomputed with the
76 * existing frequencies.
78 * THIS PROGRAM DOES NOT CHECK .HUF FILE FORMAT!--it had better be correct.
80 * HUFFMAN ENCODE TABLE (.huf) FILE FORMAT:
81 * Each line represents each possible byte value (0 - 255),
82 * the huffman 'literal' character (#256), or comments.
83 * There are exactly 257 lines sorted by decreasing count.
84 * There are four fields, each separated by one or more tabs (\t).
86 * 1. CHARACTER. a number from 0 to 256.
88 * The 'character' represented by the number 256 is the literal.
89 * it represents all characters whose frequency is so low that
90 * there is no huffman code translation--this reduces the max
91 * length of the coded bit string when there are lots of zero
92 * or low frequency bytes in the input. For example,
93 * pure ascii text files only occasionally have byte values
94 * less than 32 (control chars) and rarely greater than 127
95 * (high order bit turned on).
97 * 2. HUFFMAN CODE. a string of binary digits (0's and 1's).
98 * Each string is unique to that character.
99 * This field will consist of a single blank, when the character
100 * will be coded by the huffman literal. If the code of
101 * the literal itself is blank, then literal coding is
102 * not used in this table--all characters are represented
103 * by complete huffman code strings.
105 * 3. COUNT. The number of times this character appeared in the text.
106 * The literal's count equals the sum of the counts of all the
107 * real characters which are represented by the literal.
108 * The literal's count may be 0 if all the characters it
109 * represents have zero frequencies.
111 * 4. COMMENTS. A label depicting the printable char or its description.
113 * HUFFMAN DECODE TABLE (.c) FILE FORMAT:
114 * A sequence of integers formatted as a C array of integer pairs
115 * and intended to be compiled and linked into the decode program.
116 * Each huffman tree node contains two integers.
117 * The root of the tree is the LAST integer pair.
118 * The first (left) integer contains the array index down the '0' branch,
119 * the right integer points down the '1' branch.
120 * However if an integer is negative, the decoding ceases and the
121 * resulting plaintext character is the negative integer + 257,
122 * and will always be in the range 0 - 255, or 256 for the literal code.
125 * Revision 2.3 1996/03/25 18:55:04 miker
126 * Changed FILENAME_MAX to _POSIX_PATH_MAX.
128 * Revision 2.2 1995/10/25 17:50:34 miker
131 * Revision 2.1 1995/09/22 20:46:28 miker
132 * Freeze DtSearch 0.1, AusText 2.1.8
134 * Revision 1.4 1995/09/19 22:04:11 miker
135 * Print out nonascii chars in .huf file comments.
137 * Revision 1.3 1995/09/05 18:08:00 miker
138 * Name changes for DtSearch.
145 #include <sys/stat.h>
147 #define MS_huff 30 /* message catalog set number */
148 #define DELIMITERS "\t\n" /* betw fields in .huf file */
150 #define MAX_BITLEN 24
151 #define MAX_NODES 514
153 * 256 chars + 'literal' char = max leaves, therefore max treesize
157 /*----------------------- HCTREE --------------------------*/
158 /* tree is also a table so tree ptrs are table indexes:
159 * 0 - 255 = characters themselves (leaves at base of tree).
160 * 256 = literal code (special char/leaf).
161 * > 256 = higher nodes.
162 * Global 'last_node' = highest actual node alloc so far.
163 * When tree completed, last_node = root of tree.
167 char bit; /* '0' or '1' (assoc with link to
169 long count; /* freq of occurrance of char */
170 int sort; /* specifies output sort order */
171 int father; /* index points UP toward root of
173 int son0; /* index of '0' (left) subnode */
174 int son1; /* index of '1' (right) subnode */
177 /*------------------------ GLOBALS ---------------------------*/
178 static int last_node = 256;
180 long literal_threshold = 0L;
181 int debug_switch = FALSE;
182 int literal_coding_on = TRUE;
183 int input_file_specified; /* TRUE if user enters
185 int no_huffcode_file; /* TRUE if table file not
187 HCTREE hctree1[MAX_NODES];
188 char filename_input[_POSIX_PATH_MAX];
189 char filename_huf[_POSIX_PATH_MAX];
190 char filename_huc[_POSIX_PATH_MAX];
192 #ifndef TURBO_COMPILER
193 /****************************************/
197 /****************************************/
198 static char *strrev (char *string)
204 for (i = 0, j = strlen (string) - 1; i < j; i++, j--) {
206 string[i] = string[j];
212 #endif /* !TURBO_COMPILER */
214 /****************************************/
218 /****************************************/
219 /* Each call joins the two nodes with smallest count
220 * into a single higher level node. If there are more than 2 nodes with
221 * similar 'smallest' counts, then within that group the 2 nodes with the
222 * shortest current bitstring length are joined.
223 * Returns TRUE for each successful lower level join.
224 * Returns FALSE when final join is made at highest level (root).
226 static int build_tree (void)
235 /* find 2 lowest counts */
236 for (i = 0; i < 257; i++) {
237 /* skip over real chars with counts <= 'literal' threshold */
238 if (literal_coding_on
240 && hctree1[i].count <= literal_threshold) {
241 hctree1[i].sort = MAX_BITLEN + 1;
245 /* skip over literal if literal coding turned off */
246 if (i == 256 && !literal_coding_on) {
247 hctree1[256].sort = MAX_BITLEN + 1;
252 * Ascend to highest tree level for current table entry,
253 * putting length of bitstring into sort field. Save
254 * highest tree level in curr.
257 for (j = i; j != -1; j = hctree1[j].father) {
263 * sanity checks after ascending tree: 1. if bit strings
264 * have grown too large, quit. 2. if curr points to top
267 if (hctree1[i].sort > MAX_BITLEN) {
268 fprintf (stderr, catgets(dtsearch_catd, MS_huff, 30,
269 "\n183 Bit strings have grown too large. You probably "
270 "have literals\n turned off with grossly unbalanced "
271 "character counts.\n\7"));
274 if (hctree1[curr].count >= total_count) {
275 fprintf (stderr, catgets(dtsearch_catd, MS_huff, 31,
276 "\n191 Programming Error: Still trying to build\n"
277 " Huffman Code Tree after root created.\n\7"));
282 * if curr ptr already joins low0 or low1, try the next
285 if (curr == low0 || curr == low1)
289 * If curr count is less than low0, or if curr count = low0
290 * but curr bitstring length is less, replace both low0 and
291 * low1. (that way, we keep low0 always <= low1)
293 if (low0 == -1 || hctree1[curr].count < hctree1[low0].count ||
294 (hctree1[curr].count == hctree1[low0].count && hctree1[i].sort < len0)) {
298 len0 = hctree1[i].sort;
303 * At this point curr count is 'greater' than low0. If curr
304 * count is less than low1, or if curr count = low1 but
305 * curr bitstring length is less, replace only low1
307 if (low1 == -1 || hctree1[curr].count < hctree1[low1].count ||
308 (hctree1[curr].count == hctree1[low1].count && hctree1[i].sort < len1)) {
310 len1 = hctree1[i].sort;
315 * default: curr count is greater than BOTH low0 and low1,
316 * try next table entry
318 } /* end loop to find two lowest counts */
320 /* low0 and low1 now point to two lowest count nodes.
321 * link in low0 and low1 to next available new node.
324 hctree1[low0].bit = '0';
325 hctree1[low0].father = last_node;
326 hctree1[low1].bit = '1';
327 hctree1[low1].father = last_node;
328 hctree1[last_node].bit = LAST_BIT;
329 hctree1[last_node].father = -1;
330 hctree1[last_node].count = hctree1[low0].count + hctree1[low1].count;
331 hctree1[last_node].son0 = low0;
332 hctree1[last_node].son1 = low1;
335 printf ("%3d: low0=%6ld\tlow1=%6ld\tsum=%6ld\t(%ld)\n",
336 last_node, hctree1[low0].count, hctree1[low1].count,
337 hctree1[last_node].count, total_count);
339 if (hctree1[last_node].count < total_count)
343 } /* end of function build_tree */
345 /****************************************/
349 /****************************************/
350 static char *char_label (int x)
358 return "\\b (backspace)";
362 return "\\n (linefeed)";
364 return "\\v (vert tab)";
366 return "\\f (form feed)";
368 return "\\r (carr retn)";
370 return "CTRL-Z (EOF)";
372 return "CTRL-[ (ESC)";
376 return "SPACE (blank)";
380 return "_ (underscore)";
384 return "*** LITERAL CODE ***";
389 sprintf (buf, "'CTRL-%c'", 0x40 | x);
393 strcpy (buf, catgets(dtsearch_catd, MS_huff, 32,
394 "(nonascii char, high bit set)"));
398 sprintf (buf, "'%c'", x);
402 } /* end of function char_label */
404 /****************************************/
406 /* Next Sorted Node */
408 /****************************************/
409 /* Called repeatedly, returns the next treebase node in sorted order.
410 * Sort order is by length of Huffman Code String.
411 * Caller must pass index of last node returned (neg at first call).
412 * Lasti should never be larger than treebase.
414 static int next_sorted_node (int lasti)
418 long nextsortval = MAX_BITLEN + 2;
420 /* permanently mark last returned node as unavailable */
422 hctree1[lasti].sort = MAX_BITLEN + 2;
424 /* find next shortest string length */
425 for (i = 0; i < 257; i++)
426 if (hctree1[i].sort < nextsortval) {
427 nextsortval = hctree1[i].sort;
431 } /* end of function next_sorted_node */
433 /****************************************/
435 /* Initialize Treebase */
437 /****************************************/
438 /* 'Treebase' is original 257 character nodes (including literal code).
439 * If huffcode table file exists, initializes treebase with its values,
440 * else initializes treebase with zero counts.
442 static void init_treebase (void)
450 /* .huf table file does not exist--zero all counts */
451 if ((instream_huf = fopen (filename_huf, "r")) == NULL) {
452 no_huffcode_file = TRUE;
453 for (i = 0; i < 257; i++) {
454 hctree1[i].bit = LAST_BIT;
455 hctree1[i].count = 0L;
456 hctree1[i].father = -1;
457 hctree1[i].son0 = -1;
458 hctree1[i].son1 = -1;
462 /* Table file exists--init treebase with values from file.
463 * We are only interested in the character itself (i),
464 * and its current count. All other fields will be recreated
465 * at output time. FILE FORMAT IS NOT CHECKED--IT HAD BETTER BE CORRECT!
468 no_huffcode_file = FALSE;
469 fgets (filebuf, sizeof (filebuf) - 1, instream_huf);
470 /* discard this first line (don't need id stamp) */
471 while (fgets (filebuf, sizeof (filebuf) - 1, instream_huf)
473 i = atoi (strtok (filebuf, DELIMITERS)); /* char */
474 if (i < 0 || i > 256) {
475 fprintf (stderr, catgets(dtsearch_catd, MS_huff, 33,
476 "366 Invalid file format for %s.\n"),
480 strtok (NULL, DELIMITERS); /* skip over current huff
482 hctree1[i].count = (i == 256) ?
483 0L : atol (strtok (NULL, DELIMITERS));
484 hctree1[i].bit = LAST_BIT;
485 hctree1[i].father = -1;
486 hctree1[i].son0 = -1;
487 hctree1[i].son1 = -1;
489 total_count += hctree1[i].count;
490 } /* endwhile loop that reads each table line */
491 fclose (instream_huf);
494 } /* end of function init_treebase */
496 /****************************************/
500 /****************************************/
501 /* determines correct huffman code based on current counts in tree,
502 * writes out all to both files overlaying previous values if they existed.
504 static void huffman_code (time_t idstamp)
506 int i; /* current char */
508 int j; /* ascends tree from i to build bit_string */
509 char bit_string[MAX_BITLEN + 4];
515 /* establish the 'literal' node (char #256) count
516 * equal to sum of all chars whose counts are less than threshold.
518 if (literal_coding_on) {
519 hctree1[256].count = 0L;
520 for (i = 0; i < 256; i++)
521 if (hctree1[i].count <= literal_threshold)
522 hctree1[256].count += hctree1[i].count;
525 /* build the Huffman Code tree, and determine root (last_node) */
526 while (build_tree ());
528 /* now that we know the total number of tree nodes (last_node),
529 * we are ready to write.
530 * Open both output files and verify they are not write protected.
532 if ((outstream_huc = fopen (filename_huc, "w")) == NULL) {
533 fprintf (stderr, catgets(dtsearch_catd, MS_huff, 34,
534 "424 File '%s' failed to open for write. Is it read-only?\n"),
538 if ((outstream_huf = fopen (filename_huf, "w")) == NULL) {
539 fprintf (stderr, catgets(dtsearch_catd, MS_huff, 34,
540 "439 File '%s' failed to open for write. Is it read-only?\n"),
545 /* create the .c decode file (tree as integer array) */
546 fprintf (outstream_huc,
547 "#include <time.h>\n"
548 "char *hctree_name =\t\"%s\";\n"
549 "time_t hctree_id =\t%ldL;\n"
550 "int hctree_root =\t%d;\n"
551 "static int hctree_array[] = {\n",
552 filename_huc, idstamp, last_node - 257);
553 for (i = 257; i <= last_node; i++) {
554 fprintf (outstream_huc, "\t%4d,\t%4d%c\t/* %3d */\n",
555 hctree1[i].son0 - 257, hctree1[i].son1 - 257,
556 (i == last_node) ? ' ' : ',', /* no comma after last
558 i - 257); /* comment contains node number */
560 fprintf (outstream_huc, "\t};\nint *hctree =\thctree_array;\n");
561 fclose (outstream_huc);
563 /* write out the tree base (0-256) in sorted order to .huf file */
564 fprintf (outstream_huf, "%ld\tHCTREE_ID\n", idstamp);
565 for (lasti = -1; (i = next_sorted_node (lasti)) >= 0; lasti = i) {
567 * Create huffman code digit string. j ascends tree from i
568 * to build string in reverse order.
571 for (j = i; j != -1; j = hctree1[j].father)
572 *bitptr++ = hctree1[j].bit;
573 *bitptr = '\0'; /* terminate reversed string */
574 strrev (bit_string); /* reverse the string order */
575 if (bit_string[1] == 0)
576 strcpy (bit_string, " ");
577 if (strlen (bit_string) < 9)
578 strcat (bit_string, "\t");
580 /* write out the line for this char */
581 sprintf (sprintbuf, "%d\t%s\t%ld\t%s\n",
583 bit_string + 1, /* hop over LAST_BIT */
586 fprintf (outstream_huf, sprintbuf);
588 } /* end forloop printing out each tree base entry */
590 fclose (outstream_huf);
592 } /* end of function huffman_code */
594 /****************************************/
598 /****************************************/
599 static void print_usage (void)
601 fprintf (stderr, catgets(dtsearch_catd, MS_huff, 35,
602 "USAGE: huffcode [-lN | -l-] [-o] <huffname> [<infile>]\n"
603 " -l<N> specifies the 'literal' threshold count. Any character occurring\n"
604 " <= <N> times will be coded with the Huffman literal. Default is -l0,\n"
605 " literal coding only for bytes with counts of zero.\n"
606 " -l- turns off literal coding. Turning off literal coding in unbalanced\n"
607 " trees leads to EXTREMELY LONG bit string codes--don't do it unless\n"
608 " the input is known to be a well balanced binary file.\n"
609 " -o preauthorizes overwriting any currently existing decode file.\n"
610 " <huffname> is the filename prefix for the Huffman Code files.\n"
611 " If the encode file (%s) already exists, byte counts from infile will\n"
612 " be added to it, otherwise it will be newly created.\n"
613 " The decode file (%s) is always newly created each run.\n"
614 " <infile> is an input file containing bytes to be counted.\n"
615 " It may be omitted if the encode file already exists.\n"),
616 EXT_HUFFCODE, EXT_HDECODE);
618 } /* end of function print_usage */
620 /********************************************************/
622 /* USER_ARGS_PROCESSOR */
624 /********************************************************/
625 /* handles command line arguments for 'main' */
626 static void user_args_processor (int argc, char **argv)
629 int OK_to_overwrite = FALSE;
632 if (argc <= 1) { /* user just wants to see usage msg */
637 /* each pass grabs new parm of "-xxx" format */
638 while (--argc > 0 && (*++argv)[0] == '-') {
640 argptr[1] = tolower (argptr[1]);
642 case 'l': /* literal threshold */
645 else if (argptr[2] == '-')
646 literal_coding_on = FALSE;
648 literal_threshold = atoi (argptr + 2);
651 case 'o': /* OK_to_overwrite .c file if it already
653 OK_to_overwrite = TRUE;
656 case 'v': /* verbose mode = debug switch */
662 fprintf (stderr, catgets(dtsearch_catd, MS_huff, 36,
663 "'%s' is invalid argument.\n"), argptr);
665 exit (2); /* ABORT program */
668 } /* endwhile for cmd line '-'processing */
670 /* test for required tree file name */
672 fprintf (stderr, catgets(dtsearch_catd, MS_huff, 37,
673 "576 Missing Huffman Code file names prefix.\n"));
677 /* create 2 output file names from passed argument */
678 strncpy (filename_huf, argv[0], _POSIX_PATH_MAX);
679 filename_huf[_POSIX_PATH_MAX - 6] = 0;
680 strcat (filename_huf, EXT_HUFFCODE);
681 strncpy (filename_huc, argv[0], _POSIX_PATH_MAX);
682 filename_huc[_POSIX_PATH_MAX - 6] = 0;
683 strcat (filename_huc, EXT_HDECODE);
685 /* Since the decode file is a C source code file (.c extension),
686 * we want to be sure not to erase somebody's source program.
687 * So if the .c file already exists, and the user didn't specify
688 * overwrite in a command line argument, ask him now if it's OK to
689 * blow away the old file.
691 if (!OK_to_overwrite)
692 if ((stream = fopen (filename_huc, "r")) != NULL) {
694 printf (catgets(dtsearch_catd, MS_huff, 38,
695 "Decode file '%s' already exists. "
696 "Is it OK to overwrite it? [y/n] "),
698 if (toupper (getchar ()) != 'Y')
702 /* test for optional input file name */
704 input_file_specified = FALSE;
706 input_file_specified = TRUE;
707 strncpy (filename_input, argv[1], _POSIX_PATH_MAX);
708 filename_input[_POSIX_PATH_MAX - 1] = 0;
712 } /* end of function user_args_processor */
714 /****************************************/
718 /****************************************/
719 int main (int argc, char *argv[])
722 struct stat fstat_input;
725 time_t now, start_stamp;
727 setlocale (LC_ALL, "");
728 dtsearch_catd = catopen (FNAME_DTSRCAT, 0);
729 printf (catgets(dtsearch_catd, MS_huff, 40,
730 "HUFFCODE Version %s\n"), AUSAPI_VERSION);
732 /* validate user's command line arguments */
733 user_args_processor (argc, argv);
735 /* initialize tree table, using the table file if it exists */
737 if (total_count == 0L)
738 printf (catgets(dtsearch_catd, MS_huff, 41,
739 "Huffman Code Tables will be newly created.\n"));
741 printf (catgets(dtsearch_catd, MS_huff, 42,
742 "Table '%s' already contains %ld Kbytes from previous runs.\n"),
743 filename_huf, total_count / 1000L);
745 if (!input_file_specified && no_huffcode_file) {
746 fprintf (stderr, catgets(dtsearch_catd, MS_huff, 43,
747 "645 Input file not specified and '%s' table file\n"
748 " doesn't exist--nothing to do!\n"),
754 /* read the input file and count its bytes */
755 if (input_file_specified) {
756 if ((instream = fopen (filename_input, "rb")) == NULL) {
758 fprintf (stderr, catgets(dtsearch_catd, MS_huff, 44,
759 "Could not open input file '%s' or access status: %s\n"),
760 filename_input, strerror (errno));
763 if (fstat (fileno (instream), &fstat_input) == -1)
765 printf (catgets(dtsearch_catd, MS_huff, 45,
766 "Input file '%s' contains about %ld Kbytes.\n"),
767 filename_input, fstat_input.st_size / 1000L);
770 while ((mychar = getc (instream)) != EOF) {
771 hctree1[mychar].count++;
774 /* echo progress to user every so often */
775 if (!(++bytes_in % 10000L))
776 printf (catgets(dtsearch_catd, MS_huff, 46,
777 "\r%ld%% done. %2ld Kbytes read. "
778 "Estimate %3ld seconds to completion. "),
779 (bytes_in * 100L) / fstat_input.st_size,
781 (fstat_input.st_size - bytes_in) *
782 (time (NULL) - start_stamp) / bytes_in);
783 } /* end read loop for each char in input file */
787 } /* endif that processes input file */
789 /* build huffman code tree, write out files */
790 time (&now); /* this will be the official tree id time
792 printf (catgets(dtsearch_catd, MS_huff, 47,
793 "Identifying timestamp will be '%ld'.\n"
794 "%s Huffman Code Tables in '%s' and '%s'..."),
797 catgets(dtsearch_catd, MS_huff, 48, "Creating") :
798 catgets(dtsearch_catd, MS_huff, 49, "Rebuilding"),
804 } /* end of function main */
806 /************************** HUFFCODE.C ******************************/