1 /* $XConsortium: lzss.cc /main/5 1996/07/18 16:00:08 drk $ */
3 #include "compression/lzss.h"
7 Adapted from LDS (lossless datacompression sources) Version 1.1 by
14 /**************************************************************
15 LZSS.C -- A Data Compression Program
17 ***************************************************************
18 4/6/1989 Haruhiko Okumura
19 Use, distribute, and modify this program freely.
20 Please send me your improved versions.
24 **************************************************************/
30 #define N 4096 /* size of ring buffer */
31 #define F 18 /* upper limit for match_length */
32 #define THRESHOLD 2 /* encode string into position and length
33 if match_length is greater than this */
34 #define NIL N /* index for root of binary search trees */
37 textsize = 0, /* text size counter */
38 codesize = 0, /* code size counter */
39 printcount = 0; /* counter for reporting progress every 1K bytes */
41 text_buf[N + F - 1]; /* ring buffer of size N,
42 with extra F-1 bytes to facilitate string comparison */
43 int match_position, match_length, /* of longest match. These are
44 set by the InsertNode() procedure. */
45 lson[N + 1], rson[N + 257], dad[N + 1]; /* left & right children &
46 parents -- These constitute binary search trees. */
47 //FILE *infile, *outfile; /* input & output files */
49 void InitTree(void) /* initialize trees */
53 /* For i = 0 to N - 1, rson[i] and lson[i] will be the right and
54 left children of node i. These nodes need not be initialized.
55 Also, dad[i] is the parent of node i. These are initialized to
56 NIL (= N), which stands for 'not used.'
57 For i = 0 to 255, rson[N + i + 1] is the root of the tree
58 for strings that begin with character i. These are initialized
59 to NIL. Note there are 256 trees. */
61 for (i = N + 1; i <= N + 256; i++) rson[i] = NIL;
62 for (i = 0; i < N; i++) dad[i] = NIL;
65 void InsertNode(int r)
66 /* Inserts string of length F, text_buf[r..r+F-1], into one of the
67 trees (text_buf[r]'th tree) and returns the longest-match position
68 and length via the global variables match_position and match_length.
69 If match_length = F, then removes the old node in favor of the new
70 one, because the old one will be deleted sooner.
71 Note r plays double role, as tree node and position in buffer. */
76 cmp = 1; key = &text_buf[r]; p = N + 1 + key[0];
77 rson[r] = lson[r] = NIL; match_length = 0;
80 if (rson[p] != NIL) p = rson[p];
81 else { rson[p] = r; dad[r] = p; return; }
83 if (lson[p] != NIL) p = lson[p];
84 else { lson[p] = r; dad[r] = p; return; }
86 for (i = 1; i < F; i++)
87 if ((cmp = key[i] - text_buf[p + i]) != 0) break;
88 if (i > match_length) {
90 if ((match_length = i) >= F) break;
93 dad[r] = dad[p]; lson[r] = lson[p]; rson[r] = rson[p];
94 dad[lson[p]] = r; dad[rson[p]] = r;
95 if (rson[dad[p]] == p) rson[dad[p]] = r;
96 else lson[dad[p]] = r;
97 dad[p] = NIL; /* remove p */
100 void DeleteNode(int p) /* deletes node p from tree */
104 if (dad[p] == NIL) return; /* not in tree */
105 if (rson[p] == NIL) q = lson[p];
106 else if (lson[p] == NIL) q = rson[p];
109 if (rson[q] != NIL) {
110 do { q = rson[q]; } while (rson[q] != NIL);
111 rson[dad[q]] = lson[q]; dad[lson[q]] = dad[q];
112 lson[q] = lson[p]; dad[lson[p]] = q;
114 rson[q] = rson[p]; dad[rson[p]] = q;
117 if (rson[dad[p]] == p) rson[dad[p]] = q; else lson[dad[p]] = q;
121 void lzss::compress(const buffer& uncompressed, buffer& compressed)
123 if ( compressed.buf_sz() < uncompressed.buf_sz() )
124 compressed.expand_chunk(uncompressed.buf_sz());
126 ////////////////////////////////////////////
127 ////////////////////////////////////////////
128 int i, c, len, r, s, last_match_length, code_buf_ptr;
129 unsigned char code_buf[17], mask;
131 InitTree(); /* initialize trees */
132 code_buf[0] = 0; /* code_buf[1..16] saves eight units of code, and
133 code_buf[0] works as eight flags, "1" representing that the unit
134 is an unencoded letter (1 byte), "0" a position-and-length pair
135 (2 bytes). Thus, eight units require at most 16 bytes of code. */
137 code_buf_ptr = mask = 1;
139 for (i = s; i < r; i++) text_buf[i] = ' '; /* Clear the buffer with
140 any character that will appear often. */
142 char* unc_str = uncompressed.get_base();
143 int unc_str_len = uncompressed.content_sz();
146 for (len = 0; len < F; len++) {
148 if ( unc_str_ptr == unc_str_len )
151 c = unc_str[unc_str_ptr++];
154 text_buf[r + len] = c; /* Read F bytes into the last F bytes of
158 if ((textsize = len) == 0) return; /* text of size zero */
160 for (i = 1; i <= F; i++) InsertNode(r - i); /* Insert the F strings,
161 each of which begins with one or more 'space' characters. Note
162 the order in which these strings are inserted. This way,
163 degenerate trees will be less likely to occur. */
165 InsertNode(r); /* Finally, insert the whole string just read. The
166 global variables match_length and match_position are set. */
169 if (match_length > len) match_length = len; /* match_length
170 may be spuriously long near the end of text. */
173 if (match_length <= THRESHOLD) {
175 match_length = 1; /* Not long enough match. Send one byte. */
176 code_buf[0] |= mask; /* 'send one byte' flag */
177 code_buf[code_buf_ptr++] = text_buf[r]; /* Send uncoded. */
181 code_buf[code_buf_ptr++] = (unsigned char) match_position;
182 code_buf[code_buf_ptr++] = (unsigned char)
183 (((match_position >> 4) & 0xf0)
184 | (match_length - (THRESHOLD + 1))); /* Send position and
187 length pair. Note match_length > THRESHOLD. */
190 if ((mask <<= 1) == 0) { /* Shift mask left one bit. */
193 // for (i = 0; i < code_buf_ptr; i++) { /* Send at most 8 units of */
194 // putc(code_buf[i], outfile); /* code together */
197 compressed.put((char*)code_buf, code_buf_ptr, true);
199 codesize += code_buf_ptr;
200 code_buf[0] = 0; code_buf_ptr = mask = 1;
203 last_match_length = match_length;
205 for (i = 0; i < last_match_length; i++) {
207 if ( unc_str_ptr == unc_str_len )
210 c = unc_str[unc_str_ptr++];
212 DeleteNode(s); /* Delete old strings and */
213 text_buf[s] = c; /* read new bytes */
215 if (s < F - 1) text_buf[s + N] = c; /* If the position is
216 near the end of buffer, extend the buffer to make
217 string comparison easier. */
219 //s = (s + 1) & (N - 1); r = (r + 1) & (N - 1);
220 s++; s &= (N - 1); r++; r &= (N - 1);
221 /* Since this is a ring buffer, increment the position
224 InsertNode(r); /* Register the string in text_buf[r..r+F-1] */
227 // if ((textsize += i) > printcount) {
228 // printf("%12ld\r", textsize); printcount += 1024;
229 // /* Reports progress each time the textsize exceeds
230 // multiples of 1024. */
233 while (i++ < last_match_length) {/* After the end of text, */
234 DeleteNode(s); /* no need to read, but */
236 //s = (s + 1) & (N - 1); r = (r + 1) & (N - 1);
237 s++; s &= (N - 1); r++; r &= (N - 1);
239 if (--len) InsertNode(r); /* buffer may not be empty. */
242 } while (len > 0); /* until length of string to be processed is zero */
244 if (code_buf_ptr > 1) { /* Send remaining code. */
246 // for (i = 0; i < code_buf_ptr; i++) {
247 // //putc(code_buf[i], outfile);
248 // compressed.put(code_buf[i], true);
251 compressed.put((char*)code_buf, code_buf_ptr, true);
252 codesize += code_buf_ptr;
256 //printf("In : %ld bytes\n", textsize); /* Encoding is done. */
257 //printf("Out: %ld bytes\n", codesize);
258 //printf("Out/In: %.3f\n", (double)codesize / textsize);
261 void lzss::decompress(buffer& compressed, buffer& uncompressed)
266 for (i = 0; i < N - F; i++) text_buf[i] = ' ';
267 r = N - F; flags = 0;
269 if (((flags >>= 1) & 256) == 0) {
271 // if ((c = getc(infile)) == EOF) break;
273 if ( compressed.content_sz() == 0 )
276 compressed.getusc(c);
278 flags = c | 0xff00; /* uses higher byte cleverly */
279 } /* to count eight */
282 //if ((c = getc(infile)) == EOF) break;
283 //putc(c, outfile); text_buf[r++] = c; r &= (N - 1);
285 if ( compressed.content_sz() == 0 )
288 compressed.getusc(c);
290 //debug(cerr, char(c));
291 uncompressed.put(char(c), true); text_buf[r++] = c; r &= (N - 1);
294 //if ((i = getc(infile)) == EOF) break;
295 //if ((j = getc(infile)) == EOF) break;
297 if ( compressed.content_sz() == 0 )
300 compressed.getusc(i);
303 if ( compressed.content_sz() == 0 )
306 compressed.getusc(j);
309 i |= ((j & 0xf0) << 4); j = (j & 0x0f) + THRESHOLD;
311 for (k = 0; k <= j; k++) {
312 c = text_buf[(i + k) & (N - 1)];
315 //debug(cerr, char(c));
316 uncompressed.put(char(c), true);
318 text_buf[r++] = c; r &= (N - 1);
327 io_status lzss::build_dict(lex_func_t, getchar_func_t)