00001 /******************************************************************************
00002 * lzsscomprs.cpp - code for class 'LZSSCompress'- a driver class that
00003 * provides LZSS compression
00004 */
00005
00006 #include <string.h>
00007 #include <stdlib.h>
00008 #include <lzsscomprs.h>
00009
00010
00011 /******************************************************************************
00012 * LZSSCompress Statics
00013 */
00014
00015 // m_ring_buffer is a text buffer. It contains "nodes" of
00016 // uncompressed text that can be indexed by position. That is,
00017 // a substring of the ring buffer can be indexed by a position
00018 // and a length. When decoding, the compressed text may contain
00019 // a position in the ring buffer and a count of the number of
00020 // bytes from the ring buffer that are to be moved into the
00021 // uncompressed buffer.
00022 //
00023 // This ring buffer is not maintained as part of the compressed
00024 // text. Instead, it is reconstructed dynamically. That is,
00025 // it starts out empty and gets built as the text is decompressed.
00026 //
00027 // The ring buffer contain N bytes, with an additional F - 1 bytes
00028 // to facilitate string comparison.
00029
00030 unsigned char LZSSCompress::m_ring_buffer[N + F - 1];
00031
00032 // m_match_position and m_match_length are set by InsertNode().
00033 //
00034 // These variables indicate the position in the ring buffer
00035 // and the number of characters at that position that match
00036 // a given string.
00037
00038 short int LZSSCompress::m_match_position;
00039 short int LZSSCompress::m_match_length;
00040
00041 // m_lson, m_rson, and m_dad are the Japanese way of referring to
00042 // a tree structure. The dad is the parent and it has a right and
00043 // left son (child).
00044 //
00045 // For i = 0 to N-1, m_rson[i] and m_lson[i] will be the right
00046 // and left children of node i.
00047 //
00048 // For i = 0 to N-1, m_dad[i] is the parent of node i.
00049 //
00050 // For i = 0 to 255, rson[N + i + 1] is the root of the tree for
00051 // strings that begin with the character i. Note that this requires
00052 // one byte characters.
00053 //
00054 // These nodes store values of 0...(N-1). Memory requirements
00055 // can be reduces by using 2-byte integers instead of full 4-byte
00056 // integers (for 32-bit applications). Therefore, these are
00057 // defined as "short ints."
00058
00059 short int LZSSCompress::m_lson[N + 1];
00060 short int LZSSCompress::m_rson[N + 257];
00061 short int LZSSCompress::m_dad[N + 1];
00062
00063
00064 /******************************************************************************
00065 * LZSSCompress Constructor - Initializes data for instance of LZSSCompress
00066 *
00067 */
00068
00069 LZSSCompress::LZSSCompress() : SWCompress() {
00070 }
00071
00072
00073 /******************************************************************************
00074 * LZSSCompress Destructor - Cleans up instance of LZSSCompress
00075 */
00076
00077 LZSSCompress::~LZSSCompress() {
00078 }
00079
00080
00081 /******************************************************************************
00082 * LZSSCompress::InitTree - This function initializes the tree nodes to
00083 * "empty" states.
00084 */
00085
00086 void LZSSCompress::InitTree(void) {
00087 int i;
00088
00089 // For i = 0 to N - 1, m_rson[i] and m_lson[i] will be the right
00090 // and left children of node i. These nodes need not be
00091 // initialized. However, for debugging purposes, it is nice to
00092 // have them initialized. Since this is only used for compression
00093 // (not decompression), I don't mind spending the time to do it.
00094 //
00095 // For the same range of i, m_dad[i] is the parent of node i.
00096 // These are initialized to a known value that can represent
00097 // a "not used" state.
00098
00099 for (i = 0; i < N; i++) {
00100 m_lson[i] = NOT_USED;
00101 m_rson[i] = NOT_USED;
00102 m_dad[i] = NOT_USED;
00103 }
00104
00105 // For i = 0 to 255, m_rson[N + i + 1] is the root of the tree
00106 // for strings that begin with the character i. This is why
00107 // the right child array is larger than the left child array.
00108 // These are also initialzied to a "not used" state.
00109 //
00110 // Note that there are 256 of these, one for each of the possible
00111 // 256 characters.
00112
00113 for (i = N + 1; i <= (N + 256); i++) {
00114 m_rson[i] = NOT_USED;
00115 }
00116 }
00117
00118
00119 /******************************************************************************
00120 * LZSSCompress::InsertNode - This function inserts a string from the ring
00121 * buffer into one of the trees. It loads the
00122 * match position and length member variables
00123 * for the longest match.
00124 *
00125 * The string to be inserted is identified by
00126 * the parameter Pos, A full F bytes are
00127 * inserted. So,
00128 * m_ring_buffer[Pos ... Pos+F-1]
00129 * are inserted.
00130 *
00131 * If the matched length is exactly F, then an
00132 * old node is removed in favor of the new one
00133 * (because the old one will be deleted
00134 * sooner).
00135 *
00136 * Note that Pos plays a dual role. It is
00137 * used as both a position in the ring buffer
00138 * and also as a tree node.
00139 * m_ring_buffer[Pos] defines a character that
00140 * is used to identify a tree node.
00141 *
00142 * ENT: pos - position in the buffer
00143 */
00144
00145 void LZSSCompress::InsertNode(short int Pos)
00146 {
00147 short int i;
00148 short int p;
00149 int cmp;
00150 unsigned char * key;
00151
00152 /*
00153 ASSERT(Pos >= 0);
00154 ASSERT(Pos < N);
00155 */
00156
00157 cmp = 1;
00158 key = &(m_ring_buffer[Pos]);
00159
00160 // The last 256 entries in m_rson contain the root nodes for
00161 // strings that begin with a letter. Get an index for the
00162 // first letter in this string.
00163
00164 p = (short int) (N + 1 + key[0]);
00165
00166 // Set the left and right tree nodes for this position to "not
00167 // used."
00168
00169 m_lson[Pos] = NOT_USED;
00170 m_rson[Pos] = NOT_USED;
00171
00172 // Haven't matched anything yet.
00173
00174 m_match_length = 0;
00175
00176 for ( ; ; ) {
00177 if (cmp >= 0) {
00178 if (m_rson[p] != NOT_USED) {
00179 p = m_rson[p];
00180 }
00181 else {
00182 m_rson[p] = Pos;
00183 m_dad[Pos] = p;
00184 return;
00185 }
00186 }
00187 else {
00188 if (m_lson[p] != NOT_USED) {
00189 p = m_lson[p];
00190 }
00191 else {
00192 m_lson[p] = Pos;
00193 m_dad[Pos] = p;
00194 return;
00195 }
00196 }
00197
00198 // Should we go to the right or the left to look for the
00199 // next match?
00200
00201 for (i = 1; i < F; i++) {
00202 cmp = key[i] - m_ring_buffer[p + i];
00203 if (cmp != 0)
00204 break;
00205 }
00206
00207 if (i > m_match_length) {
00208 m_match_position = p;
00209 m_match_length = i;
00210
00211 if (i >= F)
00212 break;
00213 }
00214 }
00215
00216 m_dad[Pos] = m_dad[p];
00217 m_lson[Pos] = m_lson[p];
00218 m_rson[Pos] = m_rson[p];
00219
00220 m_dad[ m_lson[p] ] = Pos;
00221 m_dad[ m_rson[p] ] = Pos;
00222
00223 if (m_rson[ m_dad[p] ] == p) {
00224 m_rson[ m_dad[p] ] = Pos;
00225 }
00226 else {
00227 m_lson[ m_dad[p] ] = Pos;
00228 }
00229
00230 // Remove "p"
00231
00232 m_dad[p] = NOT_USED;
00233 }
00234
00235
00236 /******************************************************************************
00237 * LZSSCompress::DeleteNode - This function removes the node "Node" from the
00238 * tree.
00239 *
00240 * ENT: node - node to be removed
00241 */
00242
00243 void LZSSCompress::DeleteNode(short int Node)
00244 {
00245 short int q;
00246
00247 /*
00248 ASSERT(Node >= 0);
00249 ASSERT(Node < (N+1));
00250 */
00251
00252 if (m_dad[Node] == NOT_USED) { // not in tree, nothing to do
00253 return;
00254 }
00255
00256 if (m_rson[Node] == NOT_USED) {
00257 q = m_lson[Node];
00258 }
00259 else if (m_lson[Node] == NOT_USED) {
00260 q = m_rson[Node];
00261 }
00262 else {
00263 q = m_lson[Node];
00264 if (m_rson[q] != NOT_USED) {
00265 do {
00266 q = m_rson[q];
00267 } while (m_rson[q] != NOT_USED);
00268
00269 m_rson[ m_dad[q] ] = m_lson[q];
00270 m_dad[ m_lson[q] ] = m_dad[q];
00271 m_lson[q] = m_lson[Node];
00272 m_dad[ m_lson[Node] ] = q;
00273 }
00274
00275 m_rson[q] = m_rson[Node];
00276 m_dad[ m_rson[Node] ] = q;
00277 }
00278
00279 m_dad[q] = m_dad[Node];
00280
00281 if (m_rson[ m_dad[Node] ] == Node) {
00282 m_rson[ m_dad[Node] ] = q;
00283 }
00284 else {
00285 m_lson[ m_dad[Node] ] = q;
00286 }
00287
00288 m_dad[Node] = NOT_USED;
00289 }
00290
00291
00292 /******************************************************************************
00293 * LZSSCompress::Encode - This function "encodes" the input stream into the
00294 * output stream.
00295 * The GetChars() and SendChars() functions are
00296 * used to separate this method from the actual
00297 * i/o.
00298 * NOTE: must set zlen for parent class to know length of
00299 * compressed buffer.
00300 */
00301
00302 void LZSSCompress::Encode(void)
00303 {
00304 short int i; // an iterator
00305 short int r; // node number in the binary tree
00306 short int s; // position in the ring buffer
00307 unsigned short int len; // len of initial string
00308 short int last_match_length; // length of last match
00309 short int code_buf_pos; // position in the output buffer
00310 unsigned char code_buf[17]; // the output buffer
00311 unsigned char mask; // bit mask for byte 0 of out buf
00312 unsigned char c; // character read from string
00313
00314 // Start with a clean tree.
00315
00316 InitTree();
00317 direct = 0; // set direction needed by parent [Get|Send]Chars()
00318
00319 // code_buf[0] works as eight flags. A "1" represents that the
00320 // unit is an unencoded letter (1 byte), and a "0" represents
00321 // that the next unit is a <position,length> pair (2 bytes).
00322 //
00323 // code_buf[1..16] stores eight units of code. Since the best
00324 // we can do is store eight <position,length> pairs, at most 16
00325 // bytes are needed to store this.
00326 //
00327 // This is why the maximum size of the code buffer is 17 bytes.
00328
00329 code_buf[0] = 0;
00330 code_buf_pos = 1;
00331
00332 // Mask iterates over the 8 bits in the code buffer. The first
00333 // character ends up being stored in the low bit.
00334 //
00335 // bit 8 7 6 5 4 3 2 1
00336 // | |
00337 // | first sequence in code buffer
00338 // |
00339 // last sequence in code buffer
00340
00341 mask = 1;
00342
00343 s = 0;
00344 r = (short int) N - (short int) F;
00345
00346 // Initialize the ring buffer with spaces...
00347
00348 // Note that the last F bytes of the ring buffer are not filled.
00349 // This is because those F bytes will be filled in immediately
00350 // with bytes from the input stream.
00351
00352 memset(m_ring_buffer, ' ', N - F);
00353
00354 // Read F bytes into the last F bytes of the ring buffer.
00355 //
00356 // This function loads the buffer with X characters and returns
00357 // the actual amount loaded.
00358
00359 len = GetChars((char *) &(m_ring_buffer[r]), F);
00360
00361 // Make sure there is something to be compressed.
00362
00363 if (len == 0)
00364 return;
00365
00366 // Insert the F strings, each of which begins with one or more
00367 // 'space' characters. Note the order in which these strings
00368 // are inserted. This way, degenerate trees will be less likely
00369 // to occur.
00370
00371 for (i = 1; i <= F; i++) {
00372 InsertNode((short int) (r - i));
00373 }
00374
00375 // Finally, insert the whole string just read. The
00376 // member variables match_length and match_position are set.
00377
00378 InsertNode(r);
00379
00380 // Now that we're preloaded, continue till done.
00381
00382 do {
00383
00384 // m_match_length may be spuriously long near the end of
00385 // text.
00386
00387 if (m_match_length > len) {
00388 m_match_length = len;
00389 }
00390
00391 // Is it cheaper to store this as a single character? If so,
00392 // make it so.
00393
00394 if (m_match_length < THRESHOLD) {
00395 // Send one character. Remember that code_buf[0] is the
00396 // set of flags for the next eight items.
00397
00398 m_match_length = 1;
00399 code_buf[0] |= mask;
00400 code_buf[code_buf_pos++] = m_ring_buffer[r];
00401 }
00402
00403 // Otherwise, we do indeed have a string that can be stored
00404 // compressed to save space.
00405
00406 else {
00407 // The next 16 bits need to contain the position (12 bits)
00408 // and the length (4 bits).
00409
00410 code_buf[code_buf_pos++] = (unsigned char) m_match_position;
00411 code_buf[code_buf_pos++] = (unsigned char) (
00412 ((m_match_position >> 4) & 0xf0) |
00413 (m_match_length - THRESHOLD) );
00414 }
00415
00416 // Shift the mask one bit to the left so that it will be ready
00417 // to store the new bit.
00418
00419 mask = (unsigned char) (mask << 1);
00420
00421 // If the mask is now 0, then we know that we have a full set
00422 // of flags and items in the code buffer. These need to be
00423 // output.
00424
00425 if (!mask) {
00426 // code_buf is the buffer of characters to be output.
00427 // code_buf_pos is the number of characters it contains.
00428
00429 SendChars((char *) code_buf, code_buf_pos);
00430
00431 // Reset for next buffer...
00432
00433 code_buf[0] = 0;
00434 code_buf_pos = 1;
00435 mask = 1;
00436 }
00437
00438 last_match_length = m_match_length;
00439
00440 // Delete old strings and read new bytes...
00441
00442 for (i = 0; i < last_match_length; i++) {
00443 // Get next character...
00444
00445 if (GetChars((char *) &c, 1) != 1)
00446 break;
00447
00448 // Delete "old strings"
00449
00450 DeleteNode(s);
00451
00452 // Put this character into the ring buffer.
00453 //
00454 // The original comment here says "If the position is near
00455 // the end of the buffer, extend the buffer to make
00456 // string comparison easier."
00457 //
00458 // That's a little misleading, because the "end" of the
00459 // buffer is really what we consider to be the "beginning"
00460 // of the buffer, that is, positions 0 through F.
00461 //
00462 // The idea is that the front end of the buffer is duplicated
00463 // into the back end so that when you're looking at characters
00464 // at the back end of the buffer, you can index ahead (beyond
00465 // the normal end of the buffer) and see the characters
00466 // that are at the front end of the buffer wihtout having
00467 // to adjust the index.
00468 //
00469 // That is...
00470 //
00471 // 1234xxxxxxxxxxxxxxxxxxxxxxxxxxxxx1234
00472 // | | |
00473 // position 0 end of buffer |
00474 // |
00475 // duplicate of front of buffer
00476
00477 m_ring_buffer[s] = c;
00478
00479 if (s < F - 1) {
00480 m_ring_buffer[s + N] = c;
00481 }
00482
00483 // Increment the position, and wrap around when we're at
00484 // the end. Note that this relies on N being a power of 2.
00485
00486 s = (short int) ( (s + 1) & (N - 1) );
00487 r = (short int) ( (r + 1) & (N - 1) );
00488
00489 // Register the string that is found in
00490 // m_ring_buffer[r..r+F-1].
00491
00492 InsertNode(r);
00493 }
00494
00495 // If we didn't quit because we hit the last_match_length,
00496 // then we must have quit because we ran out of characters
00497 // to process.
00498
00499 while (i++ < last_match_length) {
00500 DeleteNode(s);
00501
00502 s = (short int) ( (s + 1) & (N - 1) );
00503 r = (short int) ( (r + 1) & (N - 1) );
00504
00505 // Note that len hitting 0 is the key that causes the
00506 // do...while() to terminate. This is the only place
00507 // within the loop that len is modified.
00508 //
00509 // Its original value is F (or a number less than F for
00510 // short strings).
00511
00512 if (--len) {
00513 InsertNode(r); /* buffer may not be empty. */
00514 }
00515 }
00516
00517 // End of do...while() loop. Continue processing until there
00518 // are no more characters to be compressed. The variable
00519 // "len" is used to signal this condition.
00520 } while (len > 0);
00521
00522 // There could still be something in the output buffer. Send it
00523 // now.
00524
00525 if (code_buf_pos > 1) {
00526 // code_buf is the encoded string to send.
00527 // code_buf_ptr is the number of characters.
00528
00529 SendChars((char *) code_buf, code_buf_pos);
00530 }
00531
00532
00533 // must set zlen for parent class to know length of compressed buffer
00534 zlen = zpos;
00535 }
00536
00537
00538 /******************************************************************************
00539 * LZSSCompress::Decode - This function "decodes" the input stream into the
00540 * output stream.
00541 * The GetChars() and SendChars() functions are
00542 * used to separate this method from the actual
00543 * i/o.
00544 */
00545
00546 void LZSSCompress::Decode(void)
00547 {
00548 int k;
00549 int r; // node number
00550 unsigned char c[F]; // an array of chars
00551 unsigned char flags; // 8 bits of flags
00552 int flag_count; // which flag we're on
00553 short int pos; // position in the ring buffer
00554 short int len; // number of chars in ring buffer
00555 unsigned long totalLen = 0;
00556
00557 direct = 1; // set direction needed by parent [Get|Send]Chars()
00558
00559 // Initialize the ring buffer with a common string.
00560 //
00561 // Note that the last F bytes of the ring buffer are not filled.
00562
00563 memset(m_ring_buffer, ' ', N - F);
00564
00565 r = N - F;
00566
00567 flags = (char) 0;
00568 flag_count = 0;
00569
00570 for ( ; ; ) {
00571
00572 // If there are more bits of interest in this flag, then
00573 // shift that next interesting bit into the 1's position.
00574 //
00575 // If this flag has been exhausted, the next byte must
00576 // be a flag.
00577
00578 if (flag_count > 0) {
00579 flags = (unsigned char) (flags >> 1);
00580 flag_count--;
00581 }
00582 else {
00583 // Next byte must be a flag.
00584
00585 if (GetChars((char *) &flags, 1) != 1)
00586 break;
00587
00588 // Set the flag counter. While at first it might appear
00589 // that this should be an 8 since there are 8 bits in the
00590 // flag, it should really be a 7 because the shift must
00591 // be performed 7 times in order to see all 8 bits.
00592
00593 flag_count = 7;
00594 }
00595
00596 // If the low order bit of the flag is now set, then we know
00597 // that the next byte is a single, unencoded character.
00598
00599 if (flags & 1) {
00600 if (GetChars((char *) c, 1) != 1)
00601 break;
00602
00603 if (SendChars((char *) c, 1) != 1) {
00604 totalLen++;
00605 break;
00606 }
00607
00608 // Add to buffer, and increment to next spot. Wrap at end.
00609
00610 m_ring_buffer[r] = c[0];
00611 r = (short int) ( (r + 1) & (N - 1) );
00612 }
00613
00614 // Otherwise, we know that the next two bytes are a
00615 // <position,length> pair. The position is in 12 bits and
00616 // the length is in 4 bits.
00617
00618 else {
00619 // Original code:
00620 // if ((i = getc(infile)) == EOF)
00621 // break;
00622 // if ((j = getc(infile)) == EOF)
00623 // break;
00624 // i |= ((j & 0xf0) << 4);
00625 // j = (j & 0x0f) + THRESHOLD;
00626 //
00627 // I've modified this to only make one input call, and
00628 // have changed the variable names to something more
00629 // obvious.
00630
00631 if (GetChars((char *) c, 2) != 2)
00632 break;
00633
00634 // Convert these two characters into the position and
00635 // length. Note that the length is always at least
00636 // THRESHOLD, which is why we're able to get a length
00637 // of 18 out of only 4 bits.
00638
00639 pos = (short int) ( c[0] | ((c[1] & 0xf0) << 4) );
00640
00641 len = (short int) ( (c[1] & 0x0f) + THRESHOLD );
00642
00643 // There are now "len" characters at position "pos" in
00644 // the ring buffer that can be pulled out. Note that
00645 // len is never more than F.
00646
00647 for (k = 0; k < len; k++) {
00648 c[k] = m_ring_buffer[(pos + k) & (N - 1)];
00649
00650 // Add to buffer, and increment to next spot. Wrap at end.
00651
00652 m_ring_buffer[r] = c[k];
00653 r = (short int) ( (r + 1) & (N - 1) );
00654 }
00655
00656 // Add the "len" :characters to the output stream.
00657
00658 if (SendChars((char *) c, len) != (unsigned int)len) {
00659 totalLen += len;
00660 break;
00661 }
00662 }
00663 }
00664 slen = totalLen;
00665 }
1.2.15