/* */
/* Compress using Huffman trees */
/* simple_Huffman v.1.0 (c) 2010 Jan Mojzis */
/* public class: simple_Huffman */
/* main functions: int Cmpress(input, length, output /you allocate/) */
/* int Decompress(input, length, output /DO NOT allocate/) */
/* compressed buffer structure (sizes in bytes) */
/* [1] trees count */
/* trees count +1 x { [1] char [4] ref. count } */
/* [4] uncompressed size */
/* [1-?] compressed stream of bits */
/* Notes: */
/* x program is fast */
/* x last update: 29.7.2010 - comments, class rename */
/* x errors: none known */
/* x program works correctly */
#include <mem.h> // memset
#include <stdlib.h> // qsort
typedef unsigned char BYTE;
class node{
public:
node() {memset(this, 0, sizeof(node));} // constructor, memset is faster than individ.
~node() {if (left) delete left; if (right) delete right;}
node *left, *right;
int count; // ref. count
int code; // code in bits
int codelength; // code length
BYTE chr; // ASCII char, serves in compression and then in writing of frequencies
};
class simple_Huffman{
public:
simple_Huffman() {for (register int i = 0; i < 256; i++) {trees[i] = new node(); leaves[i] = trees[i]; } empty = new node();}
~simple_Huffman() {delete empty; for (register int i = 0; i < treescount; i++) delete trees[i];}
node *trees[256]; // array of trees, here will be only one tree at pos. 0 when finished with merging
node *leaves[256]; // array of leaves, each contains ASCII char,
// ref. count, later code and codelength too. It is a reference to original trees
node *empty; // pseudoleaf, serving in sorting and merging as empty leaf
// avoiding testing on NULL
int Compress(BYTE *input, int inputlength, BYTE *output);
int Decompress(BYTE *input, int inputlength, BYTE *&output); // output is self-allocated!!!
// DO NOT attempt to allocate it!!!
int treescount;
void InitializeLeaves(node *, int, int); // initialization of leaves with codes and code lengths
};
int comparefreq(const void *A, const void *B){ // sorting, we want to sort descending
if((*(node**)A)->count == (*(node**)B)->count) // A->count == B->count ?
return 0;
return (*(node**)A)->count < (*(node**)B)->count ? 1 : -1;// no? so if A->count < B->count give 1 else -1
}
// leaves initialization - set code and codelength to leaves
void simple_Huffman::InitializeLeaves(node *n, int path, int level){ // enter level is 0, enter path is 0
bool leaf = true;
if (n->right){
InitializeLeaves(n->right, path ^ (1 << level), level+1); // recursive part
leaf = false;
}
if (n->left){
InitializeLeaves(n->left, path, level+1); // recursive part
leaf = false;
}
if (leaf){
n->codelength = level; // important in compression and decompression
n->code = path;
}
}
int simple_Huffman::Compress(BYTE *input, int inputlength, BYTE *output){
BYTE *inptr = input, *stop = input+inputlength; // stop - tam je koniec
BYTE *outptr = output;
//get the individ. characters count
while (inptr != stop){
trees[(*inptr)]->count++; // each one char is one tree in fact
trees[(*inptr)]->chr = (*inptr);
inptr++;
}
qsort(trees, 256, sizeof(node*), comparefreq); // sort by references count
for (register int i = 0; i < 256; i++) // get the trees count
if (trees[i]->count > 0)
treescount++;
else break;
(*outptr) = treescount-1; // write down trees count to 1 BYTE so, that 1=0, 256 = 255
outptr++; // go forward
int count; // references count
for (register int i = 0; i < treescount; i++){
(*outptr) = trees[i]->chr; // write char
count = trees[i]->count;
(*(outptr+1)) = count >> 24; // write references c.
(*(outptr+2)) = count >> 16;
(*(outptr+3)) = count >> 8;
(*(outptr+4)) = count;
outptr+=5; // go forward
}
(*outptr) = inputlength >> 24; // write original stream's length
(*(outptr+1)) = inputlength >> 16;
(*(outptr+2)) = inputlength >> 8;
(*(outptr+3)) = inputlength;
outptr+=4;
node *n;
// merge all chars trees into one big (Huffman) tree, on pos. 0 will be this one main tree
while (treescount > 1){ // while trees count is > than > 1
n = new node(); // allocate a new tree
if (trees[treescount-1]->count > trees[treescount-2]->count){
// to the left going the tree with a fewer ref. count
n->right = trees[treescount-1];
n->left = trees[treescount-2];
}
else{
// to the left going the tree with a fewer ref. count
n->right = trees[treescount-2];
n->left = trees[treescount-1];
}
// references count from both trees (and their subtrees) are summed and assigned to new tree
n->count = n->right->count + n->left->count;
trees[treescount-1] = empty; // one tree goes "out", so its position will be assigned to
// default default tree with 0 ref. count
trees[treescount-2] = n;
treescount--; // trees count is decreased by 1
qsort(trees, treescount, sizeof(node*), comparefreq); // again sort
// new sorting must be here, imagine situation, when there is situation: ref. count on the last >
// ref. count on the last but one, in this case, they have to be swapped
}
InitializeLeaves(trees[0], 0,0); // initialize leaves (for writing down the ref. count table & compression)
// compression
inptr = input;
int bitpath, bitswritten, outbitswritten = 0, byteswritten;
register int i, bitcount;
while (inptr != stop){
bitpath = leaves[(*inptr)]->code;
bitcount = leaves[(*inptr)]->codelength;
bitswritten =0;
for (i = 0; i < bitcount; i++){
// bits are written right to left, from LSB to MSB
// when byte is filled up, it cames to next one
(*outptr) ^= ((bitpath >> bitswritten)&1) << outbitswritten;
bitswritten++;
(*(outptr+1)) = '\0';
outptr += ((outbitswritten >> 2) & (outbitswritten >> 1) & outbitswritten) &1; // 0..6 =0, 7=1
outbitswritten++; // bytes written growing with each 8 written bits ^^
outbitswritten&=7;
}
inptr++;
}
byteswritten = outptr - output;
if (bitswritten > 0) // if necessary, increase the bytes written
byteswritten++; // e.g. if written 1 bit, even 1 bit needs whole byte
return byteswritten;
}
int simple_Huffman::Decompress(BYTE *input, int inputlength, BYTE *&output)
{
BYTE *stop = input + inputlength;
BYTE *inptr = input;
treescount = (int)(*inptr); // najprv nacitame treecount
treescount++; // trees count is always +1
inptr++;
// reading table of chars with their ref./freq. count char [1] byte, count [4] bytes
int count;
BYTE znak;
for (register int i = 0; i < treescount; i++)
{
znak = (*(inptr)); // read char from table
count = (int)0^((*(inptr+1))) << 24; // read count from table
count^= ((int)0^((*(inptr+2))) << 16);
count^= ((int)0^((*(inptr+3))) << 8);
count^= (int)(*(inptr+4));
trees[i]->count = count; // set the corresponding tree
trees[i]->chr = znak;
inptr+=5; // go forward
}
// read orig. stream size
int outsize;
outsize = (int)0^(*inptr) << 24; // read uncompressed stream s.
outsize ^= ((int)0^(*(inptr+1)) << 16);
outsize ^= ((int)0^(*(inptr+2)) << 8);
outsize ^= ((int)*(inptr+3));
inptr+=4;
node *n;
while (treescount > 1) // zlucuj stromy, kym neostane len 1
{
n = new node();
if (trees[treescount-1]->count > trees[treescount-2]->count){
// to the left going the tree with a fewer ref. count
n->right = trees[treescount-1];
n->left = trees[treescount-2];
}
else{
// to the left going the tree with a fewer ref. count
n->right = trees[treescount-2];
n->left = trees[treescount-1];
}
n->count = n->right->count + n->left->count;
trees[treescount-1] = empty; // one tree goes "out", so its position will be assigned to
// default default tree with 0 ref. count
trees[treescount-2] = n;
treescount--; // decrease trees count by 1
qsort(trees, treescount, sizeof(node*), comparefreq);
}
InitializeLeaves(trees[0], 0,0); // initialize leaves - set their codes and code lengths
output = new BYTE[outsize]; // allocate output
BYTE *outptr = output;
int bit = 0;
node *nptr ;
int b;
int outbitswritten = 0;
while(inptr != stop){ // dekompress
nptr = trees[0];
while(nptr->codelength == 0){
b = ((*inptr) >> bit) &1;
nptr = (b > 0) ? nptr->right : nptr->left;
inptr+=( (bit >> 2) & (bit >> 1) & bit) & 1;
bit++;
bit&=7;
}
(*outptr) = nptr->chr;
outptr ++;
if (outptr-output == outsize) return outptr-output;
}
return outptr-output;
}