Click here to Skip to main content
15,895,011 members
Articles / Programming Languages / C

Fast and Simple Huffman Compressor

Rate me:
Please Sign up or sign in to vote.
4.80/5 (16 votes)
2 Apr 2012CPOL5 min read 76.2K   4.8K   38  
Fast and simple huffman compressor
/*                                                                            */
/*    Compress using Huffman trees                                            */
/*    simple_Huffman                v.1.0     (c) 2010 Jan Mojzis             */
/*    public class: simple_Huffman                                            */
/*    main functions: int Cmpress(input, length, output /you allocate/)       */
/*                    int Decompress(input, length, output /DO NOT allocate/) */
/*    compressed buffer structure (sizes in bytes)                            */
/*       [1] trees count                                                      */
/*       trees count +1 x { [1] char [4] ref. count }                         */
/*       [4] uncompressed size                                                */
/*       [1-?] compressed stream of bits                                      */
/*    Notes:                                                                  */
/*       x program is fast                                                    */
/*       x last update:  29.7.2010 - comments, class rename                   */
/*       x errors: none known                                                 */
/*       x program works correctly                                            */

#include <mem.h>     // memset
#include <stdlib.h>  // qsort
typedef unsigned char BYTE;
class node{
  public:
    node() {memset(this, 0, sizeof(node));} // constructor, memset is faster than individ.
    ~node() {if (left) delete left; if (right) delete right;}
  node *left, *right;
  int count;       // ref. count
  int code;        // code in bits
  int codelength;  // code length
  BYTE chr;        // ASCII char, serves in compression and then in writing of frequencies
};
class simple_Huffman{
  public:    
    simple_Huffman() {for (register int i = 0; i < 256; i++) {trees[i] = new node(); leaves[i] = trees[i]; } empty = new node();}
    ~simple_Huffman() {delete empty; for (register int i = 0; i < treescount; i++) delete trees[i];}
  node *trees[256];  // array of trees, here will be only one tree at pos. 0 when finished with merging 
  node *leaves[256]; // array of leaves, each contains ASCII char,
                     // ref. count, later code and codelength too. It is a reference to original trees
  node *empty;       // pseudoleaf, serving in sorting and merging as empty leaf
                     // avoiding testing on NULL
  int Compress(BYTE *input, int inputlength, BYTE *output);
  int Decompress(BYTE *input, int inputlength, BYTE *&output); // output is self-allocated!!!
                                                               // DO NOT attempt to allocate it!!!
  int treescount;
  void InitializeLeaves(node *, int, int); // initialization of leaves with codes and code lengths
};
int comparefreq(const void *A, const void *B){ // sorting, we want to sort descending
	if((*(node**)A)->count == (*(node**)B)->count)     // A->count == B->count ?
		return 0;
	return (*(node**)A)->count < (*(node**)B)->count ? 1 : -1;// no? so if A->count < B->count give 1 else -1
}
// leaves initialization - set code and codelength to leaves
void simple_Huffman::InitializeLeaves(node *n, int path, int level){ // enter level is 0, enter path is 0
  bool leaf = true;
  if (n->right){      
     InitializeLeaves(n->right, path ^ (1 << level), level+1); // recursive part
     leaf = false;                                             
  }       
  if (n->left){
     InitializeLeaves(n->left, path, level+1);   // recursive part
     leaf = false;                               
  }  
  if (leaf){
      n->codelength = level;  // important in compression and decompression
      n->code = path;         
  }               
}
int simple_Huffman::Compress(BYTE *input, int inputlength, BYTE *output){
  BYTE *inptr = input, *stop = input+inputlength;  // stop - tam je koniec
  BYTE *outptr = output;
  //get the individ. characters count
  while (inptr != stop){
    trees[(*inptr)]->count++;    // each one char is one tree in fact
    trees[(*inptr)]->chr = (*inptr);        
    inptr++;
  }
  qsort(trees, 256, sizeof(node*), comparefreq); // sort by references count
  for (register int i = 0; i < 256; i++) // get the trees count
   if (trees[i]->count > 0)
      treescount++;      
   else break; 
  (*outptr) = treescount-1; // write down trees count to 1 BYTE so, that 1=0, 256 = 255
  outptr++;  // go forward
  int count; // references count
  for (register int i = 0; i < treescount; i++){
    (*outptr) = trees[i]->chr;  // write char
    count = trees[i]->count;
    (*(outptr+1)) =  count >> 24;  // write references c.
    (*(outptr+2)) =  count >> 16;
    (*(outptr+3)) =  count >> 8;
    (*(outptr+4)) =  count;
    outptr+=5;         // go forward
  }  
  (*outptr)     =  inputlength >> 24;  // write original stream's length
  (*(outptr+1)) =  inputlength >> 16;
  (*(outptr+2)) =  inputlength >> 8;
  (*(outptr+3)) =  inputlength;
  
  outptr+=4;
  node *n;
  // merge all chars trees into one big (Huffman) tree, on pos. 0 will be this one main tree
  while (treescount > 1){ // while trees count is > than > 1
      n = new node(); // allocate a new tree
      if (trees[treescount-1]->count > trees[treescount-2]->count){
        // to the left going the tree with a fewer ref. count
         n->right = trees[treescount-1];
         n->left = trees[treescount-2];
      }   
      else{
        // to the left going the tree with a fewer ref. count
         n->right = trees[treescount-2];
         n->left = trees[treescount-1];
      }
      // references count from both trees (and their subtrees) are summed and assigned to new tree
      n->count = n->right->count + n->left->count;      
      trees[treescount-1] = empty;  // one tree goes "out", so its position will be assigned to
                                    // default default tree with 0 ref. count
      trees[treescount-2] = n;
      treescount--;               // trees count is decreased by 1
      qsort(trees, treescount, sizeof(node*), comparefreq); // again sort
      // new sorting must be here, imagine situation, when there is situation: ref. count on the last > 
      // ref. count on the last but one, in this case, they have to be swapped
  }
  InitializeLeaves(trees[0], 0,0); // initialize leaves (for writing down the ref. count table & compression)
  
  // compression
  inptr = input;  
  int bitpath, bitswritten, outbitswritten = 0, byteswritten;
  register int i, bitcount;
  while (inptr != stop){
    bitpath = leaves[(*inptr)]->code;
    bitcount = leaves[(*inptr)]->codelength;
    bitswritten =0;
    for (i = 0; i < bitcount; i++){
       // bits are written right to left, from LSB to MSB
       // when byte is filled up, it cames to next one
       (*outptr) ^= ((bitpath >> bitswritten)&1) << outbitswritten;
       bitswritten++;
       (*(outptr+1)) = '\0';
       outptr += ((outbitswritten >> 2) & (outbitswritten >> 1) & outbitswritten) &1; // 0..6 =0,  7=1
       outbitswritten++; // bytes written growing with each 8 written bits ^^
       outbitswritten&=7;
    }         
    inptr++;
  }
  byteswritten = outptr - output;
  if (bitswritten > 0) // if necessary, increase the bytes written
     byteswritten++;   // e.g. if written 1 bit, even 1 bit needs whole byte
  return byteswritten;   
}
int simple_Huffman::Decompress(BYTE *input, int inputlength, BYTE *&output)
{
  BYTE *stop = input + inputlength;
  BYTE *inptr = input;
  treescount = (int)(*inptr);  // najprv nacitame treecount  
  treescount++; // trees count is always +1
  inptr++;
  // reading table of chars with their ref./freq. count char [1] byte, count [4] bytes
  int count;
  BYTE znak;
  for (register int i = 0; i < treescount; i++)
  {      
    znak = (*(inptr));  // read char from table
    count = (int)0^((*(inptr+1))) << 24; // read count from table
    count^= ((int)0^((*(inptr+2))) << 16);
    count^= ((int)0^((*(inptr+3))) << 8);
    count^= (int)(*(inptr+4));
    trees[i]->count = count;  // set the corresponding tree
    trees[i]->chr = znak;
    inptr+=5; // go forward
  }
  // read orig. stream size
  int outsize;
  outsize = (int)0^(*inptr) << 24;  // read uncompressed stream s.
  outsize ^= ((int)0^(*(inptr+1)) << 16);
  outsize ^= ((int)0^(*(inptr+2)) << 8);
  outsize ^= ((int)*(inptr+3));
  inptr+=4;
  node *n;
  while (treescount > 1)  // zlucuj stromy, kym neostane len 1
  {
      n = new node();
      if (trees[treescount-1]->count > trees[treescount-2]->count){
        // to the left going the tree with a fewer ref. count
         n->right = trees[treescount-1];
         n->left = trees[treescount-2];
      }   
      else{
        // to the left going the tree with a fewer ref. count
         n->right = trees[treescount-2];
         n->left = trees[treescount-1];
      }
      n->count = n->right->count + n->left->count;      
      trees[treescount-1] = empty;  // one tree goes "out", so its position will be assigned to
                                    // default default tree with 0 ref. count
      trees[treescount-2] = n;
      treescount--;   // decrease trees count by 1   
      qsort(trees, treescount, sizeof(node*), comparefreq);
  }
  InitializeLeaves(trees[0], 0,0);  // initialize leaves - set their codes and code lengths
  output = new BYTE[outsize]; // allocate output
  BYTE *outptr = output;
  int bit = 0;
  node *nptr ;
  int b;
  int outbitswritten = 0;
  while(inptr != stop){  // dekompress
      nptr = trees[0];
      while(nptr->codelength == 0){
        b = ((*inptr) >> bit) &1;
        nptr = (b > 0) ? nptr->right :  nptr->left;
        inptr+=( (bit >> 2) & (bit >> 1) & bit) & 1;
        bit++;        
        bit&=7;
      }  
      (*outptr) = nptr->chr;      
      outptr ++;
      if (outptr-output == outsize) return outptr-output;
  }
  return outptr-output;  
}   


By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)


Written By
Software Developer (Junior) SOMI Systems
Slovakia Slovakia
Got 1st and 2nd degree from computer science in 2011 on university of Matej Bel, city of Banska Bystrica. Currently work for computer-oriented company SOMI Systems in Slovakia.
Working for SOMI Systems a.s., http://somi.sk/

Comments and Discussions