Click here to Skip to main content
15,885,546 members
Articles / Programming Languages / C++

Patricia and Huffman, Sitting in a Trie

Rate me:
Please Sign up or sign in to vote.
4.54/5 (24 votes)
27 Dec 200410 min read 137.2K   2.2K   39  
Article demonstrates a text-based Patricia trie and adds new text-compression features.
/*

"Text-Based Patricia Trie"

Copyright (C) 2004 by Michael Walter Jaworski

Licensed under the Academic Free License version 1.2

*/

#include "pat.h"

//STATIC HUFFMAN CODE - ONLY CREATED ONCE
//*****************************************************************
//build huffman tree
//based on common letter usage in the English language
//source: UNIX manual pages (find, ls, mv, cp, ...) all added together
//
//e t a o i n s r h l d  c  u  m  f  p  g  w  y  b  v  k  x  j  q  z
//0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
//
char PatriciaTrieC::huffman_code [] = 
{
/* A-D */
2,			//A
19,			//B
11,			//C
10,			//D

/* E-H */
0,			//E
14,			//F
16,			//G
8,			//H

/* I-L */
4,			//I
23,			//J
21,			//K
9,			//L

/* M-P */
13,			//M
5,			//N
3,			//O
15,			//P

/* Q-T */
24,			//Q
6,			//S
7,			//R
1,			//T

/* U-X */
12,			//U
20,			//V
17,			//W
22,			//X

/* Y-Z */
18,			//Y
25			//Z
};
//*****************************************************************


//////////////////////////////////////////////////////////////////////////
keyType PatriciaTrieC::BuildKey (char* _txt)
{
	keyType key = 0;														//key being built up
	register char bit_count = MAX_KEY_BITS;			//how many bits have been filled
	register char txt_count = 0;								//which character are we on
	char shift_bits = 0;												//how many bits to shift to make space
	register char huff_code = 0;								//the current huffman code to add on

	//for security we copy text string into our own buffer
	char txt [MAX_STRING_FOR_KEY] = "\0";

	//copy everything up to the end of string or end of buffer
	//we ensure that there will be one end of string character at the end
	//so we can just check for that instead of the txt_count guy... easier
	//
	//also, no buffer overrun crap will be allowed
	for (char i = 0; i < (MAX_STRING_FOR_KEY-1) && *_txt != '\0'; i++, _txt++)
		txt [i] = *_txt;

	//convert text to integer
	do
	{
		//if we get to the end of the text string before the key is filled that is okay
		//we will just use what we have!
		if (txt [txt_count] == END_OF_STRING)
			break;

		//if big caps, then make it small
		if (txt [txt_count] >= 'a' && txt [txt_count] <= 'z')
			txt [txt_count] -= LOWER_TO_UPPER_FACTOR;

		//skip all weird ass characters (only normal letters are handled)
		if (txt [txt_count] < 'A' || txt [txt_count] > 'Z')
		{
			//go to next character
			txt_count += 1;

			//try next character
			continue;
		}

		//get number we are adding [HUFF]
		huff_code = huffman_code [txt [txt_count]-CONV_FACTOR_ASCII_TO_INT];

		//calculate how many bits to shift on to key
		do
		{
			huff_code >>= 1;				//shift bits off until we get to zero
			shift_bits += 1;				//number of bits we need to shift before the add
			bit_count -= 1;					//track how many bits have been shifted so far
		} 
		//quit when we have counted how many bits are used to represent number
		while (huff_code > 0);

		//if we do not have enough bits in the key, then quit with what we have
		if (bit_count < 0)
			break;

		//make space to add the next character on (shift zero to add to)
		key <<= shift_bits;

		//add the next character on
		key += huffman_code [txt [txt_count]-CONV_FACTOR_ASCII_TO_INT];

		//we added one letter
		txt_count += 1;
	}
	//we will probably quit with the break, but just to make sure we have this
	//to kill the loop, ...when we have added as many bits as possible
	//
	//notice that if bit_count is 0 it will not break, but we should not continue
	//so this little check saves us some stupidity
	while (bit_count > 0);

	//return the formed key
	return key;
}

//_n - the current node to insert at
//_payload - the class to associate with the string of text
//_key - the integer key to store (or text)
//_bp - the position of the bit in the key to check against
//////////////////////////////////////////////////////////////////////////
PatriciaNodeC* PatriciaTrieC::insert (PatriciaNodeC*& _n, ASSOCIATED_CLASS*& _payload, keyType _key, bitPositionType _bp)
{
	//empty pointer from an internal node
	if (!_n)
	{
		//auto leaf
		_n = new PatriciaNodeC (_payload, _key);
		return _n;
	}
	
	//if no more bits to discern, then you cannot insert
	else if (_bp > MAX_KEY_BITS)
	{
		//was already inserted or we do not have the appropriate number of bits to differentiate from
		//what ever was stored (obviously there is something else down there that took the max bits
		//to store and we need more bits to tell them apart!)
		return EMPTY_NODE;
	}

	//we hit a leaf and need to store the leaf and the new node too
	else if (_n->is_leaf)
	{

		//if you tried to insert duplicates, then do not insert anything else
		//
		//without this duplicates with create an immediate depth of the number of bits in our key
		//because they will be the same until _bp is greater than MAX_KEY_BITS (above) so even though
		//it is a whole key comparison, it will save on the average case (a lot)
		//
		if (_n->leaf.key == _key)
			return EMPTY_NODE;

		register PatriciaNodeC* _original = _n;

		//split leaf to internal
		_n = new PatriciaNodeC ();

		//save what was there in the appropriate child
		if (IsBitOn (_original->leaf.key, _bp))
			_n->internal.right = _original;
			
		else
			_n->internal.left = _original;

		//we placed the leaf in an appropriate position and will
		//now continue with our new internal node.
	}

	//try left (if last bit on key is not zero)
	if (IsBitOn (_key, _bp))
		return insert (_n->internal.right, _payload, _key, (bitPositionType)(_bp+1));
	
	//if last bit on key is zero then go right
	else
		return insert (_n->internal.left, _payload, _key, (bitPositionType)(_bp+1));
}

//////////////////////////////////////////////////////////////////////////
ASSOCIATED_CLASS* PatriciaTrieC::search (PatriciaNodeC* _n, keyType _key, bitPositionType _bp)
{
	//if tree is empty
	if (!_n) 
		return EMPTY_NODE;

	//if we found a leaf, then this is either it or it does not exist
	else if (_n->is_leaf)
	{
		//check if we found it or not
		if (_n->leaf.key == _key)
			//found it
			return _n->leaf.payload;

		else
			//does not exist
			return EMPTY_NODE;
	}

	//try left (if last bit on key is one)
	else if (IsBitOn (_key, _bp))
		return search (_n->internal.right, _key, (bitPositionType)(_bp+1));

	//if last bit on key is zero then go right
	else
		return search (_n->internal.left, _key, (bitPositionType)(_bp+1));
}

//////////////////////////////////////////////////////////////////////////
ASSOCIATED_CLASS* PatriciaTrieC::remove (PatriciaNodeC*& _n, keyType _key, bitPositionType _bp)
{
	//to remember payload so we can also tail recurse through
	//the list and remove any internal nodes that are not used
	//in the tree
	ASSOCIATED_CLASS* tmp = EMPTY_NODE;

	//if tree is empty
	if (!_n) 
		return EMPTY_NODE;

	else if (_n->is_leaf)
	{
		ASSOCIATED_CLASS* send_payload_back = _n->leaf.payload;

		//because in this case we do not want the payload deleted
		//do the node class will not wax it, we are passing it back
		_n->leaf.payload = EMPTY_NODE;
		delete _n;
		_n = EMPTY_NODE;
		return send_payload_back;
	}

	//try left (if last bit on key is one)
	else if (IsBitOn (_key, _bp))
		//save so we can tail recurse
		tmp = remove (_n->internal.right, _key, (bitPositionType)(_bp+1));

	//if last bit on key is zero then go right
	else
		//save so we can tail recurse
		tmp = remove (_n->internal.left, _key, (bitPositionType)(_bp+1));

	//clean up any unused internal nodes on your way up
	////////////////////////////////////////////////////////////////
	if (!_n->is_leaf && !_n->internal.left && !_n->internal.right)
	{
		delete _n;
		_n = EMPTY_NODE;
	}
	////////////////////////////////////////////////////////////////

	//return the payload
	return tmp;
}

//////////////////////////////////////////////////////////////////////////
void PatriciaTrieC::clear (PatriciaNodeC*& kill_me)
{
	//quit if NULL
	if (!kill_me)
	{
		return;
	}
	//go down every branch
	else 
	{
		if (!kill_me->is_leaf)
		{
			if (kill_me->internal.left)
				clear (kill_me->internal.left);
			
			if (kill_me->internal.right)
				clear (kill_me->internal.right);
		}
		
		delete kill_me;
		kill_me = EMPTY_NODE;
	}
	//we assume anything they wanted they got
	//we cannot assume we are allowed to delete the payload, could be
	//static..!
}


//
//
//
//
//
//
//
//
//
//
//
//
#ifdef DEBUG

	//////////////////////////////////////////////////////////////////////////
	void PatriciaTrieC::print (PatriciaNodeC* _n, keyType _bp)
	{
		if (!_n)
			return;

		register unsigned int i = 0;

		for (i = 0; i < _bp; i++)
			cout << '.';

		cout << "root " << _n << endl;

		if (_n->is_leaf)
			cout << _n->leaf.key << endl;

		else
		{
			for (i = 0; i < _bp; i++)
				cout << '.';
			cout << "left " << _n->internal.left << endl;

			print (_n->internal.left, _bp+1);

			for (i = 0; i < _bp; i++)
				cout << '.';
			cout << "right " << _n->internal.right << endl;

			print (_n->internal.right, _bp+1);
		}
	}

#endif

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article has no explicit license attached to it but may contain usage terms in the article text or the download files themselves. If in doubt please contact the author via the discussion board below.

A list of licenses authors might use can be found here


Written By
Software Developer
United Kingdom United Kingdom
This member has not yet provided a Biography. Assume it's interesting and varied, and probably something to do with programming.

Comments and Discussions