/*
"Text-Based Patricia Trie"
Copyright (C) 2004 by Michael Walter Jaworski
Licensed under the Academic Free License version 1.2
*/
#include "pat.h"
//STATIC HUFFMAN CODE - ONLY CREATED ONCE
//*****************************************************************
//build huffman tree
//based on common letter usage in the English language
//source: UNIX manual pages (find, ls, mv, cp, ...) all added together
//
//e t a o i n s r h l d c u m f p g w y b v k x j q z
//0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
//
char PatriciaTrieC::huffman_code [] =
{
/* A-D */
2, //A
19, //B
11, //C
10, //D
/* E-H */
0, //E
14, //F
16, //G
8, //H
/* I-L */
4, //I
23, //J
21, //K
9, //L
/* M-P */
13, //M
5, //N
3, //O
15, //P
/* Q-T */
24, //Q
6, //S
7, //R
1, //T
/* U-X */
12, //U
20, //V
17, //W
22, //X
/* Y-Z */
18, //Y
25 //Z
};
//*****************************************************************
//////////////////////////////////////////////////////////////////////////
keyType PatriciaTrieC::BuildKey (char* _txt)
{
keyType key = 0; //key being built up
register char bit_count = MAX_KEY_BITS; //how many bits have been filled
register char txt_count = 0; //which character are we on
char shift_bits = 0; //how many bits to shift to make space
register char huff_code = 0; //the current huffman code to add on
//for security we copy text string into our own buffer
char txt [MAX_STRING_FOR_KEY] = "\0";
//copy everything up to the end of string or end of buffer
//we ensure that there will be one end of string character at the end
//so we can just check for that instead of the txt_count guy... easier
//
//also, no buffer overrun crap will be allowed
for (char i = 0; i < (MAX_STRING_FOR_KEY-1) && *_txt != '\0'; i++, _txt++)
txt [i] = *_txt;
//convert text to integer
do
{
//if we get to the end of the text string before the key is filled that is okay
//we will just use what we have!
if (txt [txt_count] == END_OF_STRING)
break;
//if big caps, then make it small
if (txt [txt_count] >= 'a' && txt [txt_count] <= 'z')
txt [txt_count] -= LOWER_TO_UPPER_FACTOR;
//skip all weird ass characters (only normal letters are handled)
if (txt [txt_count] < 'A' || txt [txt_count] > 'Z')
{
//go to next character
txt_count += 1;
//try next character
continue;
}
//get number we are adding [HUFF]
huff_code = huffman_code [txt [txt_count]-CONV_FACTOR_ASCII_TO_INT];
//calculate how many bits to shift on to key
do
{
huff_code >>= 1; //shift bits off until we get to zero
shift_bits += 1; //number of bits we need to shift before the add
bit_count -= 1; //track how many bits have been shifted so far
}
//quit when we have counted how many bits are used to represent number
while (huff_code > 0);
//if we do not have enough bits in the key, then quit with what we have
if (bit_count < 0)
break;
//make space to add the next character on (shift zero to add to)
key <<= shift_bits;
//add the next character on
key += huffman_code [txt [txt_count]-CONV_FACTOR_ASCII_TO_INT];
//we added one letter
txt_count += 1;
}
//we will probably quit with the break, but just to make sure we have this
//to kill the loop, ...when we have added as many bits as possible
//
//notice that if bit_count is 0 it will not break, but we should not continue
//so this little check saves us some stupidity
while (bit_count > 0);
//return the formed key
return key;
}
//_n - the current node to insert at
//_payload - the class to associate with the string of text
//_key - the integer key to store (or text)
//_bp - the position of the bit in the key to check against
//////////////////////////////////////////////////////////////////////////
PatriciaNodeC* PatriciaTrieC::insert (PatriciaNodeC*& _n, ASSOCIATED_CLASS*& _payload, keyType _key, bitPositionType _bp)
{
//empty pointer from an internal node
if (!_n)
{
//auto leaf
_n = new PatriciaNodeC (_payload, _key);
return _n;
}
//if no more bits to discern, then you cannot insert
else if (_bp > MAX_KEY_BITS)
{
//was already inserted or we do not have the appropriate number of bits to differentiate from
//what ever was stored (obviously there is something else down there that took the max bits
//to store and we need more bits to tell them apart!)
return EMPTY_NODE;
}
//we hit a leaf and need to store the leaf and the new node too
else if (_n->is_leaf)
{
//if you tried to insert duplicates, then do not insert anything else
//
//without this duplicates with create an immediate depth of the number of bits in our key
//because they will be the same until _bp is greater than MAX_KEY_BITS (above) so even though
//it is a whole key comparison, it will save on the average case (a lot)
//
if (_n->leaf.key == _key)
return EMPTY_NODE;
register PatriciaNodeC* _original = _n;
//split leaf to internal
_n = new PatriciaNodeC ();
//save what was there in the appropriate child
if (IsBitOn (_original->leaf.key, _bp))
_n->internal.right = _original;
else
_n->internal.left = _original;
//we placed the leaf in an appropriate position and will
//now continue with our new internal node.
}
//try left (if last bit on key is not zero)
if (IsBitOn (_key, _bp))
return insert (_n->internal.right, _payload, _key, (bitPositionType)(_bp+1));
//if last bit on key is zero then go right
else
return insert (_n->internal.left, _payload, _key, (bitPositionType)(_bp+1));
}
//////////////////////////////////////////////////////////////////////////
ASSOCIATED_CLASS* PatriciaTrieC::search (PatriciaNodeC* _n, keyType _key, bitPositionType _bp)
{
//if tree is empty
if (!_n)
return EMPTY_NODE;
//if we found a leaf, then this is either it or it does not exist
else if (_n->is_leaf)
{
//check if we found it or not
if (_n->leaf.key == _key)
//found it
return _n->leaf.payload;
else
//does not exist
return EMPTY_NODE;
}
//try left (if last bit on key is one)
else if (IsBitOn (_key, _bp))
return search (_n->internal.right, _key, (bitPositionType)(_bp+1));
//if last bit on key is zero then go right
else
return search (_n->internal.left, _key, (bitPositionType)(_bp+1));
}
//////////////////////////////////////////////////////////////////////////
ASSOCIATED_CLASS* PatriciaTrieC::remove (PatriciaNodeC*& _n, keyType _key, bitPositionType _bp)
{
//to remember payload so we can also tail recurse through
//the list and remove any internal nodes that are not used
//in the tree
ASSOCIATED_CLASS* tmp = EMPTY_NODE;
//if tree is empty
if (!_n)
return EMPTY_NODE;
else if (_n->is_leaf)
{
ASSOCIATED_CLASS* send_payload_back = _n->leaf.payload;
//because in this case we do not want the payload deleted
//do the node class will not wax it, we are passing it back
_n->leaf.payload = EMPTY_NODE;
delete _n;
_n = EMPTY_NODE;
return send_payload_back;
}
//try left (if last bit on key is one)
else if (IsBitOn (_key, _bp))
//save so we can tail recurse
tmp = remove (_n->internal.right, _key, (bitPositionType)(_bp+1));
//if last bit on key is zero then go right
else
//save so we can tail recurse
tmp = remove (_n->internal.left, _key, (bitPositionType)(_bp+1));
//clean up any unused internal nodes on your way up
////////////////////////////////////////////////////////////////
if (!_n->is_leaf && !_n->internal.left && !_n->internal.right)
{
delete _n;
_n = EMPTY_NODE;
}
////////////////////////////////////////////////////////////////
//return the payload
return tmp;
}
//////////////////////////////////////////////////////////////////////////
void PatriciaTrieC::clear (PatriciaNodeC*& kill_me)
{
//quit if NULL
if (!kill_me)
{
return;
}
//go down every branch
else
{
if (!kill_me->is_leaf)
{
if (kill_me->internal.left)
clear (kill_me->internal.left);
if (kill_me->internal.right)
clear (kill_me->internal.right);
}
delete kill_me;
kill_me = EMPTY_NODE;
}
//we assume anything they wanted they got
//we cannot assume we are allowed to delete the payload, could be
//static..!
}
//
//
//
//
//
//
//
//
//
//
//
//
#ifdef DEBUG
//////////////////////////////////////////////////////////////////////////
void PatriciaTrieC::print (PatriciaNodeC* _n, keyType _bp)
{
if (!_n)
return;
register unsigned int i = 0;
for (i = 0; i < _bp; i++)
cout << '.';
cout << "root " << _n << endl;
if (_n->is_leaf)
cout << _n->leaf.key << endl;
else
{
for (i = 0; i < _bp; i++)
cout << '.';
cout << "left " << _n->internal.left << endl;
print (_n->internal.left, _bp+1);
for (i = 0; i < _bp; i++)
cout << '.';
cout << "right " << _n->internal.right << endl;
print (_n->internal.right, _bp+1);
}
}
#endif