Click here to Skip to main content
Click here to Skip to main content
Add your own
alternative version

How to build an in-memory state engine at runtime

, 28 Oct 2007 CPOL
A C++ template for an efficient in-memory state engine.
tag_tree.zip
cp
Debug
TagTree.exe
sources
algorithm
container
general
mpl
tag_tree.pdf
//	Copyright � 2007, solosTec
//  All rights reserved.
//---------------------------------------------------------------------
//	License for redistribution is the GNU General Public License v2.
//	See the included readme.txt for details.
//---------------------------------------------------------------------
//	developed by solosTec
//	http://www.solostec.de
//---------------------------------------------------------------------
//
//	$Author: $
//	$Revision: $
/*	$Log: $
*/
//---------------------------------------------------------------------
/**
*	Fast detecting of of tags or keywords.
 */
#pragma once
#include <boost/noncopyable.hpp>
#include <string>
#include <memory>
#include <boost/static_assert.hpp>
#include <boost/assert.hpp>
#include <boost/ptr_container/ptr_list.hpp>
#include <boost/ptr_container/ptr_array.hpp>
#include <iostream>
#include <algorithm>
#include <algorithm/hash.h>
#include <general/initialize.h>
#include <ostream>

#pragma warning( push )
#pragma warning( disable : 4996 )	//	unsafe methods _SCL_SECURE_NO_WARNINGS

namespace turban {
namespace container {

//	+-----------------------------------------------------------------+
//	| tag_tree_base [declaration]
//	+-----------------------------------------------------------------+
/**
*	Define some <code>typedefs</code> and access methods for all node
*	values.
*	The <code>tag_tree_base</code> is not restricted to strings. You
*	can use any string-like container which defines 
*	a <code>const_iterator</code> and a <code>value_type</code> type.
*
*	@see turban::general::initialize
*/
template 
< 
	typename TString,
	typename TValue,
	typename TReset
>
class tag_tree_base 
	: private boost::noncopyable
{
public:
	typedef TString								String_;
	typedef typename String_::const_iterator	char_const_iterator;
	typedef typename String_::value_type		char_type;
	typedef std::basic_ostream< char_type >		ostream_type_;
	typedef TValue								value_type;
	typedef TReset								ValueReset_;

public:
	tag_tree_base( char_type );
	~tag_tree_base();
	value_type& getValue();
	const value_type& getValue() const;

protected:
	bool isGlyph( char_type ) const;
	bool isLeaf() const;
	char_type getGlyph() const;
	bool resetValue( const value_type& );

private:
	const char_type		glyph_;
	value_type			value_;
	bool				leaf_;	//!	marks this node as leaf e.g. contains a valid value.

	static ValueReset_	valueReset_;
};

//	+-----------------------------------------------------------------+
//	| tag_tree_base [initialization]
//	+-----------------------------------------------------------------+
template < typename TString, typename TValue, typename TReset >
typename tag_tree_base< TString, TValue, TReset >::ValueReset_	
tag_tree_base< TString, TValue, TReset >::valueReset_ = tag_tree_base< TString, TValue, TReset >::ValueReset_();

//	+-----------------------------------------------------------------+
//	| tag_tree_base [implementation]
//	+-----------------------------------------------------------------+
/**
*	@param c The character this node is representing.
*/
template < typename TString, typename TValue, typename TReset >
tag_tree_base< TString, TValue, TReset >::tag_tree_base( char_type c )
	: glyph_( c )
	, leaf_( false )
{
	valueReset_( value_ );
#ifdef _DEBUG
	//std::cout
	//	<< "tag_tree_base( "
	//	<< glyph_
	//	<< " )"
	//	<< std::endl
	//	;
#endif
}

template < typename TString, typename TValue, typename TReset >
tag_tree_base< TString, TValue, TReset >::~tag_tree_base()
{
#ifdef _DEBUG
	//std::cout
	//	<< "clear: "
	//	<< glyph_
	//	<< " = "
	//	<< value_
	//	<< std::endl
	//	;
#endif
}

template < typename TString, typename TValue, typename TReset >
bool tag_tree_base< TString, TValue, TReset >::isGlyph( char_type c ) const
{
	return glyph_ == c;
}

template < typename TString, typename TValue, typename TReset >
bool tag_tree_base< TString, TValue, TReset >::isLeaf() const
{
	return leaf_;
}

template < typename TString, typename TValue, typename TReset >
typename tag_tree_base< TString, TValue, TReset >::char_type tag_tree_base< TString, TValue, TReset >::getGlyph() const
{
	return glyph_;
}

template < typename TString, typename TValue, typename TReset >
bool tag_tree_base< TString, TValue, TReset >::resetValue( const value_type& value )
{
	if (!leaf_)
	{
		valueReset_( value_, value );
		leaf_ = true;
		return true;
	}
	return false;
}

/**
*	@return value of this node.
*/
template < typename TString, typename TValue, typename TReset >
typename tag_tree_base< TString, TValue, TReset >::value_type& tag_tree_base< TString, TValue, TReset >::getValue()
{
	return value_;
}

/**
*	@see getValue()
*/
template < typename TString, typename TValue, typename TReset >
const typename tag_tree_base< TString, TValue, TReset >::value_type& tag_tree_base< TString, TValue, TReset >::getValue() const
{
	return value_;
}

//	+-----------------------------------------------------------------+
//	| tag_tree [declaration]
//	+-----------------------------------------------------------------+
/**
*	tag_tree is desiged for usage in a streaming parser. tag_tree can 
*	help you to build large state engines in memory, since each node 
*	is representing a unique parser state.
*
*	@see turban::general::initialize
*	@see turban::algorithm::hash
*/
template 
< 
	typename TString	= std::string, 
	typename TValue		= unsigned, 
	typename THash		= algorithm::hash< TString >, 
	typename TReset		= general::initialize< TValue >, 
	typename TAlloc		= std::allocator< TString > 
>
class tag_tree 
	: public tag_tree_base< TString, TValue, TReset >
{
public:
	typedef tag_tree< TString, TValue, THash, TReset, TAlloc >		MyType_;
	typedef tag_tree_base< TString, TValue, TReset >				MyBase_;
	typedef MyType_								*MyPtr_;
	typedef MyType_								&MyRef_;
	typedef THash								Policy_;

	typedef typename TAlloc::template
		rebind<MyType_>::other					MyAlloc_;

	struct MyHeapCloneAllocator
	{
		typedef MyType_	AllocOwner_;

        template< class U >
        static void deallocate_clone( const U* r )
        {
			boost::delete_clone( r );
			BOOST_STATIC_ASSERT( false );
        }
        template<>
        static void deallocate_clone<MyType_>( const MyType_* r )
        {
			MyType_*x = const_cast <MyType_*> (r);
			AllocOwner_::alloc_.deallocate( x, 1 );
        }
	};

	typedef boost::ptr_array< MyType_, Policy_::MAP_SIZE, MyHeapCloneAllocator >	NodeMap_;
	typedef typename NodeMap_::iterator												NodeMapIter_;
	typedef typename NodeMap_::const_iterator										NodeMapConstIter_;
	typedef boost::ptr_list< MyType_, MyHeapCloneAllocator, MyAlloc_ >				NodeList_;
	typedef typename NodeList_::iterator											NodeListIter_;
	typedef typename NodeList_::const_iterator										NodeListConstIter_;


private:
	class MatchingPredicate
	{
	public:
		MatchingPredicate( char_type );
		MatchingPredicate( const MatchingPredicate& );
		bool operator()( const MyType_& ) const;
	private:
		char_type	c_;
	};

public:
	tag_tree( char_type = std::string::npos );
	~tag_tree();
	bool insertKeyWord( char_const_iterator, char_const_iterator, const value_type& );
	bool insertKeyWord( const String_&, const value_type& );
	MyPtr_ findNode( char_type );
	const MyPtr_ findNode( char_type ) const;

	bool empty() const;
	size_t leaf_count( bool = false ) const;
	void dump( ostream_type_&, size_t = 0L, size_t = 0L ) const;

	//	the following methods are O(n) -> slow
	value_type lookUp( char_const_iterator, char_const_iterator ) const;
	value_type lookUp( const String_& ) const;

private:
	MyPtr_ getNode( char_type );
	MyPtr_ searchNode( char_type, bool );
	MyPtr_ createNode( char_type );

private:
	NodeMap_			nodeMap_;
	NodeList_			nodeList_;

	static Policy_		policy_;
	static MyAlloc_		alloc_;

public:
#ifdef _UNIT_TEST
	static bool unit_test();
#endif	//	_UNIT_TEST
};

//	+-----------------------------------------------------------------+
//	| tag_tree [initialization]
//	+-----------------------------------------------------------------+
template < typename TString, typename TValue, typename THash, typename TReset, typename TAlloc >
typename tag_tree< TString, TValue, THash, TReset, TAlloc >::Policy_	
tag_tree< TString, TValue, THash, TReset, TAlloc >::policy_;

template < typename TString, typename TValue, typename THash, typename TReset, typename TAlloc >
typename tag_tree< TString, TValue, THash, TReset, TAlloc >::MyAlloc_	
tag_tree< TString, TValue, THash, TReset, TAlloc >::alloc_;

//	+-----------------------------------------------------------------+
//	| tag_tree [definition]
//	+-----------------------------------------------------------------+
/**
*	Setting all pointers to NULL is done by container.
*
*	@param c The character this node is representing.
*/
template < typename TString, typename TValue, typename THash, typename TReset, typename TAlloc >
tag_tree< TString, TValue, THash, TReset, TAlloc >::tag_tree( char_type c )
	: MyBase_( c )
{}

/**
*	Deallocating of memory will be done by the containers.
*/
template < typename TString, typename TValue, typename THash, typename TReset, typename TAlloc >
tag_tree< TString, TValue, THash, TReset, TAlloc >::~tag_tree()
{
	//	free memory is done by container
}

template < typename TString, typename TValue, typename THash, typename TReset, typename TAlloc >
bool tag_tree< TString, TValue, THash, TReset, TAlloc >::insertKeyWord( char_const_iterator from, char_const_iterator to, const value_type& value )
{
	BOOST_ASSERT(from != to);

#ifdef _DEBUG
	//std::cout
	//	<< *from
	//	<< std::endl;
#endif

	MyPtr_ p = getNode( *from );
	BOOST_ASSERT( p != NULL );
	BOOST_ASSERT( p->isGlyph( *from ));

	if ((from + 1) == to)
	{
#ifdef _DEBUG
		//std::cout
		//	<< "insert value: "
		//	<< *from
		//	<< " = "
		//	<< value
		//	<< std::endl;
#endif
		//	insert value
		return p->resetValue( value );
	}

	return p->insertKeyWord( ++from, to, value );
}

template < typename TString, typename TValue, typename THash, typename TReset, typename TAlloc >
bool tag_tree< TString, TValue, THash, TReset, TAlloc >::insertKeyWord( const String_& s, const value_type& value )
{
#ifdef _DEBUG
		//std::cout
		//	<< "insert "
		//	<< s.c_str()
		//	<< std::endl
		//	;
#endif
	return insertKeyWord( s.begin(), s.end(), value );
}

/**
*	@param c character to find.
*	@return a pointer to a node containing the specified character <code>c</code> or NULL.
*/
template < typename TString, typename TValue, typename THash, typename TReset, typename TAlloc >
typename tag_tree< TString, TValue, THash, TReset, TAlloc >::MyPtr_ tag_tree< TString, TValue, THash, TReset, TAlloc >::findNode( char_type c )
{
	size_t index( policy_( c ));
	if (nodeMap_.is_null( index ))
	{
		return NULL;
	}

	MyRef_ r = nodeMap_[ index ];
	return (r.isGlyph( c ))
		? &r
		: r.searchNode( c, false );

}

/**
*	Calls the <code>non-const</code> variant of this method.
*
*	@param c character to find.
*	@return a pointer to a node containing the specified character <code>c</code> or NULL.
*	@see findNode( char_type ) const
*/
template < typename TString, typename TValue, typename THash, typename TReset, typename TAlloc >
const typename tag_tree< TString, TValue, THash, TReset, TAlloc >::MyPtr_ tag_tree< TString, TValue, THash, TReset, TAlloc >::findNode( char_type c ) const
{
	//	prevent recursive calls
	const MyPtr_ This( this );
	return const_cast <MyPtr_>( This->findNode( c ) );
}

template < typename TString, typename TValue, typename THash, typename TReset, typename TAlloc >
typename tag_tree< TString, TValue, THash, TReset, TAlloc >::MyPtr_ tag_tree< TString, TValue, THash, TReset, TAlloc >::getNode( char_type c )
{
	size_t index( policy_( c ));
	if (nodeMap_.is_null( index ))
	{
		//	create element
		MyPtr_ p = alloc_.allocate( 1 );
		new( p ) MyType_( c );
		nodeMap_.replace( index, p );
		return p;
	}
	else
	{
		MyRef_ r = nodeMap_[ index ];
		return (r.isGlyph( c ))
			? &r
			: r.searchNode( c, true );	// collision
	}
}

template < typename TString, typename TValue, typename THash, typename TReset, typename TAlloc >
typename tag_tree< TString, TValue, THash, TReset, TAlloc >::MyPtr_ tag_tree< TString, TValue, THash, TReset, TAlloc >::searchNode( char_type c, bool force )
{
	NodeListIter_ pos = std::find_if( nodeList_.begin(), nodeList_.end(), MatchingPredicate( c ));
	return (pos == nodeList_.end())
		? ( force ? createNode( c ) : NULL )
		: pos.operator->();
}

template < typename TString, typename TValue, typename THash, typename TReset, typename TAlloc >
typename tag_tree< TString, TValue, THash, TReset, TAlloc >::MyPtr_ tag_tree< TString, TValue, THash, TReset, TAlloc >::createNode( char_type c )
{
	BOOST_ASSERT( std::find_if( nodeList_.begin(), nodeList_.end(), MatchingPredicate( c )) == nodeList_.end());

	//	nothing found, create entry
	MyPtr_ p = alloc_.allocate( 1 );
	new( p ) MyType_( c );

	nodeList_.push_back( p );
#ifdef _DEBUG
	//std::cout
	//	<< "create node with: "
	//	<< c
	//	<< ", node list contains "
	//	<< nodeList_.size()
	//	<< " elements."
	//	<< std::endl;
#endif
	return p;

}

/**
*	Semantic is similar to STL - container empty() method.
*
*	@remark If the node array <code>nodeMap_</code> contains no pointers then 
*	<code>nodeList_</code> should not do also. And it should be a leaf.
*	Otherwise something is going wrong.
*
*	@return <code>true</code> if there are no outgoing node pointers.
*/
template < typename TString, typename TValue, typename THash, typename TReset, typename TAlloc >
bool tag_tree< TString, TValue, THash, TReset, TAlloc >::empty() const
{
	for (size_t idx = 0L; idx < Policy_::MAP_SIZE; idx++)
	{
		if (!nodeMap_.is_null( idx ))
		{
			return false;
		}
	}
	BOOST_ASSERT( nodeList_.empty());
	BOOST_ASSERT( nodeList_.isLeaf());
	return true;
}

/**
*	@return count of keywords starting from this point.
*/
template < typename TString, typename TValue, typename THash, typename TReset, typename TAlloc >
size_t tag_tree< TString, TValue, THash, TReset, TAlloc >::leaf_count( bool branch ) const
{
	size_t count( isLeaf() ? 1L : 0L );
	for (size_t idx = 0L; idx < Policy_::MAP_SIZE; idx++)
	{
		if (!nodeMap_.is_null( idx ))
		{
			count += nodeMap_[ idx ].leaf_count( true );
		}
	}

	if (branch)
	{
		for (NodeListConstIter_ pos = nodeList_.begin(); pos != nodeList_.end(); ++pos)
		{
			count += pos->leaf_count( true );
		}
	}
	return count;
}

/**
*	Prints a plain list of all nodes and there attributes. Usefull for debugging purposes.
*
*	@param os an open output stream. std::cout e.g.
*/
template < typename TString, typename TValue, typename THash, typename TReset, typename TAlloc >
void tag_tree< TString, TValue, THash, TReset, TAlloc >::dump( ostream_type_& os, size_t depth, size_t branch ) const
{
	os
		<< "dump( "
		<< depth
		<< "/"
		<< branch
		<< ": "
		<< getGlyph()
		<< " => "
		<< getValue()
		<< " )"
		<< std::endl
		;

	for (size_t idx = 0L; idx < Policy_::MAP_SIZE; idx++)
	{
		if (!nodeMap_.is_null( idx ))
		{
			nodeMap_[ idx ].dump( os, depth + 1, branch );
		}
	}

	for (NodeListConstIter_ pos = nodeList_.begin(); pos != nodeList_.end(); ++pos)
	{
		branch++;
		pos->dump( os, depth, branch );
	}
}

/**
*	Looks up for the specified keyword.
*
*	@param from An input iterator addressing the position of the first element in the keyword.
*	@param to An input iterator addressing the position that is one past the final element in the keyword.
*	@return value of the keyword. If the keyword does not exists the null value will
*	be returned.
*/
template < typename TString, typename TValue, typename THash, typename TReset, typename TAlloc >
typename tag_tree< TString, TValue, THash, TReset, TAlloc >::value_type tag_tree< TString, TValue, THash, TReset, TAlloc >::lookUp( char_const_iterator from, char_const_iterator to ) const
{
	//const MyType_* const prev( this );
	MyPtr_ prev = const_cast <MyPtr_> ( this );
	for (char_const_iterator pos = from; pos != to; ++pos)
	{
		MyPtr_ p = prev->findNode( *pos );
		if (p == NULL)
		{
			break;
		}
		prev = p;
	}
	return (prev != NULL)
		? prev->getValue()
		: value_type();
}

/**
*	Looks up for the specified keyword.
*
*	@param s The keyword.
*	@see lookUp( char_const_iterator from, char_const_iterator to ) const

*/
template < typename TString, typename TValue, typename THash, typename TReset, typename TAlloc >
typename tag_tree< TString, TValue, THash, TReset, TAlloc >::value_type tag_tree< TString, TValue, THash, TReset, TAlloc >::lookUp( const String_& s ) const
{
	return lookUp( s.begin(), s.end());
}

//	+-----------------------------------------------------------------+
//	| tag_tree::MatchingPredicate [definition]
//	+-----------------------------------------------------------------+
template < typename TString, typename TValue, typename THash, typename TReset, typename TAlloc >
tag_tree< TString, TValue, THash, TReset, TAlloc >::MatchingPredicate::MatchingPredicate( char_type c )
	: c_( c )
{}

template < typename TString, typename TValue, typename THash, typename TReset, typename TAlloc >
tag_tree< TString, TValue, THash, TReset, TAlloc >::MatchingPredicate::MatchingPredicate( const MatchingPredicate& p )
	: c_( p.c_ )
{}

template < typename TString, typename TValue, typename THash, typename TReset, typename TAlloc >
bool tag_tree< TString, TValue, THash, TReset, TAlloc >::MatchingPredicate::operator()( const MyType_& node ) const
{
	return node.isGlyph( c_ );
}

//	+-----------------------------------------------------------------+
//	| tag_tree [_UNIT_TEST]
//	+-----------------------------------------------------------------+

#ifdef _UNIT_TEST
//#pragma warning( push )
//#pragma warning( disable : 4996 )	//	unsafe methods _SCL_SECURE_NO_WARNINGS

template < typename TString, typename TValue, typename THash, typename TReset, typename TAlloc >
bool tag_tree< TString, TValue, THash, TReset, TAlloc >::unit_test()
{
	std::cout
		<< "hash map size: "
		<< algorithm::hash< TString >::MAP_SIZE
		<< std::endl
		;


	if (boost::is_same< String_, std::string >::value)
	{
		tag_tree < std::string, value_type > kt;

		//	with MAP_SIZE = 8 'P' and 'H' have overlapping indexes.
		std::string key1 = "POST", key2 = "HOST", key3 = "XONN";
		kt.insertKeyWord( key1.begin(), key1.end(), 17 );
		kt.insertKeyWord( key2, 21 );
		kt.insertKeyWord( key3, 22 );
		kt.insertKeyWord( "PONY", 2000 );
		kt.insertKeyWord( "POSTING", 2001 );
		kt.insertKeyWord( "POSTER", 2002 );
		kt.insertKeyWord( "PORT", 2003 );

		//	lookup
		value_type g = kt.lookUp( key2 );
		BOOST_ASSERT( g == 21 );
		value_type f = kt.lookUp( key1 );
		BOOST_ASSERT( f == 17 );
		BOOST_ASSERT( kt.lookUp( key3 ) == 22 );
		BOOST_ASSERT( kt.lookUp( "PONY" ) == 2000 );
		BOOST_ASSERT( kt.lookUp( "POSTING" ) == 2001 );
		BOOST_ASSERT( kt.lookUp( "POSTER" ) == 2002 );
		BOOST_ASSERT( kt.lookUp( "PORT" ) == 2003 );

		kt.dump( std::cout );
	}
	else if(boost::is_same< String_, std::wstring >::value)
	{
		tag_tree < std::wstring, value_type > kt;

		std::wstring key1 = L"POST", key2 = L"HOST", key3 = L"XONN";
		kt.insertKeyWord( key1.begin(), key1.end(), 17 );
		kt.insertKeyWord( key2, 21 );
		kt.insertKeyWord( key3, 22 );
		kt.insertKeyWord( L"PONY", 2000 );
		kt.insertKeyWord( L"POSTING", 2001 );
		kt.insertKeyWord( L"POSTER", 2002 );

		//	lookup
		value_type g = kt.lookUp( key2 );
		BOOST_ASSERT( g == 21 );
		value_type f = kt.lookUp( key1 );
		BOOST_ASSERT( f == 17 );
		BOOST_ASSERT( kt.lookUp( key3 ) == 22 );
		BOOST_ASSERT( kt.lookUp( L"PONY" ) == 2000 );
		BOOST_ASSERT( kt.lookUp( L"POSTING" ) == 2001 );
		BOOST_ASSERT( kt.lookUp( L"POSTER" ) == 2002 );

		kt.dump( std::wcout );
	}
	else
	{
//		BOOST_STATIC_ASSERT( false );
		BOOST_ASSERT( false );
	}

	return true;
}
//#pragma warning( pop )
#endif	//	_UNIT_TEST

}	//	namespace parser
}	//	namespace turban

#pragma warning( pop )

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)

Share

About the Author

osy
Software Developer (Senior)
Germany Germany
No Biography provided

| Advertise | Privacy | Terms of Use | Mobile
Web02 | 2.8.150327.1 | Last Updated 28 Oct 2007
Article Copyright 2007 by osy
Everything else Copyright © CodeProject, 1999-2015
Layout: fixed | fluid