// Copyright � 2007, solosTec
// All rights reserved.
//---------------------------------------------------------------------
// License for redistribution is the GNU General Public License v2.
// See the included readme.txt for details.
//---------------------------------------------------------------------
// developed by solosTec
// http://www.solostec.de
//---------------------------------------------------------------------
//
// $Author: $
// $Revision: $
/* $Log: $
*/
//---------------------------------------------------------------------
/**
* Fast detecting of of tags or keywords.
*/
#pragma once
#include <boost/noncopyable.hpp>
#include <string>
#include <memory>
#include <boost/static_assert.hpp>
#include <boost/assert.hpp>
#include <boost/ptr_container/ptr_list.hpp>
#include <boost/ptr_container/ptr_array.hpp>
#include <iostream>
#include <algorithm>
#include <algorithm/hash.h>
#include <general/initialize.h>
#include <ostream>
#pragma warning( push )
#pragma warning( disable : 4996 ) // unsafe methods _SCL_SECURE_NO_WARNINGS
namespace turban {
namespace container {
// +-----------------------------------------------------------------+
// | tag_tree_base [declaration]
// +-----------------------------------------------------------------+
/**
* Define some <code>typedefs</code> and access methods for all node
* values.
* The <code>tag_tree_base</code> is not restricted to strings. You
* can use any string-like container which defines
* a <code>const_iterator</code> and a <code>value_type</code> type.
*
* @see turban::general::initialize
*/
template
<
typename TString,
typename TValue,
typename TReset
>
class tag_tree_base
: private boost::noncopyable
{
public:
typedef TString String_;
typedef typename String_::const_iterator char_const_iterator;
typedef typename String_::value_type char_type;
typedef std::basic_ostream< char_type > ostream_type_;
typedef TValue value_type;
typedef TReset ValueReset_;
public:
tag_tree_base( char_type );
~tag_tree_base();
value_type& getValue();
const value_type& getValue() const;
protected:
bool isGlyph( char_type ) const;
bool isLeaf() const;
char_type getGlyph() const;
bool resetValue( const value_type& );
private:
const char_type glyph_;
value_type value_;
bool leaf_; //! marks this node as leaf e.g. contains a valid value.
static ValueReset_ valueReset_;
};
// +-----------------------------------------------------------------+
// | tag_tree_base [initialization]
// +-----------------------------------------------------------------+
template < typename TString, typename TValue, typename TReset >
typename tag_tree_base< TString, TValue, TReset >::ValueReset_
tag_tree_base< TString, TValue, TReset >::valueReset_ = tag_tree_base< TString, TValue, TReset >::ValueReset_();
// +-----------------------------------------------------------------+
// | tag_tree_base [implementation]
// +-----------------------------------------------------------------+
/**
* @param c The character this node is representing.
*/
template < typename TString, typename TValue, typename TReset >
tag_tree_base< TString, TValue, TReset >::tag_tree_base( char_type c )
: glyph_( c )
, leaf_( false )
{
valueReset_( value_ );
#ifdef _DEBUG
//std::cout
// << "tag_tree_base( "
// << glyph_
// << " )"
// << std::endl
// ;
#endif
}
template < typename TString, typename TValue, typename TReset >
tag_tree_base< TString, TValue, TReset >::~tag_tree_base()
{
#ifdef _DEBUG
//std::cout
// << "clear: "
// << glyph_
// << " = "
// << value_
// << std::endl
// ;
#endif
}
template < typename TString, typename TValue, typename TReset >
bool tag_tree_base< TString, TValue, TReset >::isGlyph( char_type c ) const
{
return glyph_ == c;
}
template < typename TString, typename TValue, typename TReset >
bool tag_tree_base< TString, TValue, TReset >::isLeaf() const
{
return leaf_;
}
template < typename TString, typename TValue, typename TReset >
typename tag_tree_base< TString, TValue, TReset >::char_type tag_tree_base< TString, TValue, TReset >::getGlyph() const
{
return glyph_;
}
template < typename TString, typename TValue, typename TReset >
bool tag_tree_base< TString, TValue, TReset >::resetValue( const value_type& value )
{
if (!leaf_)
{
valueReset_( value_, value );
leaf_ = true;
return true;
}
return false;
}
/**
* @return value of this node.
*/
template < typename TString, typename TValue, typename TReset >
typename tag_tree_base< TString, TValue, TReset >::value_type& tag_tree_base< TString, TValue, TReset >::getValue()
{
return value_;
}
/**
* @see getValue()
*/
template < typename TString, typename TValue, typename TReset >
const typename tag_tree_base< TString, TValue, TReset >::value_type& tag_tree_base< TString, TValue, TReset >::getValue() const
{
return value_;
}
// +-----------------------------------------------------------------+
// | tag_tree [declaration]
// +-----------------------------------------------------------------+
/**
* tag_tree is desiged for usage in a streaming parser. tag_tree can
* help you to build large state engines in memory, since each node
* is representing a unique parser state.
*
* @see turban::general::initialize
* @see turban::algorithm::hash
*/
template
<
typename TString = std::string,
typename TValue = unsigned,
typename THash = algorithm::hash< TString >,
typename TReset = general::initialize< TValue >,
typename TAlloc = std::allocator< TString >
>
class tag_tree
: public tag_tree_base< TString, TValue, TReset >
{
public:
typedef tag_tree< TString, TValue, THash, TReset, TAlloc > MyType_;
typedef tag_tree_base< TString, TValue, TReset > MyBase_;
typedef MyType_ *MyPtr_;
typedef MyType_ &MyRef_;
typedef THash Policy_;
typedef typename TAlloc::template
rebind<MyType_>::other MyAlloc_;
struct MyHeapCloneAllocator
{
typedef MyType_ AllocOwner_;
template< class U >
static void deallocate_clone( const U* r )
{
boost::delete_clone( r );
BOOST_STATIC_ASSERT( false );
}
template<>
static void deallocate_clone<MyType_>( const MyType_* r )
{
MyType_*x = const_cast <MyType_*> (r);
AllocOwner_::alloc_.deallocate( x, 1 );
}
};
typedef boost::ptr_array< MyType_, Policy_::MAP_SIZE, MyHeapCloneAllocator > NodeMap_;
typedef typename NodeMap_::iterator NodeMapIter_;
typedef typename NodeMap_::const_iterator NodeMapConstIter_;
typedef boost::ptr_list< MyType_, MyHeapCloneAllocator, MyAlloc_ > NodeList_;
typedef typename NodeList_::iterator NodeListIter_;
typedef typename NodeList_::const_iterator NodeListConstIter_;
private:
class MatchingPredicate
{
public:
MatchingPredicate( char_type );
MatchingPredicate( const MatchingPredicate& );
bool operator()( const MyType_& ) const;
private:
char_type c_;
};
public:
tag_tree( char_type = std::string::npos );
~tag_tree();
bool insertKeyWord( char_const_iterator, char_const_iterator, const value_type& );
bool insertKeyWord( const String_&, const value_type& );
MyPtr_ findNode( char_type );
const MyPtr_ findNode( char_type ) const;
bool empty() const;
size_t leaf_count( bool = false ) const;
void dump( ostream_type_&, size_t = 0L, size_t = 0L ) const;
// the following methods are O(n) -> slow
value_type lookUp( char_const_iterator, char_const_iterator ) const;
value_type lookUp( const String_& ) const;
private:
MyPtr_ getNode( char_type );
MyPtr_ searchNode( char_type, bool );
MyPtr_ createNode( char_type );
private:
NodeMap_ nodeMap_;
NodeList_ nodeList_;
static Policy_ policy_;
static MyAlloc_ alloc_;
public:
#ifdef _UNIT_TEST
static bool unit_test();
#endif // _UNIT_TEST
};
// +-----------------------------------------------------------------+
// | tag_tree [initialization]
// +-----------------------------------------------------------------+
template < typename TString, typename TValue, typename THash, typename TReset, typename TAlloc >
typename tag_tree< TString, TValue, THash, TReset, TAlloc >::Policy_
tag_tree< TString, TValue, THash, TReset, TAlloc >::policy_;
template < typename TString, typename TValue, typename THash, typename TReset, typename TAlloc >
typename tag_tree< TString, TValue, THash, TReset, TAlloc >::MyAlloc_
tag_tree< TString, TValue, THash, TReset, TAlloc >::alloc_;
// +-----------------------------------------------------------------+
// | tag_tree [definition]
// +-----------------------------------------------------------------+
/**
* Setting all pointers to NULL is done by container.
*
* @param c The character this node is representing.
*/
template < typename TString, typename TValue, typename THash, typename TReset, typename TAlloc >
tag_tree< TString, TValue, THash, TReset, TAlloc >::tag_tree( char_type c )
: MyBase_( c )
{}
/**
* Deallocating of memory will be done by the containers.
*/
template < typename TString, typename TValue, typename THash, typename TReset, typename TAlloc >
tag_tree< TString, TValue, THash, TReset, TAlloc >::~tag_tree()
{
// free memory is done by container
}
template < typename TString, typename TValue, typename THash, typename TReset, typename TAlloc >
bool tag_tree< TString, TValue, THash, TReset, TAlloc >::insertKeyWord( char_const_iterator from, char_const_iterator to, const value_type& value )
{
BOOST_ASSERT(from != to);
#ifdef _DEBUG
//std::cout
// << *from
// << std::endl;
#endif
MyPtr_ p = getNode( *from );
BOOST_ASSERT( p != NULL );
BOOST_ASSERT( p->isGlyph( *from ));
if ((from + 1) == to)
{
#ifdef _DEBUG
//std::cout
// << "insert value: "
// << *from
// << " = "
// << value
// << std::endl;
#endif
// insert value
return p->resetValue( value );
}
return p->insertKeyWord( ++from, to, value );
}
template < typename TString, typename TValue, typename THash, typename TReset, typename TAlloc >
bool tag_tree< TString, TValue, THash, TReset, TAlloc >::insertKeyWord( const String_& s, const value_type& value )
{
#ifdef _DEBUG
//std::cout
// << "insert "
// << s.c_str()
// << std::endl
// ;
#endif
return insertKeyWord( s.begin(), s.end(), value );
}
/**
* @param c character to find.
* @return a pointer to a node containing the specified character <code>c</code> or NULL.
*/
template < typename TString, typename TValue, typename THash, typename TReset, typename TAlloc >
typename tag_tree< TString, TValue, THash, TReset, TAlloc >::MyPtr_ tag_tree< TString, TValue, THash, TReset, TAlloc >::findNode( char_type c )
{
size_t index( policy_( c ));
if (nodeMap_.is_null( index ))
{
return NULL;
}
MyRef_ r = nodeMap_[ index ];
return (r.isGlyph( c ))
? &r
: r.searchNode( c, false );
}
/**
* Calls the <code>non-const</code> variant of this method.
*
* @param c character to find.
* @return a pointer to a node containing the specified character <code>c</code> or NULL.
* @see findNode( char_type ) const
*/
template < typename TString, typename TValue, typename THash, typename TReset, typename TAlloc >
const typename tag_tree< TString, TValue, THash, TReset, TAlloc >::MyPtr_ tag_tree< TString, TValue, THash, TReset, TAlloc >::findNode( char_type c ) const
{
// prevent recursive calls
const MyPtr_ This( this );
return const_cast <MyPtr_>( This->findNode( c ) );
}
template < typename TString, typename TValue, typename THash, typename TReset, typename TAlloc >
typename tag_tree< TString, TValue, THash, TReset, TAlloc >::MyPtr_ tag_tree< TString, TValue, THash, TReset, TAlloc >::getNode( char_type c )
{
size_t index( policy_( c ));
if (nodeMap_.is_null( index ))
{
// create element
MyPtr_ p = alloc_.allocate( 1 );
new( p ) MyType_( c );
nodeMap_.replace( index, p );
return p;
}
else
{
MyRef_ r = nodeMap_[ index ];
return (r.isGlyph( c ))
? &r
: r.searchNode( c, true ); // collision
}
}
template < typename TString, typename TValue, typename THash, typename TReset, typename TAlloc >
typename tag_tree< TString, TValue, THash, TReset, TAlloc >::MyPtr_ tag_tree< TString, TValue, THash, TReset, TAlloc >::searchNode( char_type c, bool force )
{
NodeListIter_ pos = std::find_if( nodeList_.begin(), nodeList_.end(), MatchingPredicate( c ));
return (pos == nodeList_.end())
? ( force ? createNode( c ) : NULL )
: pos.operator->();
}
template < typename TString, typename TValue, typename THash, typename TReset, typename TAlloc >
typename tag_tree< TString, TValue, THash, TReset, TAlloc >::MyPtr_ tag_tree< TString, TValue, THash, TReset, TAlloc >::createNode( char_type c )
{
BOOST_ASSERT( std::find_if( nodeList_.begin(), nodeList_.end(), MatchingPredicate( c )) == nodeList_.end());
// nothing found, create entry
MyPtr_ p = alloc_.allocate( 1 );
new( p ) MyType_( c );
nodeList_.push_back( p );
#ifdef _DEBUG
//std::cout
// << "create node with: "
// << c
// << ", node list contains "
// << nodeList_.size()
// << " elements."
// << std::endl;
#endif
return p;
}
/**
* Semantic is similar to STL - container empty() method.
*
* @remark If the node array <code>nodeMap_</code> contains no pointers then
* <code>nodeList_</code> should not do also. And it should be a leaf.
* Otherwise something is going wrong.
*
* @return <code>true</code> if there are no outgoing node pointers.
*/
template < typename TString, typename TValue, typename THash, typename TReset, typename TAlloc >
bool tag_tree< TString, TValue, THash, TReset, TAlloc >::empty() const
{
for (size_t idx = 0L; idx < Policy_::MAP_SIZE; idx++)
{
if (!nodeMap_.is_null( idx ))
{
return false;
}
}
BOOST_ASSERT( nodeList_.empty());
BOOST_ASSERT( nodeList_.isLeaf());
return true;
}
/**
* @return count of keywords starting from this point.
*/
template < typename TString, typename TValue, typename THash, typename TReset, typename TAlloc >
size_t tag_tree< TString, TValue, THash, TReset, TAlloc >::leaf_count( bool branch ) const
{
size_t count( isLeaf() ? 1L : 0L );
for (size_t idx = 0L; idx < Policy_::MAP_SIZE; idx++)
{
if (!nodeMap_.is_null( idx ))
{
count += nodeMap_[ idx ].leaf_count( true );
}
}
if (branch)
{
for (NodeListConstIter_ pos = nodeList_.begin(); pos != nodeList_.end(); ++pos)
{
count += pos->leaf_count( true );
}
}
return count;
}
/**
* Prints a plain list of all nodes and there attributes. Usefull for debugging purposes.
*
* @param os an open output stream. std::cout e.g.
*/
template < typename TString, typename TValue, typename THash, typename TReset, typename TAlloc >
void tag_tree< TString, TValue, THash, TReset, TAlloc >::dump( ostream_type_& os, size_t depth, size_t branch ) const
{
os
<< "dump( "
<< depth
<< "/"
<< branch
<< ": "
<< getGlyph()
<< " => "
<< getValue()
<< " )"
<< std::endl
;
for (size_t idx = 0L; idx < Policy_::MAP_SIZE; idx++)
{
if (!nodeMap_.is_null( idx ))
{
nodeMap_[ idx ].dump( os, depth + 1, branch );
}
}
for (NodeListConstIter_ pos = nodeList_.begin(); pos != nodeList_.end(); ++pos)
{
branch++;
pos->dump( os, depth, branch );
}
}
/**
* Looks up for the specified keyword.
*
* @param from An input iterator addressing the position of the first element in the keyword.
* @param to An input iterator addressing the position that is one past the final element in the keyword.
* @return value of the keyword. If the keyword does not exists the null value will
* be returned.
*/
template < typename TString, typename TValue, typename THash, typename TReset, typename TAlloc >
typename tag_tree< TString, TValue, THash, TReset, TAlloc >::value_type tag_tree< TString, TValue, THash, TReset, TAlloc >::lookUp( char_const_iterator from, char_const_iterator to ) const
{
//const MyType_* const prev( this );
MyPtr_ prev = const_cast <MyPtr_> ( this );
for (char_const_iterator pos = from; pos != to; ++pos)
{
MyPtr_ p = prev->findNode( *pos );
if (p == NULL)
{
break;
}
prev = p;
}
return (prev != NULL)
? prev->getValue()
: value_type();
}
/**
* Looks up for the specified keyword.
*
* @param s The keyword.
* @see lookUp( char_const_iterator from, char_const_iterator to ) const
*/
template < typename TString, typename TValue, typename THash, typename TReset, typename TAlloc >
typename tag_tree< TString, TValue, THash, TReset, TAlloc >::value_type tag_tree< TString, TValue, THash, TReset, TAlloc >::lookUp( const String_& s ) const
{
return lookUp( s.begin(), s.end());
}
// +-----------------------------------------------------------------+
// | tag_tree::MatchingPredicate [definition]
// +-----------------------------------------------------------------+
template < typename TString, typename TValue, typename THash, typename TReset, typename TAlloc >
tag_tree< TString, TValue, THash, TReset, TAlloc >::MatchingPredicate::MatchingPredicate( char_type c )
: c_( c )
{}
template < typename TString, typename TValue, typename THash, typename TReset, typename TAlloc >
tag_tree< TString, TValue, THash, TReset, TAlloc >::MatchingPredicate::MatchingPredicate( const MatchingPredicate& p )
: c_( p.c_ )
{}
template < typename TString, typename TValue, typename THash, typename TReset, typename TAlloc >
bool tag_tree< TString, TValue, THash, TReset, TAlloc >::MatchingPredicate::operator()( const MyType_& node ) const
{
return node.isGlyph( c_ );
}
// +-----------------------------------------------------------------+
// | tag_tree [_UNIT_TEST]
// +-----------------------------------------------------------------+
#ifdef _UNIT_TEST
//#pragma warning( push )
//#pragma warning( disable : 4996 ) // unsafe methods _SCL_SECURE_NO_WARNINGS
template < typename TString, typename TValue, typename THash, typename TReset, typename TAlloc >
bool tag_tree< TString, TValue, THash, TReset, TAlloc >::unit_test()
{
std::cout
<< "hash map size: "
<< algorithm::hash< TString >::MAP_SIZE
<< std::endl
;
if (boost::is_same< String_, std::string >::value)
{
tag_tree < std::string, value_type > kt;
// with MAP_SIZE = 8 'P' and 'H' have overlapping indexes.
std::string key1 = "POST", key2 = "HOST", key3 = "XONN";
kt.insertKeyWord( key1.begin(), key1.end(), 17 );
kt.insertKeyWord( key2, 21 );
kt.insertKeyWord( key3, 22 );
kt.insertKeyWord( "PONY", 2000 );
kt.insertKeyWord( "POSTING", 2001 );
kt.insertKeyWord( "POSTER", 2002 );
kt.insertKeyWord( "PORT", 2003 );
// lookup
value_type g = kt.lookUp( key2 );
BOOST_ASSERT( g == 21 );
value_type f = kt.lookUp( key1 );
BOOST_ASSERT( f == 17 );
BOOST_ASSERT( kt.lookUp( key3 ) == 22 );
BOOST_ASSERT( kt.lookUp( "PONY" ) == 2000 );
BOOST_ASSERT( kt.lookUp( "POSTING" ) == 2001 );
BOOST_ASSERT( kt.lookUp( "POSTER" ) == 2002 );
BOOST_ASSERT( kt.lookUp( "PORT" ) == 2003 );
kt.dump( std::cout );
}
else if(boost::is_same< String_, std::wstring >::value)
{
tag_tree < std::wstring, value_type > kt;
std::wstring key1 = L"POST", key2 = L"HOST", key3 = L"XONN";
kt.insertKeyWord( key1.begin(), key1.end(), 17 );
kt.insertKeyWord( key2, 21 );
kt.insertKeyWord( key3, 22 );
kt.insertKeyWord( L"PONY", 2000 );
kt.insertKeyWord( L"POSTING", 2001 );
kt.insertKeyWord( L"POSTER", 2002 );
// lookup
value_type g = kt.lookUp( key2 );
BOOST_ASSERT( g == 21 );
value_type f = kt.lookUp( key1 );
BOOST_ASSERT( f == 17 );
BOOST_ASSERT( kt.lookUp( key3 ) == 22 );
BOOST_ASSERT( kt.lookUp( L"PONY" ) == 2000 );
BOOST_ASSERT( kt.lookUp( L"POSTING" ) == 2001 );
BOOST_ASSERT( kt.lookUp( L"POSTER" ) == 2002 );
kt.dump( std::wcout );
}
else
{
// BOOST_STATIC_ASSERT( false );
BOOST_ASSERT( false );
}
return true;
}
//#pragma warning( pop )
#endif // _UNIT_TEST
} // namespace parser
} // namespace turban
#pragma warning( pop )