|
// StringSet.Cpp: implementation of the CIVStringSet class.
//
// Written 12 June 2002 by Scot T Brennecke
// Thanks to Moishe Halibard and Moshe Rubin for their article,
// "A Multiple Substring Search Algorithm" in the June 2002
// edition of C/C++ Users Journal. This class is based on
// the algorthim therein described, but extended to return
// all strings and use MFC classes.
//
// Modified 24 Apr 2010 by Scot T Brennecke
#include "StdAfx.H"
#include "StringSet.H"
#ifdef _DEBUG
#undef THIS_FILE
static char THIS_FILE[]=__FILE__ ;
#define new DEBUG_NEW
#endif
// This will dictate one dimension of the vectors, and can dramatically affect memory use
// Using 0x7F will limit to typical ANSI characters
const size_t c_nMaxChar = 0x7F ;
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// C I V S t r i n g S e t
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// CIVStringSet
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
CIVStringSet::CIVStringSet( DWORD dwInitialWidth /*= 64*/ )
: m_apnFSM( dwInitialWidth, std::vector<SEntry>( c_nMaxChar ) ),
m_dwMaxUsedState( 0 ),
m_strDelims( _T(" \t\n\r,.!#*(){}[]?/\\\"'") ),
m_nCurTextChar( 0 )
{
}
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// ~CIVStringSet
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
CIVStringSet::~CIVStringSet()
{
}
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// Add
// several variations, the first being the most basic
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
bool CIVStringSet::Add( LPCTSTR pszWord )
{
bool bRetVal = false ;
// Insert the word into the FSM, then add it to the array
if ( ( pszWord != NULL )
&& InsertWord( pszWord, static_cast<DWORD>(m_nSize + 1) )
)
bRetVal = (CStringArray::Add( pszWord ) >= 0) ;
return bRetVal ;
}
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// add a single word from a CString (preserves reference counting)
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
bool CIVStringSet::Add( const CString & rstrWord )
{
bool bRetVal = false ;
// Insert the word into the FSM, then add it to the array
if ( InsertWord( static_cast<LPCTSTR>(rstrWord), static_cast<DWORD>(m_nSize + 1) ) )
bRetVal = (CStringArray::Add( rstrWord ) >= 0) ;
return bRetVal ;
}
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// Add multiple words, delimited with chars from pszDelims
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
size_t CIVStringSet::Add( LPCTSTR pszWords, LPCTSTR pszDelims )
{
size_t nCount = 0 ;
// Start with the entire string
CString strUnparsed( pszWords ),
strToken ;
if ( pszDelims == NULL )
pszDelims = static_cast<LPCTSTR>(m_strDelims) ;
// stop iterating when we find no more words following delimiters
while ( ! strUnparsed.IsEmpty() )
{
// Extract the first token
strToken = strUnparsed.SpanExcluding( pszDelims ) ;
if ( ! strToken.IsEmpty() )
{
if ( Add( strToken ) ) // add it to the FSM and the array
nCount++ ;
else
break ;
}
// Now, move beyond the token just extracted
int nTokenLen = strToken.GetLength() ;
if ( nTokenLen < strUnparsed.GetLength() )
strUnparsed = strUnparsed.Mid( nTokenLen ) ;
else
break ;
// Remove the delimiters
strUnparsed.TrimLeft( pszDelims ) ;
}
return nCount ;
}
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// add nWords words, each \0 term'd, with extra \0 at end
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
size_t CIVStringSet::Add( LPCTSTR pszzWords, size_t nWords )
{
size_t nCount = 0 ;
// Start with the first word in the multi-string
LPCTSTR pszCurWord = pszzWords ;
if ( pszCurWord != NULL )
{
// stop iterating when we find a "word" beginning with null (the extra \0)
while ( pszCurWord[0] != _T('\0') )
{
if ( Add( pszCurWord ) ) // add it to the FSM and the array
{
nCount++ ;
// Position to the next word, one beyond the null terminator
size_t nLen = _tcslen( pszCurWord ) ;
// If this fails, the extra \0 was probably not there, as required
pszCurWord = &pszzWords[nLen+1] ;
}
else
break ;
}
}
#ifdef _DEBUG
if ( ( nWords != nCount )
&& ( nWords > 0 )
)
TRACE("Expected %d words, but found %d!\n", nWords, nCount);
#else
UNUSED(nWords);
#endif
return nCount ;
}
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// add all the elements of a set of strings
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
size_t CIVStringSet::Add( const std::set<CString> & sstrWords )
{
size_t nWord = 0 ;
// Iterate the string set
std::set<CString>::const_iterator it = sstrWords.begin() ;
while ( it != sstrWords.end() )
{
if ( ! Add( *it++ ) ) // add it to the FSM and the array
break ;
nWord++ ;
}
return nWord ;
}
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// add all the elements of a CStringArray
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
size_t CIVStringSet::Add( const CStringArray & astrWords )
{
size_t nWord = 0 ;
size_t nCount = static_cast<size_t>(astrWords.GetSize()) ;
SetSize( m_nSize + nCount ) ;
for ( ; nWord < nCount ; nWord++ )
{
if ( ! Add( astrWords[nWord] ) ) // add it to the FSM and the array
break ;
nWord++ ;
}
return nWord ;
}
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// add all the elements of a CStringList
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
size_t CIVStringSet::Add( const CStringList & lstrWords )
{
size_t nWord = 0 ;
size_t nCount = static_cast<size_t>(lstrWords.GetCount()) ;
SetSize( m_nSize + nCount ) ;
// Iterate the string list
POSITION pos = lstrWords.GetHeadPosition() ;
while ( pos != NULL )
{
if ( ! Add( lstrWords.GetNext( pos ) ) ) // add it to the FSM and the array
break ;
nWord++ ;
}
return nWord ;
}
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// InsertWord
// put the new word into the FSM
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
bool CIVStringSet::InsertWord( LPCTSTR pszWord, DWORD dwIndex )
{
bool bRetVal = false ;
size_t nLen = _tcslen( pszWord ) ;
if ( m_dwMaxUsedState < MAXDWORD ) // if we've got over 4 billion states, something's wrong
{
DWORD dwCurState = 0UL ;
for ( UINT nChar = 0 ; nChar < nLen ; nChar++ )
{
size_t nIdxChar = min(static_cast<size_t>(pszWord[nChar]), c_nMaxChar) ;
SEntry & rEntry = m_apnFSM[dwCurState][nIdxChar] ; // the state, and possibly also an index
DWORD dwState = rEntry.m_dwState ;
bool bNew = (dwState == 0UL) ; // no previous words have been here
if ( bNew )
dwState = ++m_dwMaxUsedState ; // use a new state number
// special case - last character of substring
if ( nChar == ( nLen-1 ) )
{
// Put both the state number and the word index in the entry
rEntry.m_dwState = dwState ;
rEntry.m_dwIndex = dwIndex ;
break ;
}
else if ( bNew ) // if current entry for this char value and state is still zero, add to FSM
rEntry.m_dwState = dwState ;
dwCurState = dwState ; // prepare for next iteration
if ( m_apnFSM.size() <= dwCurState )
m_apnFSM.resize( ((dwCurState * 2) + 63) & 0xFFC0, std::vector<SEntry>( c_nMaxChar ) ) ;
}
bRetVal = true ;
}
return bRetVal ;
}
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// FindFirstIn
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
UINT CIVStringSet::FindFirstIn( CString strText, size_t & rnFirst )
{
// The real work is done by FindNext, but we must initialize the starting
// character index and string buffer first
m_nCurTextChar = 0U ;
m_strSearch = strText ;
return FindNext( rnFirst ) ;
}
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// FindNext
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
UINT CIVStringSet::FindNext( size_t & rnNext )
{
DWORD dwCurState = 0UL ; // start in state 0
UINT nIdx ;
UINT nStartChar = MAXDWORD ;
size_t nLen = m_strSearch.GetLength() ;
LPCTSTR pszSearch = static_cast<LPCTSTR>(m_strSearch) ;
// Start with the character following the last one examined
for ( nIdx = m_nCurTextChar ; nIdx < nLen ; nIdx++ )
{
// Skip over delimiters
size_t nNextIdx = _tcsspn( &pszSearch[nIdx], m_strDelims ) ;
if ( nNextIdx != 0 )
{
nStartChar = MAXDWORD ;
if ( nIdx + nNextIdx < nLen )
{
dwCurState = 0UL ;
nIdx += nNextIdx ;
}
else
break ;
}
// if we are in state 0, save the index of the starting character,
// in case we must backtrack or for the return value
if ( dwCurState == 0UL )
nStartChar = nIdx ;
// follow up the table states
size_t nIdxChar = min(static_cast<size_t>(pszSearch[nIdx]), c_nMaxChar) ;
const SEntry & rEntry = m_apnFSM[dwCurState][nIdxChar] ;
// if we reach an entry with a an index, we may have found a full match
// if so, return the word we found
if ( ( rEntry.m_dwIndex != 0UL )
&& ( ( nIdx == nLen-1 ) // are we at the end of the search string?
|| ( m_strDelims.Find( pszSearch[nIdx+1] ) != -1 ) // or at a delimiter?
)
)
{
size_t nIndex = rEntry.m_dwIndex - 1 ;
ASSERT(nIndex < static_cast<size_t>(GetSize()));
rnNext = nIndex ;
m_nCurTextChar = nIdx + 1 ; // set the index for the next time
break ;
}
else
dwCurState = rEntry.m_dwState ;
if ( dwCurState == 0 )
{
nStartChar = MAXDWORD ; // in case we're about to terminate the loop
nIdx += _tcscspn( &pszSearch[nIdx], m_strDelims ) ; // advance to the next delimiter
}
}
// return the index of the start of the word, or -1 if no word found
return (( nIdx < nLen ) ? nStartChar : MAXDWORD) ;
}
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// FindAllIn
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
size_t CIVStringSet::FindAllIn( CString strText, CWordPosPairList & rlstrWords )
{
rlstrWords.clear() ;
// Iterate through all words, and put them in the list
size_t nWord = static_cast<size_t>(-1) ;
UINT nPos = FindFirstIn( strText, nWord ) ;
if ( nPos != MAXDWORD )
{
do
{
CWordPosPair posWord( nWord, nPos ) ;
rlstrWords.push_back( posWord ) ;
nPos = FindNext( nWord ) ;
} while ( nPos != MAXDWORD ) ;
}
return rlstrWords.size() ;
}
|
By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.
If a file you wish to view isn't highlighted, and is a text file (not binary), please
let us know and we'll add colourisation support for it.
Scot is a Sr. Escalation Engineer for the Microsoft Developer Support VS & Languages team. He helps software developers who are Microsoft customers find bugs in their own, or Microsoft's, code.
Scot spends most of his time writing, reading, or thinking about C++ software, thereby classifying him as a geek.