// StringSet.Cpp: implementation of the CIVStringSet class.
//
// Written 12 June 2002 by Scot T Brennecke
// Thanks to Moishe Halibard and Moshe Rubin for their article,
// "A Multiple Substring Search Algorithm" in the June 2002
// edition of C/C++ Users Journal. This class is based on
// the algorthim therein described, but extended to return
// all strings and use MFC classes.
#include "StdAfx.H"
#include "StringSet.H"
#ifdef _DEBUG
#undef THIS_FILE
static char THIS_FILE[]=__FILE__ ;
#define new DEBUG_NEW
#endif
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// C I V S t r i n g S e t
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// CIVStringSet
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
CIVStringSet::CIVStringSet( WORD wInitialWidth /*= 64*/ )
: m_apnFSM( new DWORD[wInitialWidth][128] ),
m_nCurDim( wInitialWidth ),
m_nUsedCols( 0 ),
m_wMaxUsedState( 0 ),
m_nCurTextChar( 0 )
{
// Initialize all FSM entries to zero
ZeroMemory( m_apnFSM, sizeof(DWORD) * 128 * wInitialWidth ) ;
}
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// ~CIVStringSet
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
CIVStringSet::~CIVStringSet()
{
delete [] m_apnFSM ;
}
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// Add
// several variations, the first being the most basic
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
bool CIVStringSet::Add( LPCTSTR pszWord )
{
bool bRetVal = false ;
// Insert the word into the FSM, then add it to the array
if ( ( pszWord != NULL )
&& InsertWord( pszWord, (WORD)(size() + 1) )
)
{
std::string strWord(pszWord) ;
push_back( strWord ) ;
bRetVal = true ;
}
return bRetVal ;
}
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// Add a string
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
bool CIVStringSet::Add( const std::string & rstrWord )
{
bool bRetVal = false ;
// Insert the word into the FSM, then add it to the array
if ( InsertWord( rstrWord.c_str(), (WORD)(size() + 1) ) )
{
push_back( rstrWord ) ;
bRetVal = true ;
}
return bRetVal ;
}
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// Add multiple words, delimited with chars from pszDelims
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
int CIVStringSet::Add( LPCTSTR pszWords, LPCTSTR pszDelims )
{
int nCount = 0 ;
// Start with the entire string
std::string strUnparsed( pszWords ),
strToken ;
// stop iterating when we find no more words following delimiters
while ( ! strUnparsed.empty() )
{
// Extract the first token
size_t nTokSize = _tcscspn( strUnparsed.c_str(), pszDelims ) ;
strToken = strUnparsed.substr( 0, nTokSize ) ;
if ( ! strToken.empty() )
{
if ( Add( strToken ) ) // add it to the FSM and the array
nCount++ ;
else
break ;
}
// Now, move beyond the token just extracted and the delimiters
int nTokenLen = strToken.length() ;
if ( nTokenLen < strUnparsed.length() )
{
int nDelimLen = _tcsspn( &strUnparsed[nTokenLen], pszDelims ) ;
nTokenLen += nDelimLen ;
if ( nTokenLen < strUnparsed.length() )
strUnparsed = strUnparsed.substr( nTokenLen ) ;
else
break ;
}
else
break ;
}
return nCount ;
}
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// add nWords words, each \0 term'd, with extra \0 at end
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
int CIVStringSet::Add( LPCTSTR pszzWords, int nWords )
{
int nCount = 0 ;
// Start with the first word in the multi-string
LPCTSTR pszCurWord = pszzWords ;
if ( pszCurWord != NULL )
{
// stop iterating when we find a "word" beginning with null (the extra \0)
while ( pszCurWord[0] != '\0' )
{
if ( Add( pszCurWord ) ) // add it to the FSM and the array
{
nCount++ ;
// Position to the next word, one beyond the null terminator
size_t nLen = _tcslen( pszCurWord ) ;
// If this fails, the extra \0 was probably not there, as required
pszCurWord = &pszzWords[nLen+1] ;
}
else
break ;
}
}
#ifdef _DEBUG
if ( ( nWords != nCount )
&& ( nWords > 0 )
)
TRACE("Expected %d words, but found %d!\n", nWords, nCount);
#else
UNUSED(nWords);
#endif
return nCount ;
}
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// add all the elements of an array of strings
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
int CIVStringSet::Add( std::vector<std::string> astrWords )
{
size_t nCount = astrWords.size() ;
reserve( size() + nCount ) ;
for ( UINT nWord = 0 ; nWord < nCount ; nWord++ )
{
if ( ! Add( astrWords[nWord] ) ) // add it to the FSM and the array
break ;
}
return nWord ;
}
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// add all the elements of a list of strings
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
int CIVStringSet::Add( std::list<std::string> lstrWords )
{
size_t nCount = lstrWords.size() ;
reserve( size() + nCount ) ;
// Iterate the string list
std::list<std::string>::iterator it = lstrWords.begin() ;
while ( it != lstrWords.end() )
{
if ( ! Add( *it++ ) ) // add it to the FSM and the array
break ;
}
return nCount ;
}
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// InsertWord
// put the new word into the FSM
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
bool CIVStringSet::InsertWord( LPCTSTR pszWord, WORD wIndex )
{
bool bRetVal = false ;
size_t nLen = _tcslen( pszWord ) ;
if ( ( m_wMaxUsedState < MAXWORD ) // if we've got 65,535 states, something's wrong
&& SetColDim( m_nUsedCols + nLen ) // make sure enough columns are allocated
)
{
WORD wCurState = 0 ;
for ( UINT nChar = 0 ; nChar < nLen ; nChar++ )
{
int nIdxChar = (pszWord[nChar] & 0x7F) ; // mask out char values above 127
DWORD dwEntry = m_apnFSM[wCurState][nIdxChar] ; // the state, and possibly also an index
WORD wState = LOWORD(dwEntry) ; // extract only the state portion
bool bNew = (wState == 0) ; // no previous words have been here
if ( bNew )
wState = ++m_wMaxUsedState ; // use a new state number
// special case - last character of substring
if ( nChar == ( nLen-1 ) )
{
// Put both the state number and the word index in the entry
m_apnFSM[wCurState][nIdxChar] = (DWORD)MAKELONG(wState, wIndex) ;
break ;
}
else if ( bNew ) // if current entry for this char value and state is still zero, add to FSM
m_apnFSM[wCurState][nIdxChar] = wState ;
wCurState = wState ; // prepare for next iteration
}
m_nUsedCols += nLen ;
bRetVal = true ;
}
return bRetVal ;
}
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// SetColDim
// set the current width to at least nNewDim columns
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
bool CIVStringSet::SetColDim( size_t nNewDim )
{
bool bRetVal = false ;
// Determine if we don't yet have enough columns for the newest word
if ( m_nCurDim < nNewDim )
{
int nNewSize = ((nNewDim + 63) & 0xFFC0) ; // set to next higher multiple of 64
if ( nNewSize < MAXWORD ) // we're "limited" to 65,535 columns
{
// reallocate the FSM with the increased column width
DWORD (* apnTemp)[128] = new DWORD[nNewSize][128] ;
ASSERT((int)m_nUsedCols < nNewSize);
// Copy the portion already used, and zero out the rest
CopyMemory( &apnTemp[0], &m_apnFSM[0], sizeof(DWORD) * 128 * m_nUsedCols ) ;
ZeroMemory( &apnTemp[m_nUsedCols], sizeof(DWORD) * 128 * (nNewSize - m_nUsedCols) ) ;
delete [] m_apnFSM ; // Get rid of old FSM buffer,
m_apnFSM = apnTemp ; // and point to new one
m_nCurDim = (WORD)nNewSize ;
bRetVal = true ;
}
}
else
bRetVal = true ;
return bRetVal ;
}
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// FindFirstIn
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
UINT CIVStringSet::FindFirstIn( std::string strText, int & rnFirst )
{
// The real work is done by FindNext, but we must initialize the starting
// character index and string buffer first
m_nCurTextChar = 0 ;
m_strSearch = strText ;
return FindNext( rnFirst ) ;
}
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// FindNext
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
UINT CIVStringSet::FindNext( int & rnNext )
{
WORD wCurState = 0 ; // start in state 0
UINT nIdx ;
UINT nStartChar = 0xFFFFFFFF ;
size_t nLen = m_strSearch.length() ;
// Start with the character following the last one examined
for ( nIdx = m_nCurTextChar ; nIdx < nLen ; nIdx++ )
{
WORD wPrevState = wCurState ;
// if we are in state 0, save the index of the starting character,
// in case we must backtrack or for the return value
if ( wCurState == 0 )
nStartChar = nIdx ;
// follow up the table states
int nIdxChar = (m_strSearch[(int)nIdx] & 0x7F) ; // mask out char values above 127
DWORD dwEntry = m_apnFSM[wCurState][nIdxChar] ;
// if we reach an entry with a high word (an index), we have found a full match
// return the word we found
if ( dwEntry > MAXWORD )
{
int nIndex = HIWORD(dwEntry) - 1 ;
ASSERT(nIndex < size());
rnNext = nIndex ;
m_nCurTextChar = nIdx + 1 ; // set the index for the next time
break ;
}
else
wCurState = LOWORD(dwEntry) ;
// have we followed a false lead?
// if so, backtrack nIdx to position to continue search, with state reset to zero
// set nIdx to nStartChar, and it will be incremented as the loop re-enters
if ( ( wPrevState != 0 )
&& ( wCurState == 0 )
)
{
nIdx = nStartChar ;
nStartChar = 0xFFFFFFFF ; // in case we're about to terminate the loop
}
}
// return the index of the start of the word, or -1 if no word found
return (( nIdx < nLen ) ? nStartChar : 0xFFFFFFFF) ;
}
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// FindAllIn
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
size_t CIVStringSet::FindAllIn( std::string strText, CWordPosPairList & rlstrWords )
{
rlstrWords.clear() ;
// Iterate through all words, and put them in the list
int nWord = -1 ;
UINT nPos = FindFirstIn( strText, nWord ) ;
if ( nPos != 0xFFFFFFFF )
{
do
{
CWordPosPair posWord( nWord, nPos ) ;
rlstrWords.push_back( posWord ) ;
nPos = FindNext( nWord ) ;
} while ( nPos != 0xFFFFFFFF ) ;
}
return rlstrWords.size() ;
}