|
// StringSet.Cpp: implementation of the CIVStringSet class.
//
// Written 12 June 2002 by Scot T Brennecke
// Thanks to Moishe Halibard and Moshe Rubin for their article,
// "A Multiple Substring Search Algorithm" in the June 2002
// edition of C/C++ Users Journal. This class is based on
// the algorthim therein described, but extended to return
// all strings and use MFC classes.
#include "StdAfx.H"
#include "StringSet.H"
#ifdef _DEBUG
#undef THIS_FILE
static char THIS_FILE[]=__FILE__ ;
#define new DEBUG_NEW
#endif
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// C I V S t r i n g S e t
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// CIVStringSet
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
CIVStringSet::CIVStringSet( WORD wInitialWidth /*= 64*/ )
: m_apnFSM( new DWORD[wInitialWidth][128] ),
m_nCurDim( wInitialWidth ),
m_nUsedCols( 0 ),
m_wMaxUsedState( 0 ),
m_nCurTextChar( 0 )
{
// Initialize all FSM entries to zero
ZeroMemory( m_apnFSM, sizeof(DWORD) * 128 * wInitialWidth ) ;
}
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// ~CIVStringSet
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
CIVStringSet::~CIVStringSet()
{
delete [] m_apnFSM ;
}
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// Add
// several variations, the first being the most basic
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
bool CIVStringSet::Add( LPCTSTR pszWord )
{
bool bRetVal = false ;
// Insert the word into the FSM, then add it to the array
if ( ( pszWord != NULL )
&& InsertWord( pszWord, (WORD)(m_nSize + 1) )
)
bRetVal = (CStringArray::Add( pszWord ) >= 0) ;
return bRetVal ;
}
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// add a single word from a CString (preserves reference counting)
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
bool CIVStringSet::Add( const CString & rstrWord )
{
bool bRetVal = false ;
// Insert the word into the FSM, then add it to the array
if ( InsertWord( (LPCTSTR)rstrWord, (WORD)(m_nSize + 1) ) )
bRetVal = (CStringArray::Add( rstrWord ) >= 0) ;
return bRetVal ;
}
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// add nWords words, each \0 term'd, with extra \0 at end
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
int CIVStringSet::Add( LPCTSTR pszzWords, int nWords )
{
int nCount = 0 ;
// Start with the first word in the multi-string
LPCTSTR pszCurWord = pszzWords ;
if ( pszCurWord != NULL )
{
// stop iterating when we find a "word" beginning with null (the extra \0)
while ( pszCurWord[0] != '\0' )
{
if ( Add( pszCurWord ) ) // add it to the FSM and the array
{
nCount++ ;
// Position to the next word, one beyond the null terminator
size_t nLen = _tcslen( pszCurWord ) ;
// If this fails, the extra \0 was probably not there, as required
pszCurWord = &pszzWords[nLen+1] ;
}
else
break ;
}
}
#ifdef _DEBUG
if ( ( nWords != nCount )
&& ( nWords > 0 )
)
TRACE("Expected %d words, but found %d!\n", nWords, nCount);
#else
UNUSED(nWords);
#endif
return nCount ;
}
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// add all the elements of a CStringArray
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
int CIVStringSet::Add( CStringArray astrWords )
{
int nCount = astrWords.GetSize() ;
for ( int nWord = 0 ; nWord < nCount ; nWord++ )
{
if ( ! Add( astrWords[nWord] ) ) // add it to the FSM and the array
break ;
}
return nWord ;
}
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// add all the elements of a CStringList
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
int CIVStringSet::Add( CStringList lstrWords )
{
int nWords = lstrWords.GetCount() ;
int nCount = 0 ;
// Iterate the string list
POSITION pos = lstrWords.GetHeadPosition() ;
while ( pos != NULL )
{
if ( Add( lstrWords.GetNext( pos ) ) ) // add it to the FSM and the array
nCount++ ;
else
break ;
}
#ifdef _DEBUG
if ( ( nWords != nCount )
&& ( nWords > 0 )
)
TRACE("Expected %d words, but found %d!\n", nWords, nCount);
#else
UNUSED(nWords);
#endif
return nCount ;
}
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// InsertWord
// put the new word into the FSM
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
bool CIVStringSet::InsertWord( LPCTSTR pszWord, WORD wIndex )
{
bool bRetVal = false ;
size_t nLen = _tcslen( pszWord ) ;
if ( ( m_wMaxUsedState < MAXWORD ) // if we've got 65,535 states, something's wrong
&& SetColDim( m_nUsedCols + nLen ) // make sure enough columns are allocated
)
{
WORD wCurState = 0 ;
for ( UINT nChar = 0 ; nChar < nLen ; nChar++ )
{
int nIdxChar = (pszWord[nChar] & 0x7F) ; // mask out char values above 127
DWORD dwEntry = m_apnFSM[wCurState][nIdxChar] ; // the state, and possibly also an index
WORD wState = LOWORD(dwEntry) ; // extract only the state portion
bool bNew = (wState == 0) ; // no previous words have been here
if ( bNew )
wState = ++m_wMaxUsedState ; // use a new state number
// special case - last character of substring
if ( nChar == ( nLen-1 ) )
{
// Put both the state number and the word index in the entry
m_apnFSM[wCurState][nIdxChar] = (DWORD)MAKELONG(wState, wIndex) ;
break ;
}
else if ( bNew ) // if current entry for this char value and state is still zero, add to FSM
m_apnFSM[wCurState][nIdxChar] = wState ;
wCurState = wState ; // prepare for next iteration
}
m_nUsedCols += nLen ;
bRetVal = true ;
}
return bRetVal ;
}
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// SetColDim
// set the current width to at least nNewDim columns
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
bool CIVStringSet::SetColDim( size_t nNewDim )
{
bool bRetVal = false ;
// Determine if we don't yet have enough columns for the newest word
if ( m_nCurDim < nNewDim )
{
int nNewSize = ((nNewDim + 63) & 0xFFC0) ; // set to next higher multiple of 64
if ( nNewSize < MAXWORD ) // we're "limited" to 65,535 columns
{
// reallocate the FSM with the increased column width
DWORD (* apnTemp)[128] = new DWORD[nNewSize][128] ;
ASSERT((int)m_nUsedCols < nNewSize);
// Copy the portion already used, and zero out the rest
CopyMemory( &apnTemp[0], &m_apnFSM[0], sizeof(DWORD) * 128 * m_nUsedCols ) ;
ZeroMemory( &apnTemp[m_nUsedCols], sizeof(DWORD) * 128 * (nNewSize - m_nUsedCols) ) ;
delete [] m_apnFSM ; // Get rid of old FSM buffer,
m_apnFSM = apnTemp ; // and point to new one
m_nCurDim = (WORD)nNewSize ;
bRetVal = true ;
}
}
else
bRetVal = true ;
return bRetVal ;
}
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// FindFirstIn
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
bool CIVStringSet::FindFirstIn( CString strText, CString & rstrFirst )
{
// The real work is done by FindNext, but we must initialize the starting
// character index and string buffer first
m_nCurTextChar = 0 ;
m_strSearch = strText ;
return FindNext( rstrFirst ) ;
}
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// FindNext
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
bool CIVStringSet::FindNext( CString & rstrNext )
{
bool bRetVal = false ;
WORD wCurState = 0 ; // start in state 0
UINT nStartChar = 0 ;
size_t nLen = m_strSearch.GetLength() ;
// Start with the character following the last one examined
for ( UINT nIdx = m_nCurTextChar ; nIdx < nLen ; nIdx++ )
{
WORD wPrevState = wCurState ;
// if we are in state 0, save a pointer to the starting character, in case we must backtrack
if ( wCurState == 0 )
nStartChar = nIdx ;
// follow up the table states
int nIdxChar = (m_strSearch[(int)nIdx] & 0x7F) ; // mask out char values above 127
DWORD dwEntry = m_apnFSM[wCurState][nIdxChar] ;
// if we reach an entry with a high word (an index), we have found a full match
// return the word we found
if ( dwEntry > MAXWORD )
{
int nIndex = HIWORD(dwEntry) - 1 ;
ASSERT(nIndex < GetSize());
rstrNext = ElementAt( nIndex ) ;
m_nCurTextChar = nIdx + 1 ; // set the index for the next time
bRetVal = true ;
break ;
}
else
wCurState = LOWORD(dwEntry) ;
// have we followed a false lead?
// if so, backtrack nIdx to position to continue search, with state reset to zero
// set nIdx to nStartChar, and it will be incremented as the loop re-enters
if ( ( wPrevState != 0 )
&& ( wCurState == 0 )
)
nIdx = nStartChar ;
}
return bRetVal ;
}
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// FindAllIn
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
int CIVStringSet::FindAllIn( CString strText, CStringList & rlstrWords )
{
rlstrWords.RemoveAll() ;
// Iterate through all words, and put them in the list
CString strWord ;
if ( FindFirstIn( strText, strWord ) )
{
do
{
rlstrWords.AddTail( strWord ) ;
} while ( FindNext( strWord ) ) ;
}
return rlstrWords.GetCount() ;
}
|
By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.
If a file you wish to view isn't highlighted, and is a text file (not binary), please
let us know and we'll add colourisation support for it.
Scot is a Sr. Escalation Engineer for the Microsoft Developer Support VS & Languages team. He helps software developers who are Microsoft customers find bugs in their own, or Microsoft's, code.
Scot spends most of his time writing, reading, or thinking about C++ software, thereby classifying him as a geek.