Click here to Skip to main content
15,867,568 members
Articles / Desktop Programming / MFC

A Multiple Substring Search Class: CIVStringSet

Rate me:
Please Sign up or sign in to vote.
4.69/5 (6 votes)
25 Apr 2010CPOL4 min read 152.3K   2.1K   37  
A string array class using MFC or STL that performs very fast multiple string searches
// StringSet.Cpp: implementation of the CIVStringSet class.
//
// Written 12 June 2002 by Scot T Brennecke
// Thanks to Moishe Halibard and Moshe Rubin for their article,
//    "A Multiple Substring Search Algorithm" in the June 2002
//    edition of C/C++ Users Journal.  This class is based on
//    the algorthim therein described, but extended to return
//    all strings and use MFC classes.


#include "StdAfx.H"
#include "StringSet.H"

#ifdef _DEBUG
#undef THIS_FILE
static char THIS_FILE[]=__FILE__ ;
#define new DEBUG_NEW
#endif

//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
//                       C I V S t r i n g S e t
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// CIVStringSet
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
CIVStringSet::CIVStringSet( WORD wInitialWidth /*= 64*/ )
    : m_apnFSM( new DWORD[wInitialWidth][128] ),
      m_nCurDim( wInitialWidth ),
      m_nUsedCols( 0 ),
      m_wMaxUsedState( 0 ),
      m_nCurTextChar( 0 )
{
    // Initialize all FSM entries to zero
    ZeroMemory( m_apnFSM, sizeof(DWORD) * 128 * wInitialWidth ) ;
}

//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// ~CIVStringSet
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
CIVStringSet::~CIVStringSet()
{
    delete [] m_apnFSM ;
}

//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// Add
//    several variations, the first being the most basic
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
bool CIVStringSet::Add( LPCTSTR pszWord )
{
    bool bRetVal = false ;

    // Insert the word into the FSM, then add it to the array
    if ( ( pszWord != NULL )
      && InsertWord( pszWord, (WORD)(size() + 1) )
       )
    {
        std::string strWord(pszWord) ;
        push_back( strWord ) ;
        bRetVal = true ;
    }

    return bRetVal ;
}

//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// Add a string
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
bool CIVStringSet::Add( const std::string & rstrWord )
{
    bool bRetVal = false ;

    // Insert the word into the FSM, then add it to the array
    if ( InsertWord( rstrWord.c_str(), (WORD)(size() + 1) ) )
    {
        push_back( rstrWord ) ;
        bRetVal = true ;
    }

    return bRetVal ;
}

//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// Add multiple words, delimited with chars from pszDelims
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
int CIVStringSet::Add( LPCTSTR pszWords, LPCTSTR pszDelims )
{
    int nCount = 0 ;

    // Start with the entire string
    std::string strUnparsed( pszWords ),
                strToken ;

    // stop iterating when we find no more words following delimiters
    while ( ! strUnparsed.empty() )
    {
        // Extract the first token
        size_t nTokSize = _tcscspn( strUnparsed.c_str(), pszDelims ) ;
        strToken = strUnparsed.substr( 0, nTokSize ) ;
        if ( ! strToken.empty() )
        {
            if ( Add( strToken ) )  // add it to the FSM and the array
                nCount++ ;
            else
                break ;
        }

        // Now, move beyond the token just extracted and the delimiters
        int nTokenLen = strToken.length() ;
        if ( nTokenLen < strUnparsed.length() )
        {
            int nDelimLen = _tcsspn( &strUnparsed[nTokenLen], pszDelims ) ;
            nTokenLen += nDelimLen ;
            if ( nTokenLen < strUnparsed.length() )
                strUnparsed = strUnparsed.substr( nTokenLen ) ;
            else
                break ;
        }
        else
            break ;
    }

    return nCount ;
}

//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// add nWords words, each \0 term'd, with extra \0 at end
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
int CIVStringSet::Add( LPCTSTR pszzWords, int nWords )
{
    int nCount = 0 ;

    // Start with the first word in the multi-string
    LPCTSTR pszCurWord = pszzWords ;
    if ( pszCurWord != NULL )
    {
        // stop iterating when we find a "word" beginning with null (the extra \0)
        while ( pszCurWord[0] != '\0' )
        {
            if ( Add( pszCurWord ) )  // add it to the FSM and the array
            {
                nCount++ ;

                // Position to the next word, one beyond the null terminator
                size_t nLen = _tcslen( pszCurWord ) ;
                // If this fails, the extra \0 was probably not there, as required
                pszCurWord = &pszzWords[nLen+1] ;
            }
            else
                break ;
        }
    }

#ifdef _DEBUG
    if ( ( nWords != nCount )
      && ( nWords > 0 )
       )
        TRACE("Expected %d words, but found %d!\n", nWords, nCount);
#else
    UNUSED(nWords);
#endif

    return nCount ;
}

//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// add all the elements of an array of strings
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
int CIVStringSet::Add( std::vector<std::string> astrWords )
{
    size_t nCount = astrWords.size() ;
    reserve( size() + nCount ) ;

    for ( UINT nWord = 0 ; nWord < nCount ; nWord++ )
    {
        if ( ! Add( astrWords[nWord] ) )  // add it to the FSM and the array
            break ;
    }

    return nWord ;
}

//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// add all the elements of a list of strings
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
int CIVStringSet::Add( std::list<std::string> lstrWords )
{
    size_t nCount = lstrWords.size() ;
    reserve( size() + nCount ) ;

    // Iterate the string list
    std::list<std::string>::iterator it = lstrWords.begin() ;
    while ( it != lstrWords.end() )
    {
        if ( ! Add( *it++ ) )  // add it to the FSM and the array
            break ;
    }

    return nCount ;
}

//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// InsertWord
//          put the new word into the FSM
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
bool CIVStringSet::InsertWord( LPCTSTR pszWord, WORD wIndex )
{
    bool bRetVal = false ;

    size_t nLen = _tcslen( pszWord ) ;
    if ( ( m_wMaxUsedState < MAXWORD )    // if we've got 65,535 states, something's wrong
      && SetColDim( m_nUsedCols + nLen )  // make sure enough columns are allocated
       )
    {
        WORD wCurState = 0 ;
        for ( UINT nChar = 0 ; nChar < nLen ; nChar++ )
        {
            int nIdxChar = (pszWord[nChar] & 0x7F) ;  // mask out char values above 127
            DWORD dwEntry = m_apnFSM[wCurState][nIdxChar] ;  // the state, and possibly also an index
            WORD wState = LOWORD(dwEntry) ;  // extract only the state portion
            bool bNew = (wState == 0) ;      // no previous words have been here

            if ( bNew )
                wState = ++m_wMaxUsedState ;  // use a new state number

            // special case - last character of substring
            if ( nChar == ( nLen-1 ) )
            {
                // Put both the state number and the word index in the entry
                m_apnFSM[wCurState][nIdxChar] = (DWORD)MAKELONG(wState, wIndex) ;
                break ;
            }
            else if ( bNew ) // if current entry for this char value and state is still zero, add to FSM
                m_apnFSM[wCurState][nIdxChar] = wState ;

            wCurState = wState ;  // prepare for next iteration
        }

        m_nUsedCols += nLen ;
        bRetVal = true ;
    }

    return bRetVal ;
}

//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// SetColDim
//          set the current width to at least nNewDim columns
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
bool CIVStringSet::SetColDim( size_t nNewDim )
{
    bool bRetVal = false ;

    // Determine if we don't yet have enough columns for the newest word 
    if ( m_nCurDim < nNewDim )
    {
        int nNewSize = ((nNewDim + 63) & 0xFFC0) ;  // set to next higher multiple of 64
        if ( nNewSize < MAXWORD )  // we're "limited" to 65,535 columns
        {
            // reallocate the FSM with the increased column width
            DWORD (* apnTemp)[128] = new DWORD[nNewSize][128] ;
            ASSERT((int)m_nUsedCols < nNewSize);
            // Copy the portion already used, and zero out the rest
            CopyMemory( &apnTemp[0], &m_apnFSM[0], sizeof(DWORD) * 128 * m_nUsedCols ) ;
            ZeroMemory( &apnTemp[m_nUsedCols], sizeof(DWORD) * 128 * (nNewSize - m_nUsedCols) ) ;
            delete [] m_apnFSM ; // Get rid of old FSM buffer,
            m_apnFSM = apnTemp ; // and point to new one
            m_nCurDim = (WORD)nNewSize ;

            bRetVal = true ;
        }
    }
    else
        bRetVal = true ;

    return bRetVal ;
}

//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// FindFirstIn
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
UINT CIVStringSet::FindFirstIn( std::string strText, int & rnFirst )
{
    // The real work is done by FindNext, but we must initialize the starting
    //    character index and string buffer first
    m_nCurTextChar = 0 ;
    m_strSearch = strText ;

    return FindNext( rnFirst ) ;
}

//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// FindNext
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
UINT CIVStringSet::FindNext( int & rnNext )
{
    WORD wCurState = 0 ; // start in state 0
    UINT nIdx ;
    UINT nStartChar = 0xFFFFFFFF ;
    size_t nLen = m_strSearch.length() ;

    // Start with the character following the last one examined
    for ( nIdx = m_nCurTextChar ; nIdx < nLen ; nIdx++ )
    {
        WORD wPrevState = wCurState ;

        // if we are in state 0, save the index of the starting character,
        //   in case we must backtrack or for the return value
        if ( wCurState == 0 )
            nStartChar = nIdx ;

        // follow up the table states
        int nIdxChar = (m_strSearch[(int)nIdx] & 0x7F) ;  // mask out char values above 127
        DWORD dwEntry = m_apnFSM[wCurState][nIdxChar] ;

        // if we reach an entry with a high word (an index), we have found a full match
        //    return the word we found
        if ( dwEntry > MAXWORD )
        {
            int nIndex = HIWORD(dwEntry) - 1 ;
            ASSERT(nIndex < size());
            rnNext = nIndex ;
            m_nCurTextChar = nIdx + 1 ;  // set the index for the next time
            break ;
        }
        else
            wCurState = LOWORD(dwEntry) ;

        // have we followed a false lead?
        // if so, backtrack nIdx to position to continue search, with state reset to zero
        // set nIdx to nStartChar, and it will be incremented as the loop re-enters
        if ( ( wPrevState != 0 )
          && ( wCurState == 0 )
           )
        {
            nIdx = nStartChar ;
            nStartChar = 0xFFFFFFFF ;  // in case we're about to terminate the loop
        }
    }

    // return the index of the start of the word, or -1 if no word found
    return (( nIdx < nLen ) ? nStartChar : 0xFFFFFFFF) ;
}

//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// FindAllIn
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
size_t CIVStringSet::FindAllIn( std::string strText, CWordPosPairList & rlstrWords )
{
    rlstrWords.clear() ;

    // Iterate through all words, and put them in the list
    int  nWord = -1 ;
    UINT nPos = FindFirstIn( strText, nWord ) ;
    if ( nPos != 0xFFFFFFFF )
    {
        do
        {
            CWordPosPair posWord( nWord, nPos ) ;
            rlstrWords.push_back( posWord ) ;
            nPos = FindNext( nWord ) ;
        } while ( nPos != 0xFFFFFFFF ) ;
    }

    return rlstrWords.size() ;
}

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)


Written By
Software Developer (Senior) Microsoft
United States United States
Scot is a Sr. Escalation Engineer for the Microsoft Developer Support VS & Languages team. He helps software developers who are Microsoft customers find bugs in their own, or Microsoft's, code.

Scot spends most of his time writing, reading, or thinking about C++ software, thereby classifying him as a geek.

Comments and Discussions