A Multiple Substring Search Class: CIVStringSet

Scot Brennecke

Rate me:

4.69/5 (6 votes)

25 Apr 2010CPOL4 min read

152.8K

2.1K

A string array class using MFC or STL that performs very fast multiple string searches

civstringset_stl_source.zip
- StringSet.Cpp
- StringSet.H
civstringset_demo.zip
- StringSetSample
  - res
    - StringSetSample.ico
    - StringSetSample.rc2
  - Resource.H
  - StdAFX.Cpp
  - StdAFX.H
  - StringSetSample.Cpp
  - StringSetSample.H
  - StringSetSample.rc
  - StringSetSample.sln
  - StringSetSample.vcproj
  - StringSetSampleDlg.Cpp
  - StringSetSampleDlg.H
  - TargetVer.H
civstringset_mfc_source.zip
- StringSet.Cpp
- StringSet.H
civstringset_source.zip
- IVCode
  - IVStringSet
    - MFC
      - StringSet.Cpp
      - StringSet.H
    - STL
      - StringSet.Cpp
      - StringSet.H
civstringset.zip
- Write Triangle
  - Test
    - QueryBuilder
      - StringSet.Cpp
      - StringSet.H

// StringSet.Cpp: implementation of the CIVStringSet class.
//
// Written 12 June 2002 by Scot T Brennecke
// Thanks to Moishe Halibard and Moshe Rubin for their article,
//    "A Multiple Substring Search Algorithm" in the June 2002
//    edition of C/C++ Users Journal.  This class is based on
//    the algorthim therein described, but extended to return
//    all strings and use MFC classes.


#include "StdAfx.H"
#include "StringSet.H"

#ifdef _DEBUG
#undef THIS_FILE
static char THIS_FILE[]=__FILE__ ;
#define new DEBUG_NEW
#endif

//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
//                       C I V S t r i n g S e t
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// CIVStringSet
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
CIVStringSet::CIVStringSet( WORD wInitialWidth /*= 64*/ )
    : m_apnFSM( new DWORD[wInitialWidth][128] ),
      m_nCurDim( wInitialWidth ),
      m_nUsedCols( 0 ),
      m_wMaxUsedState( 0 ),
      m_nCurTextChar( 0 )
{
    // Initialize all FSM entries to zero
    ZeroMemory( m_apnFSM, sizeof(DWORD) * 128 * wInitialWidth ) ;
}

//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// ~CIVStringSet
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
CIVStringSet::~CIVStringSet()
{
    delete [] m_apnFSM ;
}

//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// Add
//    several variations, the first being the most basic
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
bool CIVStringSet::Add( LPCTSTR pszWord )
{
    bool bRetVal = false ;

    // Insert the word into the FSM, then add it to the array
    if ( ( pszWord != NULL )
      && InsertWord( pszWord, (WORD)(m_nSize + 1) )
       )
        bRetVal = (CStringArray::Add( pszWord ) >= 0) ;

    return bRetVal ;
}

//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// add a single word from a CString (preserves reference counting)
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
bool CIVStringSet::Add( const CString & rstrWord )
{
    bool bRetVal = false ;

    // Insert the word into the FSM, then add it to the array
    if ( InsertWord( (LPCTSTR)rstrWord, (WORD)(m_nSize + 1) ) )
        bRetVal = (CStringArray::Add( rstrWord ) >= 0) ;

    return bRetVal ;
}


//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// add nWords words, each \0 term'd, with extra \0 at end
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
int CIVStringSet::Add( LPCTSTR pszzWords, int nWords )
{
    int nCount = 0 ;

    // Start with the first word in the multi-string
    LPCTSTR pszCurWord = pszzWords ;
    if ( pszCurWord != NULL )
    {
        // stop iterating when we find a "word" beginning with null (the extra \0)
        while ( pszCurWord[0] != '\0' )
        {
            if ( Add( pszCurWord ) )  // add it to the FSM and the array
            {
                nCount++ ;

                // Position to the next word, one beyond the null terminator
                size_t nLen = _tcslen( pszCurWord ) ;
                // If this fails, the extra \0 was probably not there, as required
                pszCurWord = &pszzWords[nLen+1] ;
            }
            else
                break ;
        }
    }

#ifdef _DEBUG
    if ( ( nWords != nCount )
      && ( nWords > 0 )
       )
        TRACE("Expected %d words, but found %d!\n", nWords, nCount);
#else
    UNUSED(nWords);
#endif

    return nCount ;
}

//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// add all the elements of a CStringArray
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
int CIVStringSet::Add( CStringArray astrWords )
{
    int nCount = astrWords.GetSize() ;

    for ( int nWord = 0 ; nWord < nCount ; nWord++ )
    {
        if ( ! Add( astrWords[nWord] ) )  // add it to the FSM and the array
            break ;
    }

    return nWord ;
}

//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// add all the elements of a CStringList
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
int CIVStringSet::Add( CStringList lstrWords )
{
    int nWords = lstrWords.GetCount() ;
    int nCount = 0 ;

    // Iterate the string list
    POSITION pos = lstrWords.GetHeadPosition() ;
    while ( pos != NULL )
    {
        if ( Add( lstrWords.GetNext( pos ) ) )  // add it to the FSM and the array
            nCount++ ;
        else
            break ;
    }

#ifdef _DEBUG
    if ( ( nWords != nCount )
      && ( nWords > 0 )
       )
        TRACE("Expected %d words, but found %d!\n", nWords, nCount);
#else
    UNUSED(nWords);
#endif

    return nCount ;
}

//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// InsertWord
//          put the new word into the FSM
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
bool CIVStringSet::InsertWord( LPCTSTR pszWord, WORD wIndex )
{
    bool bRetVal = false ;

    size_t nLen = _tcslen( pszWord ) ;
    if ( ( m_wMaxUsedState < MAXWORD )    // if we've got 65,535 states, something's wrong
      && SetColDim( m_nUsedCols + nLen )  // make sure enough columns are allocated
       )
    {
        WORD wCurState = 0 ;
        for ( UINT nChar = 0 ; nChar < nLen ; nChar++ )
        {
            int nIdxChar = (pszWord[nChar] & 0x7F) ;  // mask out char values above 127
            DWORD dwEntry = m_apnFSM[wCurState][nIdxChar] ;  // the state, and possibly also an index
            WORD wState = LOWORD(dwEntry) ;  // extract only the state portion
            bool bNew = (wState == 0) ;      // no previous words have been here

            if ( bNew )
                wState = ++m_wMaxUsedState ;  // use a new state number

            // special case - last character of substring
            if ( nChar == ( nLen-1 ) )
            {
                // Put both the state number and the word index in the entry
                m_apnFSM[wCurState][nIdxChar] = (DWORD)MAKELONG(wState, wIndex) ;
                break ;
            }
            else if ( bNew ) // if current entry for this char value and state is still zero, add to FSM
                m_apnFSM[wCurState][nIdxChar] = wState ;

            wCurState = wState ;  // prepare for next iteration
        }

        m_nUsedCols += nLen ;
        bRetVal = true ;
    }

    return bRetVal ;
}

//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// SetColDim
//          set the current width to at least nNewDim columns
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
bool CIVStringSet::SetColDim( size_t nNewDim )
{
    bool bRetVal = false ;

    // Determine if we don't yet have enough columns for the newest word 
    if ( m_nCurDim < nNewDim )
    {
        int nNewSize = ((nNewDim + 63) & 0xFFC0) ;  // set to next higher multiple of 64
        if ( nNewSize < MAXWORD )  // we're "limited" to 65,535 columns
        {
            // reallocate the FSM with the increased column width
            DWORD (* apnTemp)[128] = new DWORD[nNewSize][128] ;
            ASSERT((int)m_nUsedCols < nNewSize);
            // Copy the portion already used, and zero out the rest
            CopyMemory( &apnTemp[0], &m_apnFSM[0], sizeof(DWORD) * 128 * m_nUsedCols ) ;
            ZeroMemory( &apnTemp[m_nUsedCols], sizeof(DWORD) * 128 * (nNewSize - m_nUsedCols) ) ;
            delete [] m_apnFSM ; // Get rid of old FSM buffer,
            m_apnFSM = apnTemp ; // and point to new one
            m_nCurDim = (WORD)nNewSize ;

            bRetVal = true ;
        }
    }
    else
        bRetVal = true ;

    return bRetVal ;
}

//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// FindFirstIn
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
bool CIVStringSet::FindFirstIn( CString strText, CString & rstrFirst )
{
    // The real work is done by FindNext, but we must initialize the starting
    //    character index and string buffer first
    m_nCurTextChar = 0 ;
    m_strSearch = strText ;

    return FindNext( rstrFirst ) ;
}

//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// FindNext
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
bool CIVStringSet::FindNext( CString & rstrNext )
{
    bool bRetVal = false ;

    WORD wCurState = 0 ; // start in state 0
    UINT nStartChar = 0 ;
    size_t nLen = m_strSearch.GetLength() ;

    // Start with the character following the last one examined
    for ( UINT nIdx = m_nCurTextChar ; nIdx < nLen ; nIdx++ )
    {
        WORD wPrevState = wCurState ;

        // if we are in state 0, save a pointer to the starting character, in case we must backtrack
        if ( wCurState == 0 )
            nStartChar = nIdx ;

        // follow up the table states
        int nIdxChar = (m_strSearch[(int)nIdx] & 0x7F) ;  // mask out char values above 127
        DWORD dwEntry = m_apnFSM[wCurState][nIdxChar] ;

        // if we reach an entry with a high word (an index), we have found a full match
        //    return the word we found
        if ( dwEntry > MAXWORD )
        {
            int nIndex = HIWORD(dwEntry) - 1 ;
            ASSERT(nIndex < GetSize());
            rstrNext = ElementAt( nIndex ) ;
            m_nCurTextChar = nIdx + 1 ;  // set the index for the next time
            bRetVal = true ;
            break ;
        }
        else
            wCurState = LOWORD(dwEntry) ;

        // have we followed a false lead?
        // if so, backtrack nIdx to position to continue search, with state reset to zero
        // set nIdx to nStartChar, and it will be incremented as the loop re-enters
        if ( ( wPrevState != 0 )
          && ( wCurState == 0 )
           )
            nIdx = nStartChar ;
    }

    return bRetVal ;
}

//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// FindAllIn
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
int CIVStringSet::FindAllIn( CString strText, CStringList & rlstrWords )
{
    rlstrWords.RemoveAll() ;

    // Iterate through all words, and put them in the list
    CString strWord ;
    if ( FindFirstIn( strText, strWord ) )
    {
        do
        {
            rlstrWords.AddTail( strWord ) ;
        } while ( FindNext( strWord ) ) ;
    }

    return rlstrWords.GetCount() ;
}

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)

Written By

Scot Brennecke

Software Developer (Senior) Microsoft

United States

Scot is a Sr. Escalation Engineer for the Microsoft Developer Support VS & Languages team. He helps software developers who are Microsoft customers find bugs in their own, or Microsoft's, code.

Scot spends most of his time writing, reading, or thinking about C++ software, thereby classifying him as a geek.

A Multiple Substring Search Class: CIVStringSet

License

Comments and Discussions