A Multiple Substring Search Class: CIVStringSet

Scot Brennecke

Rate me:

4.69/5 (6 votes)

25 Apr 2010CPOL4 min read

153.3K

2.1K

A string array class using MFC or STL that performs very fast multiple string searches

civstringset_stl_source.zip
- StringSet.Cpp
- StringSet.H
civstringset_demo.zip
- StringSetSample
  - res
    - StringSetSample.ico
    - StringSetSample.rc2
  - Resource.H
  - StdAFX.Cpp
  - StdAFX.H
  - StringSetSample.Cpp
  - StringSetSample.H
  - StringSetSample.rc
  - StringSetSample.sln
  - StringSetSample.vcproj
  - StringSetSampleDlg.Cpp
  - StringSetSampleDlg.H
  - TargetVer.H
civstringset_mfc_source.zip
- StringSet.Cpp
- StringSet.H
civstringset_source.zip
- IVCode
  - IVStringSet
    - MFC
      - StringSet.Cpp
      - StringSet.H
    - STL
      - StringSet.Cpp
      - StringSet.H
civstringset.zip
- Write Triangle
  - Test
    - QueryBuilder
      - StringSet.Cpp
      - StringSet.H

// StringSet.Cpp: implementation of the CIVStringSet class.
//
// Written 12 June 2002 by Scot T Brennecke
// Thanks to Moishe Halibard and Moshe Rubin for their article,
//    "A Multiple Substring Search Algorithm" in the June 2002
//    edition of C/C++ Users Journal.  This class is based on
//    the algorthim therein described, but extended to return
//    all strings and use MFC classes.
//
// Modified 24 Apr 2010 by Scot T Brennecke

#include "StdAfx.H"
#include "StringSet.H"

#ifdef _DEBUG
#undef THIS_FILE
static char THIS_FILE[]=__FILE__ ;
#define new DEBUG_NEW
#endif

// This will dictate one dimension of the vectors, and can dramatically affect memory use
// Using 0x7F will limit to typical ANSI characters
const size_t c_nMaxChar = 0x7F ;

//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
//                       C I V S t r i n g S e t
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// CIVStringSet
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
CIVStringSet::CIVStringSet( DWORD dwInitialWidth /*= 64*/ )
  : m_apnFSM( dwInitialWidth, std::vector<SEntry>( c_nMaxChar ) ),
    m_dwMaxUsedState( 0 ),
    m_strDelims( _T(" \t\n\r,.!#*(){}[]?/\\\"'") ),
    m_nCurTextChar( 0 )
{
}

//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// ~CIVStringSet
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
CIVStringSet::~CIVStringSet()
{
}

//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// Add
//    several variations, the first being the most basic
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
bool CIVStringSet::Add( LPCTSTR pszWord )
{
    bool bRetVal = false ;

    // Insert the word into the FSM, then add it to the array
    if ( ( pszWord != NULL )
      && InsertWord( pszWord, static_cast<DWORD>(m_nSize + 1) )
       )
        bRetVal = (CStringArray::Add( pszWord ) >= 0) ;

    return bRetVal ;
}

//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// add a single word from a CString (preserves reference counting)
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
bool CIVStringSet::Add( const CString & rstrWord )
{
    bool bRetVal = false ;

    // Insert the word into the FSM, then add it to the array
    if ( InsertWord( static_cast<LPCTSTR>(rstrWord), static_cast<DWORD>(m_nSize + 1) ) )
        bRetVal = (CStringArray::Add( rstrWord ) >= 0) ;

    return bRetVal ;
}

//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// Add multiple words, delimited with chars from pszDelims
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
size_t CIVStringSet::Add( LPCTSTR pszWords, LPCTSTR pszDelims )
{
    size_t nCount = 0 ;

    // Start with the entire string
    CString strUnparsed( pszWords ),
            strToken ;

    if ( pszDelims == NULL )
        pszDelims = static_cast<LPCTSTR>(m_strDelims) ;

    // stop iterating when we find no more words following delimiters
    while ( ! strUnparsed.IsEmpty() )
    {
        // Extract the first token
        strToken = strUnparsed.SpanExcluding( pszDelims ) ;
        if ( ! strToken.IsEmpty() )
        {
            if ( Add( strToken ) )  // add it to the FSM and the array
                nCount++ ;
            else
                break ;
        }

        // Now, move beyond the token just extracted
        int nTokenLen = strToken.GetLength() ;
        if ( nTokenLen < strUnparsed.GetLength() )
            strUnparsed = strUnparsed.Mid( nTokenLen ) ;
        else
            break ;

        // Remove the delimiters
        strUnparsed.TrimLeft( pszDelims ) ;
    }

    return nCount ;
}

//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// add nWords words, each \0 term'd, with extra \0 at end
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
size_t CIVStringSet::Add( LPCTSTR pszzWords, size_t nWords )
{
    size_t nCount = 0 ;

    // Start with the first word in the multi-string
    LPCTSTR pszCurWord = pszzWords ;
    if ( pszCurWord != NULL )
    {
        // stop iterating when we find a "word" beginning with null (the extra \0)
        while ( pszCurWord[0] != _T('\0') )
        {
            if ( Add( pszCurWord ) )  // add it to the FSM and the array
            {
                nCount++ ;

                // Position to the next word, one beyond the null terminator
                size_t nLen = _tcslen( pszCurWord ) ;
                // If this fails, the extra \0 was probably not there, as required
                pszCurWord = &pszzWords[nLen+1] ;
            }
            else
                break ;
        }
    }

#ifdef _DEBUG
    if ( ( nWords != nCount )
      && ( nWords > 0 )
       )
        TRACE("Expected %d words, but found %d!\n", nWords, nCount);
#else
    UNUSED(nWords);
#endif

    return nCount ;
}

//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// add all the elements of a set of strings
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
size_t CIVStringSet::Add( const std::set<CString> & sstrWords )
{
    size_t nWord = 0 ;

    // Iterate the string set
    std::set<CString>::const_iterator it = sstrWords.begin() ;
    while ( it != sstrWords.end() )
    {
        if ( ! Add( *it++ ) )  // add it to the FSM and the array
            break ;
        nWord++ ;
    }

    return nWord ;
}

//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// add all the elements of a CStringArray
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
size_t CIVStringSet::Add( const CStringArray & astrWords )
{
    size_t nWord = 0 ;

    size_t nCount = static_cast<size_t>(astrWords.GetSize()) ;
    SetSize( m_nSize + nCount ) ;

    for ( ; nWord < nCount ; nWord++ )
    {
        if ( ! Add( astrWords[nWord] ) )  // add it to the FSM and the array
            break ;
        nWord++ ;
    }

    return nWord ;
}

//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// add all the elements of a CStringList
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
size_t CIVStringSet::Add( const CStringList & lstrWords )
{
    size_t nWord = 0 ;

    size_t nCount = static_cast<size_t>(lstrWords.GetCount()) ;
    SetSize( m_nSize + nCount ) ;

    // Iterate the string list
    POSITION pos = lstrWords.GetHeadPosition() ;
    while ( pos != NULL )
    {
        if ( ! Add( lstrWords.GetNext( pos ) ) )  // add it to the FSM and the array
            break ;
        nWord++ ;
    }

    return nWord ;
}

//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// InsertWord
//          put the new word into the FSM
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
bool CIVStringSet::InsertWord( LPCTSTR pszWord, DWORD dwIndex )
{
    bool bRetVal = false ;

    size_t nLen = _tcslen( pszWord ) ;
    if ( m_dwMaxUsedState < MAXDWORD )  // if we've got over 4 billion states, something's wrong
    {
        DWORD dwCurState = 0UL ;
        for ( UINT nChar = 0 ; nChar < nLen ; nChar++ )
        {
            size_t   nIdxChar = min(static_cast<size_t>(pszWord[nChar]), c_nMaxChar) ;
            SEntry & rEntry   = m_apnFSM[dwCurState][nIdxChar] ;  // the state, and possibly also an index
            DWORD    dwState  = rEntry.m_dwState ;
            bool     bNew     = (dwState == 0UL) ;                // no previous words have been here

            if ( bNew )
                dwState = ++m_dwMaxUsedState ;  // use a new state number

            // special case - last character of substring
            if ( nChar == ( nLen-1 ) )
            {
                // Put both the state number and the word index in the entry
                rEntry.m_dwState = dwState ;
                rEntry.m_dwIndex = dwIndex ;
                break ;
            }
            else if ( bNew ) // if current entry for this char value and state is still zero, add to FSM
                rEntry.m_dwState = dwState ;

            dwCurState = dwState ;  // prepare for next iteration
            if ( m_apnFSM.size() <= dwCurState )
                m_apnFSM.resize( ((dwCurState * 2) + 63) & 0xFFC0, std::vector<SEntry>( c_nMaxChar ) ) ;
        }

        bRetVal = true ;
    }

    return bRetVal ;
}

//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// FindFirstIn
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
UINT CIVStringSet::FindFirstIn( CString strText, size_t & rnFirst )
{
    // The real work is done by FindNext, but we must initialize the starting
    //    character index and string buffer first
    m_nCurTextChar = 0U ;
    m_strSearch = strText ;

    return FindNext( rnFirst ) ;
}

//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// FindNext
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
UINT CIVStringSet::FindNext( size_t & rnNext )
{
    DWORD dwCurState = 0UL ; // start in state 0
    UINT nIdx ;
    UINT nStartChar = MAXDWORD ;
    size_t nLen = m_strSearch.GetLength() ;
    LPCTSTR pszSearch = static_cast<LPCTSTR>(m_strSearch) ;

    // Start with the character following the last one examined
    for ( nIdx = m_nCurTextChar ; nIdx < nLen ; nIdx++ )
    {
        // Skip over delimiters
        size_t nNextIdx = _tcsspn( &pszSearch[nIdx], m_strDelims ) ;
        if ( nNextIdx != 0 )
        {
            nStartChar = MAXDWORD ;
            if ( nIdx + nNextIdx < nLen )
            {
                dwCurState = 0UL ;
                nIdx += nNextIdx ;
            }
            else
                break ;
        }

        // if we are in state 0, save the index of the starting character,
        //   in case we must backtrack or for the return value
        if ( dwCurState == 0UL )
            nStartChar = nIdx ;

        // follow up the table states
        size_t nIdxChar = min(static_cast<size_t>(pszSearch[nIdx]), c_nMaxChar) ;
        const SEntry & rEntry = m_apnFSM[dwCurState][nIdxChar] ;

        // if we reach an entry with a an index, we may have found a full match
        //    if so, return the word we found
        if ( ( rEntry.m_dwIndex != 0UL )
          && ( ( nIdx == nLen-1 )  // are we at the end of the search string?
            || ( m_strDelims.Find( pszSearch[nIdx+1] ) != -1 ) // or at a delimiter?
             )
           )
        {
            size_t nIndex = rEntry.m_dwIndex - 1 ;
            ASSERT(nIndex < static_cast<size_t>(GetSize()));
            rnNext = nIndex ;
            m_nCurTextChar = nIdx + 1 ;  // set the index for the next time
            break ;
        }
        else
            dwCurState = rEntry.m_dwState ;

        if ( dwCurState == 0 )
        {
            nStartChar = MAXDWORD ;  // in case we're about to terminate the loop
            nIdx += _tcscspn( &pszSearch[nIdx], m_strDelims ) ; // advance to the next delimiter
        }
    }

    // return the index of the start of the word, or -1 if no word found
    return (( nIdx < nLen ) ? nStartChar : MAXDWORD) ;
}

//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// FindAllIn
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
size_t CIVStringSet::FindAllIn( CString strText, CWordPosPairList & rlstrWords )
{
    rlstrWords.clear() ;

    // Iterate through all words, and put them in the list
    size_t nWord = static_cast<size_t>(-1) ;
    UINT nPos = FindFirstIn( strText, nWord ) ;
    if ( nPos != MAXDWORD )
    {
        do
        {
            CWordPosPair posWord( nWord, nPos ) ;
            rlstrWords.push_back( posWord ) ;
            nPos = FindNext( nWord ) ;
        } while ( nPos != MAXDWORD ) ;
    }

    return rlstrWords.size() ;
}

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)

Written By

Scot Brennecke

Software Developer (Senior) Microsoft

United States

Scot is a Sr. Escalation Engineer for the Microsoft Developer Support VS & Languages team. He helps software developers who are Microsoft customers find bugs in their own, or Microsoft's, code.

Scot spends most of his time writing, reading, or thinking about C++ software, thereby classifying him as a geek.

A Multiple Substring Search Class: CIVStringSet

License

Comments and Discussions