Click here to Skip to main content
15,886,724 members
Articles / Desktop Programming / MFC

Spell Checker Dictionary Engine

Rate me:
Please Sign up or sign in to vote.
4.86/5 (34 votes)
20 May 2003CPOL6 min read 118.6K   4.6K   70  
A dictionary engine for use in applications requiring spell checking
// Dictionary.cpp: implementation of the Dictionary classes
//
//////////////////////////////////////////////////////////////////////

#include "stdafx.h"
#include "Dictionary.h"

#ifdef _DEBUG
#undef THIS_FILE
static char THIS_FILE[]=__FILE__;
#define new DEBUG_NEW
#endif

#define NODE_HAS_NEXT   0x01
#define NODE_HAS_ALTERN 0x02

#define REMOVEDWORD_TERMINATOR  char(-1)

//////////////////////////////////////////////////////////////////////////////
//                                                                          //
//  CNode                                                                   //
//                                                                          //
//////////////////////////////////////////////////////////////////////////////

//////////////////////////////////////////////////////////////////////////////
//                                                                          //
//  Function : CNode                                                        //
//  Purpose  : Constructor                                                  //
//  Params   : c -  Character this node is to represent                     //
//  Returns  : N/A                                                          //
//  Throws   : None                                                         //
//                                                                          //
//////////////////////////////////////////////////////////////////////////////

CNode::CNode(char c)
{
    m_Character = c;
    m_pAlternative = NULL;
    m_pNext = NULL;
}

//////////////////////////////////////////////////////////////////////////////
//                                                                          //
//  Function : ~CNode                                                       //
//  Purpose  : Destructor                                                   //
//  Params   : None                                                         //
//  Returns  : N/A                                                          //
//  Throws   : None                                                         //
//                                                                          //
//////////////////////////////////////////////////////////////////////////////

CNode::~CNode()
{
    delete m_pAlternative;
    delete m_pNext;
}

//////////////////////////////////////////////////////////////////////////////
//                                                                          //
//  Function : InsertWord                                                   //
//  Purpose  : Insert the given word into the tree                          //
//  Params   : szWord - Word to insert into the tree                        //
//  Returns  : None                                                         //
//  Throws   : None                                                         //
//                                                                          //
//////////////////////////////////////////////////////////////////////////////

void CNode::InsertWord(LPCSTR szWord)
{
    if (m_Character == szWord[0])
    {
        // This is the right node for this character!
        if (szWord[0])
        {
            // Only carry on if we have not reached the end of the word!
            if (m_pNext == NULL)
            {
                // Create a new "Next" node
                m_pNext = new CNode(szWord[1]);
            }
            m_pNext->InsertWord(&szWord[1]);
        }
    }
    else
    {
        // This is not the right node, so we need to see if we have an alternative
        if (m_pAlternative == NULL)
        {
            // Create a new alternative node
            m_pAlternative = new CNode(szWord[0]);
        }
        m_pAlternative->InsertWord(szWord);
    }
}

//////////////////////////////////////////////////////////////////////////////
//                                                                          //
//  Function : IsWordListed                                                 //
//  Purpose  : Determine if a word appears in the tree                      //
//  Params   : szWord - Word to check                                       //
//             bMatchCase - true if case must be matched, else false        //
//             match - Enumeration showing if word was found                //
//             pFinalNode - Pointer to the final node encountered           //
//  Returns  : None                                                         //
//  Throws   : None                                                         //
//                                                                          //
//////////////////////////////////////////////////////////////////////////////

void CNode::IsWordListed(LPCSTR szWord, bool bMatchCase, WordMatch & match, bool bIsFirstNode, CNode ** pFinalNode)
{
    // If we are collecting nodes encountered, add ourselves to the array
    if (pFinalNode) *pFinalNode = this;

    // Start out by assuming that we have a perfect match (we may be proved wrong later!)
    if (bIsFirstNode) match = eMatchPerfect;

    if (m_Character == szWord[0])
    {
        // Unless we have reached the end, continue matching the rest of the word
        if (m_Character != '\0')
        {
            if (m_pNext)
            {
                m_pNext->IsWordListed(&szWord[1], bMatchCase, match, false, pFinalNode);

                if ((match != eMatchPerfect) && (m_Character >= 'A') && (m_Character <= 'Z') && m_pAlternative && !bMatchCase)
                {
                    // Could be a case thing - see if we have an alternative
                    WordMatch matchTemp;
                    m_pAlternative->IsWordListed(szWord, bMatchCase, matchTemp, bIsFirstNode, pFinalNode);
                    match = CDictionary::GetBestMatch(match, matchTemp);
                }
            }
            else
            {
                // Something not right here - if the character is not '\0', we should always have a next pointer
                ASSERT(0);
                match = eMatchInternalError;
            }
        }
    }
    else if ((tolower(m_Character) == tolower(szWord[0]) && !bMatchCase))
    {
        // We have found a match, but with the wrong case. Now, we could have the instance where
        // there are two words in the dictionary, with different capitalisation (for example "Bob" and "bob")
        // First off, if we still have a perfect match, see if we can get a perfect match from an alternative
        if (match == eMatchPerfect)
        {
            if (m_pAlternative)
            {
                // Check to see what we can find
                m_pAlternative->IsWordListed(szWord, bMatchCase, match, bIsFirstNode, pFinalNode);
            }
            else
            {
                // No alternative, hence no match
                match = eMatchNone;
            }
        }

        switch (match)
        {
        case eMatchNone:
        case eMatchMixedCase:
        case eMatchCapitalisedFirst:
            // Carry on with this node, but with an imperfect match
            if (m_pNext)
            {
                // Set the match according to whether or not we are the first node
                match = (bIsFirstNode && m_Character == tolower(szWord[0])) ? eMatchCapitalisedFirst : eMatchMixedCase;

                // And carry on trying to find a match
                m_pNext->IsWordListed(&szWord[1], bMatchCase, match, false, pFinalNode);
            }
            else
            {
                // No next node, which is not right
                ASSERT(0);
                match = eMatchInternalError;
            }
            break;
        default:
            // Either we found a perfect match, or the next best thing - a word with only a capitalised first letter
            // Or of course we could have hit an internal error. However, basically, we finish here, as we have the
            // best result we are going to get.
            break;
        }
    }
    else if (m_pAlternative)
    {
        // This character doesn't match - see if any of the alternatives do
        m_pAlternative->IsWordListed(szWord, bMatchCase, match, bIsFirstNode, pFinalNode);
    }
    else
    {
        // We have no match for this character - hence, we have no match
        match = eMatchNone;
    }
}

//////////////////////////////////////////////////////////////////////////////
//                                                                          //
//  Function : GetPatternMatchingWords                                      //
//  Purpose  : Get words in the dictionary that match the given pattern     //
//  Params   : szWord - Pattern to match (? represents a single char)       //
//             straSuggestions - Reference to array for results             //
//             strWordSoFar - Word built so far by parent items             //
//  Returns  : None                                                         //
//  Throws   : None                                                         //
//                                                                          //
//////////////////////////////////////////////////////////////////////////////

void CNode::GetPatternMatchingWords(LPCSTR szWord, CStringArray & straSuggestions, CString strWordSoFar)
{
    if ((m_Character == szWord[0]) || (tolower(m_Character) == tolower(szWord[0])) || (szWord[0] == '?'))
    {
        // We have a match for this character
        if (m_Character == '\0' && szWord[0] == m_Character)
        {
            // We have reached the end of the word, add it to the suggestions
            straSuggestions.Add(strWordSoFar);
        }
        else 
        {
            if (m_pNext)
            {
                // We have not reached the end, and we have more node, so continue checking
                m_pNext->GetPatternMatchingWords(&szWord[1], straSuggestions, strWordSoFar + m_Character);
            }
            if (m_pAlternative)
            {
                // We have not reached the end, and we have more node, so continue checking
                m_pAlternative->GetPatternMatchingWords(szWord, straSuggestions, strWordSoFar);
            }
        }
    }
    else
    {
        // Does not match - see if one of the alternatives match
        if (m_pAlternative)
            m_pAlternative->GetPatternMatchingWords(szWord, straSuggestions, strWordSoFar);
    }
}

//////////////////////////////////////////////////////////////////////////////
//                                                                          //
//  Function : Serialise                                                    //
//  Purpose  : Serialise the node into or out of a file                     //
//  Params   : ar - Reference to archive to / from which we transfer data   //
//  Returns  : None                                                         //
//  Throws   : None                                                         //
//                                                                          //
//////////////////////////////////////////////////////////////////////////////

void CNode::Serialise(CArchive & ar)
{
    if (ar.IsLoading())
    {
        // Loading - get the character
        ar >> m_Character;

        // Get the bitmask indicating what children we have
        BYTE byBitmask;
        ar >> byBitmask;

        // And load the children according to the bitmask
        if (byBitmask & NODE_HAS_NEXT)
        {
            m_pNext = new CNode;
            m_pNext->Serialise(ar);
        }
        if (byBitmask & NODE_HAS_ALTERN)
        {
            m_pAlternative = new CNode;
            m_pAlternative->Serialise(ar);
        }
    }
    else
    {
        // Saving - save the character
        ar << m_Character;

        // Create and save a bitmask indicating which children we have
        BYTE byBitmask = 0;
        if (m_pNext) byBitmask |= NODE_HAS_NEXT;
        if (m_pAlternative) byBitmask |= NODE_HAS_ALTERN;

        ar << byBitmask;

        // Serialise our children
        if (m_pNext) m_pNext->Serialise(ar);
        if (m_pAlternative) m_pAlternative->Serialise(ar);
    }
}

//////////////////////////////////////////////////////////////////////////////
//                                                                          //
//  Function : GetWordCount                                                 //
//  Purpose  : Get the word count in the tree                               //
//  Params   : nCount - Reference to the count variable                     //
//  Returns  : None                                                         //
//  Throws   : None                                                         //
//                                                                          //
//////////////////////////////////////////////////////////////////////////////

void CNode::GetWordCount(int & nCount)
{
    if (m_Character == '\0') nCount++;
    if (m_pNext) m_pNext->GetWordCount(nCount);
    if (m_pAlternative) m_pAlternative->GetWordCount(nCount);
}


//////////////////////////////////////////////////////////////////////////////
//                                                                          //
//  CDictionary                                                             //
//                                                                          //
//////////////////////////////////////////////////////////////////////////////

//////////////////////////////////////////////////////////////////////////////
//                                                                          //
//  Function : CDictionary                                                  //
//  Purpose  : Constructor                                                  //
//  Params   : None                                                         //
//  Returns  : N/A                                                          //
//  Throws   : None                                                         //
//                                                                          //
//////////////////////////////////////////////////////////////////////////////

CDictionary::CDictionary()
{
    m_pRootNode = NULL;
}

//////////////////////////////////////////////////////////////////////////////
//                                                                          //
//  Function : ~CDictionary                                                 //
//  Purpose  : Destructor                                                   //
//  Params   : None                                                         //
//  Returns  : N/A                                                          //
//  Throws   : None                                                         //
//                                                                          //
//////////////////////////////////////////////////////////////////////////////

CDictionary::~CDictionary()
{
    delete m_pRootNode;
}

//////////////////////////////////////////////////////////////////////////////
//                                                                          //
//  Function : InsertWord                                                   //
//  Purpose  : Insert a word into the dictionary                            //
//  Params   : szWord - Word to be inserted                                 //
//  Returns  : None                                                         //
//  Throws   : None                                                         //
//                                                                          //
//////////////////////////////////////////////////////////////////////////////

void CDictionary::InsertWord(LPCSTR szWord)
{
    ASSERT(szWord);
    if (m_pRootNode == NULL)
    {
        // We have no root node, so start the tree off with the first character of the word
        m_pRootNode = new CNode(szWord[0]);
    }

    // Insert the word into the tree
    m_pRootNode->InsertWord(szWord);
}

//////////////////////////////////////////////////////////////////////////////
//                                                                          //
//  Function : IsWordListed                                                 //
//  Purpose  : Determines if a word is listed in the dictionary             //
//  Params   : szWord - Word to check                                       //
//             bMatchCase - true if case must be matched, else false        //
//  Returns  : Word match enumeration showing if word was found             //
//  Throws   : None                                                         //
//                                                                          //
//////////////////////////////////////////////////////////////////////////////

WordMatch CDictionary::IsWordListed(LPCSTR szWord)
{
    if (m_pRootNode == NULL) return eMatchNone;

    // First off, see if we have an exact case match
    WordMatch match;
    m_pRootNode->IsWordListed(szWord, true, match, true);

    if (match != eMatchPerfect)
    {
        // Nope, see if we have anything remotely close
        m_pRootNode->IsWordListed(szWord, false, match, true);
    }

    return match;
}

//////////////////////////////////////////////////////////////////////////////
//                                                                          //
//  Function : GetSuggestions                                               //
//  Purpose  : Get suggestions for a given word                             //
//  Params   : szWord - Word to suggest for                                 //
//             straSuggestions - Reference to array for results             //
//             bCaseSuggestionsOnly - true to return only suggestions that  //
//                 are the same as the word, but with differing case        //
//  Returns  : Number of suggestions found                                  //
//  Throws   : None                                                         //
//                                                                          //
//////////////////////////////////////////////////////////////////////////////

int CDictionary::GetSuggestions(LPCSTR szWord, CStringArray & straSuggestions, bool bCaseSuggestionsOnly)
{
    if (m_pRootNode == NULL) return 0;

    // Check that the word is not listed - it is silly to suggest for a valid word!
    ASSERT(IsWordListed(szWord) != eMatchPerfect);
    if (IsWordListed(szWord) != eMatchPerfect)
    {
        // Get the suggestions that match the word but with difference cases
        m_pRootNode->GetPatternMatchingWords(szWord, straSuggestions);

        if (!bCaseSuggestionsOnly)
        {
            int nWordLength = strlen(szWord), nCount;
            char * szBuffer = new char[nWordLength + 2];

            // Get the substitution suggestions
            // For each character, we consider that it may be wrong and see if there is anything else matching
            for (nCount = 0; nCount < nWordLength; nCount++)
            {
                strcpy(szBuffer, szWord);
                szBuffer[nCount] = '?';

                m_pRootNode->GetPatternMatchingWords(szBuffer, straSuggestions);
            }

            // Get the deletion suggestions
            // For each character position, we assume that there is a character missing
            for (nCount = 0; nCount <= nWordLength; nCount++)
            {
                memset(szBuffer, 0, nWordLength + 2);
                strncpy(szBuffer, szWord, nCount);           
                strcat(szBuffer, "?");
                strcat(szBuffer, &szWord[nCount]);

                m_pRootNode->GetPatternMatchingWords(szBuffer, straSuggestions);
            }

            // Get the insertion suggestions
            // For each character , assume that the character is extraneous
            for (nCount = 0; nCount < nWordLength; nCount++)
            {
                memset(szBuffer, 0, nWordLength + 2);
                strncpy(szBuffer, szWord, nCount);
                strcat(szBuffer, &szWord[nCount + 1]);

                if (IsWordListed(szBuffer) != eMatchNone)
                    straSuggestions.Add(szBuffer);
            }

            // Get the transposition suggestions
            // For each pair of characters, do a swap and see if we can exactly match the string
            for (nCount = 0; nCount < nWordLength - 1; nCount++)
            {
                strcpy(szBuffer, szWord);
                char cTemp = szBuffer[nCount];
                szBuffer[nCount] = szBuffer[nCount + 1];
                szBuffer[nCount + 1] = cTemp;

                if (IsWordListed(szBuffer) != eMatchNone)
                    straSuggestions.Add(szBuffer);
            }

            delete [] szBuffer;
        }
    }

    // Tidy up the suggestions list
    SortSuggestions(straSuggestions);
    RemoveDuplicates(straSuggestions);

    return straSuggestions.GetSize();
}

//////////////////////////////////////////////////////////////////////////////
//                                                                          //
//  Function : PatternMatchWord                                             //
//  Purpose  : Pattern match a given word (? represents any single char)    //
//  Params   : szWord - Pattern to match                                    //
//             straSuggestions - Reference to array for results             //
//  Returns  : Number of suggestions found                                  //
//  Throws   : None                                                         //
//                                                                          //
//////////////////////////////////////////////////////////////////////////////

int CDictionary::PatternMatchWord(LPCSTR szWord, CStringArray & straSuggestions)
{
    straSuggestions.RemoveAll();

    if (m_pRootNode == NULL) return 0;

    m_pRootNode->GetPatternMatchingWords(szWord, straSuggestions);

    return straSuggestions.GetSize();
}

//////////////////////////////////////////////////////////////////////////////
//                                                                          //
//  Function : LoadDictionary                                               //
//  Purpose  : Load a previously saved dictionary file                      //
//  Params   : strFilename - Filename                                       //
//  Returns  : true if load was successful, else false                      //
//  Throws   : None                                                         //
//                                                                          //
//////////////////////////////////////////////////////////////////////////////

bool CDictionary::LoadDictionary(const CString & strFilename)
{
    bool bLoaded = false;

    if (m_pRootNode)
    {
        // Remove the existing dictionary tree
        delete m_pRootNode;
        m_pRootNode = NULL;
    }

    try
    {
        // Open the file
        CFile fileDictionary(strFilename, CFile::modeRead | CFile::shareDenyWrite);
        CArchive archive(&fileDictionary, CArchive::load);

        // And load the tree
        m_pRootNode = new CNode;
        m_pRootNode->Serialise(archive);

        bLoaded = true;
    }
    catch (CFileException * e)
    {
        e->Delete();
    }
    catch (CArchiveException * e)
    {
        e->Delete();
    }


    return bLoaded;
}

//////////////////////////////////////////////////////////////////////////////
//                                                                          //
//  Function : SaveDictionary                                               //
//  Purpose  : Save the dictionary to file                                  //
//  Params   : strFilename - Filename                                       //
//  Returns  : true if saved successfully, else false                       //
//  Throws   : None                                                         //
//                                                                          //
//////////////////////////////////////////////////////////////////////////////

bool CDictionary::SaveDictionary(const CString & strFilename)
{
    bool bSaved = false;

    if (m_pRootNode)
    {
        try
        {
            // Open the file
            CFile fileDictionary(strFilename, CFile::modeCreate | CFile::modeWrite | CFile::shareDenyWrite);
            CArchive archive(&fileDictionary, CArchive::store);

            // And save the tree
            m_pRootNode->Serialise(archive);

            bSaved = true;
        }
        catch (CFileException * e)
        {
            e->Delete();
        }
        catch (CArchiveException * e)
        {
            e->Delete();
        }
    }

    return bSaved;
}

//////////////////////////////////////////////////////////////////////////////
//                                                                          //
//  Function : CreateFromList                                               //
//  Purpose  : Create a dictionary from a text file (one word per line)     //
//  Params   : strFilename - Filename                                       //
//             bAppend - true to append to current dictionary, false to     //
//                       start afresh                                       //
//  Returns  : true if loaded successfully, else false                      //
//  Throws   : None                                                         //
//                                                                          //
//////////////////////////////////////////////////////////////////////////////

bool CDictionary::CreateFromList(const CString & strFilename, bool bAppend)
{
    bool bCreated = false;
    
    if (!bAppend)
    {
        // We are not appending the list, so delete any content we currently have
        delete m_pRootNode;
    }

    try
    {
        // Open the file
        CStdioFile fileList(strFilename, CFile::modeRead | CFile::shareDenyWrite);
        CString strWord;

        // And treat each line as a word
        while(fileList.ReadString(strWord))
        {
            InsertWord(strWord);
        }

        bCreated = true;
    }
    catch (CFileException * e)
    {
        e->Delete();
    }

    return bCreated;
}

//////////////////////////////////////////////////////////////////////////////
//                                                                          //
//  Function : SortSuggestions                                              //
//  Purpose  : Sorts the words in the given array                           //
//  Params   : straSuggestions - Reference to CStringArray to sort          //
//  Returns  : None                                                         //
//  Throws   : None                                                         //
//                                                                          //
//////////////////////////////////////////////////////////////////////////////

void CDictionary::SortSuggestions(CStringArray & straSuggestions)
{
	bool bChanged=true;
	while(bChanged)
	{
		bChanged=false;
		for(int nItem = 1; nItem < straSuggestions.GetSize(); nItem++)
		{
			if(straSuggestions.GetAt(nItem - 1) > straSuggestions.GetAt(nItem))
			{
                CString strTemp = straSuggestions.GetAt(nItem - 1);
				straSuggestions.SetAt(nItem - 1, straSuggestions.GetAt(nItem));
				straSuggestions.SetAt(nItem, strTemp);
				bChanged=true;
			}
		}
	}
}

//////////////////////////////////////////////////////////////////////////////
//                                                                          //
//  Function : RemoveDuplicates                                             //
//  Purpose  : Removed duplicates from the given array                      //
//  Params   : straSuggestions - Reference to CStringArray to process       //
//  Returns  : None                                                         //
//  Throws   : None                                                         //
//                                                                          //
//////////////////////////////////////////////////////////////////////////////

void CDictionary::RemoveDuplicates(CStringArray & straSuggestions)
{
    for (int nCount = straSuggestions.GetUpperBound(); nCount > 0; nCount--)
    {
        if (straSuggestions.GetAt(nCount) == straSuggestions.GetAt(nCount - 1))
            straSuggestions.RemoveAt(nCount);
    }
}

//////////////////////////////////////////////////////////////////////////////
//                                                                          //
//  Function : GetWordCount                                                 //
//  Purpose  : Count how many words are contained in the dictionary         //
//  Params   : None                                                         //
//  Returns  : Word in dictionary                                           //
//  Throws   : None                                                         //
//                                                                          //
//////////////////////////////////////////////////////////////////////////////

int CDictionary::GetWordCount()
{
    int nWordCount = 0;
    if (m_pRootNode)
        m_pRootNode->GetWordCount(nWordCount);

    return nWordCount;
}

//////////////////////////////////////////////////////////////////////////////
//                                                                          //
//  Function : RemoveWord                                                   //
//  Purpose  : Marks a word as deleted from the dictionary                  //
//  Params   : szWord - Word to remove                                      //
//  Returns  : true if the word is removed, false otherwise                 //
//  Throws   : None                                                         //
//                                                                          //
//////////////////////////////////////////////////////////////////////////////

bool CDictionary::RemoveWord(LPCSTR szWord)
{
    bool bRemoved = false;

    if (m_pRootNode)
    {
        // First off, go through the tree to find the word, getting the terminating node
        // Note that we only remove the word that is an exact case match for the given word
        WordMatch match;
        CNode * pFinalNode = NULL;

        m_pRootNode->IsWordListed(szWord, true, match, true, &pFinalNode);
        if (match == eMatchPerfect && pFinalNode)
        {
            // We found the final node - change it's character to something other than '\0'
            pFinalNode->m_Character = REMOVEDWORD_TERMINATOR;
            bRemoved = true;
        }
    }

    return bRemoved;
}

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)


Written By
Software Developer
United Kingdom United Kingdom
I started computer programming on the Spectrum (writing nothing more complicated than "Hello World" and a few programs that tunelessly Beeped ad infinitum) but then progressed to slightly more serious programming on the Amiga.

After A-Levels in Maths, Physics and Chemistry, I went to the University of East Anglia, Norwich, and studied beer, women and Computing Science.
Some years after graduating, I still have an appreciation of Computing Science, but as I am now married, my other studies are frowned upon.

Since graduating, I have worked on many diverse projects in areas including call centres, logistics, architecture and engineering, and heritage.

Comments and Discussions