Click here to Skip to main content
15,881,803 members
Articles / Programming Languages / C++

A Spell Checking Engine

Rate me:
Please Sign up or sign in to vote.
4.88/5 (16 votes)
5 Feb 2001 265.3K   7K   108  
A free spell checking engine for use in your C++ applications. Includes the current US English dictionary
/*

  Copyright:		2000
  Author:			Matthew T Gullett
  Email:			gullettm@yahoo.com
  Name:				CFPSSpellCheckEngine
  Part of:			Spell Checking Engine  1 of 1
  Requires:			USMain.dic, [USUser.dic], [USCommon.dic]

  DESCRIPTION
  ----------------------------------------------------------
  This class is designed to implement a phonetic spell
  checking engine with support for US English (and probably
  UK english)  It implements a simple interface for accessing
  spell checking functions.  It is designed to be very fast
  with support for a user dictionary and a common misspelling
  dictionary to improve performance and user perception.  


  INFO:
  ----------------------------------------------------------
  This class is provided -as is-.  No warranty as to the
  function or performance of this class is provided either 
  written or implied.  
  
  You may freely use this code and modify it as necessary,
  as long as this header is unmodified and credit is given
  to the author in the application(s) in which it is
  incorporated.

*/

#include "stdafx.h"
#include "FPSSpellCheckEngine.h"

#include "io.h"
#include <fcntl.h>
#include <sys/types.h>
#include <sys/stat.h>

#ifdef _DEBUG
#undef THIS_FILE
static char THIS_FILE[]=__FILE__;
#define new DEBUG_NEW
#endif

//////////////////////////////////////////////////////////////////////
// Construction/Destruction
//////////////////////////////////////////////////////////////////////

CFPSSpellCheckEngine::CFPSSpellCheckEngine()
{
	// initialize internal variables
	m_iMainHandle = 0;
	m_pMainBuffer = NULL;
	m_pWordCache = NULL;
	m_lMainLen = 0;
	m_lCacheStartPos = 0;
}

CFPSSpellCheckEngine::~CFPSSpellCheckEngine()
{
	// cleanup memory and file handles
	if (m_iMainHandle)
		_close(m_iMainHandle);
	if (m_pMainBuffer)
		free (m_pMainBuffer);
	if (m_pWordCache)
		free (m_pWordCache);
}

// use this function to specify the location of the dictionaries
void CFPSSpellCheckEngine::SetDictionaries(LPCSTR lpszMain, LPCSTR lpszUser, LPCSTR lpszCommonUse)
{
	m_strMainDic = lpszMain;
	m_strUserDic = lpszUser;
	m_strCmnUseDic = lpszCommonUse;
}

// initialize the spell checking engine
// this function caches the user and commonly mispelled
// dictionaries, and opens the main dictionary and caches
// information on the number of words in the dictionary
// and the location of the letter breaks within the file
BOOL CFPSSpellCheckEngine::InitEngine()
{
	if (m_iMainHandle || m_pMainBuffer)
		return TRUE;

	// cache secondary dictionaries
	CacheUserDic();
	CacheCommonUseWords();

	// open main data dictionary
	m_iMainHandle = _open(m_strMainDic, _O_BINARY);
	if (!m_iMainHandle)
		return FALSE;
	m_pMainBuffer = (char*)malloc(FPSSPELL_DEFAULT_CACHE_SIZE);

	// get file length
	lseek(m_iMainHandle, 0, SEEK_END);
	m_lMainLen = tell(m_iMainHandle);
	lseek(m_iMainHandle, 0, SEEK_SET);

	// set cache position information
	m_lCacheStartPos = 0;
	m_lCacheLen = _read(m_iMainHandle, m_pMainBuffer, FPSSPELL_DEFAULT_CACHE_SIZE);

	// ************************************************
	// create word cache
	if (m_pWordCache)
		free (m_pWordCache);
	m_pWordCache = (char*)malloc(GetRecordCount() * FPSSPELL_MAIN_WORD_MAX_LEN);

	// cache word locations within main datadictionay
	CacheWordLocations();
	
	return TRUE;
}

// this is the main function used to search for a word in
// the dictionary and returning a list of possible matches
BOOL CFPSSpellCheckEngine::FindWord(LPCSTR lpszWord, CStringList &PossibleMatches)
{
	char szWordToFind[200];

	// convert lpszWord to UPPERCASE and place in szWordToFind
	strcpy(szWordToFind, lpszWord);
	TrimRight(szWordToFind);
	CharUpper(szWordToFind);

	// clear possible matches
	PossibleMatches.RemoveAll();

	// *******************************************************
	// 1. look in recent finds first
	if (m_RecentFinds.Lookup(szWordToFind, m_strTempWord))
		return TRUE;

	// *******************************************************
	// 2. Look in User Data dictionary
	if (m_UserDic.Lookup(szWordToFind, m_strTempWord))
		return TRUE;

	// *******************************************************
	// 3. look in common misspells
	if (m_CmnUseDic.Lookup(szWordToFind, m_strTempWord))
	{
		PossibleMatches.AddTail(m_strTempWord);
		return FALSE;
	}

	// *******************************************************
	// 4. look in main dictionary
	if (FindWordMain(szWordToFind))
	{
		m_RecentFinds.SetAt(szWordToFind, lpszWord);
		return TRUE;
	}

	// find the possible matches for this word
	FindMatchingWords(szWordToFind, PossibleMatches);

	return FALSE;
}

// cache the USERDD file
void CFPSSpellCheckEngine::CacheUserDic()
{
	CStdioFile fp;
	CString strOldWord;
	CString strNewWord;

	if (fp.Open(m_strUserDic, CFile::modeRead | CFile::shareExclusive))
	{
		while (fp.ReadString(strOldWord))
		{
			strOldWord.TrimLeft();
			strOldWord.TrimRight();
			strNewWord = strOldWord;
			strNewWord.MakeLower();

			m_UserDic.SetAt(strNewWord, strOldWord);
		}
		fp.Close();
	}
}

// cache the common mispelled dictionary
void CFPSSpellCheckEngine::CacheCommonUseWords()
{
	return;
	CStdioFile fp;
	CString strLine;
	CString strIncorrect;
	CString strCorrect;
	int iPos = 0;

	if (fp.Open(m_strCmnUseDic, CFile::modeRead | CFile::shareExclusive))
	{
		while (fp.ReadString(strLine))
		{
			strLine.TrimLeft();
			strLine.TrimRight();

			iPos = strLine.Find('=');
			if (iPos > 0)
			{
				strIncorrect = strLine.Left(iPos);
				strCorrect = strLine.Mid(iPos+1);

				strIncorrect.TrimLeft();
				strIncorrect.TrimRight();
				strIncorrect.MakeUpper();

				strCorrect.TrimLeft();
				strCorrect.TrimRight();

				m_CmnUseDic.SetAt(strIncorrect, strCorrect);
			}
		}

		fp.Close();
	}
}

// determine if a character is a volel (A,E,I,O,Y)
BOOL CFPSSpellCheckEngine::IsVowel(char szChar)
{
	switch (szChar)
	{
	case 'A':
	case 'E':
	case 'I':
	case 'O':
	case 'U':
	case 'a':
	case 'e':
	case 'i':
	case 'o':
	case 'u':
		{
			return TRUE;
			break;
		}
	}
	
	return FALSE;
}

// determines the file offset where a record exists in the
// main dictionary
long CFPSSpellCheckEngine::GetRecordPos(long lRecord)
{
	return ((lRecord - 1) * FPSSPELL_MAIN_RECORD_LEN);
}

// determine the number of records in the main dictionary
long CFPSSpellCheckEngine::GetRecordCount()
{
	return (m_lMainLen / FPSSPELL_MAIN_RECORD_LEN);
}

// adds a word to the recent finds map so the next time
// a findword is done, it will find it here and ignore
// it
void CFPSSpellCheckEngine::IgnoreWord(LPCSTR lpszWord)
{
	char szUpperWord[200];

	// make word upper
	strcpy(szUpperWord, lpszWord);
	CharUpper(szUpperWord);

	// add word to recent finds
	m_RecentFinds.SetAt(szUpperWord, lpszWord);
}

// adds a word to the USER DD
void CFPSSpellCheckEngine::AddWord(LPCSTR lpszWord)
{
	char szUpperWord[200];

	strcpy(szUpperWord, lpszWord);
	CharUpper(szUpperWord);

	// add word to memory map of user data dictionary
	m_UserDic.SetAt(szUpperWord, lpszWord);

	// physically add word to user data dictionary file
	CStdioFile fp;
	if (fp.Open(m_strUserDic, CFile::modeRead | CFile::modeWrite | CFile::shareExclusive))
	{
		fp.Seek(0, CFile::end);
		fp.WriteString(lpszWord);
		fp.WriteString("\n");
		fp.Close();
	}
	else
	{
		if (fp.Open(m_strUserDic, CFile::modeCreate | CFile::modeWrite | CFile::shareExclusive))
		{
			fp.Seek(0, CFile::end);
			fp.WriteString(lpszWord);
			fp.WriteString("\n");
			fp.Close();
		}
	}
}

// finds a word in the main dictionary
BOOL CFPSSpellCheckEngine::FindWordMain(LPCSTR lpszWordToFind)
{
	BOOL bFound = FALSE;
	long lRecord;
	char szTest[50];

	strcpy(szTest, lpszWordToFind);
	CharUpper(szTest);

	if (lpszWordToFind[0] != '\0')
	{
		if (m_LocationCache[lpszWordToFind[0]].bPresent)
		{
			long lStartRecord = m_LocationCache[lpszWordToFind[0]].lFirstRecord;
			long lLastRecord = m_LocationCache[lpszWordToFind[0]].lLastRecord;

			lRecord = lStartRecord;
			while (lRecord <= lLastRecord && !bFound)
			{
				GetRecord(lRecord, szTest);

				if (stricmp(szTest, lpszWordToFind) == 0)
					bFound = TRUE;
				lRecord++;
			}
		}
	}

	return bFound;
}

// this function is called by FindWord and is used to search
// the main dictionary for possible matches.  This function
// uses several methods to determine if words could match.
void CFPSSpellCheckEngine::FindMatchingWords(LPCSTR lpszWord, CStringList &Matches)
{
	long lRecord;
	char szMetaphone[20];
	char szWord[50];
	char szParsedWord[50];
	int iWordSylableCount = GetSylableCount((char*)lpszWord);
	int iDicSylableCount = 0;
	int iWordLen = lstrlen(lpszWord);
	int iRecordFirstVowelPos;
	int iRecordWordLen = 0;
	int iFirstVowelPos;
	char cFirstVowel = GetFirstVowel((char*)lpszWord, iFirstVowelPos);
	char cRecordFirstVowel = 0;
	long lRecordCount = GetRecordCount();
	int iMetaphoneLen = 0;
	char szPhoneticWord[200];
	char szPhoneticRecord[200];
	char szOutWord[200];
	int iVowelCount = GetVowelCount((char*)lpszWord);
	char szTemp[50];

	memset(szTemp, 0, 50);

	m_lWordsSearched = 0;
	m_lComparisonsMade = 0;

	// do not perform lookup if word is blank
	if (iWordLen == 0)
		return;

	// if no vowels are in the non-found word, use this method to look for possible matches
	if (iVowelCount == 0)
	{
		int iMaxVowelCount = 1;

		if (lpszWord[iWordLen-1] == 'R' || lpszWord[iWordLen-1] == 'S' || lpszWord[iWordLen-1] == 'D')
			iMaxVowelCount = 2;

		if (m_LocationCache[lpszWord[0]].bPresent)
		{
			long lStartRecord = m_LocationCache[lpszWord[0]].lFirstRecord;
			long lLastRecord = m_LocationCache[lpszWord[0]].lLastRecord;

			RemoveVowelsAndDups((char*)lpszWord, szTemp);

			lRecord = lStartRecord;
			while (lRecord <= lLastRecord)
			{
				m_lWordsSearched++;

				GetRecord(lRecord, szWord, szParsedWord, iDicSylableCount, szPhoneticRecord);
				iRecordWordLen = lstrlen(szWord);

				if (IsWordMatch((char*)lpszWord, iWordLen, szWord, iRecordWordLen))
				{
					Matches.AddTail(szWord);
				}
				//else if (iDicSylableCount <= iMaxVowelCount)
				else /*if (iDicSylableCount <= iMaxVowelCount)*/
				{
					RemoveVowelsAndDups(szWord, szOutWord);

					if (stricmp(szOutWord, szTemp) == 0)
					{
						if (abs(iWordLen - iRecordWordLen) < 3)
							Matches.AddTail(szWord);
					}
				}

				lRecord++;
			}
		}
	}
	else if (iWordLen == 2)
	{
		char szTempRecordWord[200];
		char szTempInWord[200];
		long lStartRecord = m_LocationCache[lpszWord[0]].lFirstRecord;
		long lLastRecord = m_LocationCache[lpszWord[0]].lLastRecord;

		// make upper case of input word
		strcpy(szTempInWord, lpszWord);
		CharUpper(szTempInWord);

		lRecord = lStartRecord;
		while (lRecord <= lLastRecord)
		{
			m_lWordsSearched++;

			GetRecord(lRecord, szWord, szParsedWord, iDicSylableCount, szPhoneticRecord);
			//if (iDicSylableCount == 1 && lstrlen(szWord) <= 3)
			{
				strcpy(szTempRecordWord, szWord);
				CharUpper(szTempRecordWord);
				
				if (szTempRecordWord[1] == szTempInWord[1] || szTempRecordWord[2] == szTempInWord[1])
				{
					Matches.AddTail(szWord);
				}
				else if (IsWordMatch((char*)lpszWord, iWordLen, szWord, iRecordWordLen))
				{
					Matches.AddTail(szWord);
				}
			}
			lRecord++;
		}
	}
	/*
	else if (iWordLen == 3)
	{
		char szTempRecordWord[200];
		char szTempInWord[200];
		char cWordChar1;
		char cWordChar2;
		char cWordChar3;
		char cRecordChar1;
		char cRecordChar2;
		char cRecordChar3;

		// make upper case of input word
		strcpy(szTempInWord, lpszWord);
		CharUpper(szTempInWord);
		cWordChar1 = szTempInWord[0];
		cWordChar2 = szTempInWord[1];
		cWordChar3 = szTempInWord[2];

		// make metaphone out of input word
		MetaphoneEx((char*)lpszWord, szMetaphone);
		iMetaphoneLen = lstrlen(szMetaphone);
		BOOL bInclude = FALSE;

		// phonetic word
		PhoneticWord((char*)lpszWord, szPhoneticWord);

		// find record
		for (lRecord = 1; lRecord <= lRecordCount; lRecord++)
		{
			m_lWordsSearched++;

			GetRecord(lRecord, szWord, szParsedWord, iDicSylableCount, szPhoneticRecord);
			szParsedWord[5] = 0;
			iRecordWordLen = lstrlen(szWord);
			cRecordFirstVowel = GetFirstVowel(szWord, iRecordFirstVowelPos);

			bInclude = FALSE;

			if (iRecordWordLen <= 5)
			{
				if ((IsSylableCountsOK(iWordSylableCount, iDicSylableCount)) && (IsWordLenOK(iWordLen, iRecordWordLen)) && (cFirstVowel == cRecordFirstVowel))
				{
					if (stricmp(szParsedWord, szMetaphone) == 0)
					{
						Matches.AddTail(szWord);
						bInclude = TRUE;
					}
					else
					{
						if (strcmp(szPhoneticRecord, szPhoneticWord) == 0)
						{
							if (szParsedWord[0] == szMetaphone[0] && szParsedWord[1] == szMetaphone[1] && szParsedWord[2] == szMetaphone[2])
							{
								Matches.AddTail(szWord);
								bInclude = TRUE;
							}
						}
					}
				}
			}

			if (!bInclude && iRecordWordLen == 3)
			{
				strcpy(szTempRecordWord, szWord);
				CharUpper(szTempRecordWord);
				cRecordChar1 = szTempRecordWord[0];
				cRecordChar2 = szTempRecordWord[1];
				cRecordChar3 = szTempRecordWord[2];
				
				if ((cRecordChar1 == cWordChar1 || cRecordChar1 == cWordChar2 || cRecordChar1 == cWordChar3) &&
					(cRecordChar2 == cWordChar1 || cRecordChar2 == cWordChar2 || cRecordChar2 == cWordChar3) &&
					(cRecordChar3 == cWordChar1 || cRecordChar3 == cWordChar2 || cRecordChar3 == cWordChar3))
				{
					Matches.AddTail(szWord);
				}
			}
		}
	}
	*/
	else
	{
		// make metaphone out of input word
		MetaphoneEx((char*)lpszWord, szMetaphone);
		iMetaphoneLen = lstrlen(szMetaphone);

		// phonetic word
		PhoneticWord((char*)lpszWord, szPhoneticWord);

		// find record
		for (lRecord = 1; lRecord <= lRecordCount; lRecord++)
		{
			m_lWordsSearched++;

			GetRecord(lRecord, szWord, szParsedWord, iDicSylableCount, szPhoneticRecord);
			szParsedWord[6] = 0;
			iRecordWordLen = lstrlen(szWord);
			cRecordFirstVowel = GetFirstVowel(szWord, iRecordFirstVowelPos);

			if (abs(iWordLen - iRecordWordLen) < 4)
			{
				//if ((IsSylableCountsOK(iWordSylableCount, iDicSylableCount)) && (IsWordLenOK(iWordLen, iRecordWordLen)) && (cFirstVowel == cRecordFirstVowel || iFirstVowelPos > 2))
				//if ((IsWordLenOK(iWordLen, iRecordWordLen)) && (cFirstVowel == cRecordFirstVowel || iFirstVowelPos > 2))
				{
					if (stricmp(szParsedWord, szMetaphone) == 0)
					{
						if (CheckEndings(lpszWord, iWordLen, szWord, iRecordWordLen))
							Matches.AddTail(szWord);
					}
					else if (IsMetaphoneMatch(szParsedWord, szMetaphone))
					{
						if (CheckEndings(lpszWord, iWordLen, szWord, iRecordWordLen))
							Matches.AddTail(szWord);
					}
					else if (IsWordMatch((char*)lpszWord, iWordLen, szWord, iRecordWordLen))
					{
						if (CheckEndings(lpszWord, iWordLen, szWord, iRecordWordLen))
							Matches.AddTail(szWord);
					}
					else
					{
						/*
						if (strcmp(szPhoneticRecord, szPhoneticWord) == 0)
						{
							if (szParsedWord[0] == szMetaphone[0] && szParsedWord[1] == szMetaphone[1] && szParsedWord[2] == szMetaphone[2])
							{
								if (CheckEndings(lpszWord, iWordLen, szWord, iRecordWordLen))
									Matches.AddTail(szWord);
							}
						}
						*/
					}
				}
			}
		}
	}
}

// caches the file locations for the word breaks in the
// main dictionary
void CFPSSpellCheckEngine::CacheWordLocations()
{
	long lRecord = 0;
	char szWord[50];
	char szMetaphone[50];
	char szPhonetic[50];
	char cLastChar = '\0';
	char cThisChar = '\0';
	int iDicSylableCount;
	long lWordCachePos = 0;

	// set default values for cache locations
	for (int x = 0; x < 255; x++)
	{
		m_LocationCache[x].bPresent = FALSE;
		m_LocationCache[x].lFirstRecord = 0;
		m_LocationCache[x].lLastRecord = 0;
		m_LocationCache[x].lLen = 0;
		m_LocationCache[x].lStartPos = 0;
	}

	// look through all records and find letter breaks
	for (lRecord = 1; lRecord <= GetRecordCount(); lRecord++)
	{
		GetRecord(lRecord, szWord, szMetaphone, iDicSylableCount, szPhonetic);
		memcpy(&m_pWordCache[lWordCachePos], szWord, 18);
		lWordCachePos += 18;

		switch(szWord[0])
		{
		case 'a': cThisChar = 'A'; break;
		case 'b': cThisChar = 'B'; break;
		case 'c': cThisChar = 'C'; break;
		case 'd': cThisChar = 'D'; break;
		case 'e': cThisChar = 'E'; break;
		case 'f': cThisChar = 'F'; break;
		case 'g': cThisChar = 'G'; break;
		case 'h': cThisChar = 'H'; break;
		case 'i': cThisChar = 'I'; break;
		case 'j': cThisChar = 'J'; break;
		case 'k': cThisChar = 'K'; break;
		case 'l': cThisChar = 'L'; break;
		case 'm': cThisChar = 'M'; break;
		case 'n': cThisChar = 'N'; break;
		case 'o': cThisChar = 'O'; break;
		case 'p': cThisChar = 'P'; break;
		case 'q': cThisChar = 'Q'; break;
		case 'r': cThisChar = 'R'; break;
		case 's': cThisChar = 'S'; break;
		case 't': cThisChar = 'T'; break;
		case 'u': cThisChar = 'U'; break;
		case 'v': cThisChar = 'V'; break;
		case 'w': cThisChar = 'W'; break;
		case 'x': cThisChar = 'X'; break;
		case 'y': cThisChar = 'Y'; break;
		case 'z': cThisChar = 'Z'; break;
		}

		if (cLastChar != cThisChar)
		{
			m_LocationCache[cThisChar].bPresent = TRUE;
			m_LocationCache[cThisChar].lFirstRecord = lRecord;
			m_LocationCache[cThisChar].lStartPos = GetRecordPos(lRecord);

			if (cLastChar != '\0')
			{
				m_LocationCache[cLastChar].lLastRecord = lRecord-1;
				m_LocationCache[cLastChar].lLen = GetRecordPos(lRecord) - m_LocationCache[cLastChar].lStartPos;
			}
		}

		cLastChar = cThisChar;
	}
}

// retrieves a dictionary record from the main dictionary
BOOL CFPSSpellCheckEngine::GetRecord(long lRecord, char *szWord, char *szMetaphon, int & iSylableCount, char* szPhonetic)
{
	char szTemp[20];
	char szRecord[50];
	long lFilePos = GetRecordPos(lRecord);

	// see if the desired record falls within the
	if (lFilePos >= m_lCacheStartPos && lFilePos < m_lCacheStartPos + m_lCacheLen)
	{
		long lCachePos = lFilePos - m_lCacheStartPos;
		memcpy(szRecord, &m_pMainBuffer[lCachePos], FPSSPELL_MAIN_RECORD_LEN_NOCRLF);
	}
	else
	{
		// if the record is not in the cache, then go cache the
		// requirested record and the cache size thereafter
		_lseek(m_iMainHandle, lFilePos, SEEK_SET);
		m_lCacheStartPos = tell(m_iMainHandle);
		m_lCacheLen = _read(m_iMainHandle, m_pMainBuffer, FPSSPELL_DEFAULT_CACHE_SIZE);

		long lCachePos = lFilePos - m_lCacheStartPos;
		memcpy(szRecord, &m_pMainBuffer[lCachePos], FPSSPELL_MAIN_RECORD_LEN_NOCRLF);
	}

	szWord[0] = 0;
	szMetaphon[0] = 0;

	memcpy(szWord, &szRecord[0], 18);
	memcpy(szMetaphon, &szRecord[18], 6);
	memcpy(szTemp, &szRecord[24], 2);
	iSylableCount  = atoi(szTemp);
	memcpy(szPhonetic, &szRecord[26], 8);

	szWord[18] = 0;
	szMetaphon[6] = 0;
	szPhonetic[8] = 0;

	return TRUE;
}

// this function uses a modified version of the Metaphone algorithm to generate
// a quasi phonetic representation of a word.
void CFPSSpellCheckEngine::MetaphoneEx(char *szInput, char *szOutput, int iMaxLen)
{
	char szLastChar = '\0';
	char szThisChar = '\0';
	char szOutChar = '\0';
	char szTemp[200];
	int iInputPos = 0;
	int iOutputPos = 0;
	int iLen = 0;

	// copy input into temp
	memset(szTemp, 0, 200);
	strcpy(szTemp, szInput);
	TrimRight(szTemp);
	CharUpper(szTemp);
	iLen = lstrlen(szTemp);

	szOutput[0] = szTemp[0];
	iOutputPos++;

	while (iInputPos < iLen)
	{
		szOutChar = '\0';
		szThisChar = szTemp[iInputPos];

		if (szThisChar != szLastChar)
		{
			switch (szThisChar)
			{
			case 'A':
				{
					szOutChar = '\0';
					if (iInputPos == 0 && szTemp[iInputPos+1] == 'E')
					{
						szOutChar = 'E';
						iInputPos++;
					}
					break;
				}
			case 'B': 
				{
					szOutChar = 'B';
					if (szTemp[iInputPos+1] == 'R')
						szOutChar = '\0';
					break;
				}
			case 'C':
				{
					szOutChar = 'K';
					if (szTemp[iInputPos+1] == 'H')
						szOutChar = 'X';
					if (szTemp[iInputPos+1] == 'I' && szTemp[iInputPos+2] == 'A')
						szOutChar = 'X';
					if (szTemp[iInputPos+1] == 'I' && szTemp[iInputPos+2] != 'A')
						szOutChar = 'S';
					//if (szTemp[iInputPos+1] == 'E')
					//	szOutChar = 'S';
					if (szTemp[iInputPos+1] == 'Y')
						szOutChar = 'S';
					break;
				}
			case 'D':
				{
					szOutChar = 'D';
					if (szTemp[iInputPos+1] == 'G' && szTemp[iInputPos+1] == 'E')
						szOutChar = 'J';
					if (szTemp[iInputPos+1] == 'G' && szTemp[iInputPos+1] == 'Y')
						szOutChar = 'J';
					if (szTemp[iInputPos+1] == 'G' && szTemp[iInputPos+1] == 'I')
						szOutChar = 'J';
					if (iInputPos == iLen-1)
						szOutChar = '\0';
					break;
				}
			case 'E':
				{
					szOutChar = 0;
					//if (szTemp[iInputPos+1] == 'A')
					//{
					//	szOutChar = 'A';
					//	iInputPos++;
					//}
					break;
				}
			case 'F':
				{
					szOutChar = 'F';
					break;
				}
			case 'G':
				{
					szOutChar = 'J';
					if (szTemp[iInputPos+1] == 'H' && szTemp[iInputPos+2] == 'T')
					{
						szOutChar = 'T';
						iInputPos += 2;
					}
					else
					{
						if (szTemp[iInputPos+1] == 'H')
						{
							if (szTemp[iInputPos+2] != '\0' && !IsVowel(szTemp[iInputPos+2]))
								szOutChar = '\0';
						} 
						if (szTemp[iInputPos+1] == 'I' || szTemp[iInputPos+1] == 'E' || szTemp[iInputPos+1] == 'Y')
							szOutChar = 'J';
						if (iInputPos == 0 && szTemp[iInputPos+1] == 'N')
						{
							szOutChar = 'N';
							iInputPos++;
						}
					}
					break;
				}
			case 'H':
				{
					szOutChar = 'H';
					if (iInputPos > 0)
					{
						if (IsVowel(szTemp[iInputPos-1]) && !IsVowel(szTemp[iInputPos+1]))
							szOutChar = '\0';
					}
					break;
				}
			case 'J':
				{
					szOutChar = 'J';
					break;
				}
			case 'K':
				{
					szOutChar = 'K';
					if (iInputPos > 0)
					{
						if (szTemp[iInputPos-1] == 'C')
							szOutChar = '\0';
					}
					if (iInputPos == 0 && szTemp[iInputPos+1] == 'N')
					{
						szOutChar = 'N';
						iInputPos++;
					}
					if (szOutChar == 'K' && szTemp[iInputPos+1] == 'R')
						iInputPos++;
					if (szTemp[iInputPos+1] == 'C')
						iInputPos++;
					break;
				}
			case 'L':
				{
					szOutChar = 'L';
					if (szTemp[iInputPos+1] == 'Y' && szTemp[iInputPos+2] == 0)
					{
						szOutChar = 0;
						iInputPos++;
					}
					break;
				}
			case 'M':
				{
					szOutChar = 'M';
					break;
				}
			case 'N':
				{
					szOutChar = 'N';
					if (szTemp[iInputPos+1] == 'D')
					{
						szOutChar = 'D';
						iInputPos++;
					}
					break;
				}
			case 'P':
				{
					szOutChar = 'P';
					if (szTemp[iInputPos+1] == 'H')
					{
						szOutChar = 'F';
						iInputPos++;
					}
					if (iInputPos == 0 && szTemp[iInputPos+1] == 'N')
					{
						szOutChar = 'N';
						iInputPos++;
					}
					break;
				}
			case 'Q':
				{
					szOutChar = 'K';
					break;
				}
			case 'R':
				{
					szOutChar = 'R';
					if (iInputPos == iLen-1)
						szOutChar = '\0';
					if (szTemp[iInputPos+1] == 'K')
					{
						szOutChar = 'K';
						iInputPos++;
					}
					break;
				}
			case 'S':
				{
					szOutChar = 'S';
					if (szTemp[iInputPos+1] == 'C' && szTemp[iInputPos+2] == 'H')
					{
						szOutput[iOutputPos] = szOutChar;
						iOutputPos++;

						szOutChar = 'K';
						iInputPos+=2;
					}
					else
					{
						if (szTemp[iInputPos+1] == 'H')
							szOutChar = 'X';
						if (szTemp[iInputPos+1] == 'I' && szTemp[iInputPos+2] == 'A')
							szOutChar = 'X';
						if (szTemp[iInputPos+1] == 'I' && szTemp[iInputPos+2] == 'O')
							szOutChar = 'X';
						if (iInputPos == iLen-1 || (iInputPos == iLen-2 && szTemp[iInputPos+1] == 'S'))
							szOutChar = '\0';
					}
					break;
				}
			case 'T':
				{
					szOutChar = 'T';
					if (szTemp[iInputPos+1] == 'I' && szTemp[iInputPos+2] == 'A')
						szOutChar = 'X';
					if (szTemp[iInputPos+1] == 'I' && szTemp[iInputPos+2] == 'O')
						szOutChar = 'X';
					if (szTemp[iInputPos+1] == 'H')
						szOutChar = '0';
					if (szTemp[iInputPos+1] == 'C' && szTemp[iInputPos+2] == 'H')
						szOutChar = '\0';
					if (iInputPos > 0 && szTemp[iInputPos-1] == 'F')
						szOutChar = '\0';
					break;
				}
			case 'V':
				{
					szOutChar = 'F';
					break;
				}
			case 'W':
				{
					szOutChar = '\0';
					if (IsVowel(szTemp[iInputPos+1]))
						szOutChar = 'X';
					if (iInputPos == 0 && szTemp[iInputPos+1] == 'R')
					{
						szOutChar = 'R';
						iInputPos++;
					}
					if (iInputPos == 0 && szTemp[iInputPos+1] == 'H')
					{
						szOutChar = 'W';
						iInputPos++;
					}
					break;
				}
			case 'X':
				{
					szOutput[iOutputPos] = 'K';
					iOutputPos++;
					szOutChar = 'S';

					if (iInputPos == 0)
						szOutChar = 'S';
					break;
				}
			case 'Y':
				{
					szOutChar = '\0';
					if (IsVowel(szTemp[iInputPos+1]))
						szOutChar = 'Y';
					if (iInputPos == iLen-1)
						szOutChar = '\0';
					break;
				}
			case 'Z':
				{
					szOutChar = 'S';
					break;
				}
			}
		}

		if (szOutChar != '\0')
		{
			szOutput[iOutputPos] = szOutChar;
			iOutputPos++;
		}
		
		szLastChar = szThisChar;
		iInputPos++;
	}

	if (IsVowel(szOutput[0]))
		szOutput[0] = 'A';

	// truncate output if necessary
	TrimRight(szOutput);
	szOutput[iOutputPos] = '\0';
	szOutput[iMaxLen] = '\0';
}

void CFPSSpellCheckEngine::TrimRight(char *szText)
{
	int iLen = lstrlen(szText);
	int iPos = 0;
	BOOL bFound = FALSE;

	while (iPos < iLen && !bFound)
	{
		if (szText[iPos] == ' ')
			szText[iPos] = '\0';
		iPos++;
	}
}

// this function counts the sylables in a word.   It is not a truly
// accurate count, but works well for the spell checking engine
// purposes.
int CFPSSpellCheckEngine::GetSylableCount(char *szWord)
{
	int iCount = 0;
	int iLen = lstrlen(szWord);

	for (int iPos = 0; iPos < iLen; iPos++)
	{
		switch (szWord[iPos])
		{
		case 'A':
		case 'a':
		case 'I':
		case 'i':
		case 'O':
		case 'o':
		case 'U':
		case 'u':
			{
				if (iPos != 0)
				{
					if (!IsVowel(szWord[iPos-1]))
						iCount++;
					break;
				}
			}
		case 'E':
		case 'e':
			{
				if (iPos != 0 && iPos != iLen-1)
				{
					if (!IsVowel(szWord[iPos-1]))
						iCount++;
					break;
				}
			}
		}
	}

	if (iCount == 0)
		iCount = 1;

	return iCount;
}

// used by the FindMatchingWords function to determine if 2 words
// could possibly match.  
BOOL CFPSSpellCheckEngine::IsWordLenOK(int iWordLen, int iRecordWordLen)
{
	if (iWordLen - iRecordWordLen > FPSSPELL_MAX_WORD_LEN_DIFF)
		return FALSE;
	if (iRecordWordLen - iWordLen > FPSSPELL_MAX_WORD_LEN_DIFF)
		return FALSE;

	if (iWordLen == 4 && iRecordWordLen < FPSSPELL_MAX_WORD_LEN_DIFF)
		return FALSE;

	return TRUE;
}

// extracts the first volel from a word and then translates it from
// 5 possible vowels (A,E,I,O,U) to 3 possible vowels (A,I,O)
char CFPSSpellCheckEngine::GetFirstVowel(char* szWord, int& iVowelPos)
{
	int iPos = 1;
	BOOL bFound = FALSE;
	int iLen = lstrlen(szWord);
	char cReturn = 0;

	iVowelPos = 0;

	if (IsVowel(szWord[0]))
	{
		cReturn = szWord[0];
		if ((cReturn == 'e' || cReturn == 'E') && (szWord[1] == 'A' || szWord[1] == 'a')&& (szWord[2] == 'U' || szWord[2] == 'u'))
		{
			cReturn = 'U';
		}
		else if ((cReturn == 'e' || cReturn == 'E') && (szWord[1] == 'A' || szWord[1] == 'a'))
		{
			cReturn = 'A';
		}
		else if ((cReturn == 'a' || cReturn == 'A') && (szWord[1] == 'U' || szWord[1] == 'u'))
		{
			cReturn = 'O';
		}
	}
	else
	{
		while (iPos < iLen && !bFound)
		{
			if (!IsVowel(szWord[iPos-1]) && IsVowel(szWord[iPos]))
			{
				iVowelPos = iPos;
				cReturn = szWord[iPos];

				if ((cReturn == 'e' || cReturn == 'E') && (szWord[iPos+1] == 'A' || szWord[iPos+1] == 'a') && (szWord[iPos+2] == 'U' || szWord[iPos+2] == 'u'))
				{
					cReturn = 'U';
				}
				else if ((cReturn == 'e' || cReturn == 'E') && (szWord[iPos+1] == 'A' || szWord[iPos+1] == 'a'))
				{
					cReturn = 'A';
				}
				else if ((cReturn == 'a' || cReturn == 'A') && (szWord[iPos+1] == 'U' || szWord[iPos+1] == 'u'))
				{
					cReturn = 'O';
				}
				bFound = TRUE;
			}

			iPos++;
		}
	}

	switch (cReturn)
	{
	case 'a': cReturn = 'A';break;
	case 'e': cReturn = 'E';break;
	case 'i': cReturn = 'I';break;
	case 'o': cReturn = 'O';break;
	case 'u': cReturn = 'U';break;
	case 'y': cReturn = 'Y';break;
	}

	switch (cReturn)
	{
	case 'A': cReturn = 'A';break;
	case 'E': cReturn = 'I';break;
	case 'I': cReturn = 'I';break;
	case 'O': cReturn = 'O';break;
	case 'U': cReturn = 'O';break;
	case 'Y': cReturn = 'I';break;
	}

	return cReturn;
}

// called by FindMatchingWords to determine if the syllable 
// counts of 2 words are OK for matching purposes
BOOL CFPSSpellCheckEngine::IsSylableCountsOK(int iWord, int iRecord)
{
	if (iWord == iRecord)
		return TRUE;

	if (iWord - iRecord == 1)
		return TRUE;

	if (iRecord - iWord == 1)
		return TRUE;

	return FALSE;
}

// similar to MetaphoneEx, but instead uses a custom algorithm
// to generate a quasi-phonetic representation of a word.  The
// output of this algorithm is used by FindMatchingWords to
// augment the metaphone algorithm
void CFPSSpellCheckEngine::PhoneticWord(char *szInput, char *szOutput)
{
	int iInputPos = 0;
	int iOutputPos = 0;
	int iOutputLen = 0;
	int iLen = lstrlen(szInput);
	char szTemp[200];
	char szTempOut[200];
	char cLastChar = 0;
	char cThisChar = 0;
	char cNextChar = 0;

	// make word upper and clear output
	memset(szOutput, 0, 20);
	memset(szTempOut, 0, 20);
	strcpy(szTemp, szInput);
	CharUpper(szTemp);

	// FIRST, PARSE INPUT FOR PHONETICS
	while (iInputPos < iLen && iOutputPos < 8)
	{
		cLastChar = cThisChar;
		cThisChar = szTemp[iInputPos];
		cNextChar = szTemp[iInputPos+1];

		if (cThisChar != cLastChar)
		{
			switch (cThisChar)
			{
			// PROCESS ALL VOWELS FIRST
			case 'A':
			case 'E':
			case 'I':
			case 'O':
			case 'U':
			case 'Y':
				{
					if (iOutputPos == 0)
					//if (!IsVowel(cLastChar))
					{
						szTempOut[iOutputPos] = cThisChar;
						iOutputPos++;
					}
					break;
				}
			// B,D,P,F,T,V can all be followed by 'R'.
			// in this condition, only use the R
			case 'B':
			case 'D':
				{
					if (cNextChar != 'R')
					{
						szTempOut[iOutputPos] = 'D';
						iOutputPos++;
					}
					break;
				}
			case 'P':
			case 'F':
				{
					if (cNextChar != 'R')
					{
						szTempOut[iOutputPos] = 'F';
						iOutputPos++;
					}
					break;
				}
			case 'T':
				{
					if (cNextChar != 'R' && cLastChar != 'F')
					{
						szTempOut[iOutputPos] = 'T';
						iOutputPos++;
					}
					break;
				}
			case 'V':
				{
					if (cNextChar != 'R')
					{
						szTempOut[iOutputPos] = 'F';
						iOutputPos++;
					}
					break;
				}
			case 'L':
				{
					szTempOut[iOutputPos] = 'L';
					iOutputPos++;
					break;
				}
			case 'R':
				{
					szTempOut[iOutputPos] = 'R';
					iOutputPos++;
					break;
				}
			// M & N should always be mapped to same char
			case 'M':
				{
					szTempOut[iOutputPos] = 'M';
					iOutputPos++;
					break;
				}
			case 'N':
				{
					if (cNextChar != 'T' && cNextChar != 'G')
					{
						szTempOut[iOutputPos] = 'M';
						iOutputPos++;
					}
					break;
				}
			// C, K, Q, S should always be same char
			case 'C':
			case 'K':
			case 'Q':
			case 'S':
				{
					if (cLastChar != 'C' && cLastChar != 'K' && cLastChar != 'S')
					{
						szTempOut[iOutputPos] = 'C';
						iOutputPos++;
					}
					break;
				}
			// G and J should always be G (with no duplicate chars)
			case 'G':
			case 'J':
				{
					if (cLastChar != 'G' && cLastChar != 'J')
					{
						szTempOut[iOutputPos] = 'G';
						iOutputPos++;
					}
					break;
				}
			// CHARS WHICH SHOULD ALL BE SILENT, except when first
			case 'W':
			case 'X':
			case 'Z':
			case 'H':
				{
					if (iInputPos == 0)
					{
						szTempOut[iOutputPos] = cThisChar;
						iOutputPos++;
						break;
					}
					break;
				}
			}
		}

		iInputPos++;
	}

	// NOW, BUILD FINAL OUTPUT - THIS SHOULD REMOVE DUPLICATES
	iOutputLen = iOutputPos;
	iInputPos = 0;
	iOutputPos = 0;
	cLastChar = 0;
	cThisChar = 0;
	while (iInputPos < iOutputLen)
	{
		cLastChar = cThisChar;
		cThisChar = szTempOut[iInputPos];

		if (cLastChar != cThisChar)
		{
			szOutput[iOutputPos] = cThisChar;
			iOutputPos++;
		}

		iInputPos++;
	}
}

// this function is used to rebuild the main dictionary file
// from a word file.  The word file should already be sorted 
// in ascending order
void CFPSSpellCheckEngine::BuildMainDIC(LPCSTR lpszWordFile, LPCSTR lpszDICFile)
{
	CWaitCursor cursor;
	CStdioFile InFile;
	CStdioFile OutFile;
	CString strWord;
	char szOutWord[200];
	char szOut[50];
	char szPhonetic[50];
	char szTemp[20];

	if (InFile.Open(lpszWordFile, CFile::modeRead | CFile::shareExclusive))
	{
		if (OutFile.Open(lpszDICFile, CFile::modeCreate | CFile::modeWrite | CFile::shareExclusive))
		{
			while (InFile.ReadString(strWord))
			{
				memset(szOutWord, 0, 6);
				memset(szPhonetic, 0, 10);
				strWord.TrimLeft();
				strWord.TrimRight();

				PhoneticWord(strWord.GetBuffer(strWord.GetLength()), szPhonetic);
				MetaphoneEx(strWord.GetBuffer(strWord.GetLength()), szOutWord);
				_itoa(GetSylableCount(strWord.GetBuffer(strWord.GetLength())), szTemp, 10);

				strWord = strWord.Left(18);

				memset(szOut, 0, 50);
				memcpy(szOut, strWord.GetBuffer(strWord.GetLength()), strWord.GetLength());
				memcpy(&szOut[18], szOutWord, 6);
				memcpy(&szOut[24], szTemp, 2);
				memcpy(&szOut[26], szPhonetic, 8);

				OutFile.Write(szOut, 34);
				OutFile.WriteString("\n");
			}

			OutFile.Close();
		}

		InFile.Close();
	}
}

// this is an alternate version of the GetRecord function 
// which only returns the word from the dictionary, and not
// the metaphone, length and phonetic representations
BOOL CFPSSpellCheckEngine::GetRecord(long lRecord, char *szWord)
{
	long lCachePos = (lRecord - 1) * 18;

	memset(szWord, 0, 19);
	memcpy(szWord, &m_pWordCache[lCachePos], 18);

	return TRUE;
}

// returns the number of vowels in a word
int CFPSSpellCheckEngine::GetVowelCount(char* szWord)
{
	int iCount = 0;
	int iLen = lstrlen(szWord);

	for (int iPos = 0; iPos < iLen; iPos++)
	{
		if (IsVowel(szWord[iPos]))
			iCount++;
	}

	return iCount;
}

// creates an output based on an input word, but removes
// all vowels
void CFPSSpellCheckEngine::RemoveVowels(char *szInput, char *szOutput)
{
	int iInLen = lstrlen(szInput);
	int iInPos = 0;
	int iOutPos = 0;

	memset(szOutput, 0, iInLen);

	for (iInPos = 0; iInPos < iInLen; iInPos++)
	{
		if (!IsVowel(szInput[iInPos]))
		{
			szOutput[iOutPos] = szInput[iInPos];
			iOutPos++;
		}
	}
}

// caches the words in the main dictionary
void CFPSSpellCheckEngine::CacheWords()
{
	long lRecord = 0;
	long lCachePos = 0;
	char szWord[50];
	char szMetaphone[50];
	int iSylables;
	char szPhonetic[50];

	if (m_pWordCache)
		free (m_pWordCache);

	m_pWordCache = (char*)malloc(GetRecordCount() * 18);
	if (!m_pWordCache)
		return;

	for (lRecord = 1; lRecord <= GetRecordCount(); lRecord++)
	{
		GetRecord(lRecord, szWord, szMetaphone, iSylables, szPhonetic);

		memcpy(&m_pWordCache[lCachePos], szWord, 18);
		lCachePos += 18;
	}
}

// this function is called by the FindMatchingWords function
// to determine if 2 metaphone representations of a word
// match based on the spell checking engines requirements
BOOL CFPSSpellCheckEngine::IsMetaphoneMatch(char *szMeta1, char *szMeta2)
{
	BOOL bReturn = FALSE;

	// if the first letter does not match, but the remainder matches, OK
	if (strcmp(&szMeta1[1], &szMeta2[1]) == 0)
	{
		// various combinations of letters which when
		// first, and remainder of metaphone matches,
		// match=OK
		if (szMeta1[0] == 'K' && szMeta2[0] == 'C')
			bReturn = TRUE;
		if (szMeta1[0] == 'C' && szMeta2[0] == 'K')
			bReturn = TRUE;
		if (szMeta1[0] == 'P' && szMeta2[0] == 'F')
			bReturn = TRUE;
		if (szMeta1[0] == 'F' && szMeta2[0] == 'P')
			bReturn = TRUE;
		if (szMeta1[0] == 'S' && szMeta2[0] == 'C')
			bReturn = TRUE;
		if (szMeta1[0] == 'C' && szMeta2[0] == 'S')
			bReturn = TRUE;
	}

	return bReturn;
}

// checks 2 words to determine if they match enough
// to quialify for the spell checking engines
// requirements
BOOL CFPSSpellCheckEngine::IsWordMatch(char *szWord1, int iWord1Len, char *szWord2, int iWord2Len)
{
	BOOL bReturn = FALSE;
	char szCopy2[50];

	strcpy(szCopy2, szWord2);
	CharUpper(szCopy2);

	// check for one letter differences on words with same len more than 4 chars
	if ((!bReturn) && (iWord1Len == iWord2Len) && (iWord1Len > 2))
	{
		int iDiffCount = 0;
		int iPos = 0;

		while (iPos < iWord1Len && iDiffCount < 2)
		{
			if (szWord1[iPos] != szCopy2[iPos])
				iDiffCount++;
			
			iPos++;
		}

		if (iDiffCount == 1)
			bReturn = TRUE;
	}

	// if not same len, check for missing 'l' in 'ly' ending
	if (szCopy2[iWord2Len-1] == 'Y' && szWord1[iWord1Len-2] != 'L')
	{
		//if (memcmp(szWord1, szCopy2, iWord2Len-2) == 0)
		//	bReturn = TRUE;
	}

	return bReturn;
}

BOOL CFPSSpellCheckEngine::SaveWordFile(LPCSTR lpszFileName)
{
	CStdioFile fp;
	BOOL bReturn = TRUE;
	char szWord[50];
	long lRecordCount = GetRecordCount();

	try
	{
		if (fp.Open(lpszFileName, CFile::modeWrite | CFile::modeCreate | CFile::shareExclusive))
		{
			for (long lRecord = 1; lRecord <= lRecordCount; lRecord++)
			{
				GetRecord(lRecord, szWord);

				fp.WriteString(CString(szWord));
				fp.WriteString("\n");
			}


			fp.Close();
		}
		else
		{
			bReturn = FALSE;
		}
	}
	catch(...)
	{
		bReturn = FALSE;
	}

	return bReturn;

}

BOOL CFPSSpellCheckEngine::CheckEndings(LPCSTR lpszW1, int iLen1, LPCSTR lpszW2, int iLen2)
{
	BOOL bReturn = TRUE;

	if (lpszW1[iLen1-1] == 'S' || lpszW1[iLen1-1] == 's')
	{
		if (lpszW2[iLen2-1] != 'S' && lpszW2[iLen2-1] != 's' && lpszW2[iLen2-1] != 'D' && lpszW2[iLen2-1] != 'd')
			bReturn = FALSE;
	}
	else if (lpszW1[iLen1-1] == 'D' || lpszW1[iLen1-1] == 'd')
	{
		if (lpszW2[iLen2-1] != 'D' && lpszW2[iLen2-1] != 'd' && lpszW2[iLen2-1] != 'S' && lpszW2[iLen2-1] != 's')
			bReturn = FALSE;
	}

	return bReturn;
}

void CFPSSpellCheckEngine::RemoveVowelsAndDups(char *szInput, char *szOutput)
{
	int iInLen = lstrlen(szInput);
	int iInPos = 0;
	int iOutPos = 0;
	char cLastChar = 0;

	memset(szOutput, 0, iInLen);

	for (iInPos = 0; iInPos < iInLen; iInPos++)
	{
		if (!IsVowel(szInput[iInPos]))
		{
			if (cLastChar != szInput[iInPos])
			{
				szOutput[iOutPos] = szInput[iInPos];
				iOutPos++;
			}
		}

		cLastChar = szInput[iInPos];
	}
}

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article has no explicit license attached to it but may contain usage terms in the article text or the download files themselves. If in doubt please contact the author via the discussion board below.

A list of licenses authors might use can be found here


Written By
Web Developer
United States United States
This member has not yet provided a Biography. Assume it's interesting and varied, and probably something to do with programming.

Comments and Discussions